iflow-mcp_janspoerer-mcp_browser_use 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/METADATA +26 -0
  2. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/RECORD +50 -0
  3. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/WHEEL +5 -0
  4. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/entry_points.txt +2 -0
  5. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/licenses/LICENSE +201 -0
  6. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/top_level.txt +1 -0
  7. mcp_browser_use/__init__.py +2 -0
  8. mcp_browser_use/__main__.py +1347 -0
  9. mcp_browser_use/actions/__init__.py +1 -0
  10. mcp_browser_use/actions/elements.py +173 -0
  11. mcp_browser_use/actions/extraction.py +864 -0
  12. mcp_browser_use/actions/keyboard.py +43 -0
  13. mcp_browser_use/actions/navigation.py +73 -0
  14. mcp_browser_use/actions/screenshots.py +85 -0
  15. mcp_browser_use/browser/__init__.py +1 -0
  16. mcp_browser_use/browser/chrome.py +150 -0
  17. mcp_browser_use/browser/chrome_executable.py +204 -0
  18. mcp_browser_use/browser/chrome_launcher.py +330 -0
  19. mcp_browser_use/browser/chrome_process.py +104 -0
  20. mcp_browser_use/browser/devtools.py +230 -0
  21. mcp_browser_use/browser/driver.py +322 -0
  22. mcp_browser_use/browser/process.py +133 -0
  23. mcp_browser_use/cleaners.py +530 -0
  24. mcp_browser_use/config/__init__.py +30 -0
  25. mcp_browser_use/config/environment.py +155 -0
  26. mcp_browser_use/config/paths.py +97 -0
  27. mcp_browser_use/constants.py +68 -0
  28. mcp_browser_use/context.py +150 -0
  29. mcp_browser_use/context_pack.py +85 -0
  30. mcp_browser_use/decorators/__init__.py +13 -0
  31. mcp_browser_use/decorators/ensure.py +84 -0
  32. mcp_browser_use/decorators/envelope.py +83 -0
  33. mcp_browser_use/decorators/locking.py +172 -0
  34. mcp_browser_use/helpers.py +173 -0
  35. mcp_browser_use/helpers_context.py +261 -0
  36. mcp_browser_use/locking/__init__.py +1 -0
  37. mcp_browser_use/locking/action_lock.py +190 -0
  38. mcp_browser_use/locking/file_mutex.py +139 -0
  39. mcp_browser_use/locking/window_registry.py +178 -0
  40. mcp_browser_use/tools/__init__.py +59 -0
  41. mcp_browser_use/tools/browser_management.py +260 -0
  42. mcp_browser_use/tools/debugging.py +195 -0
  43. mcp_browser_use/tools/extraction.py +58 -0
  44. mcp_browser_use/tools/interaction.py +323 -0
  45. mcp_browser_use/tools/navigation.py +84 -0
  46. mcp_browser_use/tools/screenshots.py +116 -0
  47. mcp_browser_use/utils/__init__.py +1 -0
  48. mcp_browser_use/utils/diagnostics.py +85 -0
  49. mcp_browser_use/utils/html_utils.py +118 -0
  50. mcp_browser_use/utils/retry.py +57 -0
@@ -0,0 +1,864 @@
1
+ """Element extraction functionality for fine-grained data collection."""
2
+
3
+ import json
4
+ import re
5
+ from typing import Optional, List, Dict, Any
6
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from selenium.webdriver.support import expected_conditions as EC
10
+ from ..context import get_context
11
+ from .elements import find_element, get_by_selector
12
+ from .screenshots import _make_page_snapshot
13
+
14
+
15
+ async def extract_elements(
16
+ selectors: Optional[List[Dict[str, str]]] = None,
17
+ container_selector: Optional[str] = None,
18
+ fields: Optional[List[Dict[str, str]]] = None,
19
+ selector_type: str = "css",
20
+ wait_for_visible: bool = False,
21
+ timeout: int = 10,
22
+ max_items: Optional[int] = None,
23
+ offset: Optional[int] = None,
24
+ discover_containers: bool = False,
25
+ wait_for_content_loaded: Optional[Dict[str, Any]] = None,
26
+ ) -> str:
27
+ """
28
+ Extract content from specific elements on the current page.
29
+
30
+ Supports two extraction modes:
31
+
32
+ MODE 1: Simple extraction (using 'selectors' parameter)
33
+ - Extract individual elements with CSS/XPath
34
+ - Returns list of extracted elements
35
+
36
+ MODE 2: Structured extraction (using 'container_selector' + 'fields' parameters)
37
+ - Find multiple containers (e.g., product items)
38
+ - Extract named fields from each container
39
+ - Support attribute extraction and regex cleaning
40
+ - Returns array of structured objects
41
+
42
+ Args:
43
+ selectors: [MODE 1] Optional list of selector specifications. Each specification is a dict:
44
+ {
45
+ "selector": str, # The CSS selector or XPath expression
46
+ "type": str, # "css" or "xpath" (default: "css")
47
+ "format": str, # "html" or "text" (default: "html")
48
+ "name": str, # Optional: field name for the result
49
+ "iframe_selector": str, # Optional: selector for parent iframe
50
+ "iframe_type": str, # Optional: "css" or "xpath" for iframe
51
+ "shadow_root_selector": str, # Optional: selector for shadow root host
52
+ "shadow_root_type": str, # Optional: "css" or "xpath" for shadow root
53
+ }
54
+
55
+ container_selector: [MODE 2] CSS or XPath selector for container elements
56
+ fields: [MODE 2] List of field extractors, each with:
57
+ {
58
+ "field_name": str, # Output field name (e.g., "price_net")
59
+ "selector": str, # CSS or XPath relative to container
60
+ "selector_type": str, # "css" or "xpath" (default: "css")
61
+ "attribute": str, # Optional: extract attribute instead of text (e.g., "href")
62
+ "regex": str, # Optional: regex pattern to extract/clean value
63
+ "fallback": str # Optional: fallback value if extraction fails
64
+ }
65
+ selector_type: [MODE 2] Default selector type for container ("css" or "xpath")
66
+ wait_for_visible: [MODE 2] Wait for containers to be visible
67
+ timeout: [MODE 2] Timeout in seconds (default: 10s)
68
+ max_items: [MODE 2] Limit number of containers to extract (None = all).
69
+ Useful for testing selectors and preventing token explosions.
70
+ Recommended: 10 for testing, 50-100 for production.
71
+ offset: [MODE 2] Skip first N containers before extracting (default: None = no skip).
72
+ Useful for pagination. Example: offset=10, max_items=10 gets items 11-20.
73
+ discover_containers: [MODE 2] If True, returns container analysis instead of extraction.
74
+ Use this to explore page structure and find correct selectors.
75
+ Fast (~5s) and lightweight (~1K tokens).
76
+ wait_for_content_loaded: [MODE 2] Smart wait for lazy-loaded content (e.g., async prices).
77
+ Dict with:
78
+ - selector: CSS/XPath to check for loaded content
79
+ - min_percentage: % of containers that must have content (default 80)
80
+ - timeout: Max wait time in seconds (default 60)
81
+ - check_interval: Seconds between checks (default 5)
82
+ - check_attribute: Optional attribute to check (default: text)
83
+ - min_length: Min length to consider loaded (default 1)
84
+ Polls periodically until min_percentage of containers have the
85
+ specified content loaded. Essential for Vue.js/React/Angular sites
86
+ with asynchronous data loading.
87
+
88
+ Returns:
89
+ JSON string with structure:
90
+
91
+ MODE 1 (simple):
92
+ {
93
+ "ok": bool,
94
+ "mode": "simple",
95
+ "extracted_elements": [{selector, found, content, ...}, ...],
96
+ "snapshot": {...}
97
+ }
98
+
99
+ MODE 2 (structured):
100
+ {
101
+ "ok": bool,
102
+ "mode": "structured",
103
+ "items": [{field_name: value, ...}, ...],
104
+ "count": int,
105
+ "snapshot": {...}
106
+ }
107
+
108
+ Examples:
109
+ # MODE 1: Simple extraction
110
+ selectors = [
111
+ {"selector": "span.price", "type": "css", "format": "text", "name": "price"},
112
+ {"selector": "div.stock-info", "type": "css", "format": "html"}
113
+ ]
114
+
115
+ # MODE 2: Structured extraction (products on a listing page)
116
+ container_selector = "article.product-item"
117
+ fields = [
118
+ {"field_name": "product_name", "selector": "h3.title", "selector_type": "css"},
119
+ {"field_name": "mpn", "selector": "span[data-mpn]", "attribute": "data-mpn"},
120
+ {"field_name": "price_brutto", "selector": ".price", "regex": r"[0-9,.]+"},
121
+ {"field_name": "url", "selector": "a.product-link", "attribute": "href"}
122
+ ]
123
+ """
124
+ ctx = get_context()
125
+
126
+ # Determine extraction mode
127
+ if container_selector:
128
+ if discover_containers:
129
+ # DISCOVERY MODE: Analyze containers without extracting fields
130
+ discovery = await _discover_containers(
131
+ container_selector=container_selector,
132
+ selector_type=selector_type,
133
+ timeout=min(timeout, 5) # Cap at 5s for fast discovery
134
+ )
135
+ snapshot = _make_page_snapshot()
136
+ return json.dumps({
137
+ "ok": True,
138
+ "mode": "discovery",
139
+ **discovery,
140
+ "snapshot": snapshot
141
+ })
142
+ else:
143
+ # MODE 2: Structured extraction (with or without fields)
144
+ # When fields is None/empty, extract full text/HTML of each container
145
+ items = await _extract_structured(
146
+ container_selector=container_selector,
147
+ fields=fields, # Can be None - will extract full text/HTML
148
+ selector_type=selector_type,
149
+ wait_for_visible=wait_for_visible,
150
+ timeout=timeout,
151
+ max_items=max_items,
152
+ offset=offset,
153
+ wait_for_content_loaded=wait_for_content_loaded
154
+ )
155
+ snapshot = _make_page_snapshot()
156
+ return json.dumps({
157
+ "ok": True,
158
+ "mode": "structured",
159
+ "items": items,
160
+ "count": len(items),
161
+ "snapshot": snapshot
162
+ })
163
+ else:
164
+ # MODE 1: Simple extraction (existing behavior)
165
+ extracted_results: List[Dict[str, Any]] = []
166
+ if selectors:
167
+ for spec in selectors:
168
+ result = await _extract_single_element(spec)
169
+ extracted_results.append(result)
170
+
171
+ snapshot = _make_page_snapshot()
172
+ return json.dumps({
173
+ "ok": True,
174
+ "mode": "simple",
175
+ "extracted_elements": extracted_results,
176
+ "snapshot": snapshot
177
+ })
178
+
179
+
180
+ async def _discover_containers(
181
+ container_selector: str,
182
+ selector_type: Optional[str] = None,
183
+ timeout: int = 5,
184
+ ) -> Dict[str, Any]:
185
+ """
186
+ Discover and analyze containers without extracting fields.
187
+
188
+ Returns metadata about matching containers for agent exploration.
189
+
190
+ Args:
191
+ container_selector: Selector for container elements
192
+ selector_type: Type of selector (auto-detects if None)
193
+ timeout: Timeout in seconds (default: 5s for fast discovery)
194
+
195
+ Returns:
196
+ Dictionary with discovered_containers info
197
+ """
198
+ ctx = get_context()
199
+
200
+ # Auto-detect selector type
201
+ if selector_type is None:
202
+ if container_selector.startswith('//') or container_selector.startswith('/'):
203
+ selector_type = "xpath"
204
+ else:
205
+ selector_type = "css"
206
+
207
+ try:
208
+ by_type = get_by_selector(selector_type)
209
+ if not by_type:
210
+ return {
211
+ "discovered_containers": {
212
+ "selector": container_selector,
213
+ "selector_type": selector_type,
214
+ "count": 0,
215
+ "error": f"Invalid selector_type: {selector_type}"
216
+ }
217
+ }
218
+
219
+ # Quick check with short timeout
220
+ try:
221
+ WebDriverWait(ctx.driver, timeout).until(
222
+ EC.presence_of_element_located((by_type, container_selector))
223
+ )
224
+ except TimeoutException:
225
+ return {
226
+ "discovered_containers": {
227
+ "selector": container_selector,
228
+ "selector_type": selector_type,
229
+ "count": 0,
230
+ "error": f"No containers found within {timeout}s timeout"
231
+ }
232
+ }
233
+
234
+ # Find all containers
235
+ containers = ctx.driver.find_elements(by_type, container_selector)
236
+ count = len(containers)
237
+
238
+ if count == 0:
239
+ return {
240
+ "discovered_containers": {
241
+ "selector": container_selector,
242
+ "selector_type": selector_type,
243
+ "count": 0,
244
+ "error": "Selector matched but no elements found"
245
+ }
246
+ }
247
+
248
+ # Analyze first container as sample
249
+ first_container = containers[0]
250
+
251
+ # Get sample HTML (truncated)
252
+ sample_html = ctx.driver.execute_script(
253
+ "return arguments[0].outerHTML;",
254
+ first_container
255
+ )
256
+ sample_html = sample_html[:500] + ("..." if len(sample_html) > 500 else "")
257
+
258
+ # Get sample text
259
+ sample_text = ctx.driver.execute_script(
260
+ "return arguments[0].textContent;",
261
+ first_container
262
+ )
263
+ if sample_text:
264
+ sample_text = ' '.join(sample_text.split()) # Normalize whitespace
265
+ sample_text = sample_text[:300] + ("..." if len(sample_text) > 300 else "")
266
+ else:
267
+ sample_text = ""
268
+
269
+ # Get common attributes
270
+ attrs = first_container.get_property('attributes')
271
+ common_attributes = [attr['name'] for attr in attrs] if attrs else []
272
+
273
+ # Analyze common child elements (helpful for field extraction)
274
+ common_child_selectors = _analyze_child_elements(first_container, ctx)
275
+
276
+ return {
277
+ "discovered_containers": {
278
+ "selector": container_selector,
279
+ "selector_type": selector_type,
280
+ "count": count,
281
+ "sample_html": sample_html,
282
+ "sample_text": sample_text,
283
+ "common_attributes": common_attributes,
284
+ "common_child_selectors": common_child_selectors,
285
+ "recommendation": (
286
+ f"Found {count} containers. "
287
+ f"Use max_items=10 to test extraction on first 10 items."
288
+ )
289
+ }
290
+ }
291
+
292
+ except Exception as e:
293
+ return {
294
+ "discovered_containers": {
295
+ "selector": container_selector,
296
+ "selector_type": selector_type,
297
+ "count": 0,
298
+ "error": f"Discovery failed: {str(e)}"
299
+ }
300
+ }
301
+
302
+
303
+ def _analyze_child_elements(container, ctx) -> List[Dict[str, Any]]:
304
+ """
305
+ Analyze common child elements within a container.
306
+
307
+ Returns list of common child selector patterns found in the container.
308
+ Helps agents understand the structure for field extraction.
309
+ """
310
+ try:
311
+ # Common selector patterns to check
312
+ patterns = [
313
+ # Headings
314
+ "h1", "h2", "h3", "h4", "h5", "h6",
315
+ # Common elements
316
+ "a", "span", "div", "p", "img",
317
+ # Common class patterns
318
+ "[class*='price']", "[class*='title']", "[class*='name']",
319
+ "[class*='stock']", "[class*='availability']", "[class*='description']",
320
+ # Data attributes
321
+ "[data-price]", "[data-id]", "[data-product]", "[data-mpn]"
322
+ ]
323
+
324
+ child_info = []
325
+ for pattern in patterns:
326
+ try:
327
+ elements = container.find_elements(By.CSS_SELECTOR, pattern)
328
+ if elements:
329
+ # Get sample text from first element
330
+ sample = None
331
+ try:
332
+ text = elements[0].text
333
+ if text:
334
+ sample = text[:50] if len(text) > 50 else text
335
+ except:
336
+ pass
337
+
338
+ child_info.append({
339
+ "selector": pattern,
340
+ "count_per_container": len(elements),
341
+ "sample_text": sample
342
+ })
343
+ except:
344
+ continue
345
+
346
+ # Limit to top 10 most relevant
347
+ return child_info[:10]
348
+
349
+ except:
350
+ return []
351
+
352
+
353
+ async def _extract_structured(
354
+ container_selector: str,
355
+ fields: Optional[List[Dict[str, str]]] = None,
356
+ selector_type: Optional[str] = None,
357
+ wait_for_visible: bool = False,
358
+ timeout: int = 10,
359
+ max_items: Optional[int] = None,
360
+ offset: Optional[int] = None,
361
+ wait_for_content_loaded: Optional[Dict[str, Any]] = None,
362
+ ) -> List[Dict[str, Any]]:
363
+ """
364
+ Extract structured data from multiple containers on the page.
365
+
366
+ Args:
367
+ container_selector: Selector for container elements (e.g., product items)
368
+ fields: List of field extractors with field_name, selector, etc.
369
+ selector_type: Type of container_selector ("css" or "xpath").
370
+ If None, auto-detects from selector syntax:
371
+ - Starts with // or / -> xpath
372
+ - Otherwise -> css
373
+ wait_for_visible: Wait for containers to be visible
374
+ timeout: Timeout in seconds for finding containers
375
+ max_items: Optional maximum number of containers to extract
376
+ offset: Optional number of containers to skip before extracting
377
+ wait_for_content_loaded: Optional config for smart waiting on lazy-loaded content.
378
+ Dict with keys:
379
+ - selector: CSS/XPath to check for loaded content
380
+ - min_percentage: % of containers that must have content (default 80)
381
+ - timeout: Max wait time in seconds (default 60)
382
+ - check_interval: Seconds between checks (default 5)
383
+ - check_attribute: Optional attribute to check (default: text)
384
+ - min_length: Min length to consider loaded (default 1)
385
+
386
+ Returns:
387
+ List of dictionaries, each representing one container's extracted data.
388
+ Last item may include _wait_metadata with smart wait results.
389
+ """
390
+ ctx = get_context()
391
+ items = []
392
+
393
+ try:
394
+ # Auto-detect selector type if not provided
395
+ if selector_type is None:
396
+ if container_selector.startswith('//') or container_selector.startswith('/'):
397
+ selector_type = "xpath"
398
+ else:
399
+ selector_type = "css"
400
+
401
+ # Find all container elements
402
+ by_type = get_by_selector(selector_type)
403
+ if not by_type:
404
+ return [{
405
+ "_error": f"Invalid selector_type: {selector_type}"
406
+ }]
407
+
408
+ # Wait for containers to appear
409
+ if wait_for_visible:
410
+ WebDriverWait(ctx.driver, timeout).until(
411
+ EC.visibility_of_element_located((by_type, container_selector))
412
+ )
413
+ else:
414
+ WebDriverWait(ctx.driver, timeout).until(
415
+ EC.presence_of_element_located((by_type, container_selector))
416
+ )
417
+
418
+ # Find all containers
419
+ all_containers = ctx.driver.find_elements(by_type, container_selector)
420
+ total_count = len(all_containers)
421
+
422
+ # Apply offset if specified
423
+ offset_val = offset if offset is not None else 0
424
+ containers_after_offset = all_containers
425
+ if offset_val > 0:
426
+ containers_after_offset = all_containers[offset_val:]
427
+
428
+ # Apply max_items limit if specified
429
+ limited = False
430
+ if max_items is not None and max_items > 0:
431
+ containers = containers_after_offset[:max_items]
432
+ limited = True
433
+ else:
434
+ containers = containers_after_offset
435
+ limited = (offset_val > 0) or (len(all_containers) > len(containers))
436
+
437
+ # Wait for lazy-loaded content if configured
438
+ wait_metadata = None
439
+ if wait_for_content_loaded:
440
+ wait_metadata = _wait_for_lazy_content(
441
+ containers=containers,
442
+ wait_config=wait_for_content_loaded,
443
+ ctx=ctx
444
+ )
445
+
446
+ # Extract fields from each container
447
+ for idx, container in enumerate(containers):
448
+ item = {}
449
+ # _container_index reflects the actual position in the original full list
450
+ item["_container_index"] = offset_val + idx
451
+
452
+ if fields:
453
+ # Extract specified fields
454
+ for field_spec in fields:
455
+ field_name = field_spec.get("field_name", f"field_{idx}")
456
+ value = _extract_field_from_container(container, field_spec, ctx)
457
+ item[field_name] = value
458
+ else:
459
+ # No fields specified - extract full text content of container
460
+ try:
461
+ full_text = ctx.driver.execute_script("return arguments[0].textContent;", container)
462
+ if full_text:
463
+ # Clean and normalize whitespace
464
+ full_text = full_text.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
465
+ full_text = ' '.join(full_text.split())
466
+ item["full_text"] = full_text or ""
467
+ except Exception as e:
468
+ item["full_text"] = f"Error extracting text: {str(e)}"
469
+
470
+ items.append(item)
471
+
472
+ # Add metadata notes
473
+ metadata_entry = {}
474
+
475
+ # Add limit metadata if results were limited or offset was used
476
+ if limited or offset_val > 0:
477
+ note_parts = []
478
+ if offset_val > 0:
479
+ note_parts.append(f"Offset: {offset_val}")
480
+ if max_items is not None and max_items > 0:
481
+ note_parts.append(f"Max items: {max_items}")
482
+ note_parts.append(f"Extracted: {len(containers)}, Total available: {total_count}")
483
+
484
+ metadata_entry["_note"] = ". ".join(note_parts)
485
+ metadata_entry["_limited"] = limited
486
+ metadata_entry["_offset"] = offset_val
487
+ metadata_entry["_extracted_count"] = len(containers)
488
+ metadata_entry["_total_count"] = total_count
489
+
490
+ # Add wait metadata if smart wait was used
491
+ if wait_metadata:
492
+ metadata_entry["_wait_metadata"] = wait_metadata
493
+
494
+ # Add metadata entry if it has any content
495
+ if metadata_entry:
496
+ items.append(metadata_entry)
497
+
498
+ except TimeoutException:
499
+ items.append({
500
+ "_error": f"Container not found within {timeout}s timeout",
501
+ "_container_selector": container_selector
502
+ })
503
+ except Exception as e:
504
+ items.append({
505
+ "_error": f"Error during structured extraction: {str(e)}",
506
+ "_container_selector": container_selector
507
+ })
508
+
509
+ return items
510
+
511
+
512
+ def _detect_loading_indicators(container) -> bool:
513
+ """
514
+ Detect if a container is in a loading state.
515
+
516
+ Checks for common loading indicators:
517
+ - Classes: skeleton, loading, placeholder, spinner, shimmer
518
+ - Aria attributes: aria-busy="true"
519
+ - Empty or placeholder data
520
+
521
+ Returns:
522
+ True if loading indicators detected, False otherwise
523
+ """
524
+ try:
525
+ # Check class names for loading indicators
526
+ class_attr = container.get_attribute("class") or ""
527
+ loading_keywords = ["skeleton", "loading", "placeholder", "spinner", "shimmer", "pending"]
528
+ if any(keyword in class_attr.lower() for keyword in loading_keywords):
529
+ return True
530
+
531
+ # Check aria-busy attribute
532
+ if container.get_attribute("aria-busy") == "true":
533
+ return True
534
+
535
+ # Check for loading spinners as child elements
536
+ try:
537
+ spinners = container.find_elements(By.CSS_SELECTOR, ".spinner, .loading, [class*='spinner'], [class*='loading']")
538
+ if spinners:
539
+ return True
540
+ except:
541
+ pass
542
+
543
+ return False
544
+ except:
545
+ return False
546
+
547
+
548
+ def _wait_for_lazy_content(
549
+ containers: List,
550
+ wait_config: Dict[str, Any],
551
+ ctx
552
+ ) -> Dict[str, Any]:
553
+ """
554
+ Wait for lazy-loaded content to appear in containers.
555
+
556
+ Polls containers periodically until a minimum percentage have the specified
557
+ content loaded, or until timeout is reached. This is essential for modern
558
+ JavaScript-heavy sites that load prices, availability, and other data
559
+ asynchronously after initial page render.
560
+
561
+ Args:
562
+ containers: List of container WebElements to check
563
+ wait_config: Configuration dict with:
564
+ - selector: CSS/XPath selector to check for content (e.g., ".price")
565
+ - selector_type: Optional "css" or "xpath" (auto-detects if not provided)
566
+ - min_percentage: Minimum % of containers that must have content (default 80)
567
+ - timeout: Maximum wait time in seconds (default 60)
568
+ - check_interval: Seconds between checks (default 5)
569
+ - check_attribute: Optional attribute to check (default checks text content)
570
+ - min_length: Minimum length of text/attribute to consider "loaded" (default 1)
571
+ ctx: Browser context
572
+
573
+ Returns:
574
+ Dict with loading metadata:
575
+ {
576
+ "waited": True,
577
+ "duration_seconds": 23.4,
578
+ "loaded_count": 21,
579
+ "total_count": 25,
580
+ "percentage": 84.0,
581
+ "timeout_reached": False,
582
+ "loading_indicators_found": 4,
583
+ "checks_performed": 5
584
+ }
585
+ """
586
+ import time
587
+
588
+ # Parse config with defaults
589
+ selector = wait_config.get("selector")
590
+ if not selector:
591
+ return {"waited": False, "error": "No selector provided in wait_for_content_loaded"}
592
+
593
+ selector_type = wait_config.get("selector_type")
594
+ min_percentage = wait_config.get("min_percentage", 80)
595
+ timeout_seconds = wait_config.get("timeout", 60)
596
+ check_interval = wait_config.get("check_interval", 5)
597
+ check_attribute = wait_config.get("check_attribute") # None = check text
598
+ min_length = wait_config.get("min_length", 1)
599
+
600
+ # Auto-detect selector type if not provided
601
+ if selector_type is None:
602
+ if selector.startswith('//') or selector.startswith('/'):
603
+ selector_type = "xpath"
604
+ else:
605
+ selector_type = "css"
606
+
607
+ by_type = get_by_selector(selector_type)
608
+ if not by_type:
609
+ return {"waited": False, "error": f"Invalid selector_type: {selector_type}"}
610
+
611
+ total_count = len(containers)
612
+ if total_count == 0:
613
+ return {"waited": False, "error": "No containers to check"}
614
+
615
+ start_time = time.time()
616
+ checks_performed = 0
617
+ timeout_reached = False
618
+
619
+ while True:
620
+ checks_performed += 1
621
+ loaded_count = 0
622
+ loading_indicators = 0
623
+
624
+ # Check each container
625
+ for container in containers:
626
+ try:
627
+ # Check for loading indicators
628
+ if _detect_loading_indicators(container):
629
+ loading_indicators += 1
630
+
631
+ # Try to find the content element
632
+ element = container.find_element(by_type, selector)
633
+
634
+ # Extract value
635
+ if check_attribute:
636
+ value = element.get_attribute(check_attribute)
637
+ else:
638
+ value = element.text or ctx.driver.execute_script("return arguments[0].textContent;", element)
639
+
640
+ # Check if content is loaded (non-empty and meets min_length)
641
+ if value and len(str(value).strip()) >= min_length:
642
+ loaded_count += 1
643
+
644
+ except:
645
+ # Element not found or error extracting - not loaded yet
646
+ continue
647
+
648
+ # Calculate percentage loaded
649
+ percentage = (loaded_count / total_count) * 100
650
+
651
+ # Check if we've met the threshold
652
+ if percentage >= min_percentage:
653
+ duration = time.time() - start_time
654
+ return {
655
+ "waited": True,
656
+ "duration_seconds": round(duration, 2),
657
+ "loaded_count": loaded_count,
658
+ "total_count": total_count,
659
+ "percentage": round(percentage, 1),
660
+ "timeout_reached": False,
661
+ "loading_indicators_found": loading_indicators,
662
+ "checks_performed": checks_performed
663
+ }
664
+
665
+ # Check timeout
666
+ elapsed = time.time() - start_time
667
+ if elapsed >= timeout_seconds:
668
+ timeout_reached = True
669
+ return {
670
+ "waited": True,
671
+ "duration_seconds": round(elapsed, 2),
672
+ "loaded_count": loaded_count,
673
+ "total_count": total_count,
674
+ "percentage": round(percentage, 1),
675
+ "timeout_reached": True,
676
+ "loading_indicators_found": loading_indicators,
677
+ "checks_performed": checks_performed
678
+ }
679
+
680
+ # Sleep before next check
681
+ time.sleep(check_interval)
682
+
683
+
684
+ def _extract_field_from_container(
685
+ container,
686
+ field_spec: Dict[str, str],
687
+ ctx
688
+ ) -> Any:
689
+ """
690
+ Extract a single field value from a container element.
691
+
692
+ Args:
693
+ container: WebElement representing the container
694
+ field_spec: Field extraction specification
695
+ ctx: Browser context
696
+
697
+ Returns:
698
+ Extracted and cleaned value, or fallback/None if not found
699
+ """
700
+ selector = field_spec.get("selector", "")
701
+ field_selector_type = field_spec.get("selector_type", "css").lower()
702
+ attribute = field_spec.get("attribute")
703
+ regex_pattern = field_spec.get("regex")
704
+ fallback = field_spec.get("fallback")
705
+
706
+ try:
707
+ # Find element within container
708
+ by_type = get_by_selector(field_selector_type)
709
+ if not by_type:
710
+ return fallback or f"Invalid selector_type: {field_selector_type}"
711
+
712
+ # Find element relative to container
713
+ element = container.find_element(by_type, selector)
714
+
715
+ # Extract value
716
+ if attribute:
717
+ # Extract from attribute
718
+ value = element.get_attribute(attribute)
719
+ else:
720
+ # Extract text content
721
+ value = ctx.driver.execute_script("return arguments[0].textContent;", element)
722
+ if value:
723
+ # Clean and normalize whitespace
724
+ value = value.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
725
+ value = ' '.join(value.split())
726
+
727
+ # Apply regex if specified
728
+ if value and regex_pattern:
729
+ try:
730
+ match = re.search(regex_pattern, value)
731
+ if match:
732
+ # Return first capturing group if exists, otherwise whole match
733
+ value = match.group(1) if match.lastindex else match.group(0)
734
+ else:
735
+ # Regex didn't match, use fallback if available
736
+ value = fallback if fallback is not None else value
737
+ except re.error:
738
+ # Invalid regex, keep original value
739
+ pass
740
+
741
+ return value if value is not None else fallback
742
+
743
+ except NoSuchElementException:
744
+ return fallback
745
+ except Exception as e:
746
+ return fallback if fallback is not None else f"Error: {str(e)}"
747
+
748
+
749
+ async def _extract_single_element(spec: Dict[str, str]) -> Dict[str, Any]:
750
+ """
751
+ Extract content from a single element specification.
752
+
753
+ Args:
754
+ spec: Selector specification dictionary
755
+
756
+ Returns:
757
+ Dictionary with extraction result
758
+ """
759
+ ctx = get_context()
760
+
761
+ # Parse specification with defaults
762
+ selector = spec.get("selector")
763
+ selector_type = spec.get("type", "css").lower()
764
+ output_format = spec.get("format", "html").lower()
765
+ field_name = spec.get("name") # Optional field name
766
+ iframe_selector = spec.get("iframe_selector")
767
+ iframe_type = spec.get("iframe_type", "css")
768
+ shadow_root_selector = spec.get("shadow_root_selector")
769
+ shadow_root_type = spec.get("shadow_root_type", "css")
770
+ timeout = int(spec.get("timeout", 10))
771
+
772
+ # Validate inputs
773
+ if not selector:
774
+ result = {
775
+ "selector": selector,
776
+ "selector_type": selector_type,
777
+ "found": False,
778
+ "content": None,
779
+ "format": output_format,
780
+ "error": "No selector provided"
781
+ }
782
+ if field_name:
783
+ result["name"] = field_name
784
+ return result
785
+
786
+ if selector_type not in ("css", "xpath"):
787
+ result = {
788
+ "selector": selector,
789
+ "selector_type": selector_type,
790
+ "found": False,
791
+ "content": None,
792
+ "format": output_format,
793
+ "error": f"Invalid selector_type: {selector_type}. Must be 'css' or 'xpath'"
794
+ }
795
+ if field_name:
796
+ result["name"] = field_name
797
+ return result
798
+
799
+ if output_format not in ("html", "text"):
800
+ output_format = "html" # Default fallback
801
+
802
+ result = {
803
+ "selector": selector,
804
+ "selector_type": selector_type,
805
+ "found": False,
806
+ "content": None,
807
+ "format": output_format,
808
+ "error": None
809
+ }
810
+ if field_name:
811
+ result["name"] = field_name
812
+
813
+ try:
814
+ # Find the element
815
+ element = find_element(
816
+ driver=ctx.driver,
817
+ selector=selector,
818
+ selector_type=selector_type,
819
+ timeout=timeout,
820
+ visible_only=False,
821
+ iframe_selector=iframe_selector,
822
+ iframe_selector_type=iframe_type,
823
+ shadow_root_selector=shadow_root_selector,
824
+ shadow_root_selector_type=shadow_root_type,
825
+ stay_in_context=True, # Stay in iframe context for extraction
826
+ )
827
+
828
+ result["found"] = True
829
+
830
+ # Extract content based on format
831
+ if output_format == "html":
832
+ # Get outerHTML
833
+ html = ctx.driver.execute_script("return arguments[0].outerHTML;", element)
834
+ # Clean invalid characters
835
+ html = html.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
836
+ result["content"] = html
837
+ else: # text
838
+ # Get textContent (preserves whitespace better than .text property)
839
+ text = ctx.driver.execute_script("return arguments[0].textContent;", element)
840
+ # Clean and normalize
841
+ if text:
842
+ text = text.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
843
+ # Basic whitespace normalization
844
+ text = ' '.join(text.split())
845
+ result["content"] = text or ""
846
+
847
+ except TimeoutException:
848
+ result["error"] = f"Element not found within {timeout}s timeout"
849
+ except NoSuchElementException:
850
+ result["error"] = "Element not found"
851
+ except Exception as e:
852
+ result["error"] = f"Error extracting element: {str(e)}"
853
+ finally:
854
+ # Always switch back to default content
855
+ try:
856
+ if ctx.is_driver_initialized():
857
+ ctx.driver.switch_to.default_content()
858
+ except Exception:
859
+ pass
860
+
861
+ return result
862
+
863
+
864
+ __all__ = ['extract_elements']