iflow-mcp_janspoerer-mcp_browser_use 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/METADATA +26 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/RECORD +50 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/WHEEL +5 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/licenses/LICENSE +201 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/top_level.txt +1 -0
- mcp_browser_use/__init__.py +2 -0
- mcp_browser_use/__main__.py +1347 -0
- mcp_browser_use/actions/__init__.py +1 -0
- mcp_browser_use/actions/elements.py +173 -0
- mcp_browser_use/actions/extraction.py +864 -0
- mcp_browser_use/actions/keyboard.py +43 -0
- mcp_browser_use/actions/navigation.py +73 -0
- mcp_browser_use/actions/screenshots.py +85 -0
- mcp_browser_use/browser/__init__.py +1 -0
- mcp_browser_use/browser/chrome.py +150 -0
- mcp_browser_use/browser/chrome_executable.py +204 -0
- mcp_browser_use/browser/chrome_launcher.py +330 -0
- mcp_browser_use/browser/chrome_process.py +104 -0
- mcp_browser_use/browser/devtools.py +230 -0
- mcp_browser_use/browser/driver.py +322 -0
- mcp_browser_use/browser/process.py +133 -0
- mcp_browser_use/cleaners.py +530 -0
- mcp_browser_use/config/__init__.py +30 -0
- mcp_browser_use/config/environment.py +155 -0
- mcp_browser_use/config/paths.py +97 -0
- mcp_browser_use/constants.py +68 -0
- mcp_browser_use/context.py +150 -0
- mcp_browser_use/context_pack.py +85 -0
- mcp_browser_use/decorators/__init__.py +13 -0
- mcp_browser_use/decorators/ensure.py +84 -0
- mcp_browser_use/decorators/envelope.py +83 -0
- mcp_browser_use/decorators/locking.py +172 -0
- mcp_browser_use/helpers.py +173 -0
- mcp_browser_use/helpers_context.py +261 -0
- mcp_browser_use/locking/__init__.py +1 -0
- mcp_browser_use/locking/action_lock.py +190 -0
- mcp_browser_use/locking/file_mutex.py +139 -0
- mcp_browser_use/locking/window_registry.py +178 -0
- mcp_browser_use/tools/__init__.py +59 -0
- mcp_browser_use/tools/browser_management.py +260 -0
- mcp_browser_use/tools/debugging.py +195 -0
- mcp_browser_use/tools/extraction.py +58 -0
- mcp_browser_use/tools/interaction.py +323 -0
- mcp_browser_use/tools/navigation.py +84 -0
- mcp_browser_use/tools/screenshots.py +116 -0
- mcp_browser_use/utils/__init__.py +1 -0
- mcp_browser_use/utils/diagnostics.py +85 -0
- mcp_browser_use/utils/html_utils.py +118 -0
- mcp_browser_use/utils/retry.py +57 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
"""Element extraction functionality for fine-grained data collection."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Optional, List, Dict, Any
|
|
6
|
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
7
|
+
from selenium.webdriver.common.by import By
|
|
8
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
9
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
10
|
+
from ..context import get_context
|
|
11
|
+
from .elements import find_element, get_by_selector
|
|
12
|
+
from .screenshots import _make_page_snapshot
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def extract_elements(
|
|
16
|
+
selectors: Optional[List[Dict[str, str]]] = None,
|
|
17
|
+
container_selector: Optional[str] = None,
|
|
18
|
+
fields: Optional[List[Dict[str, str]]] = None,
|
|
19
|
+
selector_type: str = "css",
|
|
20
|
+
wait_for_visible: bool = False,
|
|
21
|
+
timeout: int = 10,
|
|
22
|
+
max_items: Optional[int] = None,
|
|
23
|
+
offset: Optional[int] = None,
|
|
24
|
+
discover_containers: bool = False,
|
|
25
|
+
wait_for_content_loaded: Optional[Dict[str, Any]] = None,
|
|
26
|
+
) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Extract content from specific elements on the current page.
|
|
29
|
+
|
|
30
|
+
Supports two extraction modes:
|
|
31
|
+
|
|
32
|
+
MODE 1: Simple extraction (using 'selectors' parameter)
|
|
33
|
+
- Extract individual elements with CSS/XPath
|
|
34
|
+
- Returns list of extracted elements
|
|
35
|
+
|
|
36
|
+
MODE 2: Structured extraction (using 'container_selector' + 'fields' parameters)
|
|
37
|
+
- Find multiple containers (e.g., product items)
|
|
38
|
+
- Extract named fields from each container
|
|
39
|
+
- Support attribute extraction and regex cleaning
|
|
40
|
+
- Returns array of structured objects
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
selectors: [MODE 1] Optional list of selector specifications. Each specification is a dict:
|
|
44
|
+
{
|
|
45
|
+
"selector": str, # The CSS selector or XPath expression
|
|
46
|
+
"type": str, # "css" or "xpath" (default: "css")
|
|
47
|
+
"format": str, # "html" or "text" (default: "html")
|
|
48
|
+
"name": str, # Optional: field name for the result
|
|
49
|
+
"iframe_selector": str, # Optional: selector for parent iframe
|
|
50
|
+
"iframe_type": str, # Optional: "css" or "xpath" for iframe
|
|
51
|
+
"shadow_root_selector": str, # Optional: selector for shadow root host
|
|
52
|
+
"shadow_root_type": str, # Optional: "css" or "xpath" for shadow root
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
container_selector: [MODE 2] CSS or XPath selector for container elements
|
|
56
|
+
fields: [MODE 2] List of field extractors, each with:
|
|
57
|
+
{
|
|
58
|
+
"field_name": str, # Output field name (e.g., "price_net")
|
|
59
|
+
"selector": str, # CSS or XPath relative to container
|
|
60
|
+
"selector_type": str, # "css" or "xpath" (default: "css")
|
|
61
|
+
"attribute": str, # Optional: extract attribute instead of text (e.g., "href")
|
|
62
|
+
"regex": str, # Optional: regex pattern to extract/clean value
|
|
63
|
+
"fallback": str # Optional: fallback value if extraction fails
|
|
64
|
+
}
|
|
65
|
+
selector_type: [MODE 2] Default selector type for container ("css" or "xpath")
|
|
66
|
+
wait_for_visible: [MODE 2] Wait for containers to be visible
|
|
67
|
+
timeout: [MODE 2] Timeout in seconds (default: 10s)
|
|
68
|
+
max_items: [MODE 2] Limit number of containers to extract (None = all).
|
|
69
|
+
Useful for testing selectors and preventing token explosions.
|
|
70
|
+
Recommended: 10 for testing, 50-100 for production.
|
|
71
|
+
offset: [MODE 2] Skip first N containers before extracting (default: None = no skip).
|
|
72
|
+
Useful for pagination. Example: offset=10, max_items=10 gets items 11-20.
|
|
73
|
+
discover_containers: [MODE 2] If True, returns container analysis instead of extraction.
|
|
74
|
+
Use this to explore page structure and find correct selectors.
|
|
75
|
+
Fast (~5s) and lightweight (~1K tokens).
|
|
76
|
+
wait_for_content_loaded: [MODE 2] Smart wait for lazy-loaded content (e.g., async prices).
|
|
77
|
+
Dict with:
|
|
78
|
+
- selector: CSS/XPath to check for loaded content
|
|
79
|
+
- min_percentage: % of containers that must have content (default 80)
|
|
80
|
+
- timeout: Max wait time in seconds (default 60)
|
|
81
|
+
- check_interval: Seconds between checks (default 5)
|
|
82
|
+
- check_attribute: Optional attribute to check (default: text)
|
|
83
|
+
- min_length: Min length to consider loaded (default 1)
|
|
84
|
+
Polls periodically until min_percentage of containers have the
|
|
85
|
+
specified content loaded. Essential for Vue.js/React/Angular sites
|
|
86
|
+
with asynchronous data loading.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
JSON string with structure:
|
|
90
|
+
|
|
91
|
+
MODE 1 (simple):
|
|
92
|
+
{
|
|
93
|
+
"ok": bool,
|
|
94
|
+
"mode": "simple",
|
|
95
|
+
"extracted_elements": [{selector, found, content, ...}, ...],
|
|
96
|
+
"snapshot": {...}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
MODE 2 (structured):
|
|
100
|
+
{
|
|
101
|
+
"ok": bool,
|
|
102
|
+
"mode": "structured",
|
|
103
|
+
"items": [{field_name: value, ...}, ...],
|
|
104
|
+
"count": int,
|
|
105
|
+
"snapshot": {...}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
Examples:
|
|
109
|
+
# MODE 1: Simple extraction
|
|
110
|
+
selectors = [
|
|
111
|
+
{"selector": "span.price", "type": "css", "format": "text", "name": "price"},
|
|
112
|
+
{"selector": "div.stock-info", "type": "css", "format": "html"}
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
# MODE 2: Structured extraction (products on a listing page)
|
|
116
|
+
container_selector = "article.product-item"
|
|
117
|
+
fields = [
|
|
118
|
+
{"field_name": "product_name", "selector": "h3.title", "selector_type": "css"},
|
|
119
|
+
{"field_name": "mpn", "selector": "span[data-mpn]", "attribute": "data-mpn"},
|
|
120
|
+
{"field_name": "price_brutto", "selector": ".price", "regex": r"[0-9,.]+"},
|
|
121
|
+
{"field_name": "url", "selector": "a.product-link", "attribute": "href"}
|
|
122
|
+
]
|
|
123
|
+
"""
|
|
124
|
+
ctx = get_context()
|
|
125
|
+
|
|
126
|
+
# Determine extraction mode
|
|
127
|
+
if container_selector:
|
|
128
|
+
if discover_containers:
|
|
129
|
+
# DISCOVERY MODE: Analyze containers without extracting fields
|
|
130
|
+
discovery = await _discover_containers(
|
|
131
|
+
container_selector=container_selector,
|
|
132
|
+
selector_type=selector_type,
|
|
133
|
+
timeout=min(timeout, 5) # Cap at 5s for fast discovery
|
|
134
|
+
)
|
|
135
|
+
snapshot = _make_page_snapshot()
|
|
136
|
+
return json.dumps({
|
|
137
|
+
"ok": True,
|
|
138
|
+
"mode": "discovery",
|
|
139
|
+
**discovery,
|
|
140
|
+
"snapshot": snapshot
|
|
141
|
+
})
|
|
142
|
+
else:
|
|
143
|
+
# MODE 2: Structured extraction (with or without fields)
|
|
144
|
+
# When fields is None/empty, extract full text/HTML of each container
|
|
145
|
+
items = await _extract_structured(
|
|
146
|
+
container_selector=container_selector,
|
|
147
|
+
fields=fields, # Can be None - will extract full text/HTML
|
|
148
|
+
selector_type=selector_type,
|
|
149
|
+
wait_for_visible=wait_for_visible,
|
|
150
|
+
timeout=timeout,
|
|
151
|
+
max_items=max_items,
|
|
152
|
+
offset=offset,
|
|
153
|
+
wait_for_content_loaded=wait_for_content_loaded
|
|
154
|
+
)
|
|
155
|
+
snapshot = _make_page_snapshot()
|
|
156
|
+
return json.dumps({
|
|
157
|
+
"ok": True,
|
|
158
|
+
"mode": "structured",
|
|
159
|
+
"items": items,
|
|
160
|
+
"count": len(items),
|
|
161
|
+
"snapshot": snapshot
|
|
162
|
+
})
|
|
163
|
+
else:
|
|
164
|
+
# MODE 1: Simple extraction (existing behavior)
|
|
165
|
+
extracted_results: List[Dict[str, Any]] = []
|
|
166
|
+
if selectors:
|
|
167
|
+
for spec in selectors:
|
|
168
|
+
result = await _extract_single_element(spec)
|
|
169
|
+
extracted_results.append(result)
|
|
170
|
+
|
|
171
|
+
snapshot = _make_page_snapshot()
|
|
172
|
+
return json.dumps({
|
|
173
|
+
"ok": True,
|
|
174
|
+
"mode": "simple",
|
|
175
|
+
"extracted_elements": extracted_results,
|
|
176
|
+
"snapshot": snapshot
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def _discover_containers(
|
|
181
|
+
container_selector: str,
|
|
182
|
+
selector_type: Optional[str] = None,
|
|
183
|
+
timeout: int = 5,
|
|
184
|
+
) -> Dict[str, Any]:
|
|
185
|
+
"""
|
|
186
|
+
Discover and analyze containers without extracting fields.
|
|
187
|
+
|
|
188
|
+
Returns metadata about matching containers for agent exploration.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
container_selector: Selector for container elements
|
|
192
|
+
selector_type: Type of selector (auto-detects if None)
|
|
193
|
+
timeout: Timeout in seconds (default: 5s for fast discovery)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Dictionary with discovered_containers info
|
|
197
|
+
"""
|
|
198
|
+
ctx = get_context()
|
|
199
|
+
|
|
200
|
+
# Auto-detect selector type
|
|
201
|
+
if selector_type is None:
|
|
202
|
+
if container_selector.startswith('//') or container_selector.startswith('/'):
|
|
203
|
+
selector_type = "xpath"
|
|
204
|
+
else:
|
|
205
|
+
selector_type = "css"
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
by_type = get_by_selector(selector_type)
|
|
209
|
+
if not by_type:
|
|
210
|
+
return {
|
|
211
|
+
"discovered_containers": {
|
|
212
|
+
"selector": container_selector,
|
|
213
|
+
"selector_type": selector_type,
|
|
214
|
+
"count": 0,
|
|
215
|
+
"error": f"Invalid selector_type: {selector_type}"
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Quick check with short timeout
|
|
220
|
+
try:
|
|
221
|
+
WebDriverWait(ctx.driver, timeout).until(
|
|
222
|
+
EC.presence_of_element_located((by_type, container_selector))
|
|
223
|
+
)
|
|
224
|
+
except TimeoutException:
|
|
225
|
+
return {
|
|
226
|
+
"discovered_containers": {
|
|
227
|
+
"selector": container_selector,
|
|
228
|
+
"selector_type": selector_type,
|
|
229
|
+
"count": 0,
|
|
230
|
+
"error": f"No containers found within {timeout}s timeout"
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
# Find all containers
|
|
235
|
+
containers = ctx.driver.find_elements(by_type, container_selector)
|
|
236
|
+
count = len(containers)
|
|
237
|
+
|
|
238
|
+
if count == 0:
|
|
239
|
+
return {
|
|
240
|
+
"discovered_containers": {
|
|
241
|
+
"selector": container_selector,
|
|
242
|
+
"selector_type": selector_type,
|
|
243
|
+
"count": 0,
|
|
244
|
+
"error": "Selector matched but no elements found"
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
# Analyze first container as sample
|
|
249
|
+
first_container = containers[0]
|
|
250
|
+
|
|
251
|
+
# Get sample HTML (truncated)
|
|
252
|
+
sample_html = ctx.driver.execute_script(
|
|
253
|
+
"return arguments[0].outerHTML;",
|
|
254
|
+
first_container
|
|
255
|
+
)
|
|
256
|
+
sample_html = sample_html[:500] + ("..." if len(sample_html) > 500 else "")
|
|
257
|
+
|
|
258
|
+
# Get sample text
|
|
259
|
+
sample_text = ctx.driver.execute_script(
|
|
260
|
+
"return arguments[0].textContent;",
|
|
261
|
+
first_container
|
|
262
|
+
)
|
|
263
|
+
if sample_text:
|
|
264
|
+
sample_text = ' '.join(sample_text.split()) # Normalize whitespace
|
|
265
|
+
sample_text = sample_text[:300] + ("..." if len(sample_text) > 300 else "")
|
|
266
|
+
else:
|
|
267
|
+
sample_text = ""
|
|
268
|
+
|
|
269
|
+
# Get common attributes
|
|
270
|
+
attrs = first_container.get_property('attributes')
|
|
271
|
+
common_attributes = [attr['name'] for attr in attrs] if attrs else []
|
|
272
|
+
|
|
273
|
+
# Analyze common child elements (helpful for field extraction)
|
|
274
|
+
common_child_selectors = _analyze_child_elements(first_container, ctx)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
"discovered_containers": {
|
|
278
|
+
"selector": container_selector,
|
|
279
|
+
"selector_type": selector_type,
|
|
280
|
+
"count": count,
|
|
281
|
+
"sample_html": sample_html,
|
|
282
|
+
"sample_text": sample_text,
|
|
283
|
+
"common_attributes": common_attributes,
|
|
284
|
+
"common_child_selectors": common_child_selectors,
|
|
285
|
+
"recommendation": (
|
|
286
|
+
f"Found {count} containers. "
|
|
287
|
+
f"Use max_items=10 to test extraction on first 10 items."
|
|
288
|
+
)
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
return {
|
|
294
|
+
"discovered_containers": {
|
|
295
|
+
"selector": container_selector,
|
|
296
|
+
"selector_type": selector_type,
|
|
297
|
+
"count": 0,
|
|
298
|
+
"error": f"Discovery failed: {str(e)}"
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _analyze_child_elements(container, ctx) -> List[Dict[str, Any]]:
|
|
304
|
+
"""
|
|
305
|
+
Analyze common child elements within a container.
|
|
306
|
+
|
|
307
|
+
Returns list of common child selector patterns found in the container.
|
|
308
|
+
Helps agents understand the structure for field extraction.
|
|
309
|
+
"""
|
|
310
|
+
try:
|
|
311
|
+
# Common selector patterns to check
|
|
312
|
+
patterns = [
|
|
313
|
+
# Headings
|
|
314
|
+
"h1", "h2", "h3", "h4", "h5", "h6",
|
|
315
|
+
# Common elements
|
|
316
|
+
"a", "span", "div", "p", "img",
|
|
317
|
+
# Common class patterns
|
|
318
|
+
"[class*='price']", "[class*='title']", "[class*='name']",
|
|
319
|
+
"[class*='stock']", "[class*='availability']", "[class*='description']",
|
|
320
|
+
# Data attributes
|
|
321
|
+
"[data-price]", "[data-id]", "[data-product]", "[data-mpn]"
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
child_info = []
|
|
325
|
+
for pattern in patterns:
|
|
326
|
+
try:
|
|
327
|
+
elements = container.find_elements(By.CSS_SELECTOR, pattern)
|
|
328
|
+
if elements:
|
|
329
|
+
# Get sample text from first element
|
|
330
|
+
sample = None
|
|
331
|
+
try:
|
|
332
|
+
text = elements[0].text
|
|
333
|
+
if text:
|
|
334
|
+
sample = text[:50] if len(text) > 50 else text
|
|
335
|
+
except:
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
child_info.append({
|
|
339
|
+
"selector": pattern,
|
|
340
|
+
"count_per_container": len(elements),
|
|
341
|
+
"sample_text": sample
|
|
342
|
+
})
|
|
343
|
+
except:
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
# Limit to top 10 most relevant
|
|
347
|
+
return child_info[:10]
|
|
348
|
+
|
|
349
|
+
except:
|
|
350
|
+
return []
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
async def _extract_structured(
|
|
354
|
+
container_selector: str,
|
|
355
|
+
fields: Optional[List[Dict[str, str]]] = None,
|
|
356
|
+
selector_type: Optional[str] = None,
|
|
357
|
+
wait_for_visible: bool = False,
|
|
358
|
+
timeout: int = 10,
|
|
359
|
+
max_items: Optional[int] = None,
|
|
360
|
+
offset: Optional[int] = None,
|
|
361
|
+
wait_for_content_loaded: Optional[Dict[str, Any]] = None,
|
|
362
|
+
) -> List[Dict[str, Any]]:
|
|
363
|
+
"""
|
|
364
|
+
Extract structured data from multiple containers on the page.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
container_selector: Selector for container elements (e.g., product items)
|
|
368
|
+
fields: List of field extractors with field_name, selector, etc.
|
|
369
|
+
selector_type: Type of container_selector ("css" or "xpath").
|
|
370
|
+
If None, auto-detects from selector syntax:
|
|
371
|
+
- Starts with // or / -> xpath
|
|
372
|
+
- Otherwise -> css
|
|
373
|
+
wait_for_visible: Wait for containers to be visible
|
|
374
|
+
timeout: Timeout in seconds for finding containers
|
|
375
|
+
max_items: Optional maximum number of containers to extract
|
|
376
|
+
offset: Optional number of containers to skip before extracting
|
|
377
|
+
wait_for_content_loaded: Optional config for smart waiting on lazy-loaded content.
|
|
378
|
+
Dict with keys:
|
|
379
|
+
- selector: CSS/XPath to check for loaded content
|
|
380
|
+
- min_percentage: % of containers that must have content (default 80)
|
|
381
|
+
- timeout: Max wait time in seconds (default 60)
|
|
382
|
+
- check_interval: Seconds between checks (default 5)
|
|
383
|
+
- check_attribute: Optional attribute to check (default: text)
|
|
384
|
+
- min_length: Min length to consider loaded (default 1)
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
List of dictionaries, each representing one container's extracted data.
|
|
388
|
+
Last item may include _wait_metadata with smart wait results.
|
|
389
|
+
"""
|
|
390
|
+
ctx = get_context()
|
|
391
|
+
items = []
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Auto-detect selector type if not provided
|
|
395
|
+
if selector_type is None:
|
|
396
|
+
if container_selector.startswith('//') or container_selector.startswith('/'):
|
|
397
|
+
selector_type = "xpath"
|
|
398
|
+
else:
|
|
399
|
+
selector_type = "css"
|
|
400
|
+
|
|
401
|
+
# Find all container elements
|
|
402
|
+
by_type = get_by_selector(selector_type)
|
|
403
|
+
if not by_type:
|
|
404
|
+
return [{
|
|
405
|
+
"_error": f"Invalid selector_type: {selector_type}"
|
|
406
|
+
}]
|
|
407
|
+
|
|
408
|
+
# Wait for containers to appear
|
|
409
|
+
if wait_for_visible:
|
|
410
|
+
WebDriverWait(ctx.driver, timeout).until(
|
|
411
|
+
EC.visibility_of_element_located((by_type, container_selector))
|
|
412
|
+
)
|
|
413
|
+
else:
|
|
414
|
+
WebDriverWait(ctx.driver, timeout).until(
|
|
415
|
+
EC.presence_of_element_located((by_type, container_selector))
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Find all containers
|
|
419
|
+
all_containers = ctx.driver.find_elements(by_type, container_selector)
|
|
420
|
+
total_count = len(all_containers)
|
|
421
|
+
|
|
422
|
+
# Apply offset if specified
|
|
423
|
+
offset_val = offset if offset is not None else 0
|
|
424
|
+
containers_after_offset = all_containers
|
|
425
|
+
if offset_val > 0:
|
|
426
|
+
containers_after_offset = all_containers[offset_val:]
|
|
427
|
+
|
|
428
|
+
# Apply max_items limit if specified
|
|
429
|
+
limited = False
|
|
430
|
+
if max_items is not None and max_items > 0:
|
|
431
|
+
containers = containers_after_offset[:max_items]
|
|
432
|
+
limited = True
|
|
433
|
+
else:
|
|
434
|
+
containers = containers_after_offset
|
|
435
|
+
limited = (offset_val > 0) or (len(all_containers) > len(containers))
|
|
436
|
+
|
|
437
|
+
# Wait for lazy-loaded content if configured
|
|
438
|
+
wait_metadata = None
|
|
439
|
+
if wait_for_content_loaded:
|
|
440
|
+
wait_metadata = _wait_for_lazy_content(
|
|
441
|
+
containers=containers,
|
|
442
|
+
wait_config=wait_for_content_loaded,
|
|
443
|
+
ctx=ctx
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Extract fields from each container
|
|
447
|
+
for idx, container in enumerate(containers):
|
|
448
|
+
item = {}
|
|
449
|
+
# _container_index reflects the actual position in the original full list
|
|
450
|
+
item["_container_index"] = offset_val + idx
|
|
451
|
+
|
|
452
|
+
if fields:
|
|
453
|
+
# Extract specified fields
|
|
454
|
+
for field_spec in fields:
|
|
455
|
+
field_name = field_spec.get("field_name", f"field_{idx}")
|
|
456
|
+
value = _extract_field_from_container(container, field_spec, ctx)
|
|
457
|
+
item[field_name] = value
|
|
458
|
+
else:
|
|
459
|
+
# No fields specified - extract full text content of container
|
|
460
|
+
try:
|
|
461
|
+
full_text = ctx.driver.execute_script("return arguments[0].textContent;", container)
|
|
462
|
+
if full_text:
|
|
463
|
+
# Clean and normalize whitespace
|
|
464
|
+
full_text = full_text.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
|
|
465
|
+
full_text = ' '.join(full_text.split())
|
|
466
|
+
item["full_text"] = full_text or ""
|
|
467
|
+
except Exception as e:
|
|
468
|
+
item["full_text"] = f"Error extracting text: {str(e)}"
|
|
469
|
+
|
|
470
|
+
items.append(item)
|
|
471
|
+
|
|
472
|
+
# Add metadata notes
|
|
473
|
+
metadata_entry = {}
|
|
474
|
+
|
|
475
|
+
# Add limit metadata if results were limited or offset was used
|
|
476
|
+
if limited or offset_val > 0:
|
|
477
|
+
note_parts = []
|
|
478
|
+
if offset_val > 0:
|
|
479
|
+
note_parts.append(f"Offset: {offset_val}")
|
|
480
|
+
if max_items is not None and max_items > 0:
|
|
481
|
+
note_parts.append(f"Max items: {max_items}")
|
|
482
|
+
note_parts.append(f"Extracted: {len(containers)}, Total available: {total_count}")
|
|
483
|
+
|
|
484
|
+
metadata_entry["_note"] = ". ".join(note_parts)
|
|
485
|
+
metadata_entry["_limited"] = limited
|
|
486
|
+
metadata_entry["_offset"] = offset_val
|
|
487
|
+
metadata_entry["_extracted_count"] = len(containers)
|
|
488
|
+
metadata_entry["_total_count"] = total_count
|
|
489
|
+
|
|
490
|
+
# Add wait metadata if smart wait was used
|
|
491
|
+
if wait_metadata:
|
|
492
|
+
metadata_entry["_wait_metadata"] = wait_metadata
|
|
493
|
+
|
|
494
|
+
# Add metadata entry if it has any content
|
|
495
|
+
if metadata_entry:
|
|
496
|
+
items.append(metadata_entry)
|
|
497
|
+
|
|
498
|
+
except TimeoutException:
|
|
499
|
+
items.append({
|
|
500
|
+
"_error": f"Container not found within {timeout}s timeout",
|
|
501
|
+
"_container_selector": container_selector
|
|
502
|
+
})
|
|
503
|
+
except Exception as e:
|
|
504
|
+
items.append({
|
|
505
|
+
"_error": f"Error during structured extraction: {str(e)}",
|
|
506
|
+
"_container_selector": container_selector
|
|
507
|
+
})
|
|
508
|
+
|
|
509
|
+
return items
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _detect_loading_indicators(container) -> bool:
|
|
513
|
+
"""
|
|
514
|
+
Detect if a container is in a loading state.
|
|
515
|
+
|
|
516
|
+
Checks for common loading indicators:
|
|
517
|
+
- Classes: skeleton, loading, placeholder, spinner, shimmer
|
|
518
|
+
- Aria attributes: aria-busy="true"
|
|
519
|
+
- Empty or placeholder data
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
True if loading indicators detected, False otherwise
|
|
523
|
+
"""
|
|
524
|
+
try:
|
|
525
|
+
# Check class names for loading indicators
|
|
526
|
+
class_attr = container.get_attribute("class") or ""
|
|
527
|
+
loading_keywords = ["skeleton", "loading", "placeholder", "spinner", "shimmer", "pending"]
|
|
528
|
+
if any(keyword in class_attr.lower() for keyword in loading_keywords):
|
|
529
|
+
return True
|
|
530
|
+
|
|
531
|
+
# Check aria-busy attribute
|
|
532
|
+
if container.get_attribute("aria-busy") == "true":
|
|
533
|
+
return True
|
|
534
|
+
|
|
535
|
+
# Check for loading spinners as child elements
|
|
536
|
+
try:
|
|
537
|
+
spinners = container.find_elements(By.CSS_SELECTOR, ".spinner, .loading, [class*='spinner'], [class*='loading']")
|
|
538
|
+
if spinners:
|
|
539
|
+
return True
|
|
540
|
+
except:
|
|
541
|
+
pass
|
|
542
|
+
|
|
543
|
+
return False
|
|
544
|
+
except:
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _wait_for_lazy_content(
|
|
549
|
+
containers: List,
|
|
550
|
+
wait_config: Dict[str, Any],
|
|
551
|
+
ctx
|
|
552
|
+
) -> Dict[str, Any]:
|
|
553
|
+
"""
|
|
554
|
+
Wait for lazy-loaded content to appear in containers.
|
|
555
|
+
|
|
556
|
+
Polls containers periodically until a minimum percentage have the specified
|
|
557
|
+
content loaded, or until timeout is reached. This is essential for modern
|
|
558
|
+
JavaScript-heavy sites that load prices, availability, and other data
|
|
559
|
+
asynchronously after initial page render.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
containers: List of container WebElements to check
|
|
563
|
+
wait_config: Configuration dict with:
|
|
564
|
+
- selector: CSS/XPath selector to check for content (e.g., ".price")
|
|
565
|
+
- selector_type: Optional "css" or "xpath" (auto-detects if not provided)
|
|
566
|
+
- min_percentage: Minimum % of containers that must have content (default 80)
|
|
567
|
+
- timeout: Maximum wait time in seconds (default 60)
|
|
568
|
+
- check_interval: Seconds between checks (default 5)
|
|
569
|
+
- check_attribute: Optional attribute to check (default checks text content)
|
|
570
|
+
- min_length: Minimum length of text/attribute to consider "loaded" (default 1)
|
|
571
|
+
ctx: Browser context
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
Dict with loading metadata:
|
|
575
|
+
{
|
|
576
|
+
"waited": True,
|
|
577
|
+
"duration_seconds": 23.4,
|
|
578
|
+
"loaded_count": 21,
|
|
579
|
+
"total_count": 25,
|
|
580
|
+
"percentage": 84.0,
|
|
581
|
+
"timeout_reached": False,
|
|
582
|
+
"loading_indicators_found": 4,
|
|
583
|
+
"checks_performed": 5
|
|
584
|
+
}
|
|
585
|
+
"""
|
|
586
|
+
import time
|
|
587
|
+
|
|
588
|
+
# Parse config with defaults
|
|
589
|
+
selector = wait_config.get("selector")
|
|
590
|
+
if not selector:
|
|
591
|
+
return {"waited": False, "error": "No selector provided in wait_for_content_loaded"}
|
|
592
|
+
|
|
593
|
+
selector_type = wait_config.get("selector_type")
|
|
594
|
+
min_percentage = wait_config.get("min_percentage", 80)
|
|
595
|
+
timeout_seconds = wait_config.get("timeout", 60)
|
|
596
|
+
check_interval = wait_config.get("check_interval", 5)
|
|
597
|
+
check_attribute = wait_config.get("check_attribute") # None = check text
|
|
598
|
+
min_length = wait_config.get("min_length", 1)
|
|
599
|
+
|
|
600
|
+
# Auto-detect selector type if not provided
|
|
601
|
+
if selector_type is None:
|
|
602
|
+
if selector.startswith('//') or selector.startswith('/'):
|
|
603
|
+
selector_type = "xpath"
|
|
604
|
+
else:
|
|
605
|
+
selector_type = "css"
|
|
606
|
+
|
|
607
|
+
by_type = get_by_selector(selector_type)
|
|
608
|
+
if not by_type:
|
|
609
|
+
return {"waited": False, "error": f"Invalid selector_type: {selector_type}"}
|
|
610
|
+
|
|
611
|
+
total_count = len(containers)
|
|
612
|
+
if total_count == 0:
|
|
613
|
+
return {"waited": False, "error": "No containers to check"}
|
|
614
|
+
|
|
615
|
+
start_time = time.time()
|
|
616
|
+
checks_performed = 0
|
|
617
|
+
timeout_reached = False
|
|
618
|
+
|
|
619
|
+
while True:
|
|
620
|
+
checks_performed += 1
|
|
621
|
+
loaded_count = 0
|
|
622
|
+
loading_indicators = 0
|
|
623
|
+
|
|
624
|
+
# Check each container
|
|
625
|
+
for container in containers:
|
|
626
|
+
try:
|
|
627
|
+
# Check for loading indicators
|
|
628
|
+
if _detect_loading_indicators(container):
|
|
629
|
+
loading_indicators += 1
|
|
630
|
+
|
|
631
|
+
# Try to find the content element
|
|
632
|
+
element = container.find_element(by_type, selector)
|
|
633
|
+
|
|
634
|
+
# Extract value
|
|
635
|
+
if check_attribute:
|
|
636
|
+
value = element.get_attribute(check_attribute)
|
|
637
|
+
else:
|
|
638
|
+
value = element.text or ctx.driver.execute_script("return arguments[0].textContent;", element)
|
|
639
|
+
|
|
640
|
+
# Check if content is loaded (non-empty and meets min_length)
|
|
641
|
+
if value and len(str(value).strip()) >= min_length:
|
|
642
|
+
loaded_count += 1
|
|
643
|
+
|
|
644
|
+
except:
|
|
645
|
+
# Element not found or error extracting - not loaded yet
|
|
646
|
+
continue
|
|
647
|
+
|
|
648
|
+
# Calculate percentage loaded
|
|
649
|
+
percentage = (loaded_count / total_count) * 100
|
|
650
|
+
|
|
651
|
+
# Check if we've met the threshold
|
|
652
|
+
if percentage >= min_percentage:
|
|
653
|
+
duration = time.time() - start_time
|
|
654
|
+
return {
|
|
655
|
+
"waited": True,
|
|
656
|
+
"duration_seconds": round(duration, 2),
|
|
657
|
+
"loaded_count": loaded_count,
|
|
658
|
+
"total_count": total_count,
|
|
659
|
+
"percentage": round(percentage, 1),
|
|
660
|
+
"timeout_reached": False,
|
|
661
|
+
"loading_indicators_found": loading_indicators,
|
|
662
|
+
"checks_performed": checks_performed
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
# Check timeout
|
|
666
|
+
elapsed = time.time() - start_time
|
|
667
|
+
if elapsed >= timeout_seconds:
|
|
668
|
+
timeout_reached = True
|
|
669
|
+
return {
|
|
670
|
+
"waited": True,
|
|
671
|
+
"duration_seconds": round(elapsed, 2),
|
|
672
|
+
"loaded_count": loaded_count,
|
|
673
|
+
"total_count": total_count,
|
|
674
|
+
"percentage": round(percentage, 1),
|
|
675
|
+
"timeout_reached": True,
|
|
676
|
+
"loading_indicators_found": loading_indicators,
|
|
677
|
+
"checks_performed": checks_performed
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
# Sleep before next check
|
|
681
|
+
time.sleep(check_interval)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _extract_field_from_container(
|
|
685
|
+
container,
|
|
686
|
+
field_spec: Dict[str, str],
|
|
687
|
+
ctx
|
|
688
|
+
) -> Any:
|
|
689
|
+
"""
|
|
690
|
+
Extract a single field value from a container element.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
container: WebElement representing the container
|
|
694
|
+
field_spec: Field extraction specification
|
|
695
|
+
ctx: Browser context
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
Extracted and cleaned value, or fallback/None if not found
|
|
699
|
+
"""
|
|
700
|
+
selector = field_spec.get("selector", "")
|
|
701
|
+
field_selector_type = field_spec.get("selector_type", "css").lower()
|
|
702
|
+
attribute = field_spec.get("attribute")
|
|
703
|
+
regex_pattern = field_spec.get("regex")
|
|
704
|
+
fallback = field_spec.get("fallback")
|
|
705
|
+
|
|
706
|
+
try:
|
|
707
|
+
# Find element within container
|
|
708
|
+
by_type = get_by_selector(field_selector_type)
|
|
709
|
+
if not by_type:
|
|
710
|
+
return fallback or f"Invalid selector_type: {field_selector_type}"
|
|
711
|
+
|
|
712
|
+
# Find element relative to container
|
|
713
|
+
element = container.find_element(by_type, selector)
|
|
714
|
+
|
|
715
|
+
# Extract value
|
|
716
|
+
if attribute:
|
|
717
|
+
# Extract from attribute
|
|
718
|
+
value = element.get_attribute(attribute)
|
|
719
|
+
else:
|
|
720
|
+
# Extract text content
|
|
721
|
+
value = ctx.driver.execute_script("return arguments[0].textContent;", element)
|
|
722
|
+
if value:
|
|
723
|
+
# Clean and normalize whitespace
|
|
724
|
+
value = value.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
|
|
725
|
+
value = ' '.join(value.split())
|
|
726
|
+
|
|
727
|
+
# Apply regex if specified
|
|
728
|
+
if value and regex_pattern:
|
|
729
|
+
try:
|
|
730
|
+
match = re.search(regex_pattern, value)
|
|
731
|
+
if match:
|
|
732
|
+
# Return first capturing group if exists, otherwise whole match
|
|
733
|
+
value = match.group(1) if match.lastindex else match.group(0)
|
|
734
|
+
else:
|
|
735
|
+
# Regex didn't match, use fallback if available
|
|
736
|
+
value = fallback if fallback is not None else value
|
|
737
|
+
except re.error:
|
|
738
|
+
# Invalid regex, keep original value
|
|
739
|
+
pass
|
|
740
|
+
|
|
741
|
+
return value if value is not None else fallback
|
|
742
|
+
|
|
743
|
+
except NoSuchElementException:
|
|
744
|
+
return fallback
|
|
745
|
+
except Exception as e:
|
|
746
|
+
return fallback if fallback is not None else f"Error: {str(e)}"
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
async def _extract_single_element(spec: Dict[str, str]) -> Dict[str, Any]:
|
|
750
|
+
"""
|
|
751
|
+
Extract content from a single element specification.
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
spec: Selector specification dictionary
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
Dictionary with extraction result
|
|
758
|
+
"""
|
|
759
|
+
ctx = get_context()
|
|
760
|
+
|
|
761
|
+
# Parse specification with defaults
|
|
762
|
+
selector = spec.get("selector")
|
|
763
|
+
selector_type = spec.get("type", "css").lower()
|
|
764
|
+
output_format = spec.get("format", "html").lower()
|
|
765
|
+
field_name = spec.get("name") # Optional field name
|
|
766
|
+
iframe_selector = spec.get("iframe_selector")
|
|
767
|
+
iframe_type = spec.get("iframe_type", "css")
|
|
768
|
+
shadow_root_selector = spec.get("shadow_root_selector")
|
|
769
|
+
shadow_root_type = spec.get("shadow_root_type", "css")
|
|
770
|
+
timeout = int(spec.get("timeout", 10))
|
|
771
|
+
|
|
772
|
+
# Validate inputs
|
|
773
|
+
if not selector:
|
|
774
|
+
result = {
|
|
775
|
+
"selector": selector,
|
|
776
|
+
"selector_type": selector_type,
|
|
777
|
+
"found": False,
|
|
778
|
+
"content": None,
|
|
779
|
+
"format": output_format,
|
|
780
|
+
"error": "No selector provided"
|
|
781
|
+
}
|
|
782
|
+
if field_name:
|
|
783
|
+
result["name"] = field_name
|
|
784
|
+
return result
|
|
785
|
+
|
|
786
|
+
if selector_type not in ("css", "xpath"):
|
|
787
|
+
result = {
|
|
788
|
+
"selector": selector,
|
|
789
|
+
"selector_type": selector_type,
|
|
790
|
+
"found": False,
|
|
791
|
+
"content": None,
|
|
792
|
+
"format": output_format,
|
|
793
|
+
"error": f"Invalid selector_type: {selector_type}. Must be 'css' or 'xpath'"
|
|
794
|
+
}
|
|
795
|
+
if field_name:
|
|
796
|
+
result["name"] = field_name
|
|
797
|
+
return result
|
|
798
|
+
|
|
799
|
+
if output_format not in ("html", "text"):
|
|
800
|
+
output_format = "html" # Default fallback
|
|
801
|
+
|
|
802
|
+
result = {
|
|
803
|
+
"selector": selector,
|
|
804
|
+
"selector_type": selector_type,
|
|
805
|
+
"found": False,
|
|
806
|
+
"content": None,
|
|
807
|
+
"format": output_format,
|
|
808
|
+
"error": None
|
|
809
|
+
}
|
|
810
|
+
if field_name:
|
|
811
|
+
result["name"] = field_name
|
|
812
|
+
|
|
813
|
+
try:
|
|
814
|
+
# Find the element
|
|
815
|
+
element = find_element(
|
|
816
|
+
driver=ctx.driver,
|
|
817
|
+
selector=selector,
|
|
818
|
+
selector_type=selector_type,
|
|
819
|
+
timeout=timeout,
|
|
820
|
+
visible_only=False,
|
|
821
|
+
iframe_selector=iframe_selector,
|
|
822
|
+
iframe_selector_type=iframe_type,
|
|
823
|
+
shadow_root_selector=shadow_root_selector,
|
|
824
|
+
shadow_root_selector_type=shadow_root_type,
|
|
825
|
+
stay_in_context=True, # Stay in iframe context for extraction
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
result["found"] = True
|
|
829
|
+
|
|
830
|
+
# Extract content based on format
|
|
831
|
+
if output_format == "html":
|
|
832
|
+
# Get outerHTML
|
|
833
|
+
html = ctx.driver.execute_script("return arguments[0].outerHTML;", element)
|
|
834
|
+
# Clean invalid characters
|
|
835
|
+
html = html.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
|
|
836
|
+
result["content"] = html
|
|
837
|
+
else: # text
|
|
838
|
+
# Get textContent (preserves whitespace better than .text property)
|
|
839
|
+
text = ctx.driver.execute_script("return arguments[0].textContent;", element)
|
|
840
|
+
# Clean and normalize
|
|
841
|
+
if text:
|
|
842
|
+
text = text.replace('\x00', '').encode('utf-8', errors='ignore').decode('utf-8')
|
|
843
|
+
# Basic whitespace normalization
|
|
844
|
+
text = ' '.join(text.split())
|
|
845
|
+
result["content"] = text or ""
|
|
846
|
+
|
|
847
|
+
except TimeoutException:
|
|
848
|
+
result["error"] = f"Element not found within {timeout}s timeout"
|
|
849
|
+
except NoSuchElementException:
|
|
850
|
+
result["error"] = "Element not found"
|
|
851
|
+
except Exception as e:
|
|
852
|
+
result["error"] = f"Error extracting element: {str(e)}"
|
|
853
|
+
finally:
|
|
854
|
+
# Always switch back to default content
|
|
855
|
+
try:
|
|
856
|
+
if ctx.is_driver_initialized():
|
|
857
|
+
ctx.driver.switch_to.default_content()
|
|
858
|
+
except Exception:
|
|
859
|
+
pass
|
|
860
|
+
|
|
861
|
+
return result
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
__all__ = ['extract_elements']
|