phantomfetch 0.4.4__tar.gz → 0.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/PKG-INFO +1 -1
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/pyproject.toml +1 -1
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/browser/actions.py +133 -16
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/browser/cdp.py +151 -98
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/README.md +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/__init__.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/cache.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/captcha.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/__init__.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/base.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/browser/__init__.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/engines/curl.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/fetch.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/pool.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/telemetry.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/tools/selector_builder.py +0 -0
- {phantomfetch-0.4.4 → phantomfetch-0.4.6}/src/phantomfetch/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: phantomfetch
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
|
|
5
5
|
Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
|
|
6
6
|
Author: CosmicBull
|
|
@@ -5,7 +5,7 @@ import time
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
|
-
from
|
|
8
|
+
from playwright.async_api import Locator, Page
|
|
9
9
|
|
|
10
10
|
from ...types import ActionResult
|
|
11
11
|
|
|
@@ -71,7 +71,7 @@ async def execute_actions(
|
|
|
71
71
|
Returns:
|
|
72
72
|
List of ActionResult objects
|
|
73
73
|
"""
|
|
74
|
-
from
|
|
74
|
+
from playwright.async_api import Page
|
|
75
75
|
|
|
76
76
|
from ...types import ActionResult
|
|
77
77
|
|
|
@@ -172,23 +172,138 @@ async def execute_actions(
|
|
|
172
172
|
result.success = False
|
|
173
173
|
result.error = "Loop requires selector and child actions"
|
|
174
174
|
else:
|
|
175
|
-
# 1. Find all elements on the resolved context
|
|
176
|
-
elements = await ctx.locator(action.selector).all()
|
|
177
|
-
loop_results = []
|
|
178
|
-
|
|
179
175
|
limit = action.max_iterations or 100
|
|
180
176
|
logger.debug(
|
|
181
|
-
f"[browser]
|
|
177
|
+
f"[browser] Compiling loop for '{action.selector}' to native JS (limit={limit})"
|
|
182
178
|
)
|
|
183
179
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
180
|
+
serialized_actions = actions_to_payload(action.actions)
|
|
181
|
+
|
|
182
|
+
js_native_loop = """
|
|
183
|
+
async (nodeOrArgs, argsIfNode) => {
|
|
184
|
+
let rootNode = document;
|
|
185
|
+
let args = nodeOrArgs;
|
|
186
|
+
if (nodeOrArgs instanceof Node) {
|
|
187
|
+
rootNode = nodeOrArgs;
|
|
188
|
+
args = argsIfNode;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const { selector, actions, limit } = args;
|
|
192
|
+
const elements = Array.from(rootNode.querySelectorAll(selector)).slice(0, limit);
|
|
193
|
+
const results = [];
|
|
194
|
+
|
|
195
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
196
|
+
|
|
197
|
+
const waitFor = async (sel, state, timeout) => {
|
|
198
|
+
const start = Date.now();
|
|
199
|
+
while (Date.now() - start < timeout) {
|
|
200
|
+
const el = document.querySelector(sel);
|
|
201
|
+
const isVis = el && el.offsetParent !== null && window.getComputedStyle(el).display !== 'none' && window.getComputedStyle(el).visibility !== 'hidden';
|
|
202
|
+
if (state === 'hidden' && !isVis) return true;
|
|
203
|
+
if (state !== 'hidden' && isVis) return true;
|
|
204
|
+
await sleep(100);
|
|
205
|
+
}
|
|
206
|
+
throw new Error(`Timeout waiting for ${sel} to be ${state}`);
|
|
207
|
+
};
|
|
190
208
|
|
|
191
|
-
|
|
209
|
+
const isVisible = (el) => el && el.offsetParent !== null && window.getComputedStyle(el).display !== 'none' && window.getComputedStyle(el).visibility !== 'hidden';
|
|
210
|
+
|
|
211
|
+
const extractSingle = (c, spec) => {
|
|
212
|
+
let sel = spec, op = "text", param = null, visibleOnly = false;
|
|
213
|
+
if (typeof spec === "object" && spec !== null && spec._selector) {
|
|
214
|
+
sel = spec._selector; visibleOnly = !!spec._visible_only;
|
|
215
|
+
} else if (typeof spec === "string") {
|
|
216
|
+
if (spec.includes(" :: ")) {
|
|
217
|
+
const parts = spec.split(" :: ");
|
|
218
|
+
sel = parts[0];
|
|
219
|
+
if (parts[1].startsWith("attr(")) { op = "attr"; param = parts[1].slice(5, -1); }
|
|
220
|
+
else { op = parts[1]; }
|
|
221
|
+
} else if (spec.trim().startsWith("::")) {
|
|
222
|
+
sel = null;
|
|
223
|
+
const parts = spec.trim().substring(2).trim();
|
|
224
|
+
if (parts.startsWith("attr(")) { op = "attr"; param = parts.slice(5, -1); }
|
|
225
|
+
else { op = parts; }
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
if (!sel || sel.trim() === "") sel = null;
|
|
229
|
+
let el = visibleOnly ? Array.from(sel ? c.querySelectorAll(sel) : [c]).find(isVisible) : (sel ? c.querySelector(sel) : c);
|
|
230
|
+
if (!el) return null;
|
|
231
|
+
if (op === "text") return el.innerText.trim();
|
|
232
|
+
if (op === "html") return el.outerHTML;
|
|
233
|
+
if (op === "attr" && param) return el.getAttribute(param);
|
|
234
|
+
return null;
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
const processSchema = (c, s) => {
|
|
238
|
+
const out = {};
|
|
239
|
+
for (const [key, val] of Object.entries(s)) {
|
|
240
|
+
if (typeof val === "string") out[key] = extractSingle(c, val);
|
|
241
|
+
else if (typeof val === "object" && val !== null) {
|
|
242
|
+
if (val._selector) out[key] = extractSingle(c, val);
|
|
243
|
+
else out[key] = processSchema(c, val);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return out;
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
for (let i = 0; i < elements.length; i++) {
|
|
250
|
+
const el = elements[i];
|
|
251
|
+
const iterResults = [];
|
|
252
|
+
|
|
253
|
+
for (const act of actions) {
|
|
254
|
+
const scopeCtx = act.scope === 'page' ? document : el;
|
|
255
|
+
const actionResult = { action: act, success: true, data: null };
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
if (act.action === 'wait') {
|
|
259
|
+
if (act.selector) {
|
|
260
|
+
await waitFor(act.selector, act.state || 'visible', act.timeout || 5000);
|
|
261
|
+
} else if (act.timeout) {
|
|
262
|
+
await sleep(act.timeout);
|
|
263
|
+
}
|
|
264
|
+
} else if (act.action === 'click') {
|
|
265
|
+
const target = act.selector ? scopeCtx.querySelector(act.selector) : scopeCtx;
|
|
266
|
+
if (target) {
|
|
267
|
+
target.scrollIntoView({ behavior: 'instant', block: 'center' });
|
|
268
|
+
target.click();
|
|
269
|
+
await sleep(act.human_like ? Math.floor(Math.random() * 100) + 50 : 10);
|
|
270
|
+
} else throw new Error(`Click target not found: ${act.selector}`);
|
|
271
|
+
} else if (act.action === 'extract') {
|
|
272
|
+
const root = act.selector ? scopeCtx.querySelector(act.selector) : scopeCtx;
|
|
273
|
+
if (root) actionResult.data = processSchema(root, act.schema);
|
|
274
|
+
else throw new Error(`Extract target not found: ${act.selector}`);
|
|
275
|
+
} else if (act.action === 'evaluate') {
|
|
276
|
+
if (act.value) actionResult.data = eval(act.value);
|
|
277
|
+
}
|
|
278
|
+
} catch (err) {
|
|
279
|
+
actionResult.success = false;
|
|
280
|
+
actionResult.error = err.message;
|
|
281
|
+
if (act.fail_on_error) {
|
|
282
|
+
iterResults.push(actionResult);
|
|
283
|
+
throw err;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
iterResults.push(actionResult);
|
|
287
|
+
}
|
|
288
|
+
results.push({ index: i, results: iterResults });
|
|
289
|
+
}
|
|
290
|
+
return results;
|
|
291
|
+
}
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
loop_results = await ctx.evaluate(
|
|
296
|
+
js_native_loop,
|
|
297
|
+
{
|
|
298
|
+
"selector": action.selector,
|
|
299
|
+
"actions": serialized_actions,
|
|
300
|
+
"limit": limit,
|
|
301
|
+
},
|
|
302
|
+
)
|
|
303
|
+
result.data = loop_results
|
|
304
|
+
except Exception as e:
|
|
305
|
+
result.success = False
|
|
306
|
+
result.error = f"Native JS Loop Execution Failed: {e}"
|
|
192
307
|
|
|
193
308
|
case "click":
|
|
194
309
|
if action.selector:
|
|
@@ -644,10 +759,12 @@ async def execute_actions(
|
|
|
644
759
|
|
|
645
760
|
finally:
|
|
646
761
|
result.duration = time.perf_counter() - start_time
|
|
647
|
-
|
|
762
|
+
|
|
648
763
|
# Enhanced OTel Attributes
|
|
649
764
|
span.set_attribute("phantomfetch.action.success", result.success)
|
|
650
|
-
span.set_attribute(
|
|
765
|
+
span.set_attribute(
|
|
766
|
+
"phantomfetch.action.duration_ms", result.duration * 1000
|
|
767
|
+
)
|
|
651
768
|
if result.error:
|
|
652
769
|
span.set_attribute("phantomfetch.action.error", str(result.error))
|
|
653
770
|
|
|
@@ -1,20 +1,25 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import re
|
|
1
3
|
import time
|
|
2
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
|
-
import sys
|
|
4
5
|
from urllib.parse import urlparse
|
|
5
6
|
|
|
6
7
|
from browserforge.fingerprints import FingerprintGenerator, Screen
|
|
7
8
|
from browserforge.injectors.playwright import AsyncNewContext
|
|
8
9
|
from loguru import logger
|
|
9
10
|
from opentelemetry import context
|
|
11
|
+
from undetected_playwright import stealth_async
|
|
10
12
|
|
|
11
13
|
from ...telemetry import get_tracer
|
|
12
14
|
from ...types import Action, Cookie, Proxy, Response
|
|
13
15
|
from .actions import execute_actions
|
|
14
|
-
from undetected_playwright import stealth_async
|
|
15
16
|
|
|
16
17
|
tracer = get_tracer()
|
|
17
18
|
|
|
19
|
+
BLOCK_DOMAINS = re.compile(
|
|
20
|
+
r"(google-analytics\.com|doubleclick\.net|googletagmanager\.com|facebook\.net|hotjar\.com)"
|
|
21
|
+
)
|
|
22
|
+
|
|
18
23
|
if TYPE_CHECKING:
|
|
19
24
|
# ... (omitted shared lines) ...
|
|
20
25
|
|
|
@@ -102,7 +107,11 @@ class CDPEngine:
|
|
|
102
107
|
self.browser_type = browser_type
|
|
103
108
|
|
|
104
109
|
# Map playwright browser type to BrowserForge constraint
|
|
105
|
-
if "browser" not in self.fingerprint_options and self.browser_type in (
|
|
110
|
+
if "browser" not in self.fingerprint_options and self.browser_type in (
|
|
111
|
+
"chromium",
|
|
112
|
+
"firefox",
|
|
113
|
+
"webkit",
|
|
114
|
+
):
|
|
106
115
|
bf_map = {"chromium": "chrome", "webkit": "safari", "firefox": "firefox"}
|
|
107
116
|
self.fingerprint_options["browser"] = bf_map[self.browser_type]
|
|
108
117
|
|
|
@@ -112,7 +121,9 @@ class CDPEngine:
|
|
|
112
121
|
|
|
113
122
|
# BrowserForge fingerprint generator — instantiated once, generate() per request
|
|
114
123
|
if fingerprint:
|
|
115
|
-
screen = Screen(
|
|
124
|
+
screen = Screen(
|
|
125
|
+
min_width=1280, max_width=1920, min_height=720, max_height=1080
|
|
126
|
+
)
|
|
116
127
|
self._fp_gen: FingerprintGenerator | None = FingerprintGenerator(
|
|
117
128
|
screen=screen,
|
|
118
129
|
mock_webrtc=True,
|
|
@@ -131,22 +142,41 @@ class CDPEngine:
|
|
|
131
142
|
from playwright.async_api import async_playwright
|
|
132
143
|
|
|
133
144
|
self._playwright = await async_playwright().start()
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
145
|
+
try:
|
|
146
|
+
if self.cdp_endpoint:
|
|
147
|
+
logger.info(f"[cdp] Connecting to: {self.cdp_endpoint}")
|
|
148
|
+
browser_obj = getattr(self._playwright, self.browser_type)
|
|
149
|
+
self._browser = await browser_obj.connect_over_cdp(self.cdp_endpoint)
|
|
150
|
+
|
|
151
|
+
if self.use_existing_page and self._browser.contexts:
|
|
152
|
+
self._existing_context = self._browser.contexts[0]
|
|
153
|
+
if self._existing_context.pages:
|
|
154
|
+
self._existing_page = self._existing_context.pages[0]
|
|
155
|
+
logger.info(
|
|
156
|
+
"[cdp] Using existing page for recording compatibility"
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
logger.info(
|
|
160
|
+
f"[cdp] Launching local browser (headless={self.headless}, type={self.browser_type})"
|
|
161
|
+
)
|
|
162
|
+
base_args = []
|
|
163
|
+
if self.browser_type == "chromium":
|
|
164
|
+
base_args = [
|
|
165
|
+
"--disable-blink-features=AutomationControlled",
|
|
166
|
+
"--no-sandbox",
|
|
167
|
+
"--disable-setuid-sandbox",
|
|
168
|
+
]
|
|
169
|
+
extra = [a for a in self.launch_args if a not in base_args]
|
|
170
|
+
|
|
171
|
+
browser_obj = getattr(self._playwright, self.browser_type)
|
|
172
|
+
self._browser = await browser_obj.launch(
|
|
173
|
+
headless=self.headless,
|
|
174
|
+
args=base_args + extra,
|
|
175
|
+
)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
await self._playwright.stop()
|
|
178
|
+
self._playwright = None
|
|
179
|
+
raise e
|
|
150
180
|
base_args = []
|
|
151
181
|
if self.browser_type == "chromium":
|
|
152
182
|
# --no-sandbox is required on Linux environments that don't have
|
|
@@ -158,7 +188,7 @@ class CDPEngine:
|
|
|
158
188
|
"--disable-setuid-sandbox",
|
|
159
189
|
]
|
|
160
190
|
extra = [a for a in self.launch_args if a not in base_args]
|
|
161
|
-
|
|
191
|
+
|
|
162
192
|
browser_obj = getattr(self._playwright, self.browser_type)
|
|
163
193
|
self._browser = await browser_obj.launch(
|
|
164
194
|
headless=self.headless,
|
|
@@ -348,13 +378,17 @@ class CDPEngine:
|
|
|
348
378
|
|
|
349
379
|
if proxy:
|
|
350
380
|
parsed = urlparse(proxy.url)
|
|
351
|
-
proxy_server =
|
|
381
|
+
proxy_server = (
|
|
382
|
+
f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"
|
|
383
|
+
if parsed.port
|
|
384
|
+
else f"{parsed.scheme}://{parsed.hostname}"
|
|
385
|
+
)
|
|
352
386
|
proxy_dict = {"server": proxy_server}
|
|
353
387
|
if parsed.username:
|
|
354
388
|
proxy_dict["username"] = parsed.username
|
|
355
389
|
if parsed.password:
|
|
356
390
|
proxy_dict["password"] = parsed.password
|
|
357
|
-
|
|
391
|
+
|
|
358
392
|
context_opts["proxy"] = proxy_dict
|
|
359
393
|
|
|
360
394
|
# If we have basic cookies to set via context creation (simpler than add_cookies sometimes)
|
|
@@ -371,47 +405,50 @@ class CDPEngine:
|
|
|
371
405
|
page = self._existing_page
|
|
372
406
|
using_existing = True
|
|
373
407
|
logger.debug(f"[cdp] Reusing existing page for {url}")
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
)
|
|
408
|
+
# Create new context with BrowserForge fingerprint injection (if enabled)
|
|
409
|
+
elif self._fp_gen is not None:
|
|
410
|
+
# Generate a fingerprint with any per-instance constraints
|
|
411
|
+
fp = self._fp_gen.generate(**self.fingerprint_options)
|
|
412
|
+
|
|
413
|
+
# Sync UA and viewport FROM the fingerprint for full consistency.
|
|
414
|
+
# Explicit user_agent/viewport overrides take precedence.
|
|
415
|
+
if not self.user_agent:
|
|
416
|
+
context_opts["user_agent"] = fp.navigator.userAgent
|
|
417
|
+
if not self.viewport:
|
|
418
|
+
context_opts["viewport"] = {
|
|
419
|
+
"width": fp.screen.width,
|
|
420
|
+
"height": fp.screen.height,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
logger.debug(
|
|
424
|
+
f"[cdp] BrowserForge fingerprint: UA={fp.navigator.userAgent[:60]}..."
|
|
425
|
+
)
|
|
393
426
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
427
|
+
# AsyncNewContext injects all fingerprint JS overrides
|
|
428
|
+
# (Canvas, WebGL, navigator, AudioContext, fonts, etc.)
|
|
429
|
+
browser_context = await AsyncNewContext(
|
|
430
|
+
self._browser,
|
|
431
|
+
fingerprint=fp,
|
|
432
|
+
**context_opts,
|
|
433
|
+
)
|
|
401
434
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
435
|
+
# Apply matching headers so HTTP-layer headers are consistent
|
|
436
|
+
# with the JS navigator (UA, sec-ch-ua*, Accept-Language, etc.)
|
|
437
|
+
if fp.headers:
|
|
438
|
+
# Merge: fingerprint headers are base; caller's explicit
|
|
439
|
+
# headers=... arg will be applied later via set_extra_http_headers.
|
|
440
|
+
# Here we only set the fingerprint-derived defaults.
|
|
441
|
+
fp_headers = dict(fp.headers)
|
|
442
|
+
# Remove headers that Playwright manages internally to avoid conflicts
|
|
443
|
+
for managed in (
|
|
444
|
+
"Host",
|
|
445
|
+
"Content-Length",
|
|
446
|
+
"Transfer-Encoding",
|
|
447
|
+
):
|
|
448
|
+
fp_headers.pop(managed, None)
|
|
449
|
+
await browser_context.set_extra_http_headers(fp_headers)
|
|
450
|
+
else:
|
|
451
|
+
browser_context = await self._browser.new_context(**context_opts)
|
|
415
452
|
|
|
416
453
|
except Exception as e:
|
|
417
454
|
span.record_exception(e)
|
|
@@ -457,23 +494,16 @@ class CDPEngine:
|
|
|
457
494
|
# Actually valid Playwright storage_state is best.
|
|
458
495
|
# If we use context.add_init_script, we can inject it.
|
|
459
496
|
if "origins" in storage_state:
|
|
497
|
+
import json
|
|
498
|
+
|
|
460
499
|
for origin_data in storage_state["origins"]:
|
|
461
500
|
origin = origin_data["origin"]
|
|
462
|
-
ls_data = origin_data[
|
|
463
|
-
|
|
464
|
-
] # list of {name, value}
|
|
465
|
-
# Construct JS to set items
|
|
466
|
-
js_setter = ""
|
|
467
|
-
for item in ls_data:
|
|
468
|
-
k = item["name"].replace('"', '\\"')
|
|
469
|
-
v = item["value"].replace('"', '\\"')
|
|
470
|
-
js_setter += (
|
|
471
|
-
f'window.localStorage.setItem("{k}", "{v}");'
|
|
472
|
-
)
|
|
473
|
-
|
|
501
|
+
ls_data = origin_data["localStorage"]
|
|
502
|
+
ls_json = json.dumps(ls_data)
|
|
474
503
|
script = f"""
|
|
475
504
|
if (window.location.origin === "{origin}") {{
|
|
476
|
-
{
|
|
505
|
+
const data = {ls_json};
|
|
506
|
+
data.forEach(item => window.localStorage.setItem(item.name, item.value));
|
|
477
507
|
}}
|
|
478
508
|
"""
|
|
479
509
|
await browser_context.add_init_script(script)
|
|
@@ -538,12 +568,13 @@ class CDPEngine:
|
|
|
538
568
|
try:
|
|
539
569
|
req = response.request
|
|
540
570
|
if req.resource_type in ("xhr", "fetch"):
|
|
541
|
-
# Capture body safely
|
|
542
571
|
resp_body = None
|
|
543
572
|
try:
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
573
|
+
content_length = int(
|
|
574
|
+
response.headers.get("content-length", 0)
|
|
575
|
+
)
|
|
576
|
+
if content_length < 1024 * 1024:
|
|
577
|
+
body_bytes = await response.body()
|
|
547
578
|
resp_body = body_bytes.decode(
|
|
548
579
|
"utf-8", errors="replace"
|
|
549
580
|
)
|
|
@@ -609,10 +640,8 @@ class CDPEngine:
|
|
|
609
640
|
context.detach(token)
|
|
610
641
|
|
|
611
642
|
async def handle_route_with_context(route: "Route") -> None:
|
|
612
|
-
# Attach the captured context
|
|
613
643
|
token = context.attach(current_ctx)
|
|
614
644
|
try:
|
|
615
|
-
# Handle resource blocking
|
|
616
645
|
if (
|
|
617
646
|
block_resources
|
|
618
647
|
and route.request.resource_type in block_resources
|
|
@@ -620,7 +649,19 @@ class CDPEngine:
|
|
|
620
649
|
await route.abort()
|
|
621
650
|
return
|
|
622
651
|
|
|
652
|
+
if BLOCK_DOMAINS.search(route.request.url):
|
|
653
|
+
await route.abort()
|
|
654
|
+
return
|
|
655
|
+
|
|
623
656
|
await self._handle_route(route)
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.error(
|
|
659
|
+
f"[cdp] Route handler catastrophically failed: {e}"
|
|
660
|
+
)
|
|
661
|
+
try:
|
|
662
|
+
await route.continue_()
|
|
663
|
+
except Exception:
|
|
664
|
+
pass
|
|
624
665
|
finally:
|
|
625
666
|
context.detach(token)
|
|
626
667
|
|
|
@@ -634,12 +675,18 @@ class CDPEngine:
|
|
|
634
675
|
|
|
635
676
|
# Navigate
|
|
636
677
|
span.add_event("navigation.start")
|
|
678
|
+
await browser_context.tracing.start(
|
|
679
|
+
screenshots=True, snapshots=True, sources=True
|
|
680
|
+
)
|
|
637
681
|
try:
|
|
638
682
|
response = await page.goto(
|
|
639
683
|
url, timeout=timeout_ms, wait_until=wait_until
|
|
640
684
|
)
|
|
641
685
|
except Exception as e:
|
|
642
686
|
span.record_exception(e)
|
|
687
|
+
await browser_context.tracing.stop(
|
|
688
|
+
path=f"/tmp/trace_{int(time.time())}.zip"
|
|
689
|
+
)
|
|
643
690
|
return Response(
|
|
644
691
|
url=url,
|
|
645
692
|
status=0,
|
|
@@ -660,9 +707,9 @@ class CDPEngine:
|
|
|
660
707
|
except Exception as e:
|
|
661
708
|
logger.warning(f"[cdp] wait_for_url failed: {e}")
|
|
662
709
|
span.record_exception(e)
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
710
|
+
await browser_context.tracing.stop(
|
|
711
|
+
path=f"/tmp/trace_{int(time.time())}.zip"
|
|
712
|
+
)
|
|
666
713
|
body_content = b""
|
|
667
714
|
try:
|
|
668
715
|
body_content = (await page.content()).encode("utf-8")
|
|
@@ -671,7 +718,7 @@ class CDPEngine:
|
|
|
671
718
|
|
|
672
719
|
return Response(
|
|
673
720
|
url=page.url,
|
|
674
|
-
status=0,
|
|
721
|
+
status=0,
|
|
675
722
|
body=body_content,
|
|
676
723
|
engine="browser",
|
|
677
724
|
error=f"Wait for URL failed: {wait_for_url}. Current URL: {page.url}",
|
|
@@ -686,10 +733,11 @@ class CDPEngine:
|
|
|
686
733
|
try:
|
|
687
734
|
action_results = await execute_actions(page, actions)
|
|
688
735
|
except RuntimeError as e:
|
|
689
|
-
# Catch Fail Fast exceptions
|
|
690
736
|
logger.warning(f"[cdp] Action fail_on_error triggered: {e}")
|
|
691
737
|
span.record_exception(e)
|
|
692
|
-
|
|
738
|
+
await browser_context.tracing.stop(
|
|
739
|
+
path=f"/tmp/trace_{int(time.time())}.zip"
|
|
740
|
+
)
|
|
693
741
|
body_content = b""
|
|
694
742
|
try:
|
|
695
743
|
body_content = (await page.content()).encode("utf-8")
|
|
@@ -703,11 +751,6 @@ class CDPEngine:
|
|
|
703
751
|
engine="browser",
|
|
704
752
|
error=f"Action Execution Failed: {e}",
|
|
705
753
|
network_log=network_log,
|
|
706
|
-
# We might have partial results if execute_actions appends before raising
|
|
707
|
-
# But execute_actions usually returns the list.
|
|
708
|
-
# In failure case, it raises, so we don't get the list return value.
|
|
709
|
-
# However, we modified execute_actions to raise, so we lose the list unless we modify it to attach results to exception.
|
|
710
|
-
# For now, simplistic error return is fine.
|
|
711
754
|
action_results=[],
|
|
712
755
|
)
|
|
713
756
|
span.add_event("actions.end")
|
|
@@ -741,7 +784,9 @@ class CDPEngine:
|
|
|
741
784
|
break
|
|
742
785
|
except Exception as e:
|
|
743
786
|
if "Unable to retrieve content" in str(e) and i < 2:
|
|
744
|
-
logger.debug(
|
|
787
|
+
logger.debug(
|
|
788
|
+
f"[cdp] Retrying content retrieval due to navigation ({i + 1}/3)"
|
|
789
|
+
)
|
|
745
790
|
await asyncio.sleep(0.5)
|
|
746
791
|
else:
|
|
747
792
|
logger.warning(f"[cdp] Failed to retrieve content: {e}")
|
|
@@ -796,6 +841,15 @@ class CDPEngine:
|
|
|
796
841
|
storage_state=current_storage_state,
|
|
797
842
|
)
|
|
798
843
|
|
|
844
|
+
except asyncio.CancelledError:
|
|
845
|
+
logger.warning("Scrape cancelled by orchestrator. Shielding teardown.")
|
|
846
|
+
if not using_existing and browser_context:
|
|
847
|
+
try:
|
|
848
|
+
await asyncio.shield(browser_context.close())
|
|
849
|
+
except Exception:
|
|
850
|
+
pass
|
|
851
|
+
raise
|
|
852
|
+
|
|
799
853
|
except Exception as e:
|
|
800
854
|
logger.error(f"[cdp] Error: {e}")
|
|
801
855
|
span.record_exception(e)
|
|
@@ -811,9 +865,8 @@ class CDPEngine:
|
|
|
811
865
|
)
|
|
812
866
|
|
|
813
867
|
finally:
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
if page:
|
|
817
|
-
await page.close()
|
|
818
|
-
if browser_context:
|
|
868
|
+
if not using_existing and browser_context:
|
|
869
|
+
try:
|
|
819
870
|
await browser_context.close()
|
|
871
|
+
except Exception:
|
|
872
|
+
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|