phantomfetch 0.4.4__tar.gz → 0.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: phantomfetch
3
- Version: 0.4.4
3
+ Version: 0.4.6
4
4
  Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
5
5
  Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
6
6
  Author: CosmicBull
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomfetch"
3
- version = "0.4.4"
3
+ version = "0.4.6"
4
4
  description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -5,7 +5,7 @@ import time
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
7
  if TYPE_CHECKING:
8
- from rebrowser_playwright.async_api import Locator, Page
8
+ from playwright.async_api import Locator, Page
9
9
 
10
10
  from ...types import ActionResult
11
11
 
@@ -71,7 +71,7 @@ async def execute_actions(
71
71
  Returns:
72
72
  List of ActionResult objects
73
73
  """
74
- from rebrowser_playwright.async_api import Page
74
+ from playwright.async_api import Page
75
75
 
76
76
  from ...types import ActionResult
77
77
 
@@ -172,23 +172,138 @@ async def execute_actions(
172
172
  result.success = False
173
173
  result.error = "Loop requires selector and child actions"
174
174
  else:
175
- # 1. Find all elements on the resolved context
176
- elements = await ctx.locator(action.selector).all()
177
- loop_results = []
178
-
179
175
  limit = action.max_iterations or 100
180
176
  logger.debug(
181
- f"[browser] Looping over {len(elements)} elements (limit={limit})"
177
+ f"[browser] Compiling loop for '{action.selector}' to native JS (limit={limit})"
182
178
  )
183
179
 
184
- for i, el in enumerate(elements):
185
- if i >= limit:
186
- break
187
- # 2. Execute child actions on EACH element locator
188
- sub_res = await execute_actions(el, action.actions)
189
- loop_results.append({"index": i, "results": sub_res})
180
+ serialized_actions = actions_to_payload(action.actions)
181
+
182
+ js_native_loop = """
183
+ async (nodeOrArgs, argsIfNode) => {
184
+ let rootNode = document;
185
+ let args = nodeOrArgs;
186
+ if (nodeOrArgs instanceof Node) {
187
+ rootNode = nodeOrArgs;
188
+ args = argsIfNode;
189
+ }
190
+
191
+ const { selector, actions, limit } = args;
192
+ const elements = Array.from(rootNode.querySelectorAll(selector)).slice(0, limit);
193
+ const results = [];
194
+
195
+ const sleep = ms => new Promise(r => setTimeout(r, ms));
196
+
197
+ const waitFor = async (sel, state, timeout) => {
198
+ const start = Date.now();
199
+ while (Date.now() - start < timeout) {
200
+ const el = document.querySelector(sel);
201
+ const isVis = el && el.offsetParent !== null && window.getComputedStyle(el).display !== 'none' && window.getComputedStyle(el).visibility !== 'hidden';
202
+ if (state === 'hidden' && !isVis) return true;
203
+ if (state !== 'hidden' && isVis) return true;
204
+ await sleep(100);
205
+ }
206
+ throw new Error(`Timeout waiting for ${sel} to be ${state}`);
207
+ };
190
208
 
191
- result.data = loop_results
209
+ const isVisible = (el) => el && el.offsetParent !== null && window.getComputedStyle(el).display !== 'none' && window.getComputedStyle(el).visibility !== 'hidden';
210
+
211
+ const extractSingle = (c, spec) => {
212
+ let sel = spec, op = "text", param = null, visibleOnly = false;
213
+ if (typeof spec === "object" && spec !== null && spec._selector) {
214
+ sel = spec._selector; visibleOnly = !!spec._visible_only;
215
+ } else if (typeof spec === "string") {
216
+ if (spec.includes(" :: ")) {
217
+ const parts = spec.split(" :: ");
218
+ sel = parts[0];
219
+ if (parts[1].startsWith("attr(")) { op = "attr"; param = parts[1].slice(5, -1); }
220
+ else { op = parts[1]; }
221
+ } else if (spec.trim().startsWith("::")) {
222
+ sel = null;
223
+ const parts = spec.trim().substring(2).trim();
224
+ if (parts.startsWith("attr(")) { op = "attr"; param = parts.slice(5, -1); }
225
+ else { op = parts; }
226
+ }
227
+ }
228
+ if (!sel || sel.trim() === "") sel = null;
229
+ let el = visibleOnly ? Array.from(sel ? c.querySelectorAll(sel) : [c]).find(isVisible) : (sel ? c.querySelector(sel) : c);
230
+ if (!el) return null;
231
+ if (op === "text") return el.innerText.trim();
232
+ if (op === "html") return el.outerHTML;
233
+ if (op === "attr" && param) return el.getAttribute(param);
234
+ return null;
235
+ };
236
+
237
+ const processSchema = (c, s) => {
238
+ const out = {};
239
+ for (const [key, val] of Object.entries(s)) {
240
+ if (typeof val === "string") out[key] = extractSingle(c, val);
241
+ else if (typeof val === "object" && val !== null) {
242
+ if (val._selector) out[key] = extractSingle(c, val);
243
+ else out[key] = processSchema(c, val);
244
+ }
245
+ }
246
+ return out;
247
+ };
248
+
249
+ for (let i = 0; i < elements.length; i++) {
250
+ const el = elements[i];
251
+ const iterResults = [];
252
+
253
+ for (const act of actions) {
254
+ const scopeCtx = act.scope === 'page' ? document : el;
255
+ const actionResult = { action: act, success: true, data: null };
256
+
257
+ try {
258
+ if (act.action === 'wait') {
259
+ if (act.selector) {
260
+ await waitFor(act.selector, act.state || 'visible', act.timeout || 5000);
261
+ } else if (act.timeout) {
262
+ await sleep(act.timeout);
263
+ }
264
+ } else if (act.action === 'click') {
265
+ const target = act.selector ? scopeCtx.querySelector(act.selector) : scopeCtx;
266
+ if (target) {
267
+ target.scrollIntoView({ behavior: 'instant', block: 'center' });
268
+ target.click();
269
+ await sleep(act.human_like ? Math.floor(Math.random() * 100) + 50 : 10);
270
+ } else throw new Error(`Click target not found: ${act.selector}`);
271
+ } else if (act.action === 'extract') {
272
+ const root = act.selector ? scopeCtx.querySelector(act.selector) : scopeCtx;
273
+ if (root) actionResult.data = processSchema(root, act.schema);
274
+ else throw new Error(`Extract target not found: ${act.selector}`);
275
+ } else if (act.action === 'evaluate') {
276
+ if (act.value) actionResult.data = eval(act.value);
277
+ }
278
+ } catch (err) {
279
+ actionResult.success = false;
280
+ actionResult.error = err.message;
281
+ if (act.fail_on_error) {
282
+ iterResults.push(actionResult);
283
+ throw err;
284
+ }
285
+ }
286
+ iterResults.push(actionResult);
287
+ }
288
+ results.push({ index: i, results: iterResults });
289
+ }
290
+ return results;
291
+ }
292
+ """
293
+
294
+ try:
295
+ loop_results = await ctx.evaluate(
296
+ js_native_loop,
297
+ {
298
+ "selector": action.selector,
299
+ "actions": serialized_actions,
300
+ "limit": limit,
301
+ },
302
+ )
303
+ result.data = loop_results
304
+ except Exception as e:
305
+ result.success = False
306
+ result.error = f"Native JS Loop Execution Failed: {e}"
192
307
 
193
308
  case "click":
194
309
  if action.selector:
@@ -644,10 +759,12 @@ async def execute_actions(
644
759
 
645
760
  finally:
646
761
  result.duration = time.perf_counter() - start_time
647
-
762
+
648
763
  # Enhanced OTel Attributes
649
764
  span.set_attribute("phantomfetch.action.success", result.success)
650
- span.set_attribute("phantomfetch.action.duration_ms", result.duration * 1000)
765
+ span.set_attribute(
766
+ "phantomfetch.action.duration_ms", result.duration * 1000
767
+ )
651
768
  if result.error:
652
769
  span.set_attribute("phantomfetch.action.error", str(result.error))
653
770
 
@@ -1,20 +1,25 @@
1
+ import asyncio
2
+ import re
1
3
  import time
2
4
  from typing import TYPE_CHECKING, Any, Optional
3
- import sys
4
5
  from urllib.parse import urlparse
5
6
 
6
7
  from browserforge.fingerprints import FingerprintGenerator, Screen
7
8
  from browserforge.injectors.playwright import AsyncNewContext
8
9
  from loguru import logger
9
10
  from opentelemetry import context
11
+ from undetected_playwright import stealth_async
10
12
 
11
13
  from ...telemetry import get_tracer
12
14
  from ...types import Action, Cookie, Proxy, Response
13
15
  from .actions import execute_actions
14
- from undetected_playwright import stealth_async
15
16
 
16
17
  tracer = get_tracer()
17
18
 
19
+ BLOCK_DOMAINS = re.compile(
20
+ r"(google-analytics\.com|doubleclick\.net|googletagmanager\.com|facebook\.net|hotjar\.com)"
21
+ )
22
+
18
23
  if TYPE_CHECKING:
19
24
  # ... (omitted shared lines) ...
20
25
 
@@ -102,7 +107,11 @@ class CDPEngine:
102
107
  self.browser_type = browser_type
103
108
 
104
109
  # Map playwright browser type to BrowserForge constraint
105
- if "browser" not in self.fingerprint_options and self.browser_type in ("chromium", "firefox", "webkit"):
110
+ if "browser" not in self.fingerprint_options and self.browser_type in (
111
+ "chromium",
112
+ "firefox",
113
+ "webkit",
114
+ ):
106
115
  bf_map = {"chromium": "chrome", "webkit": "safari", "firefox": "firefox"}
107
116
  self.fingerprint_options["browser"] = bf_map[self.browser_type]
108
117
 
@@ -112,7 +121,9 @@ class CDPEngine:
112
121
 
113
122
  # BrowserForge fingerprint generator — instantiated once, generate() per request
114
123
  if fingerprint:
115
- screen = Screen(min_width=1280, max_width=1920, min_height=720, max_height=1080)
124
+ screen = Screen(
125
+ min_width=1280, max_width=1920, min_height=720, max_height=1080
126
+ )
116
127
  self._fp_gen: FingerprintGenerator | None = FingerprintGenerator(
117
128
  screen=screen,
118
129
  mock_webrtc=True,
@@ -131,22 +142,41 @@ class CDPEngine:
131
142
  from playwright.async_api import async_playwright
132
143
 
133
144
  self._playwright = await async_playwright().start()
134
-
135
- if self.cdp_endpoint:
136
- logger.info(f"[cdp] Connecting to: {self.cdp_endpoint}")
137
- browser_obj = getattr(self._playwright, self.browser_type)
138
- self._browser = await browser_obj.connect_over_cdp(
139
- self.cdp_endpoint
140
- )
141
-
142
- # Detect existing contexts/pages for recording compatibility
143
- if self.use_existing_page and self._browser.contexts:
144
- self._existing_context = self._browser.contexts[0]
145
- if self._existing_context.pages:
146
- self._existing_page = self._existing_context.pages[0]
147
- logger.info("[cdp] Using existing page for recording compatibility")
148
- else:
149
- logger.info(f"[cdp] Launching local browser (headless={self.headless}, type={self.browser_type})")
145
+ try:
146
+ if self.cdp_endpoint:
147
+ logger.info(f"[cdp] Connecting to: {self.cdp_endpoint}")
148
+ browser_obj = getattr(self._playwright, self.browser_type)
149
+ self._browser = await browser_obj.connect_over_cdp(self.cdp_endpoint)
150
+
151
+ if self.use_existing_page and self._browser.contexts:
152
+ self._existing_context = self._browser.contexts[0]
153
+ if self._existing_context.pages:
154
+ self._existing_page = self._existing_context.pages[0]
155
+ logger.info(
156
+ "[cdp] Using existing page for recording compatibility"
157
+ )
158
+ else:
159
+ logger.info(
160
+ f"[cdp] Launching local browser (headless={self.headless}, type={self.browser_type})"
161
+ )
162
+ base_args = []
163
+ if self.browser_type == "chromium":
164
+ base_args = [
165
+ "--disable-blink-features=AutomationControlled",
166
+ "--no-sandbox",
167
+ "--disable-setuid-sandbox",
168
+ ]
169
+ extra = [a for a in self.launch_args if a not in base_args]
170
+
171
+ browser_obj = getattr(self._playwright, self.browser_type)
172
+ self._browser = await browser_obj.launch(
173
+ headless=self.headless,
174
+ args=base_args + extra,
175
+ )
176
+ except Exception as e:
177
+ await self._playwright.stop()
178
+ self._playwright = None
179
+ raise e
150
180
  base_args = []
151
181
  if self.browser_type == "chromium":
152
182
  # --no-sandbox is required on Linux environments that don't have
@@ -158,7 +188,7 @@ class CDPEngine:
158
188
  "--disable-setuid-sandbox",
159
189
  ]
160
190
  extra = [a for a in self.launch_args if a not in base_args]
161
-
191
+
162
192
  browser_obj = getattr(self._playwright, self.browser_type)
163
193
  self._browser = await browser_obj.launch(
164
194
  headless=self.headless,
@@ -348,13 +378,17 @@ class CDPEngine:
348
378
 
349
379
  if proxy:
350
380
  parsed = urlparse(proxy.url)
351
- proxy_server = f"{parsed.scheme}://{parsed.hostname}:{parsed.port}" if parsed.port else f"{parsed.scheme}://{parsed.hostname}"
381
+ proxy_server = (
382
+ f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"
383
+ if parsed.port
384
+ else f"{parsed.scheme}://{parsed.hostname}"
385
+ )
352
386
  proxy_dict = {"server": proxy_server}
353
387
  if parsed.username:
354
388
  proxy_dict["username"] = parsed.username
355
389
  if parsed.password:
356
390
  proxy_dict["password"] = parsed.password
357
-
391
+
358
392
  context_opts["proxy"] = proxy_dict
359
393
 
360
394
  # If we have basic cookies to set via context creation (simpler than add_cookies sometimes)
@@ -371,47 +405,50 @@ class CDPEngine:
371
405
  page = self._existing_page
372
406
  using_existing = True
373
407
  logger.debug(f"[cdp] Reusing existing page for {url}")
374
- else:
375
- # Create new context with BrowserForge fingerprint injection (if enabled)
376
- if self._fp_gen is not None:
377
- # Generate a fingerprint with any per-instance constraints
378
- fp = self._fp_gen.generate(**self.fingerprint_options)
379
-
380
- # Sync UA and viewport FROM the fingerprint for full consistency.
381
- # Explicit user_agent/viewport overrides take precedence.
382
- if not self.user_agent:
383
- context_opts["user_agent"] = fp.navigator.userAgent
384
- if not self.viewport:
385
- context_opts["viewport"] = {
386
- "width": fp.screen.width,
387
- "height": fp.screen.height,
388
- }
389
-
390
- logger.debug(
391
- f"[cdp] BrowserForge fingerprint: UA={fp.navigator.userAgent[:60]}..."
392
- )
408
+ # Create new context with BrowserForge fingerprint injection (if enabled)
409
+ elif self._fp_gen is not None:
410
+ # Generate a fingerprint with any per-instance constraints
411
+ fp = self._fp_gen.generate(**self.fingerprint_options)
412
+
413
+ # Sync UA and viewport FROM the fingerprint for full consistency.
414
+ # Explicit user_agent/viewport overrides take precedence.
415
+ if not self.user_agent:
416
+ context_opts["user_agent"] = fp.navigator.userAgent
417
+ if not self.viewport:
418
+ context_opts["viewport"] = {
419
+ "width": fp.screen.width,
420
+ "height": fp.screen.height,
421
+ }
422
+
423
+ logger.debug(
424
+ f"[cdp] BrowserForge fingerprint: UA={fp.navigator.userAgent[:60]}..."
425
+ )
393
426
 
394
- # AsyncNewContext injects all fingerprint JS overrides
395
- # (Canvas, WebGL, navigator, AudioContext, fonts, etc.)
396
- browser_context = await AsyncNewContext(
397
- self._browser,
398
- fingerprint=fp,
399
- **context_opts,
400
- )
427
+ # AsyncNewContext injects all fingerprint JS overrides
428
+ # (Canvas, WebGL, navigator, AudioContext, fonts, etc.)
429
+ browser_context = await AsyncNewContext(
430
+ self._browser,
431
+ fingerprint=fp,
432
+ **context_opts,
433
+ )
401
434
 
402
- # Apply matching headers so HTTP-layer headers are consistent
403
- # with the JS navigator (UA, sec-ch-ua*, Accept-Language, etc.)
404
- if fp.headers:
405
- # Merge: fingerprint headers are base; caller's explicit
406
- # headers=... arg will be applied later via set_extra_http_headers.
407
- # Here we only set the fingerprint-derived defaults.
408
- fp_headers = dict(fp.headers)
409
- # Remove headers that Playwright manages internally to avoid conflicts
410
- for managed in ("Host", "Content-Length", "Transfer-Encoding"):
411
- fp_headers.pop(managed, None)
412
- await browser_context.set_extra_http_headers(fp_headers)
413
- else:
414
- browser_context = await self._browser.new_context(**context_opts)
435
+ # Apply matching headers so HTTP-layer headers are consistent
436
+ # with the JS navigator (UA, sec-ch-ua*, Accept-Language, etc.)
437
+ if fp.headers:
438
+ # Merge: fingerprint headers are base; caller's explicit
439
+ # headers=... arg will be applied later via set_extra_http_headers.
440
+ # Here we only set the fingerprint-derived defaults.
441
+ fp_headers = dict(fp.headers)
442
+ # Remove headers that Playwright manages internally to avoid conflicts
443
+ for managed in (
444
+ "Host",
445
+ "Content-Length",
446
+ "Transfer-Encoding",
447
+ ):
448
+ fp_headers.pop(managed, None)
449
+ await browser_context.set_extra_http_headers(fp_headers)
450
+ else:
451
+ browser_context = await self._browser.new_context(**context_opts)
415
452
 
416
453
  except Exception as e:
417
454
  span.record_exception(e)
@@ -457,23 +494,16 @@ class CDPEngine:
457
494
  # Actually valid Playwright storage_state is best.
458
495
  # If we use context.add_init_script, we can inject it.
459
496
  if "origins" in storage_state:
497
+ import json
498
+
460
499
  for origin_data in storage_state["origins"]:
461
500
  origin = origin_data["origin"]
462
- ls_data = origin_data[
463
- "localStorage"
464
- ] # list of {name, value}
465
- # Construct JS to set items
466
- js_setter = ""
467
- for item in ls_data:
468
- k = item["name"].replace('"', '\\"')
469
- v = item["value"].replace('"', '\\"')
470
- js_setter += (
471
- f'window.localStorage.setItem("{k}", "{v}");'
472
- )
473
-
501
+ ls_data = origin_data["localStorage"]
502
+ ls_json = json.dumps(ls_data)
474
503
  script = f"""
475
504
  if (window.location.origin === "{origin}") {{
476
- {js_setter}
505
+ const data = {ls_json};
506
+ data.forEach(item => window.localStorage.setItem(item.name, item.value));
477
507
  }}
478
508
  """
479
509
  await browser_context.add_init_script(script)
@@ -538,12 +568,13 @@ class CDPEngine:
538
568
  try:
539
569
  req = response.request
540
570
  if req.resource_type in ("xhr", "fetch"):
541
- # Capture body safely
542
571
  resp_body = None
543
572
  try:
544
- # Limit body size capture to avoid memory issues
545
- body_bytes = await response.body()
546
- if len(body_bytes) < 1024 * 1024: # 1MB limit
573
+ content_length = int(
574
+ response.headers.get("content-length", 0)
575
+ )
576
+ if content_length < 1024 * 1024:
577
+ body_bytes = await response.body()
547
578
  resp_body = body_bytes.decode(
548
579
  "utf-8", errors="replace"
549
580
  )
@@ -609,10 +640,8 @@ class CDPEngine:
609
640
  context.detach(token)
610
641
 
611
642
  async def handle_route_with_context(route: "Route") -> None:
612
- # Attach the captured context
613
643
  token = context.attach(current_ctx)
614
644
  try:
615
- # Handle resource blocking
616
645
  if (
617
646
  block_resources
618
647
  and route.request.resource_type in block_resources
@@ -620,7 +649,19 @@ class CDPEngine:
620
649
  await route.abort()
621
650
  return
622
651
 
652
+ if BLOCK_DOMAINS.search(route.request.url):
653
+ await route.abort()
654
+ return
655
+
623
656
  await self._handle_route(route)
657
+ except Exception as e:
658
+ logger.error(
659
+ f"[cdp] Route handler catastrophically failed: {e}"
660
+ )
661
+ try:
662
+ await route.continue_()
663
+ except Exception:
664
+ pass
624
665
  finally:
625
666
  context.detach(token)
626
667
 
@@ -634,12 +675,18 @@ class CDPEngine:
634
675
 
635
676
  # Navigate
636
677
  span.add_event("navigation.start")
678
+ await browser_context.tracing.start(
679
+ screenshots=True, snapshots=True, sources=True
680
+ )
637
681
  try:
638
682
  response = await page.goto(
639
683
  url, timeout=timeout_ms, wait_until=wait_until
640
684
  )
641
685
  except Exception as e:
642
686
  span.record_exception(e)
687
+ await browser_context.tracing.stop(
688
+ path=f"/tmp/trace_{int(time.time())}.zip"
689
+ )
643
690
  return Response(
644
691
  url=url,
645
692
  status=0,
@@ -660,9 +707,9 @@ class CDPEngine:
660
707
  except Exception as e:
661
708
  logger.warning(f"[cdp] wait_for_url failed: {e}")
662
709
  span.record_exception(e)
663
- # Option 2: Strict Waiting - Return error response
664
-
665
- # Safely capture content
710
+ await browser_context.tracing.stop(
711
+ path=f"/tmp/trace_{int(time.time())}.zip"
712
+ )
666
713
  body_content = b""
667
714
  try:
668
715
  body_content = (await page.content()).encode("utf-8")
@@ -671,7 +718,7 @@ class CDPEngine:
671
718
 
672
719
  return Response(
673
720
  url=page.url,
674
- status=0, # Or appropriate error code
721
+ status=0,
675
722
  body=body_content,
676
723
  engine="browser",
677
724
  error=f"Wait for URL failed: {wait_for_url}. Current URL: {page.url}",
@@ -686,10 +733,11 @@ class CDPEngine:
686
733
  try:
687
734
  action_results = await execute_actions(page, actions)
688
735
  except RuntimeError as e:
689
- # Catch Fail Fast exceptions
690
736
  logger.warning(f"[cdp] Action fail_on_error triggered: {e}")
691
737
  span.record_exception(e)
692
- # Safely capture content
738
+ await browser_context.tracing.stop(
739
+ path=f"/tmp/trace_{int(time.time())}.zip"
740
+ )
693
741
  body_content = b""
694
742
  try:
695
743
  body_content = (await page.content()).encode("utf-8")
@@ -703,11 +751,6 @@ class CDPEngine:
703
751
  engine="browser",
704
752
  error=f"Action Execution Failed: {e}",
705
753
  network_log=network_log,
706
- # We might have partial results if execute_actions appends before raising
707
- # But execute_actions usually returns the list.
708
- # In failure case, it raises, so we don't get the list return value.
709
- # However, we modified execute_actions to raise, so we lose the list unless we modify it to attach results to exception.
710
- # For now, simplistic error return is fine.
711
754
  action_results=[],
712
755
  )
713
756
  span.add_event("actions.end")
@@ -741,7 +784,9 @@ class CDPEngine:
741
784
  break
742
785
  except Exception as e:
743
786
  if "Unable to retrieve content" in str(e) and i < 2:
744
- logger.debug(f"[cdp] Retrying content retrieval due to navigation ({i+1}/3)")
787
+ logger.debug(
788
+ f"[cdp] Retrying content retrieval due to navigation ({i + 1}/3)"
789
+ )
745
790
  await asyncio.sleep(0.5)
746
791
  else:
747
792
  logger.warning(f"[cdp] Failed to retrieve content: {e}")
@@ -796,6 +841,15 @@ class CDPEngine:
796
841
  storage_state=current_storage_state,
797
842
  )
798
843
 
844
+ except asyncio.CancelledError:
845
+ logger.warning("Scrape cancelled by orchestrator. Shielding teardown.")
846
+ if not using_existing and browser_context:
847
+ try:
848
+ await asyncio.shield(browser_context.close())
849
+ except Exception:
850
+ pass
851
+ raise
852
+
799
853
  except Exception as e:
800
854
  logger.error(f"[cdp] Error: {e}")
801
855
  span.record_exception(e)
@@ -811,9 +865,8 @@ class CDPEngine:
811
865
  )
812
866
 
813
867
  finally:
814
- # Only close if we created new page/context (not reusing existing)
815
- if not using_existing:
816
- if page:
817
- await page.close()
818
- if browser_context:
868
+ if not using_existing and browser_context:
869
+ try:
819
870
  await browser_context.close()
871
+ except Exception:
872
+ pass
File without changes