phantomfetch 0.4.7__tar.gz → 0.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: phantomfetch
3
- Version: 0.4.7
3
+ Version: 0.4.9
4
4
  Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
5
5
  Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
6
6
  Author: CosmicBull
@@ -25,12 +25,20 @@ Requires-Dist: opentelemetry-api>=1.38.0
25
25
  Requires-Dist: opentelemetry-sdk>=1.38.0
26
26
  Requires-Dist: loguru>=0.7.3
27
27
  Requires-Dist: beautifulsoup4>=4.14.3
28
+ Requires-Dist: cloakbrowser>=0.3.0 ; extra == 'all'
29
+ Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
30
+ Requires-Dist: cloakbrowser>=0.3.0 ; extra == 'cloakbrowser'
31
+ Requires-Dist: cloakbrowser[geoip]>=0.3.0 ; extra == 'geoip'
32
+ Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
28
33
  Requires-Python: >=3.13
29
34
  Project-URL: Homepage, https://github.com/iristech-systems/PhantomFetch
30
35
  Project-URL: Documentation, https://github.com/iristech-systems/PhantomFetch#readme
31
36
  Project-URL: Repository, https://github.com/iristech-systems/PhantomFetch
32
37
  Project-URL: Issues, https://github.com/iristech-systems/PhantomFetch/issues
33
38
  Project-URL: Changelog, https://github.com/iristech-systems/PhantomFetch/blob/main/CHANGELOG.md
39
+ Provides-Extra: all
40
+ Provides-Extra: cloakbrowser
41
+ Provides-Extra: geoip
34
42
  Description-Content-Type: text/markdown
35
43
 
36
44
  # PhantomFetch
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomfetch"
3
- version = "0.4.7"
3
+ version = "0.4.9"
4
4
  description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -42,6 +42,11 @@ dependencies = [
42
42
  "beautifulsoup4>=4.14.3",
43
43
  ]
44
44
 
45
+ [project.optional-dependencies]
46
+ cloakbrowser = ["cloakbrowser>=0.3.0"]
47
+ geoip = ["cloakbrowser[geoip]>=0.3.0", "maxminddb>=2.0.0"]
48
+ all = ["cloakbrowser>=0.3.0", "maxminddb>=2.0.0"]
49
+
45
50
  [project.urls]
46
51
  Homepage = "https://github.com/iristech-systems/PhantomFetch"
47
52
  Documentation = "https://github.com/iristech-systems/PhantomFetch#readme"
@@ -1,11 +1,12 @@
1
1
  import asyncio
2
2
  import re
3
3
  import time
4
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Literal, Optional
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  from browserforge.fingerprints import FingerprintGenerator, Screen
8
8
  from browserforge.injectors.playwright import AsyncNewContext
9
+ from browserforge.injectors.utils import InjectFunction, only_injectable_headers
9
10
  from loguru import logger
10
11
  from opentelemetry import context
11
12
  from undetected_playwright import stealth_async
@@ -73,6 +74,13 @@ class CDPEngine:
73
74
  launch_args: list[str] | None = None,
74
75
  browser_type: str = "chromium",
75
76
  cdp_connection_type: str = "cdp",
77
+ backend: Literal["rebrowser", "playwright", "patchright"] = "rebrowser",
78
+ cloak_browser: bool = False,
79
+ cloak_browser_humanize: bool = False,
80
+ cloak_browser_human_preset: str = "default",
81
+ cloak_browser_geoip: bool = False,
82
+ cloak_binary_path: str | None = None,
83
+ persistent_context_dir: str | None = None,
76
84
  ):
77
85
  """
78
86
  Args:
@@ -87,6 +95,7 @@ class CDPEngine:
87
95
  fingerprint: Enable BrowserForge fingerprint injection (default: True).
88
96
  Injects a realistic Canvas, WebGL, navigator, and screen
89
97
  fingerprint into every new browser context.
98
+ Automatically disabled when cloak_browser=True (binary handles it).
90
99
  fingerprint_options: Optional constraints forwarded to FingerprintGenerator
91
100
  (e.g. {"browser": "chrome", "os": "windows"}).
92
101
  launch_args: Additional Chromium launch flags for local browser launches
@@ -98,6 +107,23 @@ class CDPEngine:
98
107
  (connect_over_cdp) or "playwright" for Playwright wire protocol (connect).
99
108
  Defaults to "cdp". Use "playwright" for services like
100
109
  playwright run-server or Scrapeless recording.
110
+ backend: Which Playwright variant to use: "rebrowser" (rebrowser_playwright, default),
111
+ "playwright" (stock playwright), or "patchright" (patchright).
112
+ CloakBrowser integration requires "playwright" or "patchright".
113
+ cloak_browser: Use CloakBrowser's stealth Chromium binary instead of stock browser.
114
+ When True, fingerprint and stealth params are ignored (binary handles them).
115
+ Requires: pip install phantomfetch[cloakbrowser]
116
+ cloak_browser_humanize: Enable human-like mouse, keyboard, and scroll behavior
117
+ when using CloakBrowser. Adds Bezier curve mouse movement,
118
+ realistic typing delays, and scroll micro-steps.
119
+ cloak_browser_human_preset: Humanize preset — "default" (normal speed) or "careful" (slower).
120
+ cloak_browser_geoip: Auto-detect timezone and locale from proxy IP using MaxMind GeoIP.
121
+ Requires: pip install phantomfetch[cloakbrowser,geoip]
122
+ cloak_binary_path: Path to CloakBrowser binary. If None, uses CloakBrowser's
123
+ ensure_binary() to download/find it.
124
+ persistent_context_dir: Directory for persistent browser profile (cookies/localStorage
125
+ persist across sessions). When set, uses CloakBrowser's
126
+ persistent context API. Enables incognito bypass.
101
127
  """
102
128
  self.cdp_endpoint = cdp_endpoint
103
129
  self.headless = headless
@@ -111,6 +137,17 @@ class CDPEngine:
111
137
  self.launch_args = launch_args or []
112
138
  self.browser_type = browser_type
113
139
  self.cdp_connection_type = cdp_connection_type
140
+ self.backend = backend
141
+ self.cloak_browser = cloak_browser
142
+ self.cloak_browser_humanize = cloak_browser_humanize
143
+ self.cloak_browser_human_preset = cloak_browser_human_preset
144
+ self.cloak_browser_geoip = cloak_browser_geoip
145
+ self.cloak_binary_path = cloak_binary_path
146
+ self.persistent_context_dir = persistent_context_dir
147
+
148
+ self._cloak_browser_available = False
149
+ self._cloak_context: Any = None
150
+ self._cloak_human_config: Any = None
114
151
 
115
152
  # Map playwright browser type to BrowserForge constraint
116
153
  if "browser" not in self.fingerprint_options and self.browser_type in (
@@ -126,7 +163,8 @@ class CDPEngine:
126
163
  self.fingerprint_options["device"] = "desktop"
127
164
 
128
165
  # BrowserForge fingerprint generator — instantiated once, generate() per request
129
- if fingerprint:
166
+ # Disabled when using CloakBrowser (binary handles fingerprints natively)
167
+ if fingerprint and not cloak_browser:
130
168
  screen = Screen(
131
169
  min_width=1280, max_width=1920, min_height=720, max_height=1080
132
170
  )
@@ -144,14 +182,83 @@ class CDPEngine:
144
182
 
145
183
  async def connect(self) -> None:
146
184
  """Initialize Playwright and connect to browser."""
185
+ if self.cloak_browser:
186
+ await self._connect_cloakbrowser()
187
+ else:
188
+ await self._connect_playwright()
189
+
190
+ async def _connect_cloakbrowser(self) -> None:
191
+ """Connect using CloakBrowser's stealth Chromium binary.
192
+
193
+ Uses cloakbrowser.launch_async() + browser.new_context() for non-persistent,
194
+ or launch_persistent_context_async() for persistent. These handle binary
195
+ management, stealth args, geoip, and humanize patches internally.
196
+ """
197
+ try:
198
+ from cloakbrowser import launch_async, launch_persistent_context_async
199
+ except ImportError:
200
+ raise ImportError(
201
+ "cloakbrowser is not installed. "
202
+ "Install it with: pip install phantomfetch[cloakbrowser]"
203
+ )
204
+
205
+ logger.info(
206
+ f"[cdp] Connecting via CloakBrowser "
207
+ f"(humanize={self.cloak_browser_humanize}, "
208
+ f"persistent={bool(self.persistent_context_dir)})"
209
+ )
210
+
211
+ if self.persistent_context_dir:
212
+ logger.debug(
213
+ f"[cdp] CloakBrowser persistent context: {self.persistent_context_dir}"
214
+ )
215
+ launch_kwargs: dict[str, Any] = {
216
+ "headless": self.headless,
217
+ "user_agent": self.user_agent,
218
+ "viewport": self.viewport,
219
+ "stealth_args": False,
220
+ "humanize": self.cloak_browser_humanize,
221
+ "human_preset": self.cloak_browser_human_preset,
222
+ "backend": self.backend,
223
+ "geoip": self.cloak_browser_geoip,
224
+ }
225
+ self._cloak_context = await launch_persistent_context_async(
226
+ self.persistent_context_dir,
227
+ **launch_kwargs,
228
+ )
229
+ self._browser: Any = None
230
+ else:
231
+ logger.debug(
232
+ "[cdp] CloakBrowser non-persistent — browser created per fetch"
233
+ )
234
+ self._browser = await launch_async(
235
+ headless=self.headless,
236
+ stealth_args=False,
237
+ humanize=self.cloak_browser_humanize,
238
+ human_preset=self.cloak_browser_human_preset,
239
+ backend=self.backend,
240
+ geoip=self.cloak_browser_geoip,
241
+ )
242
+ self._cloak_context = None
147
243
 
148
- from playwright.async_api import async_playwright
244
+ self._cloak_browser_available = True
245
+
246
+ async def _connect_playwright(self) -> None:
247
+ """Connect using standard Playwright (rebrowser, stock, or patchright)."""
248
+ backend = self.backend
249
+ if backend == "rebrowser":
250
+ from rebrowser_playwright.async_api import async_playwright
251
+ elif backend == "patchright":
252
+ from patchright.async_api import async_playwright
253
+ else:
254
+ from playwright.async_api import async_playwright
149
255
 
150
256
  self._playwright = await async_playwright().start()
151
257
  try:
152
258
  if self.cdp_endpoint:
153
259
  logger.info(
154
- f"[cdp] Connecting to: {self.cdp_endpoint} (type={self.cdp_connection_type})"
260
+ f"[cdp] Connecting to: {self.cdp_endpoint} "
261
+ f"(type={self.cdp_connection_type}, backend={backend})"
155
262
  )
156
263
  browser_obj = getattr(self._playwright, self.browser_type)
157
264
 
@@ -171,7 +278,8 @@ class CDPEngine:
171
278
  )
172
279
  else:
173
280
  logger.info(
174
- f"[cdp] Launching local browser (headless={self.headless}, type={self.browser_type})"
281
+ f"[cdp] Launching local browser "
282
+ f"(headless={self.headless}, type={self.browser_type}, backend={backend})"
175
283
  )
176
284
  base_args = []
177
285
  if self.browser_type == "chromium":
@@ -191,26 +299,15 @@ class CDPEngine:
191
299
  await self._playwright.stop()
192
300
  self._playwright = None
193
301
  raise e
194
- base_args = []
195
- if self.browser_type == "chromium":
196
- # --no-sandbox is required on Linux environments that don't have
197
- # user namespace support (VMs, containers, CI). Without it,
198
- # Chromium hangs silently at launch.
199
- base_args = [
200
- "--disable-blink-features=AutomationControlled",
201
- "--no-sandbox",
202
- "--disable-setuid-sandbox",
203
- ]
204
- extra = [a for a in self.launch_args if a not in base_args]
205
-
206
- browser_obj = getattr(self._playwright, self.browser_type)
207
- self._browser = await browser_obj.launch(
208
- headless=self.headless,
209
- args=base_args + extra,
210
- )
211
302
 
212
303
  async def disconnect(self) -> None:
213
304
  """Close browser and Playwright."""
305
+ if self._cloak_context:
306
+ try:
307
+ await self._cloak_context.close()
308
+ except Exception:
309
+ pass
310
+ self._cloak_context = None
214
311
  if self._browser:
215
312
  await self._browser.close()
216
313
  self._browser = None
@@ -384,7 +481,7 @@ class CDPEngine:
384
481
  # unless we specifically want to share state (TODO: session support)
385
482
  # Update: storage_state passed in allows checking/setting state
386
483
 
387
- context_opts = {}
484
+ context_opts: dict[str, Any] = {}
388
485
  if self.user_agent:
389
486
  context_opts["user_agent"] = self.user_agent
390
487
  if self.viewport:
@@ -397,7 +494,7 @@ class CDPEngine:
397
494
  if parsed.port
398
495
  else f"{parsed.scheme}://{parsed.hostname}"
399
496
  )
400
- proxy_dict = {"server": proxy_server}
497
+ proxy_dict: dict[str, Any] = {"server": proxy_server}
401
498
  if parsed.username:
402
499
  proxy_dict["username"] = parsed.username
403
500
  if parsed.password:
@@ -408,59 +505,105 @@ class CDPEngine:
408
505
  # If we have basic cookies to set via context creation (simpler than add_cookies sometimes)
409
506
  # But better to use add_cookies for consistency with 'cookies' arg
410
507
 
411
- browser_context = None
412
- page = None
508
+ browser_context: Any = None
509
+ page: Any = None
413
510
  using_existing = False
511
+ using_persistent = False
414
512
 
415
513
  try:
416
- # Use existing page/context if available (for recording compatibility)
417
514
  if self._existing_page and self._existing_context:
418
515
  browser_context = self._existing_context
419
516
  page = self._existing_page
420
517
  using_existing = True
421
518
  logger.debug(f"[cdp] Reusing existing page for {url}")
422
- # Create new context with BrowserForge fingerprint injection (if enabled)
519
+
520
+ # CloakBrowser — binary-level stealth, no BrowserForge injection.
521
+ # CloakBrowser handles canvas/WebGL/GPU/UA/automation signals natively.
522
+ # Uses cloakbrowser.launch_async() + browser.new_context() for non-persistent,
523
+ # or launch_persistent_context_async() for persistent.
524
+ elif self._cloak_browser_available:
525
+ proxy_arg: dict[str, Any] | None = proxy_dict if proxy else None
526
+
527
+ if self.persistent_context_dir:
528
+ using_persistent = True
529
+ if self._cloak_context is None:
530
+ from cloakbrowser import launch_persistent_context_async
531
+
532
+ launch_kwargs: dict[str, Any] = {
533
+ "headless": self.headless,
534
+ "user_agent": self.user_agent,
535
+ "viewport": self.viewport,
536
+ "stealth_args": False,
537
+ "humanize": self.cloak_browser_humanize,
538
+ "human_preset": self.cloak_browser_human_preset,
539
+ "backend": self.backend,
540
+ "proxy": proxy_arg,
541
+ "geoip": self.cloak_browser_geoip,
542
+ }
543
+ self._cloak_context = await launch_persistent_context_async(
544
+ self.persistent_context_dir,
545
+ **launch_kwargs,
546
+ )
547
+ browser_context = self._cloak_context
548
+ page = await browser_context.new_page()
549
+ else:
550
+ browser_context = await self._browser.new_context(
551
+ user_agent=self.user_agent,
552
+ viewport=self.viewport,
553
+ proxy=proxy_arg,
554
+ )
555
+ page = await browser_context.new_page()
556
+
557
+ # BrowserForge fingerprint injection (rebrowser/stock playwright)
423
558
  elif self._fp_gen is not None:
424
- # Generate a fingerprint with any per-instance constraints
425
559
  fp = self._fp_gen.generate(**self.fingerprint_options)
426
560
 
427
- # Sync UA and viewport FROM the fingerprint for full consistency.
428
- # Explicit user_agent/viewport overrides take precedence.
429
- if not self.user_agent:
430
- context_opts["user_agent"] = fp.navigator.userAgent
431
561
  if not self.viewport:
432
562
  context_opts["viewport"] = {
433
563
  "width": fp.screen.width,
434
564
  "height": fp.screen.height,
435
565
  }
436
566
 
437
- logger.debug(
438
- f"[cdp] BrowserForge fingerprint: UA={fp.navigator.userAgent[:60]}..."
439
- )
440
-
441
- # AsyncNewContext injects all fingerprint JS overrides
442
- # (Canvas, WebGL, navigator, AudioContext, fonts, etc.)
443
- browser_context = await AsyncNewContext(
444
- self._browser,
445
- fingerprint=fp,
446
- **context_opts,
567
+ user_provided_ua = self.user_agent or (
568
+ headers.get("user-agent") if headers else None
447
569
  )
448
570
 
449
- # Apply matching headers so HTTP-layer headers are consistent
450
- # with the JS navigator (UA, sec-ch-ua*, Accept-Language, etc.)
451
- if fp.headers:
452
- # Merge: fingerprint headers are base; caller's explicit
453
- # headers=... arg will be applied later via set_extra_http_headers.
454
- # Here we only set the fingerprint-derived defaults.
455
- fp_headers = dict(fp.headers)
456
- # Remove headers that Playwright manages internally to avoid conflicts
457
- for managed in (
458
- "Host",
459
- "Content-Length",
460
- "Transfer-Encoding",
461
- ):
462
- fp_headers.pop(managed, None)
463
- await browser_context.set_extra_http_headers(fp_headers)
571
+ if user_provided_ua:
572
+ context_opts["user_agent"] = user_provided_ua
573
+ logger.debug(
574
+ f"[cdp] BrowserForge fingerprint (custom UA): "
575
+ f"fp_UA={fp.navigator.userAgent[:40]}... "
576
+ f"user_UA={user_provided_ua[:40]}..."
577
+ )
578
+ browser_context = await self._browser.new_context(
579
+ **context_opts,
580
+ )
581
+ await browser_context.set_extra_http_headers(
582
+ only_injectable_headers(
583
+ fp.headers, self._browser.browser_type.name
584
+ )
585
+ )
586
+ await browser_context.add_init_script(InjectFunction(fp))
587
+ else:
588
+ context_opts["user_agent"] = fp.navigator.userAgent
589
+ logger.debug(
590
+ f"[cdp] BrowserForge fingerprint: "
591
+ f"UA={fp.navigator.userAgent[:60]}..."
592
+ )
593
+ browser_context = await AsyncNewContext(
594
+ self._browser,
595
+ fingerprint=fp,
596
+ **context_opts,
597
+ )
598
+ if fp.headers:
599
+ fp_headers = dict(fp.headers)
600
+ for managed in (
601
+ "Host",
602
+ "Content-Length",
603
+ "Transfer-Encoding",
604
+ ):
605
+ fp_headers.pop(managed, None)
606
+ await browser_context.set_extra_http_headers(fp_headers)
464
607
  else:
465
608
  browser_context = await self._browser.new_context(**context_opts)
466
609
 
@@ -479,7 +622,8 @@ class CDPEngine:
479
622
  # Apply stealth (works for both new and existing contexts)
480
623
  # Note: For remote browsers, launch args like --disable-blink-features=AutomationControlled
481
624
  # must be set by the *server* at launch time. We cannot retroactively apply them here.
482
- if stealth and browser_context:
625
+ # CloakBrowser: skip stealth_async — binary handles fingerprinting at C++ level.
626
+ if stealth and browser_context and not self._cloak_browser_available:
483
627
  logger.debug("[cdp] Applying stealth_async")
484
628
  await stealth_async(browser_context)
485
629
 
@@ -558,8 +702,8 @@ class CDPEngine:
558
702
  if headers:
559
703
  await browser_context.set_extra_http_headers(headers)
560
704
 
561
- # Create page if not reusing existing
562
- if not using_existing:
705
+ # Create page if not reusing existing or persistent
706
+ if not using_existing and not using_persistent:
563
707
  # If storing state, we should have done it on context.
564
708
  page = await browser_context.new_page()
565
709
 
@@ -857,7 +1001,7 @@ class CDPEngine:
857
1001
 
858
1002
  except asyncio.CancelledError:
859
1003
  logger.warning("Scrape cancelled by orchestrator. Shielding teardown.")
860
- if not using_existing and browser_context:
1004
+ if not using_existing and not using_persistent and browser_context:
861
1005
  try:
862
1006
  await asyncio.shield(browser_context.close())
863
1007
  except Exception:
@@ -879,7 +1023,7 @@ class CDPEngine:
879
1023
  )
880
1024
 
881
1025
  finally:
882
- if not using_existing and browser_context:
1026
+ if not using_existing and not using_persistent and browser_context:
883
1027
  try:
884
1028
  await browser_context.close()
885
1029
  except Exception:
@@ -1,11 +1,11 @@
1
1
  import asyncio
2
2
  import json
3
3
  import os
4
- from typing import Any, Literal, cast
4
+ from typing import Any, Literal
5
5
 
6
6
  from loguru import logger
7
7
 
8
- from .cache import Cache, FileSystemCache
8
+ from .cache import Cache
9
9
  from .engines import CDPEngine, CurlEngine
10
10
  from .pool import ProxyPool
11
11
  from .telemetry import get_tracer
@@ -46,10 +46,19 @@ class Fetcher:
46
46
  cache: Cache | bool | None = None,
47
47
  # Advanced CDP
48
48
  cdp_use_existing_page: bool = True,
49
+ cdp_connection_type: str = "cdp",
50
+ backend: Literal["rebrowser", "playwright", "patchright"] = "rebrowser",
49
51
  # BrowserForge fingerprinting
50
52
  fingerprint: bool = True,
51
53
  fingerprint_options: dict[str, Any] | None = None,
52
54
  browser_type: str = "chromium",
55
+ # CloakBrowser
56
+ cloak_browser: bool = False,
57
+ cloak_browser_humanize: bool = False,
58
+ cloak_browser_human_preset: str = "default",
59
+ cloak_browser_geoip: bool = False,
60
+ cloak_binary_path: str | None = None,
61
+ persistent_context_dir: str | None = None,
53
62
  ):
54
63
  """
55
64
  Initialize the Fetcher.
@@ -68,12 +77,24 @@ class Fetcher:
68
77
  max_concurrent_browser: Max concurrent browser requests
69
78
  cache: Cache implementation (e.g. FileSystemCache)
70
79
  cdp_use_existing_page: Reuse existing page in remote CDP (default: True)
80
+ backend: Which Playwright variant to use: "rebrowser", "playwright", or "patchright".
81
+ CloakBrowser integration requires "playwright" or "patchright".
71
82
  fingerprint: Enable BrowserForge fingerprint injection in the browser engine
72
83
  (default: True). Injects realistic Canvas, WebGL, navigator
73
84
  and screen fingerprints into every new Playwright context.
74
85
  fingerprint_options: Constraints for BrowserForge FingerprintGenerator
75
86
  (e.g. {"browser": "chrome", "os": "windows"}).
76
87
  browser_type: Which Playwright browser engine to use: "chromium", "firefox", or "webkit"
88
+ cloak_browser: Use CloakBrowser's stealth Chromium binary instead of stock browser.
89
+ When True, fingerprint is disabled (binary handles it).
90
+ Requires: pip install phantomfetch[cloakbrowser]
91
+ cloak_browser_humanize: Enable human-like mouse, keyboard, and scroll behavior
92
+ when using CloakBrowser.
93
+ cloak_browser_human_preset: Humanize preset — "default" (normal) or "careful" (slower).
94
+ cloak_browser_geoip: Auto-detect timezone/locale from proxy IP using MaxMind GeoIP.
95
+ cloak_binary_path: Path to CloakBrowser binary. If None, downloads automatically.
96
+ persistent_context_dir: Directory for persistent browser profile (cookies/localStorage
97
+ persist across sessions).
77
98
  """
78
99
  # Cache
79
100
  self.cache: Cache | None = None
@@ -97,9 +118,11 @@ class Fetcher:
97
118
  timeout=timeout,
98
119
  max_retries=max_retries,
99
120
  )
100
-
121
+
101
122
  if browser_engine == "baas":
102
- raise ValueError("BaaSEngine has been removed. Please use 'cdp' engine or contact support.")
123
+ raise ValueError(
124
+ "BaaSEngine has been removed. Please use 'cdp' engine or contact support."
125
+ )
103
126
 
104
127
  # Browser engine
105
128
  # Always use CDPEngine now
@@ -109,9 +132,17 @@ class Fetcher:
109
132
  timeout=browser_timeout,
110
133
  cache=self.cache,
111
134
  use_existing_page=cdp_use_existing_page,
135
+ cdp_connection_type=cdp_connection_type,
136
+ backend=backend,
112
137
  fingerprint=fingerprint,
113
138
  fingerprint_options=fingerprint_options,
114
139
  browser_type=browser_type,
140
+ cloak_browser=cloak_browser,
141
+ cloak_browser_humanize=cloak_browser_humanize,
142
+ cloak_browser_human_preset=cloak_browser_human_preset,
143
+ cloak_browser_geoip=cloak_browser_geoip,
144
+ cloak_binary_path=cloak_binary_path,
145
+ persistent_context_dir=persistent_context_dir,
115
146
  )
116
147
  self._browser = self._cdp_engine
117
148
 
@@ -147,7 +178,7 @@ class Fetcher:
147
178
  Stop the browser engine.
148
179
  """
149
180
  if self._browser:
150
- await self._browser.stop()
181
+ await self._browser.disconnect()
151
182
 
152
183
  def save_session(self, path: str) -> None:
153
184
  """
@@ -268,7 +299,7 @@ class Fetcher:
268
299
  proxy: Specific proxy to use (overrides pool)
269
300
  location: Geo location for proxy selection
270
301
  actions: List of `Action` objects or dicts (implies engine="browser")
271
-
302
+
272
303
  ...
273
304
  """
274
305
  # Normalize actions - implies browser
@@ -319,21 +350,21 @@ class Fetcher:
319
350
  cache_key_parts = [engine, url]
320
351
  if location:
321
352
  cache_key_parts.append(f"loc={location}")
322
-
353
+
323
354
  # Use proxy URL for cache key if explicit proxy is used
324
355
  # Note: We rely on the proxy *argument* (manual override) for this separation.
325
- # If using pool, we typically don't split cache by specific pool proxy,
356
+ # If using pool, we typically don't split cache by specific pool proxy,
326
357
  # UNLESS location was requested (handled above).
327
- if proxy:
358
+ if proxy:
328
359
  # If manual proxy string/object provided, include it
329
360
  p_url = proxy if isinstance(proxy, str) else proxy.url
330
- # Sanitize sensitive info? user:pass might be sensitive,
331
- # but for cache key uniqueness it's needed.
361
+ # Sanitize sensitive info? user:pass might be sensitive,
362
+ # but for cache key uniqueness it's needed.
332
363
  # Since MD5 is used downstream, it's somewhat obscured.
333
364
  cache_key_parts.append(f"proxy={p_url}")
334
365
 
335
366
  cache_key = ":".join(cache_key_parts)
336
-
367
+
337
368
  cached_resp = await self.cache.get(cache_key)
338
369
  if cached_resp:
339
370
  cached_resp.from_cache = True
@@ -347,10 +378,12 @@ class Fetcher:
347
378
  selected_proxy: Proxy | None = None
348
379
  if proxy:
349
380
  if isinstance(proxy, str):
350
- selected_proxy = Proxy(url=proxy, metadata={"source": "manual_override"})
381
+ selected_proxy = Proxy(
382
+ url=proxy, metadata={"source": "manual_override"}
383
+ )
351
384
  else:
352
385
  selected_proxy = proxy
353
-
386
+
354
387
  # 2. From Pool (if no override)
355
388
  if not selected_proxy:
356
389
  selected_proxy = self.proxy_pool.get(url=url, location=location)
@@ -358,20 +391,27 @@ class Fetcher:
358
391
  if selected_proxy:
359
392
  span.set_attribute("phantomfetch.proxy", selected_proxy.url)
360
393
  if selected_proxy.vendor:
361
- span.set_attribute("phantomfetch.proxy.vendor", selected_proxy.vendor)
394
+ span.set_attribute(
395
+ "phantomfetch.proxy.vendor", selected_proxy.vendor
396
+ )
362
397
  if selected_proxy.proxy_type:
363
- span.set_attribute("phantomfetch.proxy.type", selected_proxy.proxy_type)
398
+ span.set_attribute(
399
+ "phantomfetch.proxy.type", selected_proxy.proxy_type
400
+ )
364
401
  if selected_proxy.location:
365
- span.set_attribute("phantomfetch.proxy.location", selected_proxy.location)
402
+ span.set_attribute(
403
+ "phantomfetch.proxy.location", selected_proxy.location
404
+ )
366
405
  if selected_proxy.provider:
367
- span.set_attribute("phantomfetch.proxy.provider", selected_proxy.provider)
406
+ span.set_attribute(
407
+ "phantomfetch.proxy.provider", selected_proxy.provider
408
+ )
368
409
 
369
410
  # Route to engine
370
411
  if engine == "browser":
371
412
  resp = await self._fetch_browser(
372
413
  url=url,
373
414
  proxy=selected_proxy,
374
-
375
415
  headers=headers,
376
416
  cookies=cookies,
377
417
  actions=normalized_actions,
@@ -406,7 +446,7 @@ class Fetcher:
406
446
  # For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
407
447
  # But the 'selected_proxy' might be a new instance specific to this request (manual override).
408
448
  # The pool.mark_* methods take a Proxy object.
409
-
449
+
410
450
  # Logic: If 'proxy' argument was None, it came from pool -> Update stats.
411
451
  # If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
412
452
  if not proxy and selected_proxy:
@@ -421,10 +461,10 @@ class Fetcher:
421
461
  cache_key_parts = [engine, url]
422
462
  if location:
423
463
  cache_key_parts.append(f"loc={location}")
424
- if proxy:
464
+ if proxy:
425
465
  p_url = proxy if isinstance(proxy, str) else proxy.url
426
466
  cache_key_parts.append(f"proxy={p_url}")
427
-
467
+
428
468
  cache_key = ":".join(cache_key_parts)
429
469
  await self.cache.set(cache_key, resp)
430
470
 
@@ -451,26 +491,26 @@ class Fetcher:
451
491
  async with self._browser_semaphore:
452
492
  # Direct CDP usage (BaaSEngine removed)
453
493
  if not self._cdp_engine:
454
- # Lazy init if not done (though currently init in __init__)
455
- # But wait, logic in __init__ was:
456
- # self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
457
- # Since we removed engine selection, we should ensure it's initialized.
458
- # In __init__ I see: self._cdp_engine: CDPEngine | None = None
459
- # And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
460
- # Let's check __init__ again.
461
- pass
462
-
463
- # Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
494
+ # Lazy init if not done (though currently init in __init__)
495
+ # But wait, logic in __init__ was:
496
+ # self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
497
+ # Since we removed engine selection, we should ensure it's initialized.
498
+ # In __init__ I see: self._cdp_engine: CDPEngine | None = None
499
+ # And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
500
+ # Let's check __init__ again.
501
+ pass
502
+
503
+ # Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
464
504
  # No, I only updated the arguments. I need to make sure __init__ initializes _cdp_engine unconditionally.
465
505
  # Let's assume I fix __init__ in a separate recursive step if needed, or I can fix it here if I see it.
466
- # In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
467
- # I need to clean that up too!
468
-
506
+ # In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
507
+ # I need to clean that up too!
508
+
469
509
  # Let's just assume _browser attribute is now _cdp_engine.
470
510
  # Wait, the __init__ logic was:
471
511
  # self._browser = CDPEngine(...)
472
512
  # So here I should just use self._browser (which is typed as CDPEngine in the new world).
473
-
513
+
474
514
  # Correct implementation for _fetch_browser:
475
515
  return await self._cdp_engine.fetch(
476
516
  url=url,
@@ -63,9 +63,6 @@ class Proxy(msgspec.Struct):
63
63
  metadata: dict[str, Any] = {}
64
64
 
65
65
 
66
-
67
-
68
-
69
66
  class Action(msgspec.Struct):
70
67
  """
71
68
  Browser interaction definition.
@@ -99,8 +96,6 @@ class Action(msgspec.Struct):
99
96
  state: str | None = None
100
97
  x: int | None = None
101
98
  y: int | None = None
102
- x: int | None = None
103
- y: int | None = None
104
99
  schema: dict[str, Any] | None = None
105
100
  actions: list["Action"] | None = None
106
101
  then_actions: list["Action"] | None = None
File without changes