phantomfetch 0.4.7__tar.gz → 0.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/PKG-INFO +9 -1
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/pyproject.toml +6 -1
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/browser/cdp.py +207 -63
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/fetch.py +76 -36
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/types.py +0 -5
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/README.md +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/cache.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/captcha.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/base.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/browser/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/browser/actions.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/engines/curl.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/pool.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/telemetry.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.9}/src/phantomfetch/tools/selector_builder.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: phantomfetch
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.9
|
|
4
4
|
Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
|
|
5
5
|
Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
|
|
6
6
|
Author: CosmicBull
|
|
@@ -25,12 +25,20 @@ Requires-Dist: opentelemetry-api>=1.38.0
|
|
|
25
25
|
Requires-Dist: opentelemetry-sdk>=1.38.0
|
|
26
26
|
Requires-Dist: loguru>=0.7.3
|
|
27
27
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
28
|
+
Requires-Dist: cloakbrowser>=0.3.0 ; extra == 'all'
|
|
29
|
+
Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
|
|
30
|
+
Requires-Dist: cloakbrowser>=0.3.0 ; extra == 'cloakbrowser'
|
|
31
|
+
Requires-Dist: cloakbrowser[geoip]>=0.3.0 ; extra == 'geoip'
|
|
32
|
+
Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
|
|
28
33
|
Requires-Python: >=3.13
|
|
29
34
|
Project-URL: Homepage, https://github.com/iristech-systems/PhantomFetch
|
|
30
35
|
Project-URL: Documentation, https://github.com/iristech-systems/PhantomFetch#readme
|
|
31
36
|
Project-URL: Repository, https://github.com/iristech-systems/PhantomFetch
|
|
32
37
|
Project-URL: Issues, https://github.com/iristech-systems/PhantomFetch/issues
|
|
33
38
|
Project-URL: Changelog, https://github.com/iristech-systems/PhantomFetch/blob/main/CHANGELOG.md
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Provides-Extra: cloakbrowser
|
|
41
|
+
Provides-Extra: geoip
|
|
34
42
|
Description-Content-Type: text/markdown
|
|
35
43
|
|
|
36
44
|
# PhantomFetch
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "phantomfetch"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.9"
|
|
4
4
|
description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.13"
|
|
@@ -42,6 +42,11 @@ dependencies = [
|
|
|
42
42
|
"beautifulsoup4>=4.14.3",
|
|
43
43
|
]
|
|
44
44
|
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
cloakbrowser = ["cloakbrowser>=0.3.0"]
|
|
47
|
+
geoip = ["cloakbrowser[geoip]>=0.3.0", "maxminddb>=2.0.0"]
|
|
48
|
+
all = ["cloakbrowser>=0.3.0", "maxminddb>=2.0.0"]
|
|
49
|
+
|
|
45
50
|
[project.urls]
|
|
46
51
|
Homepage = "https://github.com/iristech-systems/PhantomFetch"
|
|
47
52
|
Documentation = "https://github.com/iristech-systems/PhantomFetch#readme"
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import re
|
|
3
3
|
import time
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
7
|
from browserforge.fingerprints import FingerprintGenerator, Screen
|
|
8
8
|
from browserforge.injectors.playwright import AsyncNewContext
|
|
9
|
+
from browserforge.injectors.utils import InjectFunction, only_injectable_headers
|
|
9
10
|
from loguru import logger
|
|
10
11
|
from opentelemetry import context
|
|
11
12
|
from undetected_playwright import stealth_async
|
|
@@ -73,6 +74,13 @@ class CDPEngine:
|
|
|
73
74
|
launch_args: list[str] | None = None,
|
|
74
75
|
browser_type: str = "chromium",
|
|
75
76
|
cdp_connection_type: str = "cdp",
|
|
77
|
+
backend: Literal["rebrowser", "playwright", "patchright"] = "rebrowser",
|
|
78
|
+
cloak_browser: bool = False,
|
|
79
|
+
cloak_browser_humanize: bool = False,
|
|
80
|
+
cloak_browser_human_preset: str = "default",
|
|
81
|
+
cloak_browser_geoip: bool = False,
|
|
82
|
+
cloak_binary_path: str | None = None,
|
|
83
|
+
persistent_context_dir: str | None = None,
|
|
76
84
|
):
|
|
77
85
|
"""
|
|
78
86
|
Args:
|
|
@@ -87,6 +95,7 @@ class CDPEngine:
|
|
|
87
95
|
fingerprint: Enable BrowserForge fingerprint injection (default: True).
|
|
88
96
|
Injects a realistic Canvas, WebGL, navigator, and screen
|
|
89
97
|
fingerprint into every new browser context.
|
|
98
|
+
Automatically disabled when cloak_browser=True (binary handles it).
|
|
90
99
|
fingerprint_options: Optional constraints forwarded to FingerprintGenerator
|
|
91
100
|
(e.g. {"browser": "chrome", "os": "windows"}).
|
|
92
101
|
launch_args: Additional Chromium launch flags for local browser launches
|
|
@@ -98,6 +107,23 @@ class CDPEngine:
|
|
|
98
107
|
(connect_over_cdp) or "playwright" for Playwright wire protocol (connect).
|
|
99
108
|
Defaults to "cdp". Use "playwright" for services like
|
|
100
109
|
playwright run-server or Scrapeless recording.
|
|
110
|
+
backend: Which Playwright variant to use: "rebrowser" (rebrowser_playwright, default),
|
|
111
|
+
"playwright" (stock playwright), or "patchright" (patchright).
|
|
112
|
+
CloakBrowser integration requires "playwright" or "patchright".
|
|
113
|
+
cloak_browser: Use CloakBrowser's stealth Chromium binary instead of stock browser.
|
|
114
|
+
When True, fingerprint and stealth params are ignored (binary handles them).
|
|
115
|
+
Requires: pip install phantomfetch[cloakbrowser]
|
|
116
|
+
cloak_browser_humanize: Enable human-like mouse, keyboard, and scroll behavior
|
|
117
|
+
when using CloakBrowser. Adds Bezier curve mouse movement,
|
|
118
|
+
realistic typing delays, and scroll micro-steps.
|
|
119
|
+
cloak_browser_human_preset: Humanize preset — "default" (normal speed) or "careful" (slower).
|
|
120
|
+
cloak_browser_geoip: Auto-detect timezone and locale from proxy IP using MaxMind GeoIP.
|
|
121
|
+
Requires: pip install phantomfetch[cloakbrowser,geoip]
|
|
122
|
+
cloak_binary_path: Path to CloakBrowser binary. If None, uses CloakBrowser's
|
|
123
|
+
ensure_binary() to download/find it.
|
|
124
|
+
persistent_context_dir: Directory for persistent browser profile (cookies/localStorage
|
|
125
|
+
persist across sessions). When set, uses CloakBrowser's
|
|
126
|
+
persistent context API. Enables incognito bypass.
|
|
101
127
|
"""
|
|
102
128
|
self.cdp_endpoint = cdp_endpoint
|
|
103
129
|
self.headless = headless
|
|
@@ -111,6 +137,17 @@ class CDPEngine:
|
|
|
111
137
|
self.launch_args = launch_args or []
|
|
112
138
|
self.browser_type = browser_type
|
|
113
139
|
self.cdp_connection_type = cdp_connection_type
|
|
140
|
+
self.backend = backend
|
|
141
|
+
self.cloak_browser = cloak_browser
|
|
142
|
+
self.cloak_browser_humanize = cloak_browser_humanize
|
|
143
|
+
self.cloak_browser_human_preset = cloak_browser_human_preset
|
|
144
|
+
self.cloak_browser_geoip = cloak_browser_geoip
|
|
145
|
+
self.cloak_binary_path = cloak_binary_path
|
|
146
|
+
self.persistent_context_dir = persistent_context_dir
|
|
147
|
+
|
|
148
|
+
self._cloak_browser_available = False
|
|
149
|
+
self._cloak_context: Any = None
|
|
150
|
+
self._cloak_human_config: Any = None
|
|
114
151
|
|
|
115
152
|
# Map playwright browser type to BrowserForge constraint
|
|
116
153
|
if "browser" not in self.fingerprint_options and self.browser_type in (
|
|
@@ -126,7 +163,8 @@ class CDPEngine:
|
|
|
126
163
|
self.fingerprint_options["device"] = "desktop"
|
|
127
164
|
|
|
128
165
|
# BrowserForge fingerprint generator — instantiated once, generate() per request
|
|
129
|
-
|
|
166
|
+
# Disabled when using CloakBrowser (binary handles fingerprints natively)
|
|
167
|
+
if fingerprint and not cloak_browser:
|
|
130
168
|
screen = Screen(
|
|
131
169
|
min_width=1280, max_width=1920, min_height=720, max_height=1080
|
|
132
170
|
)
|
|
@@ -144,14 +182,83 @@ class CDPEngine:
|
|
|
144
182
|
|
|
145
183
|
async def connect(self) -> None:
|
|
146
184
|
"""Initialize Playwright and connect to browser."""
|
|
185
|
+
if self.cloak_browser:
|
|
186
|
+
await self._connect_cloakbrowser()
|
|
187
|
+
else:
|
|
188
|
+
await self._connect_playwright()
|
|
189
|
+
|
|
190
|
+
async def _connect_cloakbrowser(self) -> None:
|
|
191
|
+
"""Connect using CloakBrowser's stealth Chromium binary.
|
|
192
|
+
|
|
193
|
+
Uses cloakbrowser.launch_async() + browser.new_context() for non-persistent,
|
|
194
|
+
or launch_persistent_context_async() for persistent. These handle binary
|
|
195
|
+
management, stealth args, geoip, and humanize patches internally.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
from cloakbrowser import launch_async, launch_persistent_context_async
|
|
199
|
+
except ImportError:
|
|
200
|
+
raise ImportError(
|
|
201
|
+
"cloakbrowser is not installed. "
|
|
202
|
+
"Install it with: pip install phantomfetch[cloakbrowser]"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
logger.info(
|
|
206
|
+
f"[cdp] Connecting via CloakBrowser "
|
|
207
|
+
f"(humanize={self.cloak_browser_humanize}, "
|
|
208
|
+
f"persistent={bool(self.persistent_context_dir)})"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
if self.persistent_context_dir:
|
|
212
|
+
logger.debug(
|
|
213
|
+
f"[cdp] CloakBrowser persistent context: {self.persistent_context_dir}"
|
|
214
|
+
)
|
|
215
|
+
launch_kwargs: dict[str, Any] = {
|
|
216
|
+
"headless": self.headless,
|
|
217
|
+
"user_agent": self.user_agent,
|
|
218
|
+
"viewport": self.viewport,
|
|
219
|
+
"stealth_args": False,
|
|
220
|
+
"humanize": self.cloak_browser_humanize,
|
|
221
|
+
"human_preset": self.cloak_browser_human_preset,
|
|
222
|
+
"backend": self.backend,
|
|
223
|
+
"geoip": self.cloak_browser_geoip,
|
|
224
|
+
}
|
|
225
|
+
self._cloak_context = await launch_persistent_context_async(
|
|
226
|
+
self.persistent_context_dir,
|
|
227
|
+
**launch_kwargs,
|
|
228
|
+
)
|
|
229
|
+
self._browser: Any = None
|
|
230
|
+
else:
|
|
231
|
+
logger.debug(
|
|
232
|
+
"[cdp] CloakBrowser non-persistent — browser created per fetch"
|
|
233
|
+
)
|
|
234
|
+
self._browser = await launch_async(
|
|
235
|
+
headless=self.headless,
|
|
236
|
+
stealth_args=False,
|
|
237
|
+
humanize=self.cloak_browser_humanize,
|
|
238
|
+
human_preset=self.cloak_browser_human_preset,
|
|
239
|
+
backend=self.backend,
|
|
240
|
+
geoip=self.cloak_browser_geoip,
|
|
241
|
+
)
|
|
242
|
+
self._cloak_context = None
|
|
147
243
|
|
|
148
|
-
|
|
244
|
+
self._cloak_browser_available = True
|
|
245
|
+
|
|
246
|
+
async def _connect_playwright(self) -> None:
|
|
247
|
+
"""Connect using standard Playwright (rebrowser, stock, or patchright)."""
|
|
248
|
+
backend = self.backend
|
|
249
|
+
if backend == "rebrowser":
|
|
250
|
+
from rebrowser_playwright.async_api import async_playwright
|
|
251
|
+
elif backend == "patchright":
|
|
252
|
+
from patchright.async_api import async_playwright
|
|
253
|
+
else:
|
|
254
|
+
from playwright.async_api import async_playwright
|
|
149
255
|
|
|
150
256
|
self._playwright = await async_playwright().start()
|
|
151
257
|
try:
|
|
152
258
|
if self.cdp_endpoint:
|
|
153
259
|
logger.info(
|
|
154
|
-
f"[cdp] Connecting to: {self.cdp_endpoint}
|
|
260
|
+
f"[cdp] Connecting to: {self.cdp_endpoint} "
|
|
261
|
+
f"(type={self.cdp_connection_type}, backend={backend})"
|
|
155
262
|
)
|
|
156
263
|
browser_obj = getattr(self._playwright, self.browser_type)
|
|
157
264
|
|
|
@@ -171,7 +278,8 @@ class CDPEngine:
|
|
|
171
278
|
)
|
|
172
279
|
else:
|
|
173
280
|
logger.info(
|
|
174
|
-
f"[cdp] Launching local browser
|
|
281
|
+
f"[cdp] Launching local browser "
|
|
282
|
+
f"(headless={self.headless}, type={self.browser_type}, backend={backend})"
|
|
175
283
|
)
|
|
176
284
|
base_args = []
|
|
177
285
|
if self.browser_type == "chromium":
|
|
@@ -191,26 +299,15 @@ class CDPEngine:
|
|
|
191
299
|
await self._playwright.stop()
|
|
192
300
|
self._playwright = None
|
|
193
301
|
raise e
|
|
194
|
-
base_args = []
|
|
195
|
-
if self.browser_type == "chromium":
|
|
196
|
-
# --no-sandbox is required on Linux environments that don't have
|
|
197
|
-
# user namespace support (VMs, containers, CI). Without it,
|
|
198
|
-
# Chromium hangs silently at launch.
|
|
199
|
-
base_args = [
|
|
200
|
-
"--disable-blink-features=AutomationControlled",
|
|
201
|
-
"--no-sandbox",
|
|
202
|
-
"--disable-setuid-sandbox",
|
|
203
|
-
]
|
|
204
|
-
extra = [a for a in self.launch_args if a not in base_args]
|
|
205
|
-
|
|
206
|
-
browser_obj = getattr(self._playwright, self.browser_type)
|
|
207
|
-
self._browser = await browser_obj.launch(
|
|
208
|
-
headless=self.headless,
|
|
209
|
-
args=base_args + extra,
|
|
210
|
-
)
|
|
211
302
|
|
|
212
303
|
async def disconnect(self) -> None:
|
|
213
304
|
"""Close browser and Playwright."""
|
|
305
|
+
if self._cloak_context:
|
|
306
|
+
try:
|
|
307
|
+
await self._cloak_context.close()
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
self._cloak_context = None
|
|
214
311
|
if self._browser:
|
|
215
312
|
await self._browser.close()
|
|
216
313
|
self._browser = None
|
|
@@ -384,7 +481,7 @@ class CDPEngine:
|
|
|
384
481
|
# unless we specifically want to share state (TODO: session support)
|
|
385
482
|
# Update: storage_state passed in allows checking/setting state
|
|
386
483
|
|
|
387
|
-
context_opts = {}
|
|
484
|
+
context_opts: dict[str, Any] = {}
|
|
388
485
|
if self.user_agent:
|
|
389
486
|
context_opts["user_agent"] = self.user_agent
|
|
390
487
|
if self.viewport:
|
|
@@ -397,7 +494,7 @@ class CDPEngine:
|
|
|
397
494
|
if parsed.port
|
|
398
495
|
else f"{parsed.scheme}://{parsed.hostname}"
|
|
399
496
|
)
|
|
400
|
-
proxy_dict = {"server": proxy_server}
|
|
497
|
+
proxy_dict: dict[str, Any] = {"server": proxy_server}
|
|
401
498
|
if parsed.username:
|
|
402
499
|
proxy_dict["username"] = parsed.username
|
|
403
500
|
if parsed.password:
|
|
@@ -408,59 +505,105 @@ class CDPEngine:
|
|
|
408
505
|
# If we have basic cookies to set via context creation (simpler than add_cookies sometimes)
|
|
409
506
|
# But better to use add_cookies for consistency with 'cookies' arg
|
|
410
507
|
|
|
411
|
-
browser_context = None
|
|
412
|
-
page = None
|
|
508
|
+
browser_context: Any = None
|
|
509
|
+
page: Any = None
|
|
413
510
|
using_existing = False
|
|
511
|
+
using_persistent = False
|
|
414
512
|
|
|
415
513
|
try:
|
|
416
|
-
# Use existing page/context if available (for recording compatibility)
|
|
417
514
|
if self._existing_page and self._existing_context:
|
|
418
515
|
browser_context = self._existing_context
|
|
419
516
|
page = self._existing_page
|
|
420
517
|
using_existing = True
|
|
421
518
|
logger.debug(f"[cdp] Reusing existing page for {url}")
|
|
422
|
-
|
|
519
|
+
|
|
520
|
+
# CloakBrowser — binary-level stealth, no BrowserForge injection.
|
|
521
|
+
# CloakBrowser handles canvas/WebGL/GPU/UA/automation signals natively.
|
|
522
|
+
# Uses cloakbrowser.launch_async() + browser.new_context() for non-persistent,
|
|
523
|
+
# or launch_persistent_context_async() for persistent.
|
|
524
|
+
elif self._cloak_browser_available:
|
|
525
|
+
proxy_arg: dict[str, Any] | None = proxy_dict if proxy else None
|
|
526
|
+
|
|
527
|
+
if self.persistent_context_dir:
|
|
528
|
+
using_persistent = True
|
|
529
|
+
if self._cloak_context is None:
|
|
530
|
+
from cloakbrowser import launch_persistent_context_async
|
|
531
|
+
|
|
532
|
+
launch_kwargs: dict[str, Any] = {
|
|
533
|
+
"headless": self.headless,
|
|
534
|
+
"user_agent": self.user_agent,
|
|
535
|
+
"viewport": self.viewport,
|
|
536
|
+
"stealth_args": False,
|
|
537
|
+
"humanize": self.cloak_browser_humanize,
|
|
538
|
+
"human_preset": self.cloak_browser_human_preset,
|
|
539
|
+
"backend": self.backend,
|
|
540
|
+
"proxy": proxy_arg,
|
|
541
|
+
"geoip": self.cloak_browser_geoip,
|
|
542
|
+
}
|
|
543
|
+
self._cloak_context = await launch_persistent_context_async(
|
|
544
|
+
self.persistent_context_dir,
|
|
545
|
+
**launch_kwargs,
|
|
546
|
+
)
|
|
547
|
+
browser_context = self._cloak_context
|
|
548
|
+
page = await browser_context.new_page()
|
|
549
|
+
else:
|
|
550
|
+
browser_context = await self._browser.new_context(
|
|
551
|
+
user_agent=self.user_agent,
|
|
552
|
+
viewport=self.viewport,
|
|
553
|
+
proxy=proxy_arg,
|
|
554
|
+
)
|
|
555
|
+
page = await browser_context.new_page()
|
|
556
|
+
|
|
557
|
+
# BrowserForge fingerprint injection (rebrowser/stock playwright)
|
|
423
558
|
elif self._fp_gen is not None:
|
|
424
|
-
# Generate a fingerprint with any per-instance constraints
|
|
425
559
|
fp = self._fp_gen.generate(**self.fingerprint_options)
|
|
426
560
|
|
|
427
|
-
# Sync UA and viewport FROM the fingerprint for full consistency.
|
|
428
|
-
# Explicit user_agent/viewport overrides take precedence.
|
|
429
|
-
if not self.user_agent:
|
|
430
|
-
context_opts["user_agent"] = fp.navigator.userAgent
|
|
431
561
|
if not self.viewport:
|
|
432
562
|
context_opts["viewport"] = {
|
|
433
563
|
"width": fp.screen.width,
|
|
434
564
|
"height": fp.screen.height,
|
|
435
565
|
}
|
|
436
566
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
# AsyncNewContext injects all fingerprint JS overrides
|
|
442
|
-
# (Canvas, WebGL, navigator, AudioContext, fonts, etc.)
|
|
443
|
-
browser_context = await AsyncNewContext(
|
|
444
|
-
self._browser,
|
|
445
|
-
fingerprint=fp,
|
|
446
|
-
**context_opts,
|
|
567
|
+
user_provided_ua = self.user_agent or (
|
|
568
|
+
headers.get("user-agent") if headers else None
|
|
447
569
|
)
|
|
448
570
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
571
|
+
if user_provided_ua:
|
|
572
|
+
context_opts["user_agent"] = user_provided_ua
|
|
573
|
+
logger.debug(
|
|
574
|
+
f"[cdp] BrowserForge fingerprint (custom UA): "
|
|
575
|
+
f"fp_UA={fp.navigator.userAgent[:40]}... "
|
|
576
|
+
f"user_UA={user_provided_ua[:40]}..."
|
|
577
|
+
)
|
|
578
|
+
browser_context = await self._browser.new_context(
|
|
579
|
+
**context_opts,
|
|
580
|
+
)
|
|
581
|
+
await browser_context.set_extra_http_headers(
|
|
582
|
+
only_injectable_headers(
|
|
583
|
+
fp.headers, self._browser.browser_type.name
|
|
584
|
+
)
|
|
585
|
+
)
|
|
586
|
+
await browser_context.add_init_script(InjectFunction(fp))
|
|
587
|
+
else:
|
|
588
|
+
context_opts["user_agent"] = fp.navigator.userAgent
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"[cdp] BrowserForge fingerprint: "
|
|
591
|
+
f"UA={fp.navigator.userAgent[:60]}..."
|
|
592
|
+
)
|
|
593
|
+
browser_context = await AsyncNewContext(
|
|
594
|
+
self._browser,
|
|
595
|
+
fingerprint=fp,
|
|
596
|
+
**context_opts,
|
|
597
|
+
)
|
|
598
|
+
if fp.headers:
|
|
599
|
+
fp_headers = dict(fp.headers)
|
|
600
|
+
for managed in (
|
|
601
|
+
"Host",
|
|
602
|
+
"Content-Length",
|
|
603
|
+
"Transfer-Encoding",
|
|
604
|
+
):
|
|
605
|
+
fp_headers.pop(managed, None)
|
|
606
|
+
await browser_context.set_extra_http_headers(fp_headers)
|
|
464
607
|
else:
|
|
465
608
|
browser_context = await self._browser.new_context(**context_opts)
|
|
466
609
|
|
|
@@ -479,7 +622,8 @@ class CDPEngine:
|
|
|
479
622
|
# Apply stealth (works for both new and existing contexts)
|
|
480
623
|
# Note: For remote browsers, launch args like --disable-blink-features=AutomationControlled
|
|
481
624
|
# must be set by the *server* at launch time. We cannot retroactively apply them here.
|
|
482
|
-
|
|
625
|
+
# CloakBrowser: skip stealth_async — binary handles fingerprinting at C++ level.
|
|
626
|
+
if stealth and browser_context and not self._cloak_browser_available:
|
|
483
627
|
logger.debug("[cdp] Applying stealth_async")
|
|
484
628
|
await stealth_async(browser_context)
|
|
485
629
|
|
|
@@ -558,8 +702,8 @@ class CDPEngine:
|
|
|
558
702
|
if headers:
|
|
559
703
|
await browser_context.set_extra_http_headers(headers)
|
|
560
704
|
|
|
561
|
-
# Create page if not reusing existing
|
|
562
|
-
if not using_existing:
|
|
705
|
+
# Create page if not reusing existing or persistent
|
|
706
|
+
if not using_existing and not using_persistent:
|
|
563
707
|
# If storing state, we should have done it on context.
|
|
564
708
|
page = await browser_context.new_page()
|
|
565
709
|
|
|
@@ -857,7 +1001,7 @@ class CDPEngine:
|
|
|
857
1001
|
|
|
858
1002
|
except asyncio.CancelledError:
|
|
859
1003
|
logger.warning("Scrape cancelled by orchestrator. Shielding teardown.")
|
|
860
|
-
if not using_existing and browser_context:
|
|
1004
|
+
if not using_existing and not using_persistent and browser_context:
|
|
861
1005
|
try:
|
|
862
1006
|
await asyncio.shield(browser_context.close())
|
|
863
1007
|
except Exception:
|
|
@@ -879,7 +1023,7 @@ class CDPEngine:
|
|
|
879
1023
|
)
|
|
880
1024
|
|
|
881
1025
|
finally:
|
|
882
|
-
if not using_existing and browser_context:
|
|
1026
|
+
if not using_existing and not using_persistent and browser_context:
|
|
883
1027
|
try:
|
|
884
1028
|
await browser_context.close()
|
|
885
1029
|
except Exception:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
from typing import Any, Literal
|
|
4
|
+
from typing import Any, Literal
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
|
|
8
|
-
from .cache import Cache
|
|
8
|
+
from .cache import Cache
|
|
9
9
|
from .engines import CDPEngine, CurlEngine
|
|
10
10
|
from .pool import ProxyPool
|
|
11
11
|
from .telemetry import get_tracer
|
|
@@ -46,10 +46,19 @@ class Fetcher:
|
|
|
46
46
|
cache: Cache | bool | None = None,
|
|
47
47
|
# Advanced CDP
|
|
48
48
|
cdp_use_existing_page: bool = True,
|
|
49
|
+
cdp_connection_type: str = "cdp",
|
|
50
|
+
backend: Literal["rebrowser", "playwright", "patchright"] = "rebrowser",
|
|
49
51
|
# BrowserForge fingerprinting
|
|
50
52
|
fingerprint: bool = True,
|
|
51
53
|
fingerprint_options: dict[str, Any] | None = None,
|
|
52
54
|
browser_type: str = "chromium",
|
|
55
|
+
# CloakBrowser
|
|
56
|
+
cloak_browser: bool = False,
|
|
57
|
+
cloak_browser_humanize: bool = False,
|
|
58
|
+
cloak_browser_human_preset: str = "default",
|
|
59
|
+
cloak_browser_geoip: bool = False,
|
|
60
|
+
cloak_binary_path: str | None = None,
|
|
61
|
+
persistent_context_dir: str | None = None,
|
|
53
62
|
):
|
|
54
63
|
"""
|
|
55
64
|
Initialize the Fetcher.
|
|
@@ -68,12 +77,24 @@ class Fetcher:
|
|
|
68
77
|
max_concurrent_browser: Max concurrent browser requests
|
|
69
78
|
cache: Cache implementation (e.g. FileSystemCache)
|
|
70
79
|
cdp_use_existing_page: Reuse existing page in remote CDP (default: True)
|
|
80
|
+
backend: Which Playwright variant to use: "rebrowser", "playwright", or "patchright".
|
|
81
|
+
CloakBrowser integration requires "playwright" or "patchright".
|
|
71
82
|
fingerprint: Enable BrowserForge fingerprint injection in the browser engine
|
|
72
83
|
(default: True). Injects realistic Canvas, WebGL, navigator
|
|
73
84
|
and screen fingerprints into every new Playwright context.
|
|
74
85
|
fingerprint_options: Constraints for BrowserForge FingerprintGenerator
|
|
75
86
|
(e.g. {"browser": "chrome", "os": "windows"}).
|
|
76
87
|
browser_type: Which Playwright browser engine to use: "chromium", "firefox", or "webkit"
|
|
88
|
+
cloak_browser: Use CloakBrowser's stealth Chromium binary instead of stock browser.
|
|
89
|
+
When True, fingerprint is disabled (binary handles it).
|
|
90
|
+
Requires: pip install phantomfetch[cloakbrowser]
|
|
91
|
+
cloak_browser_humanize: Enable human-like mouse, keyboard, and scroll behavior
|
|
92
|
+
when using CloakBrowser.
|
|
93
|
+
cloak_browser_human_preset: Humanize preset — "default" (normal) or "careful" (slower).
|
|
94
|
+
cloak_browser_geoip: Auto-detect timezone/locale from proxy IP using MaxMind GeoIP.
|
|
95
|
+
cloak_binary_path: Path to CloakBrowser binary. If None, downloads automatically.
|
|
96
|
+
persistent_context_dir: Directory for persistent browser profile (cookies/localStorage
|
|
97
|
+
persist across sessions).
|
|
77
98
|
"""
|
|
78
99
|
# Cache
|
|
79
100
|
self.cache: Cache | None = None
|
|
@@ -97,9 +118,11 @@ class Fetcher:
|
|
|
97
118
|
timeout=timeout,
|
|
98
119
|
max_retries=max_retries,
|
|
99
120
|
)
|
|
100
|
-
|
|
121
|
+
|
|
101
122
|
if browser_engine == "baas":
|
|
102
|
-
raise ValueError(
|
|
123
|
+
raise ValueError(
|
|
124
|
+
"BaaSEngine has been removed. Please use 'cdp' engine or contact support."
|
|
125
|
+
)
|
|
103
126
|
|
|
104
127
|
# Browser engine
|
|
105
128
|
# Always use CDPEngine now
|
|
@@ -109,9 +132,17 @@ class Fetcher:
|
|
|
109
132
|
timeout=browser_timeout,
|
|
110
133
|
cache=self.cache,
|
|
111
134
|
use_existing_page=cdp_use_existing_page,
|
|
135
|
+
cdp_connection_type=cdp_connection_type,
|
|
136
|
+
backend=backend,
|
|
112
137
|
fingerprint=fingerprint,
|
|
113
138
|
fingerprint_options=fingerprint_options,
|
|
114
139
|
browser_type=browser_type,
|
|
140
|
+
cloak_browser=cloak_browser,
|
|
141
|
+
cloak_browser_humanize=cloak_browser_humanize,
|
|
142
|
+
cloak_browser_human_preset=cloak_browser_human_preset,
|
|
143
|
+
cloak_browser_geoip=cloak_browser_geoip,
|
|
144
|
+
cloak_binary_path=cloak_binary_path,
|
|
145
|
+
persistent_context_dir=persistent_context_dir,
|
|
115
146
|
)
|
|
116
147
|
self._browser = self._cdp_engine
|
|
117
148
|
|
|
@@ -147,7 +178,7 @@ class Fetcher:
|
|
|
147
178
|
Stop the browser engine.
|
|
148
179
|
"""
|
|
149
180
|
if self._browser:
|
|
150
|
-
await self._browser.
|
|
181
|
+
await self._browser.disconnect()
|
|
151
182
|
|
|
152
183
|
def save_session(self, path: str) -> None:
|
|
153
184
|
"""
|
|
@@ -268,7 +299,7 @@ class Fetcher:
|
|
|
268
299
|
proxy: Specific proxy to use (overrides pool)
|
|
269
300
|
location: Geo location for proxy selection
|
|
270
301
|
actions: List of `Action` objects or dicts (implies engine="browser")
|
|
271
|
-
|
|
302
|
+
|
|
272
303
|
...
|
|
273
304
|
"""
|
|
274
305
|
# Normalize actions - implies browser
|
|
@@ -319,21 +350,21 @@ class Fetcher:
|
|
|
319
350
|
cache_key_parts = [engine, url]
|
|
320
351
|
if location:
|
|
321
352
|
cache_key_parts.append(f"loc={location}")
|
|
322
|
-
|
|
353
|
+
|
|
323
354
|
# Use proxy URL for cache key if explicit proxy is used
|
|
324
355
|
# Note: We rely on the proxy *argument* (manual override) for this separation.
|
|
325
|
-
# If using pool, we typically don't split cache by specific pool proxy,
|
|
356
|
+
# If using pool, we typically don't split cache by specific pool proxy,
|
|
326
357
|
# UNLESS location was requested (handled above).
|
|
327
|
-
if proxy:
|
|
358
|
+
if proxy:
|
|
328
359
|
# If manual proxy string/object provided, include it
|
|
329
360
|
p_url = proxy if isinstance(proxy, str) else proxy.url
|
|
330
|
-
# Sanitize sensitive info? user:pass might be sensitive,
|
|
331
|
-
# but for cache key uniqueness it's needed.
|
|
361
|
+
# Sanitize sensitive info? user:pass might be sensitive,
|
|
362
|
+
# but for cache key uniqueness it's needed.
|
|
332
363
|
# Since MD5 is used downstream, it's somewhat obscured.
|
|
333
364
|
cache_key_parts.append(f"proxy={p_url}")
|
|
334
365
|
|
|
335
366
|
cache_key = ":".join(cache_key_parts)
|
|
336
|
-
|
|
367
|
+
|
|
337
368
|
cached_resp = await self.cache.get(cache_key)
|
|
338
369
|
if cached_resp:
|
|
339
370
|
cached_resp.from_cache = True
|
|
@@ -347,10 +378,12 @@ class Fetcher:
|
|
|
347
378
|
selected_proxy: Proxy | None = None
|
|
348
379
|
if proxy:
|
|
349
380
|
if isinstance(proxy, str):
|
|
350
|
-
selected_proxy = Proxy(
|
|
381
|
+
selected_proxy = Proxy(
|
|
382
|
+
url=proxy, metadata={"source": "manual_override"}
|
|
383
|
+
)
|
|
351
384
|
else:
|
|
352
385
|
selected_proxy = proxy
|
|
353
|
-
|
|
386
|
+
|
|
354
387
|
# 2. From Pool (if no override)
|
|
355
388
|
if not selected_proxy:
|
|
356
389
|
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
@@ -358,20 +391,27 @@ class Fetcher:
|
|
|
358
391
|
if selected_proxy:
|
|
359
392
|
span.set_attribute("phantomfetch.proxy", selected_proxy.url)
|
|
360
393
|
if selected_proxy.vendor:
|
|
361
|
-
span.set_attribute(
|
|
394
|
+
span.set_attribute(
|
|
395
|
+
"phantomfetch.proxy.vendor", selected_proxy.vendor
|
|
396
|
+
)
|
|
362
397
|
if selected_proxy.proxy_type:
|
|
363
|
-
span.set_attribute(
|
|
398
|
+
span.set_attribute(
|
|
399
|
+
"phantomfetch.proxy.type", selected_proxy.proxy_type
|
|
400
|
+
)
|
|
364
401
|
if selected_proxy.location:
|
|
365
|
-
span.set_attribute(
|
|
402
|
+
span.set_attribute(
|
|
403
|
+
"phantomfetch.proxy.location", selected_proxy.location
|
|
404
|
+
)
|
|
366
405
|
if selected_proxy.provider:
|
|
367
|
-
span.set_attribute(
|
|
406
|
+
span.set_attribute(
|
|
407
|
+
"phantomfetch.proxy.provider", selected_proxy.provider
|
|
408
|
+
)
|
|
368
409
|
|
|
369
410
|
# Route to engine
|
|
370
411
|
if engine == "browser":
|
|
371
412
|
resp = await self._fetch_browser(
|
|
372
413
|
url=url,
|
|
373
414
|
proxy=selected_proxy,
|
|
374
|
-
|
|
375
415
|
headers=headers,
|
|
376
416
|
cookies=cookies,
|
|
377
417
|
actions=normalized_actions,
|
|
@@ -406,7 +446,7 @@ class Fetcher:
|
|
|
406
446
|
# For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
|
|
407
447
|
# But the 'selected_proxy' might be a new instance specific to this request (manual override).
|
|
408
448
|
# The pool.mark_* methods take a Proxy object.
|
|
409
|
-
|
|
449
|
+
|
|
410
450
|
# Logic: If 'proxy' argument was None, it came from pool -> Update stats.
|
|
411
451
|
# If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
|
|
412
452
|
if not proxy and selected_proxy:
|
|
@@ -421,10 +461,10 @@ class Fetcher:
|
|
|
421
461
|
cache_key_parts = [engine, url]
|
|
422
462
|
if location:
|
|
423
463
|
cache_key_parts.append(f"loc={location}")
|
|
424
|
-
if proxy:
|
|
464
|
+
if proxy:
|
|
425
465
|
p_url = proxy if isinstance(proxy, str) else proxy.url
|
|
426
466
|
cache_key_parts.append(f"proxy={p_url}")
|
|
427
|
-
|
|
467
|
+
|
|
428
468
|
cache_key = ":".join(cache_key_parts)
|
|
429
469
|
await self.cache.set(cache_key, resp)
|
|
430
470
|
|
|
@@ -451,26 +491,26 @@ class Fetcher:
|
|
|
451
491
|
async with self._browser_semaphore:
|
|
452
492
|
# Direct CDP usage (BaaSEngine removed)
|
|
453
493
|
if not self._cdp_engine:
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
# Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
|
|
494
|
+
# Lazy init if not done (though currently init in __init__)
|
|
495
|
+
# But wait, logic in __init__ was:
|
|
496
|
+
# self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
|
|
497
|
+
# Since we removed engine selection, we should ensure it's initialized.
|
|
498
|
+
# In __init__ I see: self._cdp_engine: CDPEngine | None = None
|
|
499
|
+
# And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
|
|
500
|
+
# Let's check __init__ again.
|
|
501
|
+
pass
|
|
502
|
+
|
|
503
|
+
# Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
|
|
464
504
|
# No, I only updated the arguments. I need to make sure __init__ initializes _cdp_engine unconditionally.
|
|
465
505
|
# Let's assume I fix __init__ in a separate recursive step if needed, or I can fix it here if I see it.
|
|
466
|
-
# In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
|
|
467
|
-
# I need to clean that up too!
|
|
468
|
-
|
|
506
|
+
# In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
|
|
507
|
+
# I need to clean that up too!
|
|
508
|
+
|
|
469
509
|
# Let's just assume _browser attribute is now _cdp_engine.
|
|
470
510
|
# Wait, the __init__ logic was:
|
|
471
511
|
# self._browser = CDPEngine(...)
|
|
472
512
|
# So here I should just use self._browser (which is typed as CDPEngine in the new world).
|
|
473
|
-
|
|
513
|
+
|
|
474
514
|
# Correct implementation for _fetch_browser:
|
|
475
515
|
return await self._cdp_engine.fetch(
|
|
476
516
|
url=url,
|
|
@@ -63,9 +63,6 @@ class Proxy(msgspec.Struct):
|
|
|
63
63
|
metadata: dict[str, Any] = {}
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
66
|
class Action(msgspec.Struct):
|
|
70
67
|
"""
|
|
71
68
|
Browser interaction definition.
|
|
@@ -99,8 +96,6 @@ class Action(msgspec.Struct):
|
|
|
99
96
|
state: str | None = None
|
|
100
97
|
x: int | None = None
|
|
101
98
|
y: int | None = None
|
|
102
|
-
x: int | None = None
|
|
103
|
-
y: int | None = None
|
|
104
99
|
schema: dict[str, Any] | None = None
|
|
105
100
|
actions: list["Action"] | None = None
|
|
106
101
|
then_actions: list["Action"] | None = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|