phantomfetch 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/PKG-INFO +3 -3
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/pyproject.toml +3 -3
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/fetch.py +120 -126
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/README.md +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/cache.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/captcha.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/base.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/actions.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/cdp.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/curl.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/pool.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/presets.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/registry.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/strategy_advisor.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/telemetry.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/fingerprint.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/matcher.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/store.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/selector_builder.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: phantomfetch
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
|
|
5
5
|
Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
|
|
6
6
|
Author: CosmicBull
|
|
@@ -26,14 +26,14 @@ Requires-Dist: opentelemetry-sdk>=1.38.0
|
|
|
26
26
|
Requires-Dist: loguru>=0.7.3
|
|
27
27
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
28
28
|
Requires-Dist: lmdb>=1.4.1
|
|
29
|
-
Requires-Dist: cloakbrowser
|
|
29
|
+
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'all'
|
|
30
30
|
Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'all'
|
|
31
31
|
Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'all'
|
|
32
32
|
Requires-Dist: patchright>=1.60.1 ; extra == 'all'
|
|
33
33
|
Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
|
|
34
34
|
Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'camoufox'
|
|
35
35
|
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'cloakbrowser'
|
|
36
|
-
Requires-Dist: cloakbrowser
|
|
36
|
+
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'geoip'
|
|
37
37
|
Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
|
|
38
38
|
Requires-Dist: patchright>=1.60.1 ; extra == 'patchright'
|
|
39
39
|
Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'rebrowser'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "phantomfetch"
|
|
3
|
-
version = "0.6.
|
|
3
|
+
version = "0.6.1"
|
|
4
4
|
description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.13"
|
|
@@ -45,11 +45,11 @@ dependencies = [
|
|
|
45
45
|
|
|
46
46
|
[project.optional-dependencies]
|
|
47
47
|
cloakbrowser = ["cloakbrowser>=0.3.31"]
|
|
48
|
-
geoip = ["cloakbrowser
|
|
48
|
+
geoip = ["cloakbrowser>=0.3.31", "maxminddb>=2.0.0"]
|
|
49
49
|
camoufox = ["camoufox[geoip]>=0.4.11"]
|
|
50
50
|
rebrowser = ["rebrowser-playwright>=1.52.0"]
|
|
51
51
|
patchright = ["patchright>=1.60.1"]
|
|
52
|
-
all = ["cloakbrowser
|
|
52
|
+
all = ["cloakbrowser>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
|
|
53
53
|
|
|
54
54
|
[project.urls]
|
|
55
55
|
Homepage = "https://github.com/iristech-systems/PhantomFetch"
|
|
@@ -403,95 +403,125 @@ class Fetcher:
|
|
|
403
403
|
|
|
404
404
|
span.set_attribute("phantomfetch.cache.hit", False)
|
|
405
405
|
|
|
406
|
-
#
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
if
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
406
|
+
# Resolve retry configurations
|
|
407
|
+
actual_max_retries = max_retries if max_retries is not None else self.max_retries
|
|
408
|
+
attempts = max(1, actual_max_retries)
|
|
409
|
+
if retry_on is None:
|
|
410
|
+
retry_status_codes: set[int] = {0, 429, 500, 502, 503, 504}
|
|
411
|
+
elif isinstance(retry_on, int):
|
|
412
|
+
retry_status_codes = {retry_on}
|
|
413
|
+
else:
|
|
414
|
+
retry_status_codes = set(retry_on)
|
|
415
|
+
|
|
416
|
+
backoff_base = 2.0 if retry_backoff is None else retry_backoff
|
|
416
417
|
|
|
417
|
-
|
|
418
|
-
if not selected_proxy:
|
|
419
|
-
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
418
|
+
last_resp: Response | None = None
|
|
420
419
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
420
|
+
for attempt in range(attempts):
|
|
421
|
+
# Get proxy
|
|
422
|
+
# 1. Manual override
|
|
423
|
+
selected_proxy: Proxy | None = None
|
|
424
|
+
if proxy:
|
|
425
|
+
if isinstance(proxy, str):
|
|
426
|
+
selected_proxy = Proxy(
|
|
427
|
+
url=proxy, metadata={"source": "manual_override"}
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
selected_proxy = proxy
|
|
431
|
+
|
|
432
|
+
# 2. From Pool (if no override)
|
|
433
|
+
if not selected_proxy:
|
|
434
|
+
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
435
|
+
|
|
436
|
+
if selected_proxy:
|
|
437
|
+
span.set_attribute("phantomfetch.proxy", selected_proxy.url)
|
|
438
|
+
if selected_proxy.vendor:
|
|
439
|
+
span.set_attribute(
|
|
440
|
+
"phantomfetch.proxy.vendor", selected_proxy.vendor
|
|
441
|
+
)
|
|
442
|
+
if selected_proxy.proxy_type:
|
|
443
|
+
span.set_attribute(
|
|
444
|
+
"phantomfetch.proxy.type", selected_proxy.proxy_type
|
|
445
|
+
)
|
|
446
|
+
if selected_proxy.location:
|
|
447
|
+
span.set_attribute(
|
|
448
|
+
"phantomfetch.proxy.location", selected_proxy.location
|
|
449
|
+
)
|
|
450
|
+
if selected_proxy.provider:
|
|
451
|
+
span.set_attribute(
|
|
452
|
+
"phantomfetch.proxy.provider", selected_proxy.provider
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Route to engine
|
|
456
|
+
if engine == "browser":
|
|
457
|
+
resp = await self._fetch_browser(
|
|
458
|
+
url=url,
|
|
459
|
+
proxy=selected_proxy,
|
|
460
|
+
headers=headers,
|
|
461
|
+
cookies=cookies,
|
|
462
|
+
actions=normalized_actions,
|
|
463
|
+
timeout=timeout or self.browser_timeout,
|
|
464
|
+
location=location,
|
|
465
|
+
wait_until=wait_until,
|
|
466
|
+
block_resources=block_resources,
|
|
467
|
+
wait_for_url=wait_for_url,
|
|
468
|
+
storage_state=self.session_data, # Pass current session
|
|
469
|
+
stealth=stealth,
|
|
434
470
|
)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
471
|
+
else:
|
|
472
|
+
resp = await self._fetch_curl(
|
|
473
|
+
url=url,
|
|
474
|
+
proxy=selected_proxy,
|
|
475
|
+
headers=headers,
|
|
476
|
+
cookies=cookies,
|
|
477
|
+
timeout=timeout or self.timeout,
|
|
478
|
+
max_retries=1, # Prevent internal retries so we can rotate proxies
|
|
479
|
+
retry_on=retry_on,
|
|
480
|
+
retry_backoff=retry_backoff,
|
|
481
|
+
referer=referer,
|
|
482
|
+
allow_redirects=allow_redirects,
|
|
438
483
|
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
storage_state=self.session_data, # Pass current session
|
|
454
|
-
stealth=stealth,
|
|
455
|
-
max_retries=self.max_retries
|
|
456
|
-
if max_retries is None
|
|
457
|
-
else max_retries,
|
|
458
|
-
retry_on=retry_on,
|
|
459
|
-
retry_backoff=retry_backoff,
|
|
484
|
+
|
|
485
|
+
last_resp = resp
|
|
486
|
+
|
|
487
|
+
# Evaluate proxy status
|
|
488
|
+
if not proxy and selected_proxy:
|
|
489
|
+
if resp.ok:
|
|
490
|
+
self.proxy_pool.mark_success(selected_proxy)
|
|
491
|
+
elif resp.error or resp.status in retry_status_codes:
|
|
492
|
+
# Mark proxy as failed so a fresh one is used on retry
|
|
493
|
+
self.proxy_pool.mark_failed(selected_proxy)
|
|
494
|
+
|
|
495
|
+
# Should we retry?
|
|
496
|
+
should_retry = (
|
|
497
|
+
attempt < attempts - 1 and resp.status in retry_status_codes
|
|
460
498
|
)
|
|
461
|
-
|
|
462
|
-
|
|
499
|
+
if not should_retry:
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
# Backoff before getting a new proxy
|
|
503
|
+
wait = backoff_base**attempt * (0.5 + random.random())
|
|
504
|
+
logger.warning(
|
|
505
|
+
f"[{engine}] Retryable status {resp.status}, "
|
|
506
|
+
f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
|
|
507
|
+
)
|
|
508
|
+
await asyncio.sleep(wait)
|
|
509
|
+
|
|
510
|
+
if not last_resp:
|
|
511
|
+
last_resp = Response(
|
|
463
512
|
url=url,
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
max_retries=self.max_retries
|
|
469
|
-
if max_retries is None
|
|
470
|
-
else max_retries,
|
|
471
|
-
retry_on=retry_on,
|
|
472
|
-
retry_backoff=retry_backoff,
|
|
473
|
-
referer=referer,
|
|
474
|
-
allow_redirects=allow_redirects,
|
|
513
|
+
status=0,
|
|
514
|
+
body=b"",
|
|
515
|
+
engine=engine,
|
|
516
|
+
error="Max retries exhausted",
|
|
475
517
|
)
|
|
518
|
+
|
|
519
|
+
resp = last_resp
|
|
476
520
|
|
|
477
521
|
# Update session data from response if present
|
|
478
522
|
if resp.storage_state:
|
|
479
523
|
self.session_data = resp.storage_state
|
|
480
524
|
|
|
481
|
-
# Update proxy stats (ONLY if it came from the pool, or generally?)
|
|
482
|
-
# If manual override, we might NOT want to impact the pool stats unless the manual proxy IS in the pool?
|
|
483
|
-
# For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
|
|
484
|
-
# But the 'selected_proxy' might be a new instance specific to this request (manual override).
|
|
485
|
-
# The pool.mark_* methods take a Proxy object.
|
|
486
|
-
|
|
487
|
-
# Logic: If 'proxy' argument was None, it came from pool -> Update stats.
|
|
488
|
-
# If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
|
|
489
|
-
if not proxy and selected_proxy:
|
|
490
|
-
if resp.ok:
|
|
491
|
-
self.proxy_pool.mark_success(selected_proxy)
|
|
492
|
-
elif resp.error:
|
|
493
|
-
self.proxy_pool.mark_failed(selected_proxy)
|
|
494
|
-
|
|
495
525
|
# Cache response
|
|
496
526
|
if self.cache and resp.ok and self.cache.should_cache_request("document"):
|
|
497
527
|
# Re-generate key (same logic as above)
|
|
@@ -523,9 +553,6 @@ class Fetcher:
|
|
|
523
553
|
wait_for_url: str | None = None,
|
|
524
554
|
storage_state: dict[str, Any] | None = None,
|
|
525
555
|
stealth: bool = False,
|
|
526
|
-
max_retries: int = 1,
|
|
527
|
-
retry_on: Collection[int] | int | None = None,
|
|
528
|
-
retry_backoff: float | None = None,
|
|
529
556
|
) -> Response:
|
|
530
557
|
async with self._semaphore:
|
|
531
558
|
async with self._browser_semaphore:
|
|
@@ -538,54 +565,21 @@ class Fetcher:
|
|
|
538
565
|
error="CDP engine not initialized",
|
|
539
566
|
)
|
|
540
567
|
|
|
541
|
-
|
|
542
|
-
if retry_on is None:
|
|
543
|
-
retry_status_codes: set[int] = set()
|
|
544
|
-
elif isinstance(retry_on, int):
|
|
545
|
-
retry_status_codes = {retry_on}
|
|
546
|
-
else:
|
|
547
|
-
retry_status_codes = set(retry_on)
|
|
548
|
-
|
|
549
|
-
backoff_base = 2.0 if retry_backoff is None else retry_backoff
|
|
550
|
-
last_response: Response | None = None
|
|
551
|
-
|
|
552
|
-
for attempt in range(attempts):
|
|
553
|
-
response = await self._cdp_engine.fetch(
|
|
554
|
-
url=url,
|
|
555
|
-
proxy=proxy,
|
|
556
|
-
headers=headers,
|
|
557
|
-
cookies=cookies,
|
|
558
|
-
actions=actions,
|
|
559
|
-
timeout=timeout,
|
|
560
|
-
location=location,
|
|
561
|
-
wait_until=wait_until,
|
|
562
|
-
block_resources=block_resources,
|
|
563
|
-
wait_for_url=wait_for_url,
|
|
564
|
-
storage_state=storage_state,
|
|
565
|
-
stealth=stealth,
|
|
566
|
-
)
|
|
567
|
-
last_response = response
|
|
568
|
-
|
|
569
|
-
should_retry = (
|
|
570
|
-
attempt < attempts - 1 and response.status in retry_status_codes
|
|
571
|
-
)
|
|
572
|
-
if not should_retry:
|
|
573
|
-
return response
|
|
574
|
-
|
|
575
|
-
wait = backoff_base**attempt * (0.5 + random.random())
|
|
576
|
-
logger.warning(
|
|
577
|
-
f"[browser] Retryable status {response.status}, "
|
|
578
|
-
f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
|
|
579
|
-
)
|
|
580
|
-
await asyncio.sleep(wait)
|
|
581
|
-
|
|
582
|
-
return last_response or Response(
|
|
568
|
+
response = await self._cdp_engine.fetch(
|
|
583
569
|
url=url,
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
570
|
+
proxy=proxy,
|
|
571
|
+
headers=headers,
|
|
572
|
+
cookies=cookies,
|
|
573
|
+
actions=actions,
|
|
574
|
+
timeout=timeout,
|
|
575
|
+
location=location,
|
|
576
|
+
wait_until=wait_until,
|
|
577
|
+
block_resources=block_resources,
|
|
578
|
+
wait_for_url=wait_for_url,
|
|
579
|
+
storage_state=storage_state,
|
|
580
|
+
stealth=stealth,
|
|
588
581
|
)
|
|
582
|
+
return response
|
|
589
583
|
|
|
590
584
|
async def _fetch_curl(
|
|
591
585
|
self,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|