phantomfetch 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/PKG-INFO +3 -3
  2. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/pyproject.toml +3 -3
  3. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/fetch.py +120 -126
  4. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/README.md +0 -0
  5. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/__init__.py +0 -0
  6. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/cache.py +0 -0
  7. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/captcha.py +0 -0
  8. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/__init__.py +0 -0
  9. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/base.py +0 -0
  10. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/__init__.py +0 -0
  11. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/actions.py +0 -0
  12. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/browser/cdp.py +0 -0
  13. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/engines/curl.py +0 -0
  14. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/pool.py +0 -0
  15. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/presets.py +0 -0
  16. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/registry.py +0 -0
  17. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/strategy_advisor.py +0 -0
  18. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/telemetry.py +0 -0
  19. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/__init__.py +0 -0
  20. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/fingerprint.py +0 -0
  21. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/matcher.py +0 -0
  22. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/adaptive/store.py +0 -0
  23. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/tools/selector_builder.py +0 -0
  24. {phantomfetch-0.6.0 → phantomfetch-0.6.1}/src/phantomfetch/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: phantomfetch
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
5
5
  Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
6
6
  Author: CosmicBull
@@ -26,14 +26,14 @@ Requires-Dist: opentelemetry-sdk>=1.38.0
26
26
  Requires-Dist: loguru>=0.7.3
27
27
  Requires-Dist: beautifulsoup4>=4.14.3
28
28
  Requires-Dist: lmdb>=1.4.1
29
- Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'all'
29
+ Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'all'
30
30
  Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'all'
31
31
  Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'all'
32
32
  Requires-Dist: patchright>=1.60.1 ; extra == 'all'
33
33
  Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
34
34
  Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'camoufox'
35
35
  Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'cloakbrowser'
36
- Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'geoip'
36
+ Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'geoip'
37
37
  Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
38
38
  Requires-Dist: patchright>=1.60.1 ; extra == 'patchright'
39
39
  Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'rebrowser'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomfetch"
3
- version = "0.6.0"
3
+ version = "0.6.1"
4
4
  description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -45,11 +45,11 @@ dependencies = [
45
45
 
46
46
  [project.optional-dependencies]
47
47
  cloakbrowser = ["cloakbrowser>=0.3.31"]
48
- geoip = ["cloakbrowser[geoip]>=0.3.31", "maxminddb>=2.0.0"]
48
+ geoip = ["cloakbrowser>=0.3.31", "maxminddb>=2.0.0"]
49
49
  camoufox = ["camoufox[geoip]>=0.4.11"]
50
50
  rebrowser = ["rebrowser-playwright>=1.52.0"]
51
51
  patchright = ["patchright>=1.60.1"]
52
- all = ["cloakbrowser[geoip]>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
52
+ all = ["cloakbrowser>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
53
53
 
54
54
  [project.urls]
55
55
  Homepage = "https://github.com/iristech-systems/PhantomFetch"
@@ -403,95 +403,125 @@ class Fetcher:
403
403
 
404
404
  span.set_attribute("phantomfetch.cache.hit", False)
405
405
 
406
- # Get proxy
407
- # 1. Manual override
408
- selected_proxy: Proxy | None = None
409
- if proxy:
410
- if isinstance(proxy, str):
411
- selected_proxy = Proxy(
412
- url=proxy, metadata={"source": "manual_override"}
413
- )
414
- else:
415
- selected_proxy = proxy
406
+ # Resolve retry configurations
407
+ actual_max_retries = max_retries if max_retries is not None else self.max_retries
408
+ attempts = max(1, actual_max_retries)
409
+ if retry_on is None:
410
+ retry_status_codes: set[int] = {0, 429, 500, 502, 503, 504}
411
+ elif isinstance(retry_on, int):
412
+ retry_status_codes = {retry_on}
413
+ else:
414
+ retry_status_codes = set(retry_on)
415
+
416
+ backoff_base = 2.0 if retry_backoff is None else retry_backoff
416
417
 
417
- # 2. From Pool (if no override)
418
- if not selected_proxy:
419
- selected_proxy = self.proxy_pool.get(url=url, location=location)
418
+ last_resp: Response | None = None
420
419
 
421
- if selected_proxy:
422
- span.set_attribute("phantomfetch.proxy", selected_proxy.url)
423
- if selected_proxy.vendor:
424
- span.set_attribute(
425
- "phantomfetch.proxy.vendor", selected_proxy.vendor
426
- )
427
- if selected_proxy.proxy_type:
428
- span.set_attribute(
429
- "phantomfetch.proxy.type", selected_proxy.proxy_type
430
- )
431
- if selected_proxy.location:
432
- span.set_attribute(
433
- "phantomfetch.proxy.location", selected_proxy.location
420
+ for attempt in range(attempts):
421
+ # Get proxy
422
+ # 1. Manual override
423
+ selected_proxy: Proxy | None = None
424
+ if proxy:
425
+ if isinstance(proxy, str):
426
+ selected_proxy = Proxy(
427
+ url=proxy, metadata={"source": "manual_override"}
428
+ )
429
+ else:
430
+ selected_proxy = proxy
431
+
432
+ # 2. From Pool (if no override)
433
+ if not selected_proxy:
434
+ selected_proxy = self.proxy_pool.get(url=url, location=location)
435
+
436
+ if selected_proxy:
437
+ span.set_attribute("phantomfetch.proxy", selected_proxy.url)
438
+ if selected_proxy.vendor:
439
+ span.set_attribute(
440
+ "phantomfetch.proxy.vendor", selected_proxy.vendor
441
+ )
442
+ if selected_proxy.proxy_type:
443
+ span.set_attribute(
444
+ "phantomfetch.proxy.type", selected_proxy.proxy_type
445
+ )
446
+ if selected_proxy.location:
447
+ span.set_attribute(
448
+ "phantomfetch.proxy.location", selected_proxy.location
449
+ )
450
+ if selected_proxy.provider:
451
+ span.set_attribute(
452
+ "phantomfetch.proxy.provider", selected_proxy.provider
453
+ )
454
+
455
+ # Route to engine
456
+ if engine == "browser":
457
+ resp = await self._fetch_browser(
458
+ url=url,
459
+ proxy=selected_proxy,
460
+ headers=headers,
461
+ cookies=cookies,
462
+ actions=normalized_actions,
463
+ timeout=timeout or self.browser_timeout,
464
+ location=location,
465
+ wait_until=wait_until,
466
+ block_resources=block_resources,
467
+ wait_for_url=wait_for_url,
468
+ storage_state=self.session_data, # Pass current session
469
+ stealth=stealth,
434
470
  )
435
- if selected_proxy.provider:
436
- span.set_attribute(
437
- "phantomfetch.proxy.provider", selected_proxy.provider
471
+ else:
472
+ resp = await self._fetch_curl(
473
+ url=url,
474
+ proxy=selected_proxy,
475
+ headers=headers,
476
+ cookies=cookies,
477
+ timeout=timeout or self.timeout,
478
+ max_retries=1, # Prevent internal retries so we can rotate proxies
479
+ retry_on=retry_on,
480
+ retry_backoff=retry_backoff,
481
+ referer=referer,
482
+ allow_redirects=allow_redirects,
438
483
  )
439
-
440
- # Route to engine
441
- if engine == "browser":
442
- resp = await self._fetch_browser(
443
- url=url,
444
- proxy=selected_proxy,
445
- headers=headers,
446
- cookies=cookies,
447
- actions=normalized_actions,
448
- timeout=timeout or self.browser_timeout,
449
- location=location,
450
- wait_until=wait_until,
451
- block_resources=block_resources,
452
- wait_for_url=wait_for_url,
453
- storage_state=self.session_data, # Pass current session
454
- stealth=stealth,
455
- max_retries=self.max_retries
456
- if max_retries is None
457
- else max_retries,
458
- retry_on=retry_on,
459
- retry_backoff=retry_backoff,
484
+
485
+ last_resp = resp
486
+
487
+ # Evaluate proxy status
488
+ if not proxy and selected_proxy:
489
+ if resp.ok:
490
+ self.proxy_pool.mark_success(selected_proxy)
491
+ elif resp.error or resp.status in retry_status_codes:
492
+ # Mark proxy as failed so a fresh one is used on retry
493
+ self.proxy_pool.mark_failed(selected_proxy)
494
+
495
+ # Should we retry?
496
+ should_retry = (
497
+ attempt < attempts - 1 and resp.status in retry_status_codes
460
498
  )
461
- else:
462
- resp = await self._fetch_curl(
499
+ if not should_retry:
500
+ break
501
+
502
+ # Backoff before getting a new proxy
503
+ wait = backoff_base**attempt * (0.5 + random.random())
504
+ logger.warning(
505
+ f"[{engine}] Retryable status {resp.status}, "
506
+ f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
507
+ )
508
+ await asyncio.sleep(wait)
509
+
510
+ if not last_resp:
511
+ last_resp = Response(
463
512
  url=url,
464
- proxy=selected_proxy,
465
- headers=headers,
466
- cookies=cookies,
467
- timeout=timeout or self.timeout,
468
- max_retries=self.max_retries
469
- if max_retries is None
470
- else max_retries,
471
- retry_on=retry_on,
472
- retry_backoff=retry_backoff,
473
- referer=referer,
474
- allow_redirects=allow_redirects,
513
+ status=0,
514
+ body=b"",
515
+ engine=engine,
516
+ error="Max retries exhausted",
475
517
  )
518
+
519
+ resp = last_resp
476
520
 
477
521
  # Update session data from response if present
478
522
  if resp.storage_state:
479
523
  self.session_data = resp.storage_state
480
524
 
481
- # Update proxy stats (ONLY if it came from the pool, or generally?)
482
- # If manual override, we might NOT want to impact the pool stats unless the manual proxy IS in the pool?
483
- # For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
484
- # But the 'selected_proxy' might be a new instance specific to this request (manual override).
485
- # The pool.mark_* methods take a Proxy object.
486
-
487
- # Logic: If 'proxy' argument was None, it came from pool -> Update stats.
488
- # If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
489
- if not proxy and selected_proxy:
490
- if resp.ok:
491
- self.proxy_pool.mark_success(selected_proxy)
492
- elif resp.error:
493
- self.proxy_pool.mark_failed(selected_proxy)
494
-
495
525
  # Cache response
496
526
  if self.cache and resp.ok and self.cache.should_cache_request("document"):
497
527
  # Re-generate key (same logic as above)
@@ -523,9 +553,6 @@ class Fetcher:
523
553
  wait_for_url: str | None = None,
524
554
  storage_state: dict[str, Any] | None = None,
525
555
  stealth: bool = False,
526
- max_retries: int = 1,
527
- retry_on: Collection[int] | int | None = None,
528
- retry_backoff: float | None = None,
529
556
  ) -> Response:
530
557
  async with self._semaphore:
531
558
  async with self._browser_semaphore:
@@ -538,54 +565,21 @@ class Fetcher:
538
565
  error="CDP engine not initialized",
539
566
  )
540
567
 
541
- attempts = max(1, max_retries)
542
- if retry_on is None:
543
- retry_status_codes: set[int] = set()
544
- elif isinstance(retry_on, int):
545
- retry_status_codes = {retry_on}
546
- else:
547
- retry_status_codes = set(retry_on)
548
-
549
- backoff_base = 2.0 if retry_backoff is None else retry_backoff
550
- last_response: Response | None = None
551
-
552
- for attempt in range(attempts):
553
- response = await self._cdp_engine.fetch(
554
- url=url,
555
- proxy=proxy,
556
- headers=headers,
557
- cookies=cookies,
558
- actions=actions,
559
- timeout=timeout,
560
- location=location,
561
- wait_until=wait_until,
562
- block_resources=block_resources,
563
- wait_for_url=wait_for_url,
564
- storage_state=storage_state,
565
- stealth=stealth,
566
- )
567
- last_response = response
568
-
569
- should_retry = (
570
- attempt < attempts - 1 and response.status in retry_status_codes
571
- )
572
- if not should_retry:
573
- return response
574
-
575
- wait = backoff_base**attempt * (0.5 + random.random())
576
- logger.warning(
577
- f"[browser] Retryable status {response.status}, "
578
- f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
579
- )
580
- await asyncio.sleep(wait)
581
-
582
- return last_response or Response(
568
+ response = await self._cdp_engine.fetch(
583
569
  url=url,
584
- status=0,
585
- body=b"",
586
- engine="browser",
587
- error="Max retries exhausted",
570
+ proxy=proxy,
571
+ headers=headers,
572
+ cookies=cookies,
573
+ actions=actions,
574
+ timeout=timeout,
575
+ location=location,
576
+ wait_until=wait_until,
577
+ block_resources=block_resources,
578
+ wait_for_url=wait_for_url,
579
+ storage_state=storage_state,
580
+ stealth=stealth,
588
581
  )
582
+ return response
589
583
 
590
584
  async def _fetch_curl(
591
585
  self,
File without changes