phantomfetch 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/PKG-INFO +3 -3
  2. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/pyproject.toml +3 -3
  3. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/actions.py +21 -17
  4. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/fetch.py +120 -126
  5. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/README.md +0 -0
  6. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/__init__.py +0 -0
  7. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/cache.py +0 -0
  8. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/captcha.py +0 -0
  9. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/__init__.py +0 -0
  10. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/base.py +0 -0
  11. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/__init__.py +0 -0
  12. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/cdp.py +0 -0
  13. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/curl.py +0 -0
  14. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/pool.py +0 -0
  15. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/presets.py +0 -0
  16. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/registry.py +0 -0
  17. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/strategy_advisor.py +0 -0
  18. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/telemetry.py +0 -0
  19. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/__init__.py +0 -0
  20. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/fingerprint.py +0 -0
  21. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/matcher.py +0 -0
  22. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/store.py +0 -0
  23. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/selector_builder.py +0 -0
  24. {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: phantomfetch
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
5
5
  Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
6
6
  Author: CosmicBull
@@ -26,14 +26,14 @@ Requires-Dist: opentelemetry-sdk>=1.38.0
26
26
  Requires-Dist: loguru>=0.7.3
27
27
  Requires-Dist: beautifulsoup4>=4.14.3
28
28
  Requires-Dist: lmdb>=1.4.1
29
- Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'all'
29
+ Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'all'
30
30
  Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'all'
31
31
  Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'all'
32
32
  Requires-Dist: patchright>=1.60.1 ; extra == 'all'
33
33
  Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
34
34
  Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'camoufox'
35
35
  Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'cloakbrowser'
36
- Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'geoip'
36
+ Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'geoip'
37
37
  Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
38
38
  Requires-Dist: patchright>=1.60.1 ; extra == 'patchright'
39
39
  Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'rebrowser'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomfetch"
3
- version = "0.6.0"
3
+ version = "0.6.2"
4
4
  description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -45,11 +45,11 @@ dependencies = [
45
45
 
46
46
  [project.optional-dependencies]
47
47
  cloakbrowser = ["cloakbrowser>=0.3.31"]
48
- geoip = ["cloakbrowser[geoip]>=0.3.31", "maxminddb>=2.0.0"]
48
+ geoip = ["cloakbrowser>=0.3.31", "maxminddb>=2.0.0"]
49
49
  camoufox = ["camoufox[geoip]>=0.4.11"]
50
50
  rebrowser = ["rebrowser-playwright>=1.52.0"]
51
51
  patchright = ["patchright>=1.60.1"]
52
- all = ["cloakbrowser[geoip]>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
52
+ all = ["cloakbrowser>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
53
53
 
54
54
  [project.urls]
55
55
  Homepage = "https://github.com/iristech-systems/PhantomFetch"
@@ -58,6 +58,11 @@ async def _human_mouse_move(page: "Page", element_handle: Any):
58
58
  # We can add a slight overshoot/correction if we want to be fancy, but `steps` is 1st iterated humanization.
59
59
 
60
60
 
61
+
62
+ def _is_page(obj: Any) -> bool:
63
+ return obj.__class__.__name__ == "Page"
64
+
65
+
61
66
  async def execute_actions(
62
67
  page: "Page | Locator", actions: list[Action]
63
68
  ) -> list["ActionResult"]:
@@ -71,7 +76,6 @@ async def execute_actions(
71
76
  Returns:
72
77
  List of ActionResult objects
73
78
  """
74
- from playwright.async_api import Page
75
79
 
76
80
  from ...types import ActionResult
77
81
 
@@ -142,7 +146,7 @@ async def execute_actions(
142
146
 
143
147
  # If scope is explicitly 'page', force usage of root page
144
148
  if action.scope == "page":
145
- ctx = page if isinstance(page, Page) else page.page
149
+ ctx = page if _is_page(page) else page.page
146
150
 
147
151
  start_time = time.perf_counter()
148
152
  result = ActionResult(action=action, success=True)
@@ -162,7 +166,7 @@ async def execute_actions(
162
166
  case "wait":
163
167
  if action.selector:
164
168
  state = action.state or "visible"
165
- if isinstance(ctx, Page):
169
+ if _is_page(ctx):
166
170
  await ctx.wait_for_selector(
167
171
  action.selector,
168
172
  timeout=action.timeout,
@@ -174,7 +178,7 @@ async def execute_actions(
174
178
  timeout=action.timeout, state=state
175
179
  )
176
180
  elif action.timeout:
177
- target_page = ctx if isinstance(ctx, Page) else ctx.page
181
+ target_page = ctx if _is_page(ctx) else ctx.page
178
182
  await target_page.wait_for_timeout(action.timeout)
179
183
 
180
184
  case "loop":
@@ -216,7 +220,7 @@ async def execute_actions(
216
220
  if action.human_like:
217
221
  # Human-like click
218
222
  # Resolve handle first
219
- if isinstance(ctx, Page):
223
+ if _is_page(ctx):
220
224
  handle = await ctx.wait_for_selector(
221
225
  action.selector,
222
226
  timeout=action.timeout,
@@ -233,7 +237,7 @@ async def execute_actions(
233
237
  if handle:
234
238
  # Need page for mouse move
235
239
  target_page = (
236
- ctx if isinstance(ctx, Page) else ctx.page
240
+ ctx if _is_page(ctx) else ctx.page
237
241
  )
238
242
  await _human_mouse_move(target_page, handle)
239
243
  await handle.click(delay=random.randint(50, 150))
@@ -243,7 +247,7 @@ async def execute_actions(
243
247
  timeout=action.timeout,
244
248
  )
245
249
  # Context click (no selector)
246
- elif isinstance(ctx, Page):
250
+ elif _is_page(ctx):
247
251
  result.success = False
248
252
  result.error = "Click action on Page requires a selector"
249
253
  elif action.human_like:
@@ -262,7 +266,7 @@ async def execute_actions(
262
266
  val_str = str(action.value)
263
267
  if action.human_like:
264
268
  await ctx.click(action.selector, timeout=action.timeout)
265
- target_page = ctx if isinstance(ctx, Page) else ctx.page
269
+ target_page = ctx if _is_page(ctx) else ctx.page
266
270
  await _human_type(target_page, val_str)
267
271
  else:
268
272
  await ctx.fill(
@@ -271,7 +275,7 @@ async def execute_actions(
271
275
  timeout=action.timeout,
272
276
  )
273
277
  # Input into self (ctx is locator)
274
- elif isinstance(ctx, Page):
278
+ elif _is_page(ctx):
275
279
  result.success = False
276
280
  result.error = "Input action on Page requires a selector"
277
281
  else:
@@ -286,7 +290,7 @@ async def execute_actions(
286
290
  case "scroll":
287
291
  # Scroll usually implies page-level or element-level scroll
288
292
  # For now, keep page level logic mostly
289
- target_page = ctx if isinstance(ctx, Page) else ctx.page
293
+ target_page = ctx if _is_page(ctx) else ctx.page
290
294
 
291
295
  if action.selector == "top":
292
296
  await target_page.evaluate("window.scrollTo(0, 0)")
@@ -444,13 +448,13 @@ async def execute_actions(
444
448
  # locator.locator(selector).select_option(...) logic
445
449
  value=str(action.value),
446
450
  timeout=action.timeout,
447
- ) if isinstance(ctx, Page) else await ctx.locator(
451
+ ) if _is_page(ctx) else await ctx.locator(
448
452
  action.selector
449
453
  ).select_option(str(action.value), timeout=action.timeout)
450
454
 
451
455
  case "hover":
452
456
  if action.selector:
453
- if isinstance(ctx, Page):
457
+ if _is_page(ctx):
454
458
  await ctx.hover(action.selector, timeout=action.timeout)
455
459
  else:
456
460
  await ctx.locator(action.selector).hover(
@@ -465,7 +469,7 @@ async def execute_actions(
465
469
  kwargs = {}
466
470
  if action.full_page:
467
471
  kwargs["full_page"] = True
468
- if not isinstance(ctx, Page):
472
+ if not _is_page(ctx):
469
473
  # If we are in a Locator (e.g. inside loop loop), but want full page,
470
474
  # we must switch to the page context.
471
475
  screenshot_ctx = ctx.page
@@ -491,7 +495,7 @@ async def execute_actions(
491
495
  result.data = img_bytes
492
496
 
493
497
  case "wait_for_load":
494
- target_page = ctx if isinstance(ctx, Page) else ctx.page
498
+ target_page = ctx if _is_page(ctx) else ctx.page
495
499
  await target_page.wait_for_load_state(
496
500
  "networkidle", timeout=action.timeout
497
501
  )
@@ -504,7 +508,7 @@ async def execute_actions(
504
508
  case "validate":
505
509
  try:
506
510
  state = action.state or "attached"
507
- if isinstance(ctx, Page):
511
+ if _is_page(ctx):
508
512
  await ctx.wait_for_selector(
509
513
  action.selector,
510
514
  timeout=action.timeout or 5000,
@@ -523,7 +527,7 @@ async def execute_actions(
523
527
 
524
528
  case "solve_captcha":
525
529
  # Requires Page context for solver
526
- target_page = ctx if isinstance(ctx, Page) else ctx.page
530
+ target_page = ctx if _is_page(ctx) else ctx.page
527
531
 
528
532
  if action.provider in ("cdp", "scraping_browser"):
529
533
  from ...captcha import CDPSolver
@@ -552,7 +556,7 @@ async def execute_actions(
552
556
  if action.selector:
553
557
  # Check visibility/existence
554
558
  try:
555
- if isinstance(ctx, Page):
559
+ if _is_page(ctx):
556
560
  # Use strict=False, state=visible/attached?
557
561
  # Just check count > 0 or wait with short timeout?
558
562
  # Let's use is_visible or check count to avoid waiting if timeout=0
@@ -403,95 +403,125 @@ class Fetcher:
403
403
 
404
404
  span.set_attribute("phantomfetch.cache.hit", False)
405
405
 
406
- # Get proxy
407
- # 1. Manual override
408
- selected_proxy: Proxy | None = None
409
- if proxy:
410
- if isinstance(proxy, str):
411
- selected_proxy = Proxy(
412
- url=proxy, metadata={"source": "manual_override"}
413
- )
414
- else:
415
- selected_proxy = proxy
406
+ # Resolve retry configurations
407
+ actual_max_retries = max_retries if max_retries is not None else self.max_retries
408
+ attempts = max(1, actual_max_retries)
409
+ if retry_on is None:
410
+ retry_status_codes: set[int] = {0, 429, 500, 502, 503, 504}
411
+ elif isinstance(retry_on, int):
412
+ retry_status_codes = {retry_on}
413
+ else:
414
+ retry_status_codes = set(retry_on)
415
+
416
+ backoff_base = 2.0 if retry_backoff is None else retry_backoff
416
417
 
417
- # 2. From Pool (if no override)
418
- if not selected_proxy:
419
- selected_proxy = self.proxy_pool.get(url=url, location=location)
418
+ last_resp: Response | None = None
420
419
 
421
- if selected_proxy:
422
- span.set_attribute("phantomfetch.proxy", selected_proxy.url)
423
- if selected_proxy.vendor:
424
- span.set_attribute(
425
- "phantomfetch.proxy.vendor", selected_proxy.vendor
426
- )
427
- if selected_proxy.proxy_type:
428
- span.set_attribute(
429
- "phantomfetch.proxy.type", selected_proxy.proxy_type
430
- )
431
- if selected_proxy.location:
432
- span.set_attribute(
433
- "phantomfetch.proxy.location", selected_proxy.location
420
+ for attempt in range(attempts):
421
+ # Get proxy
422
+ # 1. Manual override
423
+ selected_proxy: Proxy | None = None
424
+ if proxy:
425
+ if isinstance(proxy, str):
426
+ selected_proxy = Proxy(
427
+ url=proxy, metadata={"source": "manual_override"}
428
+ )
429
+ else:
430
+ selected_proxy = proxy
431
+
432
+ # 2. From Pool (if no override)
433
+ if not selected_proxy:
434
+ selected_proxy = self.proxy_pool.get(url=url, location=location)
435
+
436
+ if selected_proxy:
437
+ span.set_attribute("phantomfetch.proxy", selected_proxy.url)
438
+ if selected_proxy.vendor:
439
+ span.set_attribute(
440
+ "phantomfetch.proxy.vendor", selected_proxy.vendor
441
+ )
442
+ if selected_proxy.proxy_type:
443
+ span.set_attribute(
444
+ "phantomfetch.proxy.type", selected_proxy.proxy_type
445
+ )
446
+ if selected_proxy.location:
447
+ span.set_attribute(
448
+ "phantomfetch.proxy.location", selected_proxy.location
449
+ )
450
+ if selected_proxy.provider:
451
+ span.set_attribute(
452
+ "phantomfetch.proxy.provider", selected_proxy.provider
453
+ )
454
+
455
+ # Route to engine
456
+ if engine == "browser":
457
+ resp = await self._fetch_browser(
458
+ url=url,
459
+ proxy=selected_proxy,
460
+ headers=headers,
461
+ cookies=cookies,
462
+ actions=normalized_actions,
463
+ timeout=timeout or self.browser_timeout,
464
+ location=location,
465
+ wait_until=wait_until,
466
+ block_resources=block_resources,
467
+ wait_for_url=wait_for_url,
468
+ storage_state=self.session_data, # Pass current session
469
+ stealth=stealth,
434
470
  )
435
- if selected_proxy.provider:
436
- span.set_attribute(
437
- "phantomfetch.proxy.provider", selected_proxy.provider
471
+ else:
472
+ resp = await self._fetch_curl(
473
+ url=url,
474
+ proxy=selected_proxy,
475
+ headers=headers,
476
+ cookies=cookies,
477
+ timeout=timeout or self.timeout,
478
+ max_retries=1, # Prevent internal retries so we can rotate proxies
479
+ retry_on=retry_on,
480
+ retry_backoff=retry_backoff,
481
+ referer=referer,
482
+ allow_redirects=allow_redirects,
438
483
  )
439
-
440
- # Route to engine
441
- if engine == "browser":
442
- resp = await self._fetch_browser(
443
- url=url,
444
- proxy=selected_proxy,
445
- headers=headers,
446
- cookies=cookies,
447
- actions=normalized_actions,
448
- timeout=timeout or self.browser_timeout,
449
- location=location,
450
- wait_until=wait_until,
451
- block_resources=block_resources,
452
- wait_for_url=wait_for_url,
453
- storage_state=self.session_data, # Pass current session
454
- stealth=stealth,
455
- max_retries=self.max_retries
456
- if max_retries is None
457
- else max_retries,
458
- retry_on=retry_on,
459
- retry_backoff=retry_backoff,
484
+
485
+ last_resp = resp
486
+
487
+ # Evaluate proxy status
488
+ if not proxy and selected_proxy:
489
+ if resp.ok:
490
+ self.proxy_pool.mark_success(selected_proxy)
491
+ elif resp.error or resp.status in retry_status_codes:
492
+ # Mark proxy as failed so a fresh one is used on retry
493
+ self.proxy_pool.mark_failed(selected_proxy)
494
+
495
+ # Should we retry?
496
+ should_retry = (
497
+ attempt < attempts - 1 and resp.status in retry_status_codes
460
498
  )
461
- else:
462
- resp = await self._fetch_curl(
499
+ if not should_retry:
500
+ break
501
+
502
+ # Backoff before getting a new proxy
503
+ wait = backoff_base**attempt * (0.5 + random.random())
504
+ logger.warning(
505
+ f"[{engine}] Retryable status {resp.status}, "
506
+ f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
507
+ )
508
+ await asyncio.sleep(wait)
509
+
510
+ if not last_resp:
511
+ last_resp = Response(
463
512
  url=url,
464
- proxy=selected_proxy,
465
- headers=headers,
466
- cookies=cookies,
467
- timeout=timeout or self.timeout,
468
- max_retries=self.max_retries
469
- if max_retries is None
470
- else max_retries,
471
- retry_on=retry_on,
472
- retry_backoff=retry_backoff,
473
- referer=referer,
474
- allow_redirects=allow_redirects,
513
+ status=0,
514
+ body=b"",
515
+ engine=engine,
516
+ error="Max retries exhausted",
475
517
  )
518
+
519
+ resp = last_resp
476
520
 
477
521
  # Update session data from response if present
478
522
  if resp.storage_state:
479
523
  self.session_data = resp.storage_state
480
524
 
481
- # Update proxy stats (ONLY if it came from the pool, or generally?)
482
- # If manual override, we might NOT want to impact the pool stats unless the manual proxy IS in the pool?
483
- # For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
484
- # But the 'selected_proxy' might be a new instance specific to this request (manual override).
485
- # The pool.mark_* methods take a Proxy object.
486
-
487
- # Logic: If 'proxy' argument was None, it came from pool -> Update stats.
488
- # If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
489
- if not proxy and selected_proxy:
490
- if resp.ok:
491
- self.proxy_pool.mark_success(selected_proxy)
492
- elif resp.error:
493
- self.proxy_pool.mark_failed(selected_proxy)
494
-
495
525
  # Cache response
496
526
  if self.cache and resp.ok and self.cache.should_cache_request("document"):
497
527
  # Re-generate key (same logic as above)
@@ -523,9 +553,6 @@ class Fetcher:
523
553
  wait_for_url: str | None = None,
524
554
  storage_state: dict[str, Any] | None = None,
525
555
  stealth: bool = False,
526
- max_retries: int = 1,
527
- retry_on: Collection[int] | int | None = None,
528
- retry_backoff: float | None = None,
529
556
  ) -> Response:
530
557
  async with self._semaphore:
531
558
  async with self._browser_semaphore:
@@ -538,54 +565,21 @@ class Fetcher:
538
565
  error="CDP engine not initialized",
539
566
  )
540
567
 
541
- attempts = max(1, max_retries)
542
- if retry_on is None:
543
- retry_status_codes: set[int] = set()
544
- elif isinstance(retry_on, int):
545
- retry_status_codes = {retry_on}
546
- else:
547
- retry_status_codes = set(retry_on)
548
-
549
- backoff_base = 2.0 if retry_backoff is None else retry_backoff
550
- last_response: Response | None = None
551
-
552
- for attempt in range(attempts):
553
- response = await self._cdp_engine.fetch(
554
- url=url,
555
- proxy=proxy,
556
- headers=headers,
557
- cookies=cookies,
558
- actions=actions,
559
- timeout=timeout,
560
- location=location,
561
- wait_until=wait_until,
562
- block_resources=block_resources,
563
- wait_for_url=wait_for_url,
564
- storage_state=storage_state,
565
- stealth=stealth,
566
- )
567
- last_response = response
568
-
569
- should_retry = (
570
- attempt < attempts - 1 and response.status in retry_status_codes
571
- )
572
- if not should_retry:
573
- return response
574
-
575
- wait = backoff_base**attempt * (0.5 + random.random())
576
- logger.warning(
577
- f"[browser] Retryable status {response.status}, "
578
- f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
579
- )
580
- await asyncio.sleep(wait)
581
-
582
- return last_response or Response(
568
+ response = await self._cdp_engine.fetch(
583
569
  url=url,
584
- status=0,
585
- body=b"",
586
- engine="browser",
587
- error="Max retries exhausted",
570
+ proxy=proxy,
571
+ headers=headers,
572
+ cookies=cookies,
573
+ actions=actions,
574
+ timeout=timeout,
575
+ location=location,
576
+ wait_until=wait_until,
577
+ block_resources=block_resources,
578
+ wait_for_url=wait_for_url,
579
+ storage_state=storage_state,
580
+ stealth=stealth,
588
581
  )
582
+ return response
589
583
 
590
584
  async def _fetch_curl(
591
585
  self,
File without changes