phantomfetch 0.4.7__tar.gz → 0.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: phantomfetch
3
- Version: 0.4.7
3
+ Version: 0.4.8
4
4
  Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
5
5
  Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
6
6
  Author: CosmicBull
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomfetch"
3
- version = "0.4.7"
3
+ version = "0.4.8"
4
4
  description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -1,11 +1,11 @@
1
1
  import asyncio
2
2
  import json
3
3
  import os
4
- from typing import Any, Literal, cast
4
+ from typing import Any, Literal
5
5
 
6
6
  from loguru import logger
7
7
 
8
- from .cache import Cache, FileSystemCache
8
+ from .cache import Cache
9
9
  from .engines import CDPEngine, CurlEngine
10
10
  from .pool import ProxyPool
11
11
  from .telemetry import get_tracer
@@ -46,6 +46,7 @@ class Fetcher:
46
46
  cache: Cache | bool | None = None,
47
47
  # Advanced CDP
48
48
  cdp_use_existing_page: bool = True,
49
+ cdp_connection_type: str = "cdp",
49
50
  # BrowserForge fingerprinting
50
51
  fingerprint: bool = True,
51
52
  fingerprint_options: dict[str, Any] | None = None,
@@ -97,9 +98,11 @@ class Fetcher:
97
98
  timeout=timeout,
98
99
  max_retries=max_retries,
99
100
  )
100
-
101
+
101
102
  if browser_engine == "baas":
102
- raise ValueError("BaaSEngine has been removed. Please use 'cdp' engine or contact support.")
103
+ raise ValueError(
104
+ "BaaSEngine has been removed. Please use 'cdp' engine or contact support."
105
+ )
103
106
 
104
107
  # Browser engine
105
108
  # Always use CDPEngine now
@@ -109,6 +112,7 @@ class Fetcher:
109
112
  timeout=browser_timeout,
110
113
  cache=self.cache,
111
114
  use_existing_page=cdp_use_existing_page,
115
+ cdp_connection_type=cdp_connection_type,
112
116
  fingerprint=fingerprint,
113
117
  fingerprint_options=fingerprint_options,
114
118
  browser_type=browser_type,
@@ -268,7 +272,7 @@ class Fetcher:
268
272
  proxy: Specific proxy to use (overrides pool)
269
273
  location: Geo location for proxy selection
270
274
  actions: List of `Action` objects or dicts (implies engine="browser")
271
-
275
+
272
276
  ...
273
277
  """
274
278
  # Normalize actions - implies browser
@@ -319,21 +323,21 @@ class Fetcher:
319
323
  cache_key_parts = [engine, url]
320
324
  if location:
321
325
  cache_key_parts.append(f"loc={location}")
322
-
326
+
323
327
  # Use proxy URL for cache key if explicit proxy is used
324
328
  # Note: We rely on the proxy *argument* (manual override) for this separation.
325
- # If using pool, we typically don't split cache by specific pool proxy,
329
+ # If using pool, we typically don't split cache by specific pool proxy,
326
330
  # UNLESS location was requested (handled above).
327
- if proxy:
331
+ if proxy:
328
332
  # If manual proxy string/object provided, include it
329
333
  p_url = proxy if isinstance(proxy, str) else proxy.url
330
- # Sanitize sensitive info? user:pass might be sensitive,
331
- # but for cache key uniqueness it's needed.
334
+ # Sanitize sensitive info? user:pass might be sensitive,
335
+ # but for cache key uniqueness it's needed.
332
336
  # Since MD5 is used downstream, it's somewhat obscured.
333
337
  cache_key_parts.append(f"proxy={p_url}")
334
338
 
335
339
  cache_key = ":".join(cache_key_parts)
336
-
340
+
337
341
  cached_resp = await self.cache.get(cache_key)
338
342
  if cached_resp:
339
343
  cached_resp.from_cache = True
@@ -347,10 +351,12 @@ class Fetcher:
347
351
  selected_proxy: Proxy | None = None
348
352
  if proxy:
349
353
  if isinstance(proxy, str):
350
- selected_proxy = Proxy(url=proxy, metadata={"source": "manual_override"})
354
+ selected_proxy = Proxy(
355
+ url=proxy, metadata={"source": "manual_override"}
356
+ )
351
357
  else:
352
358
  selected_proxy = proxy
353
-
359
+
354
360
  # 2. From Pool (if no override)
355
361
  if not selected_proxy:
356
362
  selected_proxy = self.proxy_pool.get(url=url, location=location)
@@ -358,20 +364,27 @@ class Fetcher:
358
364
  if selected_proxy:
359
365
  span.set_attribute("phantomfetch.proxy", selected_proxy.url)
360
366
  if selected_proxy.vendor:
361
- span.set_attribute("phantomfetch.proxy.vendor", selected_proxy.vendor)
367
+ span.set_attribute(
368
+ "phantomfetch.proxy.vendor", selected_proxy.vendor
369
+ )
362
370
  if selected_proxy.proxy_type:
363
- span.set_attribute("phantomfetch.proxy.type", selected_proxy.proxy_type)
371
+ span.set_attribute(
372
+ "phantomfetch.proxy.type", selected_proxy.proxy_type
373
+ )
364
374
  if selected_proxy.location:
365
- span.set_attribute("phantomfetch.proxy.location", selected_proxy.location)
375
+ span.set_attribute(
376
+ "phantomfetch.proxy.location", selected_proxy.location
377
+ )
366
378
  if selected_proxy.provider:
367
- span.set_attribute("phantomfetch.proxy.provider", selected_proxy.provider)
379
+ span.set_attribute(
380
+ "phantomfetch.proxy.provider", selected_proxy.provider
381
+ )
368
382
 
369
383
  # Route to engine
370
384
  if engine == "browser":
371
385
  resp = await self._fetch_browser(
372
386
  url=url,
373
387
  proxy=selected_proxy,
374
-
375
388
  headers=headers,
376
389
  cookies=cookies,
377
390
  actions=normalized_actions,
@@ -406,7 +419,7 @@ class Fetcher:
406
419
  # For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
407
420
  # But the 'selected_proxy' might be a new instance specific to this request (manual override).
408
421
  # The pool.mark_* methods take a Proxy object.
409
-
422
+
410
423
  # Logic: If 'proxy' argument was None, it came from pool -> Update stats.
411
424
  # If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
412
425
  if not proxy and selected_proxy:
@@ -421,10 +434,10 @@ class Fetcher:
421
434
  cache_key_parts = [engine, url]
422
435
  if location:
423
436
  cache_key_parts.append(f"loc={location}")
424
- if proxy:
437
+ if proxy:
425
438
  p_url = proxy if isinstance(proxy, str) else proxy.url
426
439
  cache_key_parts.append(f"proxy={p_url}")
427
-
440
+
428
441
  cache_key = ":".join(cache_key_parts)
429
442
  await self.cache.set(cache_key, resp)
430
443
 
@@ -451,26 +464,26 @@ class Fetcher:
451
464
  async with self._browser_semaphore:
452
465
  # Direct CDP usage (BaaSEngine removed)
453
466
  if not self._cdp_engine:
454
- # Lazy init if not done (though currently init in __init__)
455
- # But wait, logic in __init__ was:
456
- # self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
457
- # Since we removed engine selection, we should ensure it's initialized.
458
- # In __init__ I see: self._cdp_engine: CDPEngine | None = None
459
- # And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
460
- # Let's check __init__ again.
461
- pass
462
-
463
- # Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
467
+ # Lazy init if not done (though currently init in __init__)
468
+ # But wait, logic in __init__ was:
469
+ # self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
470
+ # Since we removed engine selection, we should ensure it's initialized.
471
+ # In __init__ I see: self._cdp_engine: CDPEngine | None = None
472
+ # And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
473
+ # Let's check __init__ again.
474
+ pass
475
+
476
+ # Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
464
477
  # No, I only updated the arguments. I need to make sure __init__ initializes _cdp_engine unconditionally.
465
478
  # Let's assume I fix __init__ in a separate recursive step if needed, or I can fix it here if I see it.
466
- # In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
467
- # I need to clean that up too!
468
-
479
+ # In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
480
+ # I need to clean that up too!
481
+
469
482
  # Let's just assume _browser attribute is now _cdp_engine.
470
483
  # Wait, the __init__ logic was:
471
484
  # self._browser = CDPEngine(...)
472
485
  # So here I should just use self._browser (which is typed as CDPEngine in the new world).
473
-
486
+
474
487
  # Correct implementation for _fetch_browser:
475
488
  return await self._cdp_engine.fetch(
476
489
  url=url,
File without changes