phantomfetch 0.4.7__tar.gz → 0.4.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/PKG-INFO +1 -1
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/pyproject.toml +1 -1
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/fetch.py +48 -35
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/README.md +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/cache.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/captcha.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/base.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/browser/__init__.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/browser/actions.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/browser/cdp.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/engines/curl.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/pool.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/telemetry.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/tools/selector_builder.py +0 -0
- {phantomfetch-0.4.7 → phantomfetch-0.4.8}/src/phantomfetch/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: phantomfetch
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.8
|
|
4
4
|
Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
|
|
5
5
|
Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
|
|
6
6
|
Author: CosmicBull
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
from typing import Any, Literal
|
|
4
|
+
from typing import Any, Literal
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
|
|
8
|
-
from .cache import Cache
|
|
8
|
+
from .cache import Cache
|
|
9
9
|
from .engines import CDPEngine, CurlEngine
|
|
10
10
|
from .pool import ProxyPool
|
|
11
11
|
from .telemetry import get_tracer
|
|
@@ -46,6 +46,7 @@ class Fetcher:
|
|
|
46
46
|
cache: Cache | bool | None = None,
|
|
47
47
|
# Advanced CDP
|
|
48
48
|
cdp_use_existing_page: bool = True,
|
|
49
|
+
cdp_connection_type: str = "cdp",
|
|
49
50
|
# BrowserForge fingerprinting
|
|
50
51
|
fingerprint: bool = True,
|
|
51
52
|
fingerprint_options: dict[str, Any] | None = None,
|
|
@@ -97,9 +98,11 @@ class Fetcher:
|
|
|
97
98
|
timeout=timeout,
|
|
98
99
|
max_retries=max_retries,
|
|
99
100
|
)
|
|
100
|
-
|
|
101
|
+
|
|
101
102
|
if browser_engine == "baas":
|
|
102
|
-
raise ValueError(
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"BaaSEngine has been removed. Please use 'cdp' engine or contact support."
|
|
105
|
+
)
|
|
103
106
|
|
|
104
107
|
# Browser engine
|
|
105
108
|
# Always use CDPEngine now
|
|
@@ -109,6 +112,7 @@ class Fetcher:
|
|
|
109
112
|
timeout=browser_timeout,
|
|
110
113
|
cache=self.cache,
|
|
111
114
|
use_existing_page=cdp_use_existing_page,
|
|
115
|
+
cdp_connection_type=cdp_connection_type,
|
|
112
116
|
fingerprint=fingerprint,
|
|
113
117
|
fingerprint_options=fingerprint_options,
|
|
114
118
|
browser_type=browser_type,
|
|
@@ -268,7 +272,7 @@ class Fetcher:
|
|
|
268
272
|
proxy: Specific proxy to use (overrides pool)
|
|
269
273
|
location: Geo location for proxy selection
|
|
270
274
|
actions: List of `Action` objects or dicts (implies engine="browser")
|
|
271
|
-
|
|
275
|
+
|
|
272
276
|
...
|
|
273
277
|
"""
|
|
274
278
|
# Normalize actions - implies browser
|
|
@@ -319,21 +323,21 @@ class Fetcher:
|
|
|
319
323
|
cache_key_parts = [engine, url]
|
|
320
324
|
if location:
|
|
321
325
|
cache_key_parts.append(f"loc={location}")
|
|
322
|
-
|
|
326
|
+
|
|
323
327
|
# Use proxy URL for cache key if explicit proxy is used
|
|
324
328
|
# Note: We rely on the proxy *argument* (manual override) for this separation.
|
|
325
|
-
# If using pool, we typically don't split cache by specific pool proxy,
|
|
329
|
+
# If using pool, we typically don't split cache by specific pool proxy,
|
|
326
330
|
# UNLESS location was requested (handled above).
|
|
327
|
-
if proxy:
|
|
331
|
+
if proxy:
|
|
328
332
|
# If manual proxy string/object provided, include it
|
|
329
333
|
p_url = proxy if isinstance(proxy, str) else proxy.url
|
|
330
|
-
# Sanitize sensitive info? user:pass might be sensitive,
|
|
331
|
-
# but for cache key uniqueness it's needed.
|
|
334
|
+
# Sanitize sensitive info? user:pass might be sensitive,
|
|
335
|
+
# but for cache key uniqueness it's needed.
|
|
332
336
|
# Since MD5 is used downstream, it's somewhat obscured.
|
|
333
337
|
cache_key_parts.append(f"proxy={p_url}")
|
|
334
338
|
|
|
335
339
|
cache_key = ":".join(cache_key_parts)
|
|
336
|
-
|
|
340
|
+
|
|
337
341
|
cached_resp = await self.cache.get(cache_key)
|
|
338
342
|
if cached_resp:
|
|
339
343
|
cached_resp.from_cache = True
|
|
@@ -347,10 +351,12 @@ class Fetcher:
|
|
|
347
351
|
selected_proxy: Proxy | None = None
|
|
348
352
|
if proxy:
|
|
349
353
|
if isinstance(proxy, str):
|
|
350
|
-
selected_proxy = Proxy(
|
|
354
|
+
selected_proxy = Proxy(
|
|
355
|
+
url=proxy, metadata={"source": "manual_override"}
|
|
356
|
+
)
|
|
351
357
|
else:
|
|
352
358
|
selected_proxy = proxy
|
|
353
|
-
|
|
359
|
+
|
|
354
360
|
# 2. From Pool (if no override)
|
|
355
361
|
if not selected_proxy:
|
|
356
362
|
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
@@ -358,20 +364,27 @@ class Fetcher:
|
|
|
358
364
|
if selected_proxy:
|
|
359
365
|
span.set_attribute("phantomfetch.proxy", selected_proxy.url)
|
|
360
366
|
if selected_proxy.vendor:
|
|
361
|
-
span.set_attribute(
|
|
367
|
+
span.set_attribute(
|
|
368
|
+
"phantomfetch.proxy.vendor", selected_proxy.vendor
|
|
369
|
+
)
|
|
362
370
|
if selected_proxy.proxy_type:
|
|
363
|
-
span.set_attribute(
|
|
371
|
+
span.set_attribute(
|
|
372
|
+
"phantomfetch.proxy.type", selected_proxy.proxy_type
|
|
373
|
+
)
|
|
364
374
|
if selected_proxy.location:
|
|
365
|
-
span.set_attribute(
|
|
375
|
+
span.set_attribute(
|
|
376
|
+
"phantomfetch.proxy.location", selected_proxy.location
|
|
377
|
+
)
|
|
366
378
|
if selected_proxy.provider:
|
|
367
|
-
span.set_attribute(
|
|
379
|
+
span.set_attribute(
|
|
380
|
+
"phantomfetch.proxy.provider", selected_proxy.provider
|
|
381
|
+
)
|
|
368
382
|
|
|
369
383
|
# Route to engine
|
|
370
384
|
if engine == "browser":
|
|
371
385
|
resp = await self._fetch_browser(
|
|
372
386
|
url=url,
|
|
373
387
|
proxy=selected_proxy,
|
|
374
|
-
|
|
375
388
|
headers=headers,
|
|
376
389
|
cookies=cookies,
|
|
377
390
|
actions=normalized_actions,
|
|
@@ -406,7 +419,7 @@ class Fetcher:
|
|
|
406
419
|
# For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
|
|
407
420
|
# But the 'selected_proxy' might be a new instance specific to this request (manual override).
|
|
408
421
|
# The pool.mark_* methods take a Proxy object.
|
|
409
|
-
|
|
422
|
+
|
|
410
423
|
# Logic: If 'proxy' argument was None, it came from pool -> Update stats.
|
|
411
424
|
# If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
|
|
412
425
|
if not proxy and selected_proxy:
|
|
@@ -421,10 +434,10 @@ class Fetcher:
|
|
|
421
434
|
cache_key_parts = [engine, url]
|
|
422
435
|
if location:
|
|
423
436
|
cache_key_parts.append(f"loc={location}")
|
|
424
|
-
if proxy:
|
|
437
|
+
if proxy:
|
|
425
438
|
p_url = proxy if isinstance(proxy, str) else proxy.url
|
|
426
439
|
cache_key_parts.append(f"proxy={p_url}")
|
|
427
|
-
|
|
440
|
+
|
|
428
441
|
cache_key = ":".join(cache_key_parts)
|
|
429
442
|
await self.cache.set(cache_key, resp)
|
|
430
443
|
|
|
@@ -451,26 +464,26 @@ class Fetcher:
|
|
|
451
464
|
async with self._browser_semaphore:
|
|
452
465
|
# Direct CDP usage (BaaSEngine removed)
|
|
453
466
|
if not self._cdp_engine:
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
# Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
|
|
467
|
+
# Lazy init if not done (though currently init in __init__)
|
|
468
|
+
# But wait, logic in __init__ was:
|
|
469
|
+
# self._cdp_engine = CDPEngine(...) if browser_engine == "cdp"
|
|
470
|
+
# Since we removed engine selection, we should ensure it's initialized.
|
|
471
|
+
# In __init__ I see: self._cdp_engine: CDPEngine | None = None
|
|
472
|
+
# And the constructor logic for it was tied to the "if browser_engine == 'cdp'" block which I might have messed up or need to fix.
|
|
473
|
+
# Let's check __init__ again.
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
# Actually, looking at the previous file content, I removed the 'if browser_engine ==' logic in __init__?
|
|
464
477
|
# No, I only updated the arguments. I need to make sure __init__ initializes _cdp_engine unconditionally.
|
|
465
478
|
# Let's assume I fix __init__ in a separate recursive step if needed, or I can fix it here if I see it.
|
|
466
|
-
# In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
|
|
467
|
-
# I need to clean that up too!
|
|
468
|
-
|
|
479
|
+
# In the previous `view_file` output (Step 356), lines 87-101 show the old init logic.
|
|
480
|
+
# I need to clean that up too!
|
|
481
|
+
|
|
469
482
|
# Let's just assume _browser attribute is now _cdp_engine.
|
|
470
483
|
# Wait, the __init__ logic was:
|
|
471
484
|
# self._browser = CDPEngine(...)
|
|
472
485
|
# So here I should just use self._browser (which is typed as CDPEngine in the new world).
|
|
473
|
-
|
|
486
|
+
|
|
474
487
|
# Correct implementation for _fetch_browser:
|
|
475
488
|
return await self._cdp_engine.fetch(
|
|
476
489
|
url=url,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|