phantomfetch 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/PKG-INFO +3 -3
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/pyproject.toml +3 -3
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/actions.py +21 -17
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/fetch.py +120 -126
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/README.md +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/cache.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/captcha.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/base.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/cdp.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/curl.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/pool.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/presets.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/registry.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/strategy_advisor.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/telemetry.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/__init__.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/fingerprint.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/matcher.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/store.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/selector_builder.py +0 -0
- {phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: phantomfetch
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
|
|
5
5
|
Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
|
|
6
6
|
Author: CosmicBull
|
|
@@ -26,14 +26,14 @@ Requires-Dist: opentelemetry-sdk>=1.38.0
|
|
|
26
26
|
Requires-Dist: loguru>=0.7.3
|
|
27
27
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
28
28
|
Requires-Dist: lmdb>=1.4.1
|
|
29
|
-
Requires-Dist: cloakbrowser
|
|
29
|
+
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'all'
|
|
30
30
|
Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'all'
|
|
31
31
|
Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'all'
|
|
32
32
|
Requires-Dist: patchright>=1.60.1 ; extra == 'all'
|
|
33
33
|
Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
|
|
34
34
|
Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'camoufox'
|
|
35
35
|
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'cloakbrowser'
|
|
36
|
-
Requires-Dist: cloakbrowser
|
|
36
|
+
Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'geoip'
|
|
37
37
|
Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
|
|
38
38
|
Requires-Dist: patchright>=1.60.1 ; extra == 'patchright'
|
|
39
39
|
Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'rebrowser'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "phantomfetch"
|
|
3
|
-
version = "0.6.
|
|
3
|
+
version = "0.6.2"
|
|
4
4
|
description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.13"
|
|
@@ -45,11 +45,11 @@ dependencies = [
|
|
|
45
45
|
|
|
46
46
|
[project.optional-dependencies]
|
|
47
47
|
cloakbrowser = ["cloakbrowser>=0.3.31"]
|
|
48
|
-
geoip = ["cloakbrowser
|
|
48
|
+
geoip = ["cloakbrowser>=0.3.31", "maxminddb>=2.0.0"]
|
|
49
49
|
camoufox = ["camoufox[geoip]>=0.4.11"]
|
|
50
50
|
rebrowser = ["rebrowser-playwright>=1.52.0"]
|
|
51
51
|
patchright = ["patchright>=1.60.1"]
|
|
52
|
-
all = ["cloakbrowser
|
|
52
|
+
all = ["cloakbrowser>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
|
|
53
53
|
|
|
54
54
|
[project.urls]
|
|
55
55
|
Homepage = "https://github.com/iristech-systems/PhantomFetch"
|
|
@@ -58,6 +58,11 @@ async def _human_mouse_move(page: "Page", element_handle: Any):
|
|
|
58
58
|
# We can add a slight overshoot/correction if we want to be fancy, but `steps` is 1st iterated humanization.
|
|
59
59
|
|
|
60
60
|
|
|
61
|
+
|
|
62
|
+
def _is_page(obj: Any) -> bool:
|
|
63
|
+
return obj.__class__.__name__ == "Page"
|
|
64
|
+
|
|
65
|
+
|
|
61
66
|
async def execute_actions(
|
|
62
67
|
page: "Page | Locator", actions: list[Action]
|
|
63
68
|
) -> list["ActionResult"]:
|
|
@@ -71,7 +76,6 @@ async def execute_actions(
|
|
|
71
76
|
Returns:
|
|
72
77
|
List of ActionResult objects
|
|
73
78
|
"""
|
|
74
|
-
from playwright.async_api import Page
|
|
75
79
|
|
|
76
80
|
from ...types import ActionResult
|
|
77
81
|
|
|
@@ -142,7 +146,7 @@ async def execute_actions(
|
|
|
142
146
|
|
|
143
147
|
# If scope is explicitly 'page', force usage of root page
|
|
144
148
|
if action.scope == "page":
|
|
145
|
-
ctx = page if
|
|
149
|
+
ctx = page if _is_page(page) else page.page
|
|
146
150
|
|
|
147
151
|
start_time = time.perf_counter()
|
|
148
152
|
result = ActionResult(action=action, success=True)
|
|
@@ -162,7 +166,7 @@ async def execute_actions(
|
|
|
162
166
|
case "wait":
|
|
163
167
|
if action.selector:
|
|
164
168
|
state = action.state or "visible"
|
|
165
|
-
if
|
|
169
|
+
if _is_page(ctx):
|
|
166
170
|
await ctx.wait_for_selector(
|
|
167
171
|
action.selector,
|
|
168
172
|
timeout=action.timeout,
|
|
@@ -174,7 +178,7 @@ async def execute_actions(
|
|
|
174
178
|
timeout=action.timeout, state=state
|
|
175
179
|
)
|
|
176
180
|
elif action.timeout:
|
|
177
|
-
target_page = ctx if
|
|
181
|
+
target_page = ctx if _is_page(ctx) else ctx.page
|
|
178
182
|
await target_page.wait_for_timeout(action.timeout)
|
|
179
183
|
|
|
180
184
|
case "loop":
|
|
@@ -216,7 +220,7 @@ async def execute_actions(
|
|
|
216
220
|
if action.human_like:
|
|
217
221
|
# Human-like click
|
|
218
222
|
# Resolve handle first
|
|
219
|
-
if
|
|
223
|
+
if _is_page(ctx):
|
|
220
224
|
handle = await ctx.wait_for_selector(
|
|
221
225
|
action.selector,
|
|
222
226
|
timeout=action.timeout,
|
|
@@ -233,7 +237,7 @@ async def execute_actions(
|
|
|
233
237
|
if handle:
|
|
234
238
|
# Need page for mouse move
|
|
235
239
|
target_page = (
|
|
236
|
-
ctx if
|
|
240
|
+
ctx if _is_page(ctx) else ctx.page
|
|
237
241
|
)
|
|
238
242
|
await _human_mouse_move(target_page, handle)
|
|
239
243
|
await handle.click(delay=random.randint(50, 150))
|
|
@@ -243,7 +247,7 @@ async def execute_actions(
|
|
|
243
247
|
timeout=action.timeout,
|
|
244
248
|
)
|
|
245
249
|
# Context click (no selector)
|
|
246
|
-
elif
|
|
250
|
+
elif _is_page(ctx):
|
|
247
251
|
result.success = False
|
|
248
252
|
result.error = "Click action on Page requires a selector"
|
|
249
253
|
elif action.human_like:
|
|
@@ -262,7 +266,7 @@ async def execute_actions(
|
|
|
262
266
|
val_str = str(action.value)
|
|
263
267
|
if action.human_like:
|
|
264
268
|
await ctx.click(action.selector, timeout=action.timeout)
|
|
265
|
-
target_page = ctx if
|
|
269
|
+
target_page = ctx if _is_page(ctx) else ctx.page
|
|
266
270
|
await _human_type(target_page, val_str)
|
|
267
271
|
else:
|
|
268
272
|
await ctx.fill(
|
|
@@ -271,7 +275,7 @@ async def execute_actions(
|
|
|
271
275
|
timeout=action.timeout,
|
|
272
276
|
)
|
|
273
277
|
# Input into self (ctx is locator)
|
|
274
|
-
elif
|
|
278
|
+
elif _is_page(ctx):
|
|
275
279
|
result.success = False
|
|
276
280
|
result.error = "Input action on Page requires a selector"
|
|
277
281
|
else:
|
|
@@ -286,7 +290,7 @@ async def execute_actions(
|
|
|
286
290
|
case "scroll":
|
|
287
291
|
# Scroll usually implies page-level or element-level scroll
|
|
288
292
|
# For now, keep page level logic mostly
|
|
289
|
-
target_page = ctx if
|
|
293
|
+
target_page = ctx if _is_page(ctx) else ctx.page
|
|
290
294
|
|
|
291
295
|
if action.selector == "top":
|
|
292
296
|
await target_page.evaluate("window.scrollTo(0, 0)")
|
|
@@ -444,13 +448,13 @@ async def execute_actions(
|
|
|
444
448
|
# locator.locator(selector).select_option(...) logic
|
|
445
449
|
value=str(action.value),
|
|
446
450
|
timeout=action.timeout,
|
|
447
|
-
) if
|
|
451
|
+
) if _is_page(ctx) else await ctx.locator(
|
|
448
452
|
action.selector
|
|
449
453
|
).select_option(str(action.value), timeout=action.timeout)
|
|
450
454
|
|
|
451
455
|
case "hover":
|
|
452
456
|
if action.selector:
|
|
453
|
-
if
|
|
457
|
+
if _is_page(ctx):
|
|
454
458
|
await ctx.hover(action.selector, timeout=action.timeout)
|
|
455
459
|
else:
|
|
456
460
|
await ctx.locator(action.selector).hover(
|
|
@@ -465,7 +469,7 @@ async def execute_actions(
|
|
|
465
469
|
kwargs = {}
|
|
466
470
|
if action.full_page:
|
|
467
471
|
kwargs["full_page"] = True
|
|
468
|
-
if not
|
|
472
|
+
if not _is_page(ctx):
|
|
469
473
|
# If we are in a Locator (e.g. inside loop loop), but want full page,
|
|
470
474
|
# we must switch to the page context.
|
|
471
475
|
screenshot_ctx = ctx.page
|
|
@@ -491,7 +495,7 @@ async def execute_actions(
|
|
|
491
495
|
result.data = img_bytes
|
|
492
496
|
|
|
493
497
|
case "wait_for_load":
|
|
494
|
-
target_page = ctx if
|
|
498
|
+
target_page = ctx if _is_page(ctx) else ctx.page
|
|
495
499
|
await target_page.wait_for_load_state(
|
|
496
500
|
"networkidle", timeout=action.timeout
|
|
497
501
|
)
|
|
@@ -504,7 +508,7 @@ async def execute_actions(
|
|
|
504
508
|
case "validate":
|
|
505
509
|
try:
|
|
506
510
|
state = action.state or "attached"
|
|
507
|
-
if
|
|
511
|
+
if _is_page(ctx):
|
|
508
512
|
await ctx.wait_for_selector(
|
|
509
513
|
action.selector,
|
|
510
514
|
timeout=action.timeout or 5000,
|
|
@@ -523,7 +527,7 @@ async def execute_actions(
|
|
|
523
527
|
|
|
524
528
|
case "solve_captcha":
|
|
525
529
|
# Requires Page context for solver
|
|
526
|
-
target_page = ctx if
|
|
530
|
+
target_page = ctx if _is_page(ctx) else ctx.page
|
|
527
531
|
|
|
528
532
|
if action.provider in ("cdp", "scraping_browser"):
|
|
529
533
|
from ...captcha import CDPSolver
|
|
@@ -552,7 +556,7 @@ async def execute_actions(
|
|
|
552
556
|
if action.selector:
|
|
553
557
|
# Check visibility/existence
|
|
554
558
|
try:
|
|
555
|
-
if
|
|
559
|
+
if _is_page(ctx):
|
|
556
560
|
# Use strict=False, state=visible/attached?
|
|
557
561
|
# Just check count > 0 or wait with short timeout?
|
|
558
562
|
# Let's use is_visible or check count to avoid waiting if timeout=0
|
|
@@ -403,95 +403,125 @@ class Fetcher:
|
|
|
403
403
|
|
|
404
404
|
span.set_attribute("phantomfetch.cache.hit", False)
|
|
405
405
|
|
|
406
|
-
#
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
if
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
406
|
+
# Resolve retry configurations
|
|
407
|
+
actual_max_retries = max_retries if max_retries is not None else self.max_retries
|
|
408
|
+
attempts = max(1, actual_max_retries)
|
|
409
|
+
if retry_on is None:
|
|
410
|
+
retry_status_codes: set[int] = {0, 429, 500, 502, 503, 504}
|
|
411
|
+
elif isinstance(retry_on, int):
|
|
412
|
+
retry_status_codes = {retry_on}
|
|
413
|
+
else:
|
|
414
|
+
retry_status_codes = set(retry_on)
|
|
415
|
+
|
|
416
|
+
backoff_base = 2.0 if retry_backoff is None else retry_backoff
|
|
416
417
|
|
|
417
|
-
|
|
418
|
-
if not selected_proxy:
|
|
419
|
-
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
418
|
+
last_resp: Response | None = None
|
|
420
419
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
420
|
+
for attempt in range(attempts):
|
|
421
|
+
# Get proxy
|
|
422
|
+
# 1. Manual override
|
|
423
|
+
selected_proxy: Proxy | None = None
|
|
424
|
+
if proxy:
|
|
425
|
+
if isinstance(proxy, str):
|
|
426
|
+
selected_proxy = Proxy(
|
|
427
|
+
url=proxy, metadata={"source": "manual_override"}
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
selected_proxy = proxy
|
|
431
|
+
|
|
432
|
+
# 2. From Pool (if no override)
|
|
433
|
+
if not selected_proxy:
|
|
434
|
+
selected_proxy = self.proxy_pool.get(url=url, location=location)
|
|
435
|
+
|
|
436
|
+
if selected_proxy:
|
|
437
|
+
span.set_attribute("phantomfetch.proxy", selected_proxy.url)
|
|
438
|
+
if selected_proxy.vendor:
|
|
439
|
+
span.set_attribute(
|
|
440
|
+
"phantomfetch.proxy.vendor", selected_proxy.vendor
|
|
441
|
+
)
|
|
442
|
+
if selected_proxy.proxy_type:
|
|
443
|
+
span.set_attribute(
|
|
444
|
+
"phantomfetch.proxy.type", selected_proxy.proxy_type
|
|
445
|
+
)
|
|
446
|
+
if selected_proxy.location:
|
|
447
|
+
span.set_attribute(
|
|
448
|
+
"phantomfetch.proxy.location", selected_proxy.location
|
|
449
|
+
)
|
|
450
|
+
if selected_proxy.provider:
|
|
451
|
+
span.set_attribute(
|
|
452
|
+
"phantomfetch.proxy.provider", selected_proxy.provider
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Route to engine
|
|
456
|
+
if engine == "browser":
|
|
457
|
+
resp = await self._fetch_browser(
|
|
458
|
+
url=url,
|
|
459
|
+
proxy=selected_proxy,
|
|
460
|
+
headers=headers,
|
|
461
|
+
cookies=cookies,
|
|
462
|
+
actions=normalized_actions,
|
|
463
|
+
timeout=timeout or self.browser_timeout,
|
|
464
|
+
location=location,
|
|
465
|
+
wait_until=wait_until,
|
|
466
|
+
block_resources=block_resources,
|
|
467
|
+
wait_for_url=wait_for_url,
|
|
468
|
+
storage_state=self.session_data, # Pass current session
|
|
469
|
+
stealth=stealth,
|
|
434
470
|
)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
471
|
+
else:
|
|
472
|
+
resp = await self._fetch_curl(
|
|
473
|
+
url=url,
|
|
474
|
+
proxy=selected_proxy,
|
|
475
|
+
headers=headers,
|
|
476
|
+
cookies=cookies,
|
|
477
|
+
timeout=timeout or self.timeout,
|
|
478
|
+
max_retries=1, # Prevent internal retries so we can rotate proxies
|
|
479
|
+
retry_on=retry_on,
|
|
480
|
+
retry_backoff=retry_backoff,
|
|
481
|
+
referer=referer,
|
|
482
|
+
allow_redirects=allow_redirects,
|
|
438
483
|
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
storage_state=self.session_data, # Pass current session
|
|
454
|
-
stealth=stealth,
|
|
455
|
-
max_retries=self.max_retries
|
|
456
|
-
if max_retries is None
|
|
457
|
-
else max_retries,
|
|
458
|
-
retry_on=retry_on,
|
|
459
|
-
retry_backoff=retry_backoff,
|
|
484
|
+
|
|
485
|
+
last_resp = resp
|
|
486
|
+
|
|
487
|
+
# Evaluate proxy status
|
|
488
|
+
if not proxy and selected_proxy:
|
|
489
|
+
if resp.ok:
|
|
490
|
+
self.proxy_pool.mark_success(selected_proxy)
|
|
491
|
+
elif resp.error or resp.status in retry_status_codes:
|
|
492
|
+
# Mark proxy as failed so a fresh one is used on retry
|
|
493
|
+
self.proxy_pool.mark_failed(selected_proxy)
|
|
494
|
+
|
|
495
|
+
# Should we retry?
|
|
496
|
+
should_retry = (
|
|
497
|
+
attempt < attempts - 1 and resp.status in retry_status_codes
|
|
460
498
|
)
|
|
461
|
-
|
|
462
|
-
|
|
499
|
+
if not should_retry:
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
# Backoff before getting a new proxy
|
|
503
|
+
wait = backoff_base**attempt * (0.5 + random.random())
|
|
504
|
+
logger.warning(
|
|
505
|
+
f"[{engine}] Retryable status {resp.status}, "
|
|
506
|
+
f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
|
|
507
|
+
)
|
|
508
|
+
await asyncio.sleep(wait)
|
|
509
|
+
|
|
510
|
+
if not last_resp:
|
|
511
|
+
last_resp = Response(
|
|
463
512
|
url=url,
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
max_retries=self.max_retries
|
|
469
|
-
if max_retries is None
|
|
470
|
-
else max_retries,
|
|
471
|
-
retry_on=retry_on,
|
|
472
|
-
retry_backoff=retry_backoff,
|
|
473
|
-
referer=referer,
|
|
474
|
-
allow_redirects=allow_redirects,
|
|
513
|
+
status=0,
|
|
514
|
+
body=b"",
|
|
515
|
+
engine=engine,
|
|
516
|
+
error="Max retries exhausted",
|
|
475
517
|
)
|
|
518
|
+
|
|
519
|
+
resp = last_resp
|
|
476
520
|
|
|
477
521
|
# Update session data from response if present
|
|
478
522
|
if resp.storage_state:
|
|
479
523
|
self.session_data = resp.storage_state
|
|
480
524
|
|
|
481
|
-
# Update proxy stats (ONLY if it came from the pool, or generally?)
|
|
482
|
-
# If manual override, we might NOT want to impact the pool stats unless the manual proxy IS in the pool?
|
|
483
|
-
# For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
|
|
484
|
-
# But the 'selected_proxy' might be a new instance specific to this request (manual override).
|
|
485
|
-
# The pool.mark_* methods take a Proxy object.
|
|
486
|
-
|
|
487
|
-
# Logic: If 'proxy' argument was None, it came from pool -> Update stats.
|
|
488
|
-
# If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
|
|
489
|
-
if not proxy and selected_proxy:
|
|
490
|
-
if resp.ok:
|
|
491
|
-
self.proxy_pool.mark_success(selected_proxy)
|
|
492
|
-
elif resp.error:
|
|
493
|
-
self.proxy_pool.mark_failed(selected_proxy)
|
|
494
|
-
|
|
495
525
|
# Cache response
|
|
496
526
|
if self.cache and resp.ok and self.cache.should_cache_request("document"):
|
|
497
527
|
# Re-generate key (same logic as above)
|
|
@@ -523,9 +553,6 @@ class Fetcher:
|
|
|
523
553
|
wait_for_url: str | None = None,
|
|
524
554
|
storage_state: dict[str, Any] | None = None,
|
|
525
555
|
stealth: bool = False,
|
|
526
|
-
max_retries: int = 1,
|
|
527
|
-
retry_on: Collection[int] | int | None = None,
|
|
528
|
-
retry_backoff: float | None = None,
|
|
529
556
|
) -> Response:
|
|
530
557
|
async with self._semaphore:
|
|
531
558
|
async with self._browser_semaphore:
|
|
@@ -538,54 +565,21 @@ class Fetcher:
|
|
|
538
565
|
error="CDP engine not initialized",
|
|
539
566
|
)
|
|
540
567
|
|
|
541
|
-
|
|
542
|
-
if retry_on is None:
|
|
543
|
-
retry_status_codes: set[int] = set()
|
|
544
|
-
elif isinstance(retry_on, int):
|
|
545
|
-
retry_status_codes = {retry_on}
|
|
546
|
-
else:
|
|
547
|
-
retry_status_codes = set(retry_on)
|
|
548
|
-
|
|
549
|
-
backoff_base = 2.0 if retry_backoff is None else retry_backoff
|
|
550
|
-
last_response: Response | None = None
|
|
551
|
-
|
|
552
|
-
for attempt in range(attempts):
|
|
553
|
-
response = await self._cdp_engine.fetch(
|
|
554
|
-
url=url,
|
|
555
|
-
proxy=proxy,
|
|
556
|
-
headers=headers,
|
|
557
|
-
cookies=cookies,
|
|
558
|
-
actions=actions,
|
|
559
|
-
timeout=timeout,
|
|
560
|
-
location=location,
|
|
561
|
-
wait_until=wait_until,
|
|
562
|
-
block_resources=block_resources,
|
|
563
|
-
wait_for_url=wait_for_url,
|
|
564
|
-
storage_state=storage_state,
|
|
565
|
-
stealth=stealth,
|
|
566
|
-
)
|
|
567
|
-
last_response = response
|
|
568
|
-
|
|
569
|
-
should_retry = (
|
|
570
|
-
attempt < attempts - 1 and response.status in retry_status_codes
|
|
571
|
-
)
|
|
572
|
-
if not should_retry:
|
|
573
|
-
return response
|
|
574
|
-
|
|
575
|
-
wait = backoff_base**attempt * (0.5 + random.random())
|
|
576
|
-
logger.warning(
|
|
577
|
-
f"[browser] Retryable status {response.status}, "
|
|
578
|
-
f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
|
|
579
|
-
)
|
|
580
|
-
await asyncio.sleep(wait)
|
|
581
|
-
|
|
582
|
-
return last_response or Response(
|
|
568
|
+
response = await self._cdp_engine.fetch(
|
|
583
569
|
url=url,
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
570
|
+
proxy=proxy,
|
|
571
|
+
headers=headers,
|
|
572
|
+
cookies=cookies,
|
|
573
|
+
actions=actions,
|
|
574
|
+
timeout=timeout,
|
|
575
|
+
location=location,
|
|
576
|
+
wait_until=wait_until,
|
|
577
|
+
block_resources=block_resources,
|
|
578
|
+
wait_for_url=wait_for_url,
|
|
579
|
+
storage_state=storage_state,
|
|
580
|
+
stealth=stealth,
|
|
588
581
|
)
|
|
582
|
+
return response
|
|
589
583
|
|
|
590
584
|
async def _fetch_curl(
|
|
591
585
|
self,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|