PyPI - phantomfetch - Versions diffs - 0.6.0__tar.gz → 0.6.2__tar.gz - Mend

phantomfetch 0.6.0tar.gz → 0.6.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: phantomfetch
-Version: 0.6.0
+Version: 0.6.2
 Summary: High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities
 Keywords: web-scraping,playwright,curl-cffi,async,browser-automation,http-client,agentic,anti-detection
 Author: CosmicBull
@@ -26,14 +26,14 @@ Requires-Dist: opentelemetry-sdk>=1.38.0
 Requires-Dist: loguru>=0.7.3
 Requires-Dist: beautifulsoup4>=4.14.3
 Requires-Dist: lmdb>=1.4.1
-Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'all'
+Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'all'
 Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'all'
 Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'all'
 Requires-Dist: patchright>=1.60.1 ; extra == 'all'
 Requires-Dist: maxminddb>=2.0.0 ; extra == 'all'
 Requires-Dist: camoufox[geoip]>=0.4.11 ; extra == 'camoufox'
 Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'cloakbrowser'
-Requires-Dist: cloakbrowser[geoip]>=0.3.31 ; extra == 'geoip'
+Requires-Dist: cloakbrowser>=0.3.31 ; extra == 'geoip'
 Requires-Dist: maxminddb>=2.0.0 ; extra == 'geoip'
 Requires-Dist: patchright>=1.60.1 ; extra == 'patchright'
 Requires-Dist: rebrowser-playwright>=1.52.0 ; extra == 'rebrowser'

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "phantomfetch"
-version = "0.6.0"
+version = "0.6.2"
 description = "High-performance agentic web scraping library combining curl-cffi speed with Playwright browser capabilities"
 readme = "README.md"
 requires-python = ">=3.13"
@@ -45,11 +45,11 @@ dependencies = [
 [project.optional-dependencies]
 cloakbrowser = ["cloakbrowser>=0.3.31"]
-geoip = ["cloakbrowser[geoip]>=0.3.31", "maxminddb>=2.0.0"]
+geoip = ["cloakbrowser>=0.3.31", "maxminddb>=2.0.0"]
 camoufox = ["camoufox[geoip]>=0.4.11"]
 rebrowser = ["rebrowser-playwright>=1.52.0"]
 patchright = ["patchright>=1.60.1"]
-all = ["cloakbrowser[geoip]>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
+all = ["cloakbrowser>=0.3.31", "camoufox[geoip]>=0.4.11", "rebrowser-playwright>=1.52.0", "patchright>=1.60.1", "maxminddb>=2.0.0"]
 [project.urls]
 Homepage = "https://github.com/iristech-systems/PhantomFetch"

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/actions.py RENAMED Viewed

@@ -58,6 +58,11 @@ async def _human_mouse_move(page: "Page", element_handle: Any):
     # We can add a slight overshoot/correction if we want to be fancy, but `steps` is 1st iterated humanization.
+def _is_page(obj: Any) -> bool:
+    return obj.__class__.__name__ == "Page"
 async def execute_actions(
     page: "Page | Locator", actions: list[Action]
 ) -> list["ActionResult"]:
@@ -71,7 +76,6 @@ async def execute_actions(
     Returns:
         List of ActionResult objects
     """
-    from playwright.async_api import Page
     from ...types import ActionResult
@@ -142,7 +146,7 @@ async def execute_actions(
             # If scope is explicitly 'page', force usage of root page
             if action.scope == "page":
-                ctx = page if isinstance(page, Page) else page.page
+                ctx = page if _is_page(page) else page.page
             start_time = time.perf_counter()
             result = ActionResult(action=action, success=True)
@@ -162,7 +166,7 @@ async def execute_actions(
                         case "wait":
                             if action.selector:
                                 state = action.state or "visible"
-                                if isinstance(ctx, Page):
+                                if _is_page(ctx):
                                     await ctx.wait_for_selector(
                                         action.selector,
                                         timeout=action.timeout,
@@ -174,7 +178,7 @@ async def execute_actions(
                                         timeout=action.timeout, state=state
                                     )
                             elif action.timeout:
-                                target_page = ctx if isinstance(ctx, Page) else ctx.page
+                                target_page = ctx if _is_page(ctx) else ctx.page
                                 await target_page.wait_for_timeout(action.timeout)
                         case "loop":
@@ -216,7 +220,7 @@ async def execute_actions(
                                 if action.human_like:
                                     # Human-like click
                                     # Resolve handle first
-                                    if isinstance(ctx, Page):
+                                    if _is_page(ctx):
                                         handle = await ctx.wait_for_selector(
                                             action.selector,
                                             timeout=action.timeout,
@@ -233,7 +237,7 @@ async def execute_actions(
                                     if handle:
                                         # Need page for mouse move
                                         target_page = (
-                                            ctx if isinstance(ctx, Page) else ctx.page
+                                            ctx if _is_page(ctx) else ctx.page
                                         )
                                         await _human_mouse_move(target_page, handle)
                                         await handle.click(delay=random.randint(50, 150))
@@ -243,7 +247,7 @@ async def execute_actions(
                                         timeout=action.timeout,
                                     )
                             # Context click (no selector)
-                            elif isinstance(ctx, Page):
+                            elif _is_page(ctx):
                                 result.success = False
                                 result.error = "Click action on Page requires a selector"
                             elif action.human_like:
@@ -262,7 +266,7 @@ async def execute_actions(
                                 val_str = str(action.value)
                                 if action.human_like:
                                     await ctx.click(action.selector, timeout=action.timeout)
-                                    target_page = ctx if isinstance(ctx, Page) else ctx.page
+                                    target_page = ctx if _is_page(ctx) else ctx.page
                                     await _human_type(target_page, val_str)
                                 else:
                                     await ctx.fill(
@@ -271,7 +275,7 @@ async def execute_actions(
                                         timeout=action.timeout,
                                     )
                             # Input into self (ctx is locator)
-                            elif isinstance(ctx, Page):
+                            elif _is_page(ctx):
                                 result.success = False
                                 result.error = "Input action on Page requires a selector"
                             else:
@@ -286,7 +290,7 @@ async def execute_actions(
                         case "scroll":
                             # Scroll usually implies page-level or element-level scroll
                             # For now, keep page level logic mostly
-                            target_page = ctx if isinstance(ctx, Page) else ctx.page
+                            target_page = ctx if _is_page(ctx) else ctx.page
                             if action.selector == "top":
                                 await target_page.evaluate("window.scrollTo(0, 0)")
@@ -444,13 +448,13 @@ async def execute_actions(
                                 # locator.locator(selector).select_option(...) logic
                                 value=str(action.value),
                                 timeout=action.timeout,
-                            ) if isinstance(ctx, Page) else await ctx.locator(
+                            ) if _is_page(ctx) else await ctx.locator(
                                 action.selector
                             ).select_option(str(action.value), timeout=action.timeout)
                         case "hover":
                             if action.selector:
-                                if isinstance(ctx, Page):
+                                if _is_page(ctx):
                                     await ctx.hover(action.selector, timeout=action.timeout)
                                 else:
                                     await ctx.locator(action.selector).hover(
@@ -465,7 +469,7 @@ async def execute_actions(
                             kwargs = {}
                             if action.full_page:
                                 kwargs["full_page"] = True
-                                if not isinstance(ctx, Page):
+                                if not _is_page(ctx):
                                     # If we are in a Locator (e.g. inside loop loop), but want full page,
                                     # we must switch to the page context.
                                     screenshot_ctx = ctx.page
@@ -491,7 +495,7 @@ async def execute_actions(
                                 result.data = img_bytes
                         case "wait_for_load":
-                            target_page = ctx if isinstance(ctx, Page) else ctx.page
+                            target_page = ctx if _is_page(ctx) else ctx.page
                             await target_page.wait_for_load_state(
                                 "networkidle", timeout=action.timeout
                             )
@@ -504,7 +508,7 @@ async def execute_actions(
                         case "validate":
                             try:
                                 state = action.state or "attached"
-                                if isinstance(ctx, Page):
+                                if _is_page(ctx):
                                     await ctx.wait_for_selector(
                                         action.selector,
                                         timeout=action.timeout or 5000,
@@ -523,7 +527,7 @@ async def execute_actions(
                         case "solve_captcha":
                             # Requires Page context for solver
-                            target_page = ctx if isinstance(ctx, Page) else ctx.page
+                            target_page = ctx if _is_page(ctx) else ctx.page
                             if action.provider in ("cdp", "scraping_browser"):
                                 from ...captcha import CDPSolver
@@ -552,7 +556,7 @@ async def execute_actions(
                             if action.selector:
                                 # Check visibility/existence
                                 try:
-                                    if isinstance(ctx, Page):
+                                    if _is_page(ctx):
                                         # Use strict=False, state=visible/attached?
                                         # Just check count > 0 or wait with short timeout?
                                         # Let's use is_visible or check count to avoid waiting if timeout=0

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/fetch.py RENAMED Viewed

@@ -403,95 +403,125 @@ class Fetcher:
             span.set_attribute("phantomfetch.cache.hit", False)
-            # Get proxy
-            # 1. Manual override
-            selected_proxy: Proxy | None = None
-            if proxy:
-                if isinstance(proxy, str):
-                    selected_proxy = Proxy(
-                        url=proxy, metadata={"source": "manual_override"}
-                    )
-                else:
-                    selected_proxy = proxy
+            # Resolve retry configurations
+            actual_max_retries = max_retries if max_retries is not None else self.max_retries
+            attempts = max(1, actual_max_retries)
+            if retry_on is None:
+                retry_status_codes: set[int] = {0, 429, 500, 502, 503, 504}
+            elif isinstance(retry_on, int):
+                retry_status_codes = {retry_on}
+            else:
+                retry_status_codes = set(retry_on)
+            backoff_base = 2.0 if retry_backoff is None else retry_backoff
-            # 2. From Pool (if no override)
-            if not selected_proxy:
-                selected_proxy = self.proxy_pool.get(url=url, location=location)
+            last_resp: Response | None = None
-            if selected_proxy:
-                span.set_attribute("phantomfetch.proxy", selected_proxy.url)
-                if selected_proxy.vendor:
-                    span.set_attribute(
-                        "phantomfetch.proxy.vendor", selected_proxy.vendor
-                    )
-                if selected_proxy.proxy_type:
-                    span.set_attribute(
-                        "phantomfetch.proxy.type", selected_proxy.proxy_type
-                    )
-                if selected_proxy.location:
-                    span.set_attribute(
-                        "phantomfetch.proxy.location", selected_proxy.location
+            for attempt in range(attempts):
+                # Get proxy
+                # 1. Manual override
+                selected_proxy: Proxy | None = None
+                if proxy:
+                    if isinstance(proxy, str):
+                        selected_proxy = Proxy(
+                            url=proxy, metadata={"source": "manual_override"}
+                        )
+                    else:
+                        selected_proxy = proxy
+                # 2. From Pool (if no override)
+                if not selected_proxy:
+                    selected_proxy = self.proxy_pool.get(url=url, location=location)
+                if selected_proxy:
+                    span.set_attribute("phantomfetch.proxy", selected_proxy.url)
+                    if selected_proxy.vendor:
+                        span.set_attribute(
+                            "phantomfetch.proxy.vendor", selected_proxy.vendor
+                        )
+                    if selected_proxy.proxy_type:
+                        span.set_attribute(
+                            "phantomfetch.proxy.type", selected_proxy.proxy_type
+                        )
+                    if selected_proxy.location:
+                        span.set_attribute(
+                            "phantomfetch.proxy.location", selected_proxy.location
+                        )
+                    if selected_proxy.provider:
+                        span.set_attribute(
+                            "phantomfetch.proxy.provider", selected_proxy.provider
+                        )
+                # Route to engine
+                if engine == "browser":
+                    resp = await self._fetch_browser(
+                        url=url,
+                        proxy=selected_proxy,
+                        headers=headers,
+                        cookies=cookies,
+                        actions=normalized_actions,
+                        timeout=timeout or self.browser_timeout,
+                        location=location,
+                        wait_until=wait_until,
+                        block_resources=block_resources,
+                        wait_for_url=wait_for_url,
+                        storage_state=self.session_data,  # Pass current session
+                        stealth=stealth,
                     )
-                if selected_proxy.provider:
-                    span.set_attribute(
-                        "phantomfetch.proxy.provider", selected_proxy.provider
+                else:
+                    resp = await self._fetch_curl(
+                        url=url,
+                        proxy=selected_proxy,
+                        headers=headers,
+                        cookies=cookies,
+                        timeout=timeout or self.timeout,
+                        max_retries=1,  # Prevent internal retries so we can rotate proxies
+                        retry_on=retry_on,
+                        retry_backoff=retry_backoff,
+                        referer=referer,
+                        allow_redirects=allow_redirects,
                     )
-            # Route to engine
-            if engine == "browser":
-                resp = await self._fetch_browser(
-                    url=url,
-                    proxy=selected_proxy,
-                    headers=headers,
-                    cookies=cookies,
-                    actions=normalized_actions,
-                    timeout=timeout or self.browser_timeout,
-                    location=location,
-                    wait_until=wait_until,
-                    block_resources=block_resources,
-                    wait_for_url=wait_for_url,
-                    storage_state=self.session_data,  # Pass current session
-                    stealth=stealth,
-                    max_retries=self.max_retries
-                    if max_retries is None
-                    else max_retries,
-                    retry_on=retry_on,
-                    retry_backoff=retry_backoff,
+                last_resp = resp
+                # Evaluate proxy status
+                if not proxy and selected_proxy:
+                    if resp.ok:
+                        self.proxy_pool.mark_success(selected_proxy)
+                    elif resp.error or resp.status in retry_status_codes:
+                        # Mark proxy as failed so a fresh one is used on retry
+                        self.proxy_pool.mark_failed(selected_proxy)
+                # Should we retry?
+                should_retry = (
+                    attempt < attempts - 1 and resp.status in retry_status_codes
                 )
-            else:
-                resp = await self._fetch_curl(
+                if not should_retry:
+                    break
+                # Backoff before getting a new proxy
+                wait = backoff_base**attempt * (0.5 + random.random())
+                logger.warning(
+                    f"[{engine}] Retryable status {resp.status}, "
+                    f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
+                )
+                await asyncio.sleep(wait)
+            if not last_resp:
+                last_resp = Response(
                     url=url,
-                    proxy=selected_proxy,
-                    headers=headers,
-                    cookies=cookies,
-                    timeout=timeout or self.timeout,
-                    max_retries=self.max_retries
-                    if max_retries is None
-                    else max_retries,
-                    retry_on=retry_on,
-                    retry_backoff=retry_backoff,
-                    referer=referer,
-                    allow_redirects=allow_redirects,
+                    status=0,
+                    body=b"",
+                    engine=engine,
+                    error="Max retries exhausted",
                 )
+            resp = last_resp
             # Update session data from response if present
             if resp.storage_state:
                 self.session_data = resp.storage_state
-            # Update proxy stats (ONLY if it came from the pool, or generally?)
-            # If manual override, we might NOT want to impact the pool stats unless the manual proxy IS in the pool?
-            # For simplicity, if we have a pool, and this proxy matches one in the pool, we could update it.
-            # But the 'selected_proxy' might be a new instance specific to this request (manual override).
-            # The pool.mark_* methods take a Proxy object.
-            # Logic: If 'proxy' argument was None, it came from pool -> Update stats.
-            # If 'proxy' argument was set -> Do NOT update pool stats (it's a manual override).
-            if not proxy and selected_proxy:
-                if resp.ok:
-                    self.proxy_pool.mark_success(selected_proxy)
-                elif resp.error:
-                    self.proxy_pool.mark_failed(selected_proxy)
             # Cache response
             if self.cache and resp.ok and self.cache.should_cache_request("document"):
                 # Re-generate key (same logic as above)
@@ -523,9 +553,6 @@ class Fetcher:
         wait_for_url: str | None = None,
         storage_state: dict[str, Any] | None = None,
         stealth: bool = False,
-        max_retries: int = 1,
-        retry_on: Collection[int] | int | None = None,
-        retry_backoff: float | None = None,
     ) -> Response:
         async with self._semaphore:
             async with self._browser_semaphore:
@@ -538,54 +565,21 @@ class Fetcher:
                         error="CDP engine not initialized",
                     )
-                attempts = max(1, max_retries)
-                if retry_on is None:
-                    retry_status_codes: set[int] = set()
-                elif isinstance(retry_on, int):
-                    retry_status_codes = {retry_on}
-                else:
-                    retry_status_codes = set(retry_on)
-                backoff_base = 2.0 if retry_backoff is None else retry_backoff
-                last_response: Response | None = None
-                for attempt in range(attempts):
-                    response = await self._cdp_engine.fetch(
-                        url=url,
-                        proxy=proxy,
-                        headers=headers,
-                        cookies=cookies,
-                        actions=actions,
-                        timeout=timeout,
-                        location=location,
-                        wait_until=wait_until,
-                        block_resources=block_resources,
-                        wait_for_url=wait_for_url,
-                        storage_state=storage_state,
-                        stealth=stealth,
-                    )
-                    last_response = response
-                    should_retry = (
-                        attempt < attempts - 1 and response.status in retry_status_codes
-                    )
-                    if not should_retry:
-                        return response
-                    wait = backoff_base**attempt * (0.5 + random.random())
-                    logger.warning(
-                        f"[browser] Retryable status {response.status}, "
-                        f"attempt {attempt + 1}/{attempts}, retrying in {wait:.2f}s"
-                    )
-                    await asyncio.sleep(wait)
-                return last_response or Response(
+                response = await self._cdp_engine.fetch(
                     url=url,
-                    status=0,
-                    body=b"",
-                    engine="browser",
-                    error="Max retries exhausted",
+                    proxy=proxy,
+                    headers=headers,
+                    cookies=cookies,
+                    actions=actions,
+                    timeout=timeout,
+                    location=location,
+                    wait_until=wait_until,
+                    block_resources=block_resources,
+                    wait_for_url=wait_for_url,
+                    storage_state=storage_state,
+                    stealth=stealth,
                 )
+                return response
     async def _fetch_curl(
         self,

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/README.md RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/__init__.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/cache.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/captcha.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/__init__.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/base.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/__init__.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/browser/cdp.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/engines/curl.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/pool.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/presets.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/registry.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/strategy_advisor.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/telemetry.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/__init__.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/fingerprint.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/matcher.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/adaptive/store.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/tools/selector_builder.py RENAMED Viewed

File without changes

{phantomfetch-0.6.0 → phantomfetch-0.6.2}/src/phantomfetch/types.py RENAMED Viewed

File without changes

phantomfetch 0.6.0__tar.gz → 0.6.2__tar.gz

phantomfetch 0.6.0tar.gz → 0.6.2tar.gz