PyPI - fraudcrawler - Versions diffs - 0.5.6__tar.gz → 0.5.8__tar.gz - Mend

fraudcrawler 0.5.6tar.gz → 0.5.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fraudcrawler
-Version: 0.5.6
+Version: 0.5.8
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/base.py RENAMED Viewed

@@ -9,7 +9,7 @@ from pydantic import (
 from pydantic_settings import BaseSettings
 from urllib.parse import urlparse
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, TYPE_CHECKING
 import httpx
@@ -23,6 +23,9 @@ from fraudcrawler.settings import (
     DEFAULT_HTTPX_REDIRECTS,
 )
+if TYPE_CHECKING:
+    from fraudcrawler.scraping.zyte import ZyteAPI
 logger = logging.getLogger(__name__)
 # Load google locations and languages
@@ -241,3 +244,35 @@ class DomainUtils:
         if hostname and hostname.startswith("www."):
             hostname = hostname[4:]
         return hostname.lower()
+    async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
+        """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
+        This method is specifically designed to handle 403 Forbidden errors for domains
+        that may be blocking requests from certain IP ranges (like cloud providers).
+        Args:
+            url: The URL to fetch using Zyte proxy mode.
+            zyte_api: An instance of ZyteAPI to use for the request.
+        Returns:
+            The HTML content as bytes if successful, None if failed.
+        """
+        try:
+            logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
+            details = await zyte_api.details(url)
+            if details and "httpResponseBody" in details:
+                # Decode the base64 content
+                import base64
+                html_content = base64.b64decode(details["httpResponseBody"])
+                logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
+                return html_content
+            else:
+                logger.warning(f"Zyte proxy request failed for URL: {url}")
+                return None
+        except Exception as e:
+            logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
+            return None

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -114,8 +114,13 @@ class Orchestrator(ABC):
             self._owns_http_client = True
         # Setup the clients
+        self._zyteapi = ZyteAPI(
+            http_client=self._http_client, api_key=self._zyteapi_key
+        )
         self._search = Search(
-            http_client=self._http_client, serpapi_key=self._serpapi_key
+            http_client=self._http_client,
+            serpapi_key=self._serpapi_key,
+            zyte_api=self._zyteapi,
         )
         self._enricher = Enricher(
             http_client=self._http_client,
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
             pwd=self._dataforseo_pwd,
         )
         self._url_collector = URLCollector()
-        self._zyteapi = ZyteAPI(
-            http_client=self._http_client, api_key=self._zyteapi_key
-        )
         self._processor = Processor(
             http_client=self._http_client,
             api_key=self._openaiapi_key,

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/search.py RENAMED Viewed

@@ -376,13 +376,15 @@ class Toppreise(SearchEngine):
         "Upgrade-Insecure-Requests": "1",
     }
-    def __init__(self, http_client: httpx.AsyncClient):
+    def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
         """Initializes the Toppreise client.
         Args:
             http_client: An httpx.AsyncClient to use for the async requests.
+            zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
         """
         self._http_client = http_client
+        self._zyte_api = zyte_api
     @property
     def _search_engine_name(self) -> str:
@@ -448,16 +450,32 @@ class Toppreise(SearchEngine):
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
             search_string=search_string, retry_state=retry_state
         )
-        async for attempt in retry:
-            with attempt:
-                response = await self._http_client.get(
-                    url=url,
-                    headers=self._headers,
+        content = None
+        try:
+            async for attempt in retry:
+                with attempt:
+                    response = await self._http_client.get(
+                        url=url,
+                        headers=self._headers,
+                    )
+                    response.raise_for_status()
+                    content = response.content
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 403 and self._zyte_api:
+                logger.warning(
+                    f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
                 )
-                response.raise_for_status()
+                content = await self._unblock_url(url, self._zyte_api)
+                if content is None:
+                    raise e  # Re-raise if zyte fallback also failed
+            else:
+                raise e
+        if content is None:
+            raise httpx.HTTPError("Failed to fetch content")
         # Get external product urls from the content
-        content = response.content
         urls = self._get_external_product_urls(content=content)
         urls = urls[:num_results]  # Limit to num_results if needed
@@ -491,18 +509,19 @@ class Toppreise(SearchEngine):
 class Search(DomainUtils):
     """Class to perform searches using different search engines."""
-    def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
+    def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
         """Initializes the Search class with the given SerpAPI key.
         Args:
             http_client: An httpx.AsyncClient to use for the async requests.
             serpapi_key: The API key for SERP API.
+            zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
         """
         self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
         self._google_shopping = SerpAPIGoogleShopping(
             http_client=http_client, api_key=serpapi_key
         )
-        self._toppreise = Toppreise(http_client=http_client)
+        self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
     @staticmethod
     def _domain_in_host(domain: str, host: Host) -> bool:

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.5.6"
+version = "0.5.8"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/README.md RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/client.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/google-languages.json RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/google-locations.json RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/base/retry.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/launch_demo_pipeline.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/processing/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/processing/processor.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/enrich.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/url.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/scraping/zyte.py RENAMED Viewed

File without changes

{fraudcrawler-0.5.6 → fraudcrawler-0.5.8}/fraudcrawler/settings.py RENAMED Viewed

File without changes

fraudcrawler 0.5.6__tar.gz → 0.5.8__tar.gz

Potentially problematic release.

fraudcrawler 0.5.6tar.gz → 0.5.8tar.gz