PyPI - fraudcrawler - Versions diffs - 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

fraudcrawler 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

fraudcrawler/__init__.py +7 -5
fraudcrawler/base/base.py +64 -32
fraudcrawler/base/client.py +27 -11
fraudcrawler/base/orchestrator.py +103 -25
fraudcrawler/base/retry.py +5 -2
fraudcrawler/launch_demo_pipeline.py +9 -9
fraudcrawler/processing/processor.py +9 -5
fraudcrawler/scraping/enrich.py +38 -21
fraudcrawler/scraping/search.py +664 -0
fraudcrawler/scraping/zyte.py +37 -15
fraudcrawler/settings.py +13 -2
{fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/METADATA +6 -2
fraudcrawler-0.5.0.dist-info/RECORD +22 -0
fraudcrawler/scraping/serp.py +0 -515
fraudcrawler-0.4.7.dist-info/RECORD +0 -22
{fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/LICENSE +0 -0
{fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/WHEEL +0 -0
{fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/entry_points.txt +0 -0

fraudcrawler/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from fraudcrawler.scraping.serp import SerpApi, SearchEngine
+from fraudcrawler.scraping.search import Search, SearchEngineName
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.url import URLCollector
-from fraudcrawler.scraping.zyte import ZyteApi
+from fraudcrawler.scraping.zyte import ZyteAPI
 from fraudcrawler.processing.processor import Processor
 from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
@@ -13,14 +13,15 @@ from fraudcrawler.base.base import (
     Location,
     Prompt,
     ProductItem,
+    HttpxAsyncClient,
 )
 __all__ = [
-    "SerpApi",
-    "SearchEngine",
+    "Search",
+    "SearchEngineName",
     "Enricher",
     "URLCollector",
-    "ZyteApi",
+    "ZyteAPI",
     "Processor",
     "Orchestrator",
     "ProductItem",
@@ -31,4 +32,5 @@ __all__ = [
     "Deepness",
     "Enrichment",
     "Prompt",
+    "HttpxAsyncClient",
 ]

fraudcrawler/base/base.py CHANGED Viewed

@@ -7,15 +7,21 @@ from pydantic import (
     model_validator,
 )
 from pydantic_settings import BaseSettings
+from urllib.parse import urlparse
 import re
-from typing import List, Dict
+from typing import Any, Dict, List
-import aiohttp
+import httpx
 from fraudcrawler.settings import (
     GOOGLE_LANGUAGES_FILENAME,
     GOOGLE_LOCATIONS_FILENAME,
 )
+from fraudcrawler.settings import (
+    DEFAULT_HTTPX_TIMEOUT,
+    DEFAULT_HTTPX_LIMITS,
+    DEFAULT_HTTPX_REDIRECTS,
+)
 logger = logging.getLogger(__name__)
@@ -130,7 +136,8 @@ class ProductItem(BaseModel):
     search_term: str
     search_term_type: str
     url: str
-    marketplace_name: str
+    url_resolved: str
+    search_engine_name: str
     domain: str
     # Zyte parameters
@@ -180,32 +187,57 @@ class Prompt(BaseModel):
         return val
-class AsyncClient:
-    """Base class for sub-classes using async HTTP requests."""
-    @staticmethod
-    async def get(
-        url: str,
-        headers: dict | None = None,
-        params: dict | None = None,
-    ) -> dict:
-        """Async GET request of a given URL returning the data."""
-        async with aiohttp.ClientSession(headers=headers) as session:
-            async with session.get(url=url, params=params) as response:
-                response.raise_for_status()
-                json_ = await response.json()
-        return json_
-    @staticmethod
-    async def post(
-        url: str,
-        headers: dict | None = None,
-        data: List[dict] | dict | None = None,
-        auth: aiohttp.BasicAuth | None = None,
-    ) -> dict:
-        """Async POST request of a given URL returning the data."""
-        async with aiohttp.ClientSession(headers=headers) as session:
-            async with session.post(url=url, json=data, auth=auth) as response:
-                response.raise_for_status()
-                json_ = await response.json()
-        return json_
+class HttpxAsyncClient(httpx.AsyncClient):
+    """Httpx async client that can be used to retain the default settings."""
+    def __init__(
+        self,
+        timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
+        limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
+        follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
+        **kwargs: Any,
+    ) -> None:
+        if isinstance(timeout, dict):
+            timeout = httpx.Timeout(**timeout)
+        if isinstance(limits, dict):
+            limits = httpx.Limits(**limits)
+        kwargs.setdefault("timeout", timeout)
+        kwargs.setdefault("limits", limits)
+        kwargs.setdefault("follow_redirects", follow_redirects)
+        super().__init__(**kwargs)
+class DomainUtils:
+    """Utility class for domain extraction and normalization.
+    Handles domain parsing from URLs, removes common prefixes (www, http/https),
+    and provides consistent domain formatting for search and scraping operations.
+    """
+    _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
+    def _get_domain(self, url: str) -> str:
+        """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
+        Args:
+            url: The URL to be processed.
+        """
+        # Add scheme; urlparse requires it
+        if not url.startswith(("http://", "https://")):
+            url = "http://" + url
+        # Get the hostname
+        hostname = urlparse(url).hostname
+        if hostname is None and (match := re.search(self._hostname_pattern, url)):
+            hostname = match.group(1)
+        if hostname is None:
+            logger.warning(
+                f'Failed to extract domain from url="{url}"; full url is returned'
+            )
+            return url.lower()
+        # Remove www. prefix
+        if hostname and hostname.startswith("www."):
+            hostname = hostname[4:]
+        return hostname.lower()

fraudcrawler/base/client.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import datetime
 import logging
 from pathlib import Path
 from pydantic import BaseModel
-from typing import List
+from typing import List, Self
 import pandas as pd
@@ -19,7 +19,7 @@ from fraudcrawler.base.base import (
     ProductItem,
 )
 from fraudcrawler.base.orchestrator import Orchestrator
-from fraudcrawler.scraping.serp import SearchEngine
+from fraudcrawler.scraping.search import SearchEngineName
 logger = logging.getLogger(__name__)
@@ -53,6 +53,13 @@ class FraudCrawlerClient(Orchestrator):
             self._results_dir.mkdir(parents=True)
         self._results: List[Results] = []
+    async def __aenter__(self) -> Self:
+        await super().__aenter__()  # let base set itself up
+        return self  # so `async with FraudCrawlerClient()` gives you this instance
+    async def __aexit__(self, *args, **kwargs) -> None:
+        await super().__aexit__(*args, **kwargs)
     async def _collect_results(
         self, queue_in: asyncio.Queue[ProductItem | None]
     ) -> None:
@@ -93,7 +100,8 @@ class FraudCrawlerClient(Orchestrator):
         prompts: List[Prompt],
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
-        search_engines: List[SearchEngine | str] | None = None,
+        search_engines: List[SearchEngineName | str] | None = None,
+        previously_collected_urls: List[str] | None = None,
     ) -> None:
         """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
@@ -103,8 +111,10 @@ class FraudCrawlerClient(Orchestrator):
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
             prompts: The list of prompts to use for classification.
-            marketplaces: The marketplaces to include in the search.
-            excluded_urls: The URLs to exclude from the search.
+            marketplaces: The marketplaces to include in the search (optional).
+            excluded_urls: The URLs to exclude from the search (optional).
+            search_engines: The list of search engines to use for the search (optional).
+            previously_collected_urls: The urls that have been collected previously and are ignored (optional).
         """
         # Handle results files
         timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
@@ -116,24 +126,30 @@ class FraudCrawlerClient(Orchestrator):
         )
         self._results.append(Results(search_term=search_term, filename=filename))
-        # Normalize inputs
-        nrm_se: List[SearchEngine] = list(SearchEngine)
+        # Normalize inputs - convert strings to SearchEngineName enum values
+        nrm_search_engines = list(SearchEngineName)
         if search_engines:
-            nrm_se = [
-                SearchEngine(se) if isinstance(se, str) else se for se in search_engines
+            nrm_search_engines = [
+                SearchEngineName(se) if isinstance(se, str) else se
+                for se in search_engines
             ]
         # Run the pipeline by calling the orchestrator's run method
+        async def _run(*args, **kwargs):
+            async with self:
+                return await super(FraudCrawlerClient, self).run(*args, **kwargs)
         asyncio.run(
-            super().run(
+            _run(
                 search_term=search_term,
-                search_engines=nrm_se,
+                search_engines=nrm_search_engines,
                 language=language,
                 location=location,
                 deepness=deepness,
                 prompts=prompts,
                 marketplaces=marketplaces,
                 excluded_urls=excluded_urls,
+                previously_collected_urls=previously_collected_urls,
             )
         )

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from typing import Dict, List, cast
+from typing import cast, Dict, List, Self
 from bs4 import BeautifulSoup
+import httpx
 from fraudcrawler.settings import (
     PROCESSOR_DEFAULT_MODEL,
@@ -20,13 +21,14 @@ from fraudcrawler.base.base import (
     Location,
     Prompt,
     ProductItem,
+    HttpxAsyncClient,
 )
 from fraudcrawler import (
-    SerpApi,
-    SearchEngine,
+    Search,
+    SearchEngineName,
     Enricher,
     URLCollector,
-    ZyteApi,
+    ZyteAPI,
     Processor,
 )
@@ -60,9 +62,18 @@ class Orchestrator(ABC):
         n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
         n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
         n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
+        # Configure a custom httpx client.
+        # We provide a `HttpxAsyncClient` class that you can pass
+        # to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
+        http_client: httpx.AsyncClient | None = None,
     ):
         """Initializes the orchestrator with the given settings.
+        NOTE:
+        The class:`Orchestrator` must be used as context manager as follows:
+            async with Orchestrator(...) as orchestrator:
+                await orchestrator.run()
         Args:
             serpapi_key: The API key for SERP API.
             dataforseo_user: The user for DataForSEO.
@@ -73,16 +84,16 @@ class Orchestrator(ABC):
             n_serp_wkrs: Number of async workers for serp (optional).
             n_zyte_wkrs: Number of async workers for zyte (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
+            http_client: An httpx.AsyncClient to use for the async requests (optional).
         """
-        # Setup the clients
-        self._serpapi = SerpApi(api_key=serpapi_key)
-        self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
-        self._url_collector = URLCollector()
-        self._zyteapi = ZyteApi(api_key=zyteapi_key)
-        self._processor = Processor(
-            api_key=openaiapi_key,
-            model=openai_model,
-        )
+        # Store the variables for setting up the clients
+        self._serpapi_key = serpapi_key
+        self._dataforseo_user = dataforseo_user
+        self._dataforseo_pwd = dataforseo_pwd
+        self._zyteapi_key = zyteapi_key
+        self._openaiapi_key = openaiapi_key
+        self._openai_model = openai_model
         # Setup the async framework
         self._n_serp_wkrs = n_serp_wkrs
@@ -91,12 +102,50 @@ class Orchestrator(ABC):
         self._queues: Dict[str, asyncio.Queue] | None = None
         self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
+        # Setup the httpx client
+        self._http_client = http_client
+        self._owns_http_client = http_client is None
+    async def __aenter__(self) -> Self:
+        """Creates and starts an httpx.AsyncClient if not provided."""
+        if self._http_client is None:
+            logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
+            self._http_client = HttpxAsyncClient()
+            self._owns_http_client = True
+        # Setup the clients
+        self._search = Search(
+            http_client=self._http_client, serpapi_key=self._serpapi_key
+        )
+        self._enricher = Enricher(
+            http_client=self._http_client,
+            user=self._dataforseo_user,
+            pwd=self._dataforseo_pwd,
+        )
+        self._url_collector = URLCollector()
+        self._zyteapi = ZyteAPI(
+            http_client=self._http_client, api_key=self._zyteapi_key
+        )
+        self._processor = Processor(
+            http_client=self._http_client,
+            api_key=self._openaiapi_key,
+            model=self._openai_model,
+        )
+        return self
+    async def __aexit__(self, *args, **kwargs) -> None:
+        """Closes the httpx.AsyncClient if it was created by this orchestrator."""
+        if self._owns_http_client and self._http_client is not None:
+            logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
+            await self._http_client.aclose()
+            self._http_client = None
     async def _serp_execute(
         self,
         queue_in: asyncio.Queue[dict | None],
         queue_out: asyncio.Queue[ProductItem | None],
     ) -> None:
-        """Collects the SerpApi search setups from the queue_in, executes the search, filters the results (country_code) and puts them into queue_out.
+        """Collects the search setups from the queue_in, executes the search, filters the results and puts them into queue_out.
         Args:
             queue_in: The input queue containing the search parameters.
@@ -110,23 +159,30 @@ class Orchestrator(ABC):
             try:
                 search_term_type = item.pop("search_term_type")
-                results = await self._serpapi.apply(**item)
+                # The search_engines are already SearchEngineName enum values
+                search_engines = item.pop("search_engines")
+                results = await self._search.apply(
+                    **item, search_engines=search_engines
+                )
                 logger.debug(
-                    f"SERP API search for {item['search_term']} returned {len(results)} results"
+                    f"Search for {item['search_term']} returned {len(results)} results"
                 )
                 for res in results:
                     product = ProductItem(
                         search_term=item["search_term"],
                         search_term_type=search_term_type,
                         url=res.url,
-                        marketplace_name=res.marketplace_name,
+                        url_resolved=res.url,  # Set initial value, will be updated by Zyte
+                        search_engine_name=res.search_engine_name,
                         domain=res.domain,
                         filtered=res.filtered,
                         filtered_at_stage=res.filtered_at_stage,
                     )
                     await queue_out.put(product)
             except Exception as e:
-                logger.error(f"Error executing SERP API search: {e}")
+                logger.error(f"Error executing search: {e}")
             queue_in.task_done()
     async def _collect_url(
@@ -191,10 +247,22 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
                     # Fetch the product details from Zyte API
-                    details = await self._zyteapi.get_details(url=product.url)
+                    details = await self._zyteapi.details(url=product.url)
+                    url_resolved = self._zyteapi.extract_url_resolved(details=details)
+                    if url_resolved:
+                        product.url_resolved = url_resolved
                     product.product_name = self._zyteapi.extract_product_name(
                         details=details
                     )
+                    # If the resolved URL is different from the original URL, we also need to update the domain as
+                    # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
+                    if url_resolved and url_resolved != product.url:
+                        logger.debug(
+                            f"URL resolved for {product.url} is {url_resolved}"
+                        )
+                        product.domain = self._search._get_domain(url_resolved)
                     product.product_price = self._zyteapi.extract_product_price(
                         details=details
                     )
@@ -362,7 +430,7 @@ class Orchestrator(ABC):
         queue: asyncio.Queue[dict | None],
         search_term: str,
         search_term_type: str,
-        search_engines: List[SearchEngine],
+        search_engines: List[SearchEngineName],
         language: Language,
         location: Location,
         num_results: int,
@@ -387,7 +455,7 @@ class Orchestrator(ABC):
         self,
         queue: asyncio.Queue[dict | None],
         search_term: str,
-        search_engines: List[SearchEngine],
+        search_engines: List[SearchEngineName],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -417,7 +485,7 @@ class Orchestrator(ABC):
         if enrichment:
             # Call DataForSEO to get additional terms
             n_terms = enrichment.additional_terms
-            terms = await self._enricher.apply(
+            terms = await self._enricher.enrich(
                 search_term=search_term,
                 language=language,
                 location=location,
@@ -437,7 +505,7 @@ class Orchestrator(ABC):
     async def run(
         self,
         search_term: str,
-        search_engines: List[SearchEngine],
+        search_engines: List[SearchEngineName],
         language: Language,
         location: Location,
         deepness: Deepness,
@@ -450,7 +518,7 @@ class Orchestrator(ABC):
         Args:
             search_term: The search term for the query.
-            search_engines: The list of search engines to use for the SerpAPI query.
+            search_engines: The list of search engines to use for the search query.
             language: The language to use for the query.
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
@@ -459,10 +527,17 @@ class Orchestrator(ABC):
             excluded_urls: The URLs to exclude from the search.
             previously_collected_urls: The urls that have been collected previously and are ignored.
         """
         # ---------------------------
         #        INITIAL SETUP
         # ---------------------------
+        # Ensure we have at least one search engine
+        if not search_engines:
+            logger.warning(
+                "No search engines specified, using all available search engines"
+            )
+            search_engines = list(SearchEngineName)
+        # Handle previously collected URLs
         if previously_collected_urls:
             self._url_collector.collected_previously = set(previously_collected_urls)
@@ -614,4 +689,7 @@ class Orchestrator(ABC):
         finally:
             await res_queue.join()
+        # ---------------------------
+        #  CLOSING PIPELINE
+        # ---------------------------
         logger.info("Pipeline concluded; async framework is closed")

fraudcrawler/base/retry.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from aiohttp.web_exceptions import HTTPException
+from httpx import HTTPStatusError
 from tenacity import (
     AsyncRetrying,
     retry_if_exception,
@@ -17,7 +17,10 @@ from fraudcrawler.settings import (
 def _is_retryable_exception(err: BaseException) -> bool:
-    if isinstance(err, HTTPException) and err.status_code in RETRY_SKIP_IF_CODE:
+    if (
+        isinstance(err, HTTPStatusError)
+        and err.response.status_code in RETRY_SKIP_IF_CODE
+    ):
         return False
     return True

fraudcrawler/launch_demo_pipeline.py CHANGED Viewed

@@ -54,17 +54,17 @@ def search(search_term: str):
     # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
-    # # Optional: Add MARKETPLACES and EXCLUDED_URLS
-    # from fraudcrawler import Host
+    # Optional: Add MARKETPLACES and EXCLUDED_URLS
+    from fraudcrawler import Host
     # marketplaces = [
     #     Host(name="International", domains="zavamed.com,apomeds.com"),
-    #     Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
-    # ]
-    # excluded_urls = [
-    #     Host(name="Digitec", domains="digitec.ch"),
-    #     Host(name="Brack", domains="brack.ch"),
+    #     # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
     # ]
+    excluded_urls = [
+        Host(name="Digitec", domains="digitec.ch"),
+        Host(name="Brack", domains="brack.ch"),
+    ]
     # Execute the pipeline
     client.execute(
@@ -74,7 +74,7 @@ def search(search_term: str):
         deepness=deepness,
         prompts=prompts,
         # marketplaces=marketplaces,
-        # excluded_urls=excluded_urls,
+        excluded_urls=excluded_urls,
     )
     # Show results
@@ -97,4 +97,4 @@ def search(search_term: str):
 if __name__ == "__main__":
-    search(search_term = "Medion Kühlbox MD 37454")
+    search(search_term='Liebherr "TP1410"')

fraudcrawler/processing/processor.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
+import httpx
 from openai import AsyncOpenAI
 from tenacity import RetryCallState
@@ -21,6 +22,7 @@ class Processor:
     def __init__(
         self,
+        http_client: httpx.AsyncClient,
         api_key: str,
         model: str,
         default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
@@ -29,12 +31,13 @@ class Processor:
         """Initializes the Processor.
         Args:
+            http_client: An httpx.AsyncClient to use for the async requests.
             api_key: The OpenAI API key.
             model: The OpenAI model to use.
             default_if_missing: The default classification to return if error occurs.
             empty_token_count: The default value to return as tokensif the classification is empty.
         """
-        self._client = AsyncOpenAI(api_key=api_key)
+        self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
         self._model = model
         self._error_response = ClassificationResult(
             result=default_if_missing,
@@ -59,7 +62,7 @@ class Processor:
                     )
                 )
             else:
-                logger.error(
+                logger.warning(
                     f'Field "{field}" is missing in ProductItem with url="{product.url}"'
                 )
         return "\n\n".join(details)
@@ -101,9 +104,10 @@ class Processor:
             ],
             **kwargs,
         )
-        content = response.choices[0].message.content
-        if not content:
-            raise ValueError("Empty response from OpenAI API")
+        if not response or not (content := response.choices[0].message.content):
+            raise ValueError(
+                f'Error calling OpenAI API or empty response="{response}".'
+            )
         # Convert the content to an integer
         try:

fraudcrawler 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

fraudcrawler 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl