PyPI - fraudcrawler - Versions diffs - 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

fraudcrawler 0.5.8py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (17) hide show

fraudcrawler/__init__.py +2 -2
fraudcrawler/base/base.py +4 -38
fraudcrawler/base/client.py +1 -1
fraudcrawler/base/orchestrator.py +135 -135
fraudcrawler/base/retry.py +12 -6
fraudcrawler/launch_demo_pipeline.py +1 -1
fraudcrawler/processing/processor.py +3 -3
fraudcrawler/scraping/search.py +293 -74
fraudcrawler/scraping/url.py +42 -3
fraudcrawler/scraping/zyte.py +15 -1
fraudcrawler/settings.py +13 -3
{fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/METADATA +4 -3
fraudcrawler-0.6.0.dist-info/RECORD +22 -0
{fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/WHEEL +1 -1
fraudcrawler-0.5.8.dist-info/RECORD +0 -22
{fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/LICENSE +0 -0
{fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/entry_points.txt +0 -0

fraudcrawler/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fraudcrawler.scraping.search import Search, SearchEngineName
+from fraudcrawler.scraping.search import Searcher, SearchEngineName
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteAPI
@@ -17,7 +17,7 @@ from fraudcrawler.base.base import (
 )
 __all__ = [
-    "Search",
+    "Searcher",
     "SearchEngineName",
     "Enricher",
     "URLCollector",

fraudcrawler/base/base.py CHANGED Viewed

@@ -9,7 +9,8 @@ from pydantic import (
 from pydantic_settings import BaseSettings
 from urllib.parse import urlparse
 import re
-from typing import Any, Dict, List, TYPE_CHECKING
+from typing import Any, Dict, List
 import httpx
@@ -23,9 +24,6 @@ from fraudcrawler.settings import (
     DEFAULT_HTTPX_REDIRECTS,
 )
-if TYPE_CHECKING:
-    from fraudcrawler.scraping.zyte import ZyteAPI
 logger = logging.getLogger(__name__)
 # Load google locations and languages
@@ -135,7 +133,7 @@ class Deepness(BaseModel):
 class ProductItem(BaseModel):
     """Model representing a product item."""
-    # Serp/Enrich parameters
+    # Search parameters
     search_term: str
     search_term_type: str
     url: str
@@ -143,7 +141,7 @@ class ProductItem(BaseModel):
     search_engine_name: str
     domain: str
-    # Zyte parameters
+    # Context parameters
     product_name: str | None = None
     product_price: str | None = None
     product_description: str | None = None
@@ -244,35 +242,3 @@ class DomainUtils:
         if hostname and hostname.startswith("www."):
             hostname = hostname[4:]
         return hostname.lower()
-    async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
-        """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
-        This method is specifically designed to handle 403 Forbidden errors for domains
-        that may be blocking requests from certain IP ranges (like cloud providers).
-        Args:
-            url: The URL to fetch using Zyte proxy mode.
-            zyte_api: An instance of ZyteAPI to use for the request.
-        Returns:
-            The HTML content as bytes if successful, None if failed.
-        """
-        try:
-            logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
-            details = await zyte_api.details(url)
-            if details and "httpResponseBody" in details:
-                # Decode the base64 content
-                import base64
-                html_content = base64.b64decode(details["httpResponseBody"])
-                logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
-                return html_content
-            else:
-                logger.warning(f"Zyte proxy request failed for URL: {url}")
-                return None
-        except Exception as e:
-            logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
-            return None

fraudcrawler/base/client.py CHANGED Viewed

@@ -103,7 +103,7 @@ class FraudCrawlerClient(Orchestrator):
         search_engines: List[SearchEngineName | str] | None = None,
         previously_collected_urls: List[str] | None = None,
     ) -> None:
-        """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
+        """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
             search_term: The search term for the query.

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -10,8 +10,8 @@ from fraudcrawler.settings import (
     PROCESSOR_DEFAULT_MODEL,
 )
 from fraudcrawler.settings import (
-    DEFAULT_N_SERP_WKRS,
-    DEFAULT_N_ZYTE_WKRS,
+    DEFAULT_N_SRCH_WKRS,
+    DEFAULT_N_CNTX_WKRS,
     DEFAULT_N_PROC_WKRS,
 )
 from fraudcrawler.base.base import (
@@ -24,7 +24,7 @@ from fraudcrawler.base.base import (
     HttpxAsyncClient,
 )
 from fraudcrawler import (
-    Search,
+    Searcher,
     SearchEngineName,
     Enricher,
     URLCollector,
@@ -59,8 +59,8 @@ class Orchestrator(ABC):
         zyteapi_key: str,
         openaiapi_key: str,
         openai_model: str = PROCESSOR_DEFAULT_MODEL,
-        n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
-        n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
+        n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
+        n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
         n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
         # Configure a custom httpx client.
         # We provide a `HttpxAsyncClient` class that you can pass
@@ -81,8 +81,8 @@ class Orchestrator(ABC):
             zyteapi_key: The API key for Zyte API.
             openaiapi_key: The API key for OpenAI.
             openai_model: The model to use for the processing (optional).
-            n_serp_wkrs: Number of async workers for serp (optional).
-            n_zyte_wkrs: Number of async workers for zyte (optional).
+            n_srch_wkrs: Number of async workers for the search (optional).
+            n_cntx_wkrs: Number of async workers for context extraction (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
             http_client: An httpx.AsyncClient to use for the async requests (optional).
         """
@@ -96,8 +96,8 @@ class Orchestrator(ABC):
         self._openai_model = openai_model
         # Setup the async framework
-        self._n_serp_wkrs = n_serp_wkrs
-        self._n_zyte_wkrs = n_zyte_wkrs
+        self._n_srch_wkrs = n_srch_wkrs
+        self._n_cntx_wkrs = n_cntx_wkrs
         self._n_proc_wkrs = n_proc_wkrs
         self._queues: Dict[str, asyncio.Queue] | None = None
         self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
@@ -114,13 +114,10 @@ class Orchestrator(ABC):
             self._owns_http_client = True
         # Setup the clients
-        self._zyteapi = ZyteAPI(
-            http_client=self._http_client, api_key=self._zyteapi_key
-        )
-        self._search = Search(
+        self._searcher = Searcher(
             http_client=self._http_client,
             serpapi_key=self._serpapi_key,
-            zyte_api=self._zyteapi,
+            zyteapi_key=self._zyteapi_key,
         )
         self._enricher = Enricher(
             http_client=self._http_client,
@@ -128,6 +125,10 @@ class Orchestrator(ABC):
             pwd=self._dataforseo_pwd,
         )
         self._url_collector = URLCollector()
+        self._zyteapi = ZyteAPI(
+            http_client=self._http_client,
+            api_key=self._zyteapi_key,
+        )
         self._processor = Processor(
             http_client=self._http_client,
             api_key=self._openaiapi_key,
@@ -142,7 +143,7 @@ class Orchestrator(ABC):
             await self._http_client.aclose()
             self._http_client = None
-    async def _serp_execute(
+    async def _srch_execute(
         self,
         queue_in: asyncio.Queue[dict | None],
         queue_out: asyncio.Queue[ProductItem | None],
@@ -160,17 +161,14 @@ class Orchestrator(ABC):
                 break
             try:
+                # Execute the search
                 search_term_type = item.pop("search_term_type")
-                # The search_engines are already SearchEngineName enum values
-                search_engines = item.pop("search_engines")
-                results = await self._search.apply(
-                    **item, search_engines=search_engines
-                )
+                results = await self._searcher.apply(**item)
                 logger.debug(
                     f"Search for {item['search_term']} returned {len(results)} results"
                 )
+                # Create ProductItems for each result
                 for res in results:
                     product = ProductItem(
                         search_term=item["search_term"],
@@ -205,31 +203,12 @@ class Orchestrator(ABC):
                 break
             if not product.filtered:
-                # Clean the URL by removing tracking parameters
-                url = self._url_collector.remove_tracking_parameters(product.url)
-                product.url = url
-                if url in self._url_collector.collected_currently:
-                    # deduplicate on current run
-                    product.filtered = True
-                    product.filtered_at_stage = (
-                        "URL collection (current run deduplication)"
-                    )
-                    logger.debug(f"URL {url} already collected in current run")
-                elif url in self._url_collector.collected_previously:
-                    # deduplicate on previous runs coming from a db
-                    product.filtered = True
-                    product.filtered_at_stage = (
-                        "URL collection (previous run deduplication)"
-                    )
-                    logger.debug(f"URL {url} as already collected in previous run")
-                else:
-                    self._url_collector.collected_currently.add(url)
+                product = await self._url_collector.apply(product=product)
             await queue_out.put(product)
             queue_in.task_done()
-    async def _zyte_execute(
+    async def _cntx_execute(
         self,
         queue_in: asyncio.Queue[ProductItem | None],
         queue_out: asyncio.Queue[ProductItem | None],
@@ -248,7 +227,7 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    # Fetch the product details from Zyte API
+                    # Fetch the product context from Zyte API
                     details = await self._zyteapi.details(url=product.url)
                     url_resolved = self._zyteapi.extract_url_resolved(details=details)
                     if url_resolved:
@@ -258,12 +237,13 @@ class Orchestrator(ABC):
                     )
                     # If the resolved URL is different from the original URL, we also need to update the domain as
-                    # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
+                    # otherwise the unresolved domain will be shown.
+                    # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
                     if url_resolved and url_resolved != product.url:
                         logger.debug(
                             f"URL resolved for {product.url} is {url_resolved}"
                         )
-                        product.domain = self._search._get_domain(url_resolved)
+                        product.domain = self._searcher._get_domain(url_resolved)
                     product.product_price = self._zyteapi.extract_product_price(
                         details=details
@@ -348,52 +328,52 @@ class Orchestrator(ABC):
     def _setup_async_framework(
         self,
-        n_serp_wkrs: int,
-        n_zyte_wkrs: int,
+        n_srch_wkrs: int,
+        n_cntx_wkrs: int,
         n_proc_wkrs: int,
         prompts: List[Prompt],
     ) -> None:
         """Sets up the necessary queues and workers for the async framework.
         Args:
-            n_serp_wkrs: Number of async workers for serp.
-            n_zyte_wkrs: Number of async workers for zyte.
-            n_proc_wkrs: Number of async workers for processor.
+            n_srch_wkrs: Number of async workers for search.
+            n_cntx_wkrs: Number of async workers for context extraction.
+            n_proc_wkrs: Number of async workers for processing.
             prompts: The list of prompts used for the classification by func:`Processor.classify`.
         """
         # Setup the input/output queues for the workers
-        serp_queue: asyncio.Queue[dict | None] = asyncio.Queue()
+        srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
         url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
-        zyte_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
+        cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
         proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
         res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
-        # Setup the Serp workers
-        serp_wkrs = [
+        # Setup the Search workers
+        srch_wkrs = [
             asyncio.create_task(
-                self._serp_execute(
-                    queue_in=serp_queue,
+                self._srch_execute(
+                    queue_in=srch_queue,
                     queue_out=url_queue,
                 )
             )
-            for _ in range(n_serp_wkrs)
+            for _ in range(n_srch_wkrs)
         ]
         # Setup the URL collector
         url_col = asyncio.create_task(
-            self._collect_url(queue_in=url_queue, queue_out=zyte_queue)
+            self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
         )
-        # Setup the Zyte workers
-        zyte_wkrs = [
+        # Setup the context extraction workers
+        cntx_wkrs = [
             asyncio.create_task(
-                self._zyte_execute(
-                    queue_in=zyte_queue,
+                self._cntx_execute(
+                    queue_in=cntx_queue,
                     queue_out=proc_queue,
                 )
             )
-            for _ in range(n_zyte_wkrs)
+            for _ in range(n_cntx_wkrs)
         ]
         # Setup the processing workers
@@ -413,26 +393,26 @@ class Orchestrator(ABC):
         # Add the setup to the instance variables
         self._queues = {
-            "serp": serp_queue,
+            "srch": srch_queue,
             "url": url_queue,
-            "zyte": zyte_queue,
+            "cntx": cntx_queue,
             "proc": proc_queue,
             "res": res_queue,
         }
         self._workers = {
-            "serp": serp_wkrs,
+            "srch": srch_wkrs,
             "url": url_col,
-            "zyte": zyte_wkrs,
+            "cntx": cntx_wkrs,
             "proc": proc_wkrs,
             "res": res_col,
         }
     @staticmethod
-    async def _add_serp_items_for_search_term(
+    async def _add_search_items_for_search_term(
         queue: asyncio.Queue[dict | None],
         search_term: str,
         search_term_type: str,
-        search_engines: List[SearchEngineName],
+        search_engine: SearchEngineName,
         language: Language,
         location: Location,
         num_results: int,
@@ -443,17 +423,17 @@ class Orchestrator(ABC):
         item = {
             "search_term": search_term,
             "search_term_type": search_term_type,
-            "search_engines": search_engines,
+            "search_engine": search_engine,
             "language": language,
             "location": location,
             "num_results": num_results,
             "marketplaces": marketplaces,
             "excluded_urls": excluded_urls,
         }
-        logger.debug(f'Adding item="{item}" to serp_queue')
+        logger.debug(f'Adding item="{item}" to srch_queue')
         await queue.put(item)
-    async def _add_serp_items(
+    async def _add_srch_items(
         self,
         queue: asyncio.Queue[dict | None],
         search_term: str,
@@ -464,7 +444,23 @@ class Orchestrator(ABC):
         marketplaces: List[Host] | None,
         excluded_urls: List[Host] | None,
     ) -> None:
-        """Adds all the (enriched) search_term (as serp items) to the queue."""
+        """Adds all the (enriched) search_term (as srch items) to the queue.
+        One item consists of the following parameters:
+            - search_term: The search term for the query.
+            - search_term_type: The type of the search term (initial or enriched).
+            - search_engines: The search engines to use for the query.
+            - language: The language to use for the query.
+            - location: The location to use for the query.
+            - num_results: The number of results to return.
+            - marketplaces: The marketplaces to include in the search.
+            - excluded_urls: The URLs to exclude from the search.
+        For constructing such items we essentially have two loops:
+            for each search_term (initial + enriched)
+                for each search_engine
+                    add item to queue
+        """
         common_kwargs = {
             "queue": queue,
             "language": language,
@@ -473,14 +469,15 @@ class Orchestrator(ABC):
             "excluded_urls": excluded_urls,
         }
-        # Add initial items to the serp_queue
-        await self._add_serp_items_for_search_term(
-            search_term=search_term,
-            search_term_type="initial",
-            search_engines=search_engines,
-            num_results=deepness.num_results,
-            **common_kwargs,  # type: ignore[arg-type]
-        )
+        # Add initial items to the queue
+        for se in search_engines:
+            await self._add_search_items_for_search_term(
+                search_term=search_term,
+                search_term_type="initial",
+                search_engine=se,
+                num_results=deepness.num_results,
+                **common_kwargs,  # type: ignore[arg-type]
+            )
         # Enrich the search_terms
         enrichment = deepness.enrichment
@@ -494,15 +491,16 @@ class Orchestrator(ABC):
                 n_terms=n_terms,
             )
-            # Add the enriched search terms to the serp_queue
+            # Add the enriched search terms to the queue
             for trm in terms:
-                await self._add_serp_items_for_search_term(
-                    search_term=trm,
-                    search_term_type="enriched",
-                    search_engines=search_engines,
-                    num_results=enrichment.additional_urls_per_term,
-                    **common_kwargs,  # type: ignore[arg-type]
-                )
+                for se in search_engines:
+                    await self._add_search_items_for_search_term(
+                        search_term=trm,
+                        search_term_type="enriched",
+                        search_engine=se,
+                        num_results=enrichment.additional_urls_per_term,
+                        **common_kwargs,  # type: ignore[arg-type]
+                    )
     async def run(
         self,
@@ -516,7 +514,7 @@ class Orchestrator(ABC):
         excluded_urls: List[Host] | None = None,
         previously_collected_urls: List[str] | None = None,
     ) -> None:
-        """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
+        """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
             search_term: The search term for the query.
@@ -541,22 +539,24 @@ class Orchestrator(ABC):
         # Handle previously collected URLs
         if previously_collected_urls:
-            self._url_collector.collected_previously = set(previously_collected_urls)
+            self._url_collector.add_previously_collected_urls(
+                urls=previously_collected_urls
+            )
         # Setup the async framework
         n_terms_max = 1 + (
             deepness.enrichment.additional_terms if deepness.enrichment else 0
         )
-        n_serp_wkrs = min(self._n_serp_wkrs, n_terms_max)
-        n_zyte_wkrs = min(self._n_zyte_wkrs, deepness.num_results)
+        n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
+        n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
         n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
         logger.debug(
-            f"setting up async framework (#workers: serp={n_serp_wkrs}, zyte={n_zyte_wkrs}, proc={n_proc_wkrs})"
+            f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
         )
         self._setup_async_framework(
-            n_serp_wkrs=n_serp_wkrs,
-            n_zyte_wkrs=n_zyte_wkrs,
+            n_srch_wkrs=n_srch_wkrs,
+            n_cntx_wkrs=n_cntx_wkrs,
             n_proc_wkrs=n_proc_wkrs,
             prompts=prompts,
         )
@@ -566,21 +566,21 @@ class Orchestrator(ABC):
             raise ValueError(
                 "Async framework is not setup. Please call _setup_async_framework() first."
             )
-        if not all([k in self._queues for k in ["serp", "url", "zyte", "proc", "res"]]):
+        if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
             raise ValueError(
                 "The queues of the async framework are not setup correctly."
             )
         if not all(
-            [k in self._workers for k in ["serp", "url", "zyte", "proc", "res"]]
+            [k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
         ):
             raise ValueError(
                 "The workers of the async framework are not setup correctly."
             )
-        # Add the search items to the serp_queue
-        serp_queue = self._queues["serp"]
-        await self._add_serp_items(
-            queue=serp_queue,
+        # Add the search items to the srch_queue
+        srch_queue = self._queues["srch"]
+        await self._add_srch_items(
+            queue=srch_queue,
             search_term=search_term,
             search_engines=search_engines,
             language=language,
@@ -590,26 +590,26 @@ class Orchestrator(ABC):
             excluded_urls=excluded_urls,
         )
-        # ---------------------------
-        #   ORCHESTRATE SERP WORKERS
-        # ---------------------------
-        # Add the sentinels to the serp_queue
-        for _ in range(n_serp_wkrs):
-            await serp_queue.put(None)
+        # -----------------------------
+        #  ORCHESTRATE SEARCH WORKERS
+        # -----------------------------
+        # Add the sentinels to the srch_queue
+        for _ in range(n_srch_wkrs):
+            await srch_queue.put(None)
-        # Wait for the serp workers to be concluded before adding the sentinels to the url_queue
-        serp_workers = self._workers["serp"]
+        # Wait for the srch workers to be concluded before adding the sentinels to the url_queue
+        srch_workers = self._workers["srch"]
         try:
-            logger.debug("Waiting for serp_workers to conclude their tasks...")
-            serp_res = await asyncio.gather(*serp_workers, return_exceptions=True)
-            for i, res in enumerate(serp_res):
+            logger.debug("Waiting for srch_workers to conclude their tasks...")
+            srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
+            for i, res in enumerate(srch_res):
                 if isinstance(res, Exception):
-                    logger.error(f"Error in serp_worker {i}: {res}")
-            logger.debug("...serp_workers concluded their tasks")
+                    logger.error(f"Error in srch_worker {i}: {res}")
+            logger.debug("...srch_workers concluded their tasks")
         except Exception as e:
-            logger.error(f"Gathering serp_workers failed: {e}")
+            logger.error(f"Gathering srch_workers failed: {e}")
         finally:
-            await serp_queue.join()
+            await srch_queue.join()
         # ---------------------------
         #  ORCHESTRATE URL COLLECTOR
@@ -618,7 +618,7 @@ class Orchestrator(ABC):
         url_queue = self._queues["url"]
         await url_queue.put(None)
-        # Wait for the url_collector to be concluded before adding the sentinels to the zyte_queue
+        # Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
         url_collector = cast(asyncio.Task, self._workers["url"])
         try:
             logger.debug("Waiting for url_collector to conclude its tasks...")
@@ -629,27 +629,27 @@ class Orchestrator(ABC):
         finally:
             await url_queue.join()
-        # ---------------------------
-        #  ORCHESTRATE ZYTE WORKERS
-        # ---------------------------
-        # Add the sentinels to the zyte_queue
-        zyte_queue = self._queues["zyte"]
-        for _ in range(n_zyte_wkrs):
-            await zyte_queue.put(None)
+        # -----------------------------
+        #  ORCHESTRATE CONTEXT WORKERS
+        # -----------------------------
+        # Add the sentinels to the cntx_queue
+        cntx_queue = self._queues["cntx"]
+        for _ in range(n_cntx_wkrs):
+            await cntx_queue.put(None)
-        # Wait for the zyte_workers to be concluded before adding the sentinels to the proc_queue
-        zyte_workers = self._workers["zyte"]
+        # Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
+        cntx_workers = self._workers["cntx"]
         try:
-            logger.debug("Waiting for zyte_workers to conclude their tasks...")
-            zyte_res = await asyncio.gather(*zyte_workers, return_exceptions=True)
-            for i, res in enumerate(zyte_res):
+            logger.debug("Waiting for cntx_workers to conclude their tasks...")
+            cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
+            for i, res in enumerate(cntx_res):
                 if isinstance(res, Exception):
-                    logger.error(f"Error in zyte_worker {i}: {res}")
-            logger.debug("...zyte_workers concluded their tasks")
+                    logger.error(f"Error in cntx_worker {i}: {res}")
+            logger.debug("...cntx_workers concluded their tasks")
         except Exception as e:
-            logger.error(f"Gathering zyte_workers failed: {e}")
+            logger.error(f"Gathering cntx_workers failed: {e}")
         finally:
-            await zyte_queue.join()
+            await cntx_queue.join()
         # ---------------------------
         #  ORCHESTRATE PROC WORKERS

fraudcrawler/base/retry.py CHANGED Viewed

@@ -25,16 +25,22 @@ def _is_retryable_exception(err: BaseException) -> bool:
     return True
-def get_async_retry() -> AsyncRetrying:
+def get_async_retry(
+    stop_after: int = RETRY_STOP_AFTER_ATTEMPT,
+    initial_delay: int = RETRY_INITIAL_DELAY,
+    max_delay: int = RETRY_MAX_DELAY,
+    exp_base: int = RETRY_EXP_BASE,
+    jitter: int = RETRY_JITTER,
+) -> AsyncRetrying:
     """returns the retry configuration for async operations."""
     return AsyncRetrying(
         retry=retry_if_exception(_is_retryable_exception),
-        stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
+        stop=stop_after_attempt(stop_after),
         wait=wait_exponential_jitter(
-            initial=RETRY_INITIAL_DELAY,
-            max=RETRY_MAX_DELAY,
-            exp_base=RETRY_EXP_BASE,
-            jitter=RETRY_JITTER,
+            initial=initial_delay,
+            max=max_delay,
+            exp_base=exp_base,
+            jitter=jitter,
         ),
         reraise=True,
     )

fraudcrawler/launch_demo_pipeline.py CHANGED Viewed

@@ -97,4 +97,4 @@ def search(search_term: str):
 if __name__ == "__main__":
-    search(search_term='Liebherr "TP1410"')
+    search(search_term="electric cigarettes")

fraudcrawler 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.5.8py3-none-any.whl → 0.6.0py3-none-any.whl