PyPI - fraudcrawler - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

fraudcrawler/__init__.py +21 -5
fraudcrawler/base/base.py +18 -38
fraudcrawler/base/client.py +57 -60
fraudcrawler/base/orchestrator.py +277 -276
fraudcrawler/base/retry.py +25 -11
fraudcrawler/launch_demo_pipeline.py +103 -41
fraudcrawler/processing/base.py +151 -0
fraudcrawler/processing/openai.py +521 -0
fraudcrawler/scraping/enrich.py +6 -4
fraudcrawler/scraping/search.py +370 -110
fraudcrawler/scraping/url.py +42 -3
fraudcrawler/scraping/zyte.py +146 -80
fraudcrawler/settings.py +22 -10
fraudcrawler-0.7.26.dist-info/METADATA +173 -0
fraudcrawler-0.7.26.dist-info/RECORD +23 -0
fraudcrawler/processing/processor.py +0 -199
fraudcrawler-0.5.0.dist-info/METADATA +0 -167
fraudcrawler-0.5.0.dist-info/RECORD +0 -22
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt +0 -0

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -1,34 +1,32 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from typing import cast, Dict, List, Self
+from typing import cast, Dict, List
-from bs4 import BeautifulSoup
-import httpx
+import re
 from fraudcrawler.settings import (
-    PROCESSOR_DEFAULT_MODEL,
+    EXACT_MATCH_PRODUCT_FIELDS,
+    EXACT_MATCH_FIELD_SEPARATOR,
 )
 from fraudcrawler.settings import (
-    DEFAULT_N_SERP_WKRS,
-    DEFAULT_N_ZYTE_WKRS,
+    DEFAULT_N_SRCH_WKRS,
+    DEFAULT_N_CNTX_WKRS,
     DEFAULT_N_PROC_WKRS,
 )
 from fraudcrawler.base.base import (
-    Deepness,
     Host,
     Language,
     Location,
-    Prompt,
+    Deepness,
     ProductItem,
-    HttpxAsyncClient,
 )
 from fraudcrawler import (
-    Search,
+    Searcher,
     SearchEngineName,
     Enricher,
-    URLCollector,
     ZyteAPI,
+    URLCollector,
     Processor,
 )
@@ -36,16 +34,17 @@ logger = logging.getLogger(__name__)
 class Orchestrator(ABC):
-    """Abstract base class for orchestrating the different actors (crawling, processing).
+    """Abstract base class for orchestrating the different actors (scraping, processing).
+    Any subclass of :class:`Orchestrator` orchestrates the complete pipeline: search,
+    deduplication, context extraction, processing (classification), and result collection.
     Abstract methods:
         _collect_results: Collects the results from the given queue_in.
+            This function is responsible for collecting and handling the results from the given queue_in. It might
+            save the results to a file, a database, or any other storage.
-    Each subclass of class:`Orchestrator` must implement the abstract method func:`_collect_results`.
-    This function is responsible for collecting and handling the results from the given queue_in. It might
-    save the results to a file, a database, or any other storage.
-    For each pipeline step class:`Orchestrator` will deploy a number of async workers to handle the tasks.
+    For each pipeline step :class:`Orchestrator` will deploy a number of async workers to handle the tasks.
     In addition it makes sure to orchestrate the canceling of the workers only after the relevant workload is done.
     For more information on the orchestrating pattern see README.md.
@@ -53,94 +52,43 @@ class Orchestrator(ABC):
     def __init__(
         self,
-        serpapi_key: str,
-        dataforseo_user: str,
-        dataforseo_pwd: str,
-        zyteapi_key: str,
-        openaiapi_key: str,
-        openai_model: str = PROCESSOR_DEFAULT_MODEL,
-        n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
-        n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
+        searcher: Searcher,
+        enricher: Enricher,
+        url_collector: URLCollector,
+        zyteapi: ZyteAPI,
+        processor: Processor,
+        n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
+        n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
         n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
-        # Configure a custom httpx client.
-        # We provide a `HttpxAsyncClient` class that you can pass
-        # to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
-        http_client: httpx.AsyncClient | None = None,
     ):
         """Initializes the orchestrator with the given settings.
-        NOTE:
-        The class:`Orchestrator` must be used as context manager as follows:
-            async with Orchestrator(...) as orchestrator:
-                await orchestrator.run()
         Args:
-            serpapi_key: The API key for SERP API.
-            dataforseo_user: The user for DataForSEO.
-            dataforseo_pwd: The password for DataForSEO.
-            zyteapi_key: The API key for Zyte API.
-            openaiapi_key: The API key for OpenAI.
-            openai_model: The model to use for the processing (optional).
-            n_serp_wkrs: Number of async workers for serp (optional).
-            n_zyte_wkrs: Number of async workers for zyte (optional).
+            searcher: Client for searching step.
+            enricher: Client for enrichment step.
+            url_collector: Client for deduplication.
+            zyteapi: Client for metadata extraction.
+            processor: Client for product classification.
+            n_srch_wkrs: Number of async workers for the search (optional).
+            n_cntx_wkrs: Number of async workers for context extraction (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
-            http_client: An httpx.AsyncClient to use for the async requests (optional).
         """
-        # Store the variables for setting up the clients
-        self._serpapi_key = serpapi_key
-        self._dataforseo_user = dataforseo_user
-        self._dataforseo_pwd = dataforseo_pwd
-        self._zyteapi_key = zyteapi_key
-        self._openaiapi_key = openaiapi_key
-        self._openai_model = openai_model
+        # Pipeline clients
+        self._searcher = searcher
+        self._enricher = enricher
+        self._url_collector = url_collector
+        self._zyteapi = zyteapi
+        self._processor = processor
         # Setup the async framework
-        self._n_serp_wkrs = n_serp_wkrs
-        self._n_zyte_wkrs = n_zyte_wkrs
+        self._n_srch_wkrs = n_srch_wkrs
+        self._n_cntx_wkrs = n_cntx_wkrs
         self._n_proc_wkrs = n_proc_wkrs
         self._queues: Dict[str, asyncio.Queue] | None = None
         self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
-        # Setup the httpx client
-        self._http_client = http_client
-        self._owns_http_client = http_client is None
-    async def __aenter__(self) -> Self:
-        """Creates and starts an httpx.AsyncClient if not provided."""
-        if self._http_client is None:
-            logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
-            self._http_client = HttpxAsyncClient()
-            self._owns_http_client = True
-        # Setup the clients
-        self._search = Search(
-            http_client=self._http_client, serpapi_key=self._serpapi_key
-        )
-        self._enricher = Enricher(
-            http_client=self._http_client,
-            user=self._dataforseo_user,
-            pwd=self._dataforseo_pwd,
-        )
-        self._url_collector = URLCollector()
-        self._zyteapi = ZyteAPI(
-            http_client=self._http_client, api_key=self._zyteapi_key
-        )
-        self._processor = Processor(
-            http_client=self._http_client,
-            api_key=self._openaiapi_key,
-            model=self._openai_model,
-        )
-        return self
-    async def __aexit__(self, *args, **kwargs) -> None:
-        """Closes the httpx.AsyncClient if it was created by this orchestrator."""
-        if self._owns_http_client and self._http_client is not None:
-            logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
-            await self._http_client.aclose()
-            self._http_client = None
-    async def _serp_execute(
+    async def _srch_execute(
         self,
         queue_in: asyncio.Queue[dict | None],
         queue_out: asyncio.Queue[ProductItem | None],
@@ -158,17 +106,14 @@ class Orchestrator(ABC):
                 break
             try:
+                # Execute the search
                 search_term_type = item.pop("search_term_type")
-                # The search_engines are already SearchEngineName enum values
-                search_engines = item.pop("search_engines")
-                results = await self._search.apply(
-                    **item, search_engines=search_engines
-                )
+                results = await self._searcher.apply(**item)
                 logger.debug(
                     f"Search for {item['search_term']} returned {len(results)} results"
                 )
+                # Create ProductItems for each result
                 for res in results:
                     product = ProductItem(
                         search_term=item["search_term"],
@@ -181,8 +126,11 @@ class Orchestrator(ABC):
                         filtered_at_stage=res.filtered_at_stage,
                     )
                     await queue_out.put(product)
-            except Exception as e:
-                logger.error(f"Error executing search: {e}")
+            except Exception:
+                logger.error(
+                    f"Running search failed with item={item}",
+                    exc_info=True,
+                )
             queue_in.task_done()
     async def _collect_url(
@@ -203,31 +151,12 @@ class Orchestrator(ABC):
                 break
             if not product.filtered:
-                # Clean the URL by removing tracking parameters
-                url = self._url_collector.remove_tracking_parameters(product.url)
-                product.url = url
-                if url in self._url_collector.collected_currently:
-                    # deduplicate on current run
-                    product.filtered = True
-                    product.filtered_at_stage = (
-                        "URL collection (current run deduplication)"
-                    )
-                    logger.debug(f"URL {url} already collected in current run")
-                elif url in self._url_collector.collected_previously:
-                    # deduplicate on previous runs coming from a db
-                    product.filtered = True
-                    product.filtered_at_stage = (
-                        "URL collection (previous run deduplication)"
-                    )
-                    logger.debug(f"URL {url} as already collected in previous run")
-                else:
-                    self._url_collector.collected_currently.add(url)
+                product = await self._url_collector.apply(product=product)
             await queue_out.put(product)
             queue_in.task_done()
-    async def _zyte_execute(
+    async def _cntx_execute(
         self,
         queue_in: asyncio.Queue[ProductItem | None],
         queue_out: asyncio.Queue[ProductItem | None],
@@ -246,45 +175,34 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    # Fetch the product details from Zyte API
+                    # Fetch and enrich the product context from Zyte API
                     details = await self._zyteapi.details(url=product.url)
-                    url_resolved = self._zyteapi.extract_url_resolved(details=details)
-                    if url_resolved:
-                        product.url_resolved = url_resolved
-                    product.product_name = self._zyteapi.extract_product_name(
-                        details=details
+                    product = self._zyteapi.enrich_context(
+                        product=product, details=details
                     )
-                    # If the resolved URL is different from the original URL, we also need to update the domain as
-                    # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
-                    if url_resolved and url_resolved != product.url:
-                        logger.debug(
-                            f"URL resolved for {product.url} is {url_resolved}"
-                        )
-                        product.domain = self._search._get_domain(url_resolved)
-                    product.product_price = self._zyteapi.extract_product_price(
-                        details=details
-                    )
-                    product.product_description = (
-                        self._zyteapi.extract_product_description(details=details)
-                    )
-                    product.product_images = self._zyteapi.extract_image_urls(
-                        details=details
-                    )
-                    product.probability = self._zyteapi.extract_probability(
-                        details=details
-                    )
-                    product.html = self._zyteapi.extract_html(details=details)
-                    if product.html:
-                        soup = BeautifulSoup(product.html, "html.parser")
-                        product.html_clean = soup.get_text(separator=" ", strip=True)
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
-                        product.filtered_at_stage = "Zyte probability threshold"
-                except Exception as e:
-                    logger.warning(f"Error executing Zyte API search: {e}.")
+                        product.filtered_at_stage = (
+                            "Context (Zyte probability threshold)"
+                        )
+                    # Check for exact match inside the full product context
+                    product = self._check_exact_search(product=product)
+                    if (
+                        not product.filtered
+                        and product.exact_search
+                        and not product.exact_search_match
+                    ):
+                        product.filtered = True
+                        product.filtered_at_stage = "Context (exact search)"
+                except Exception:
+                    logger.error(
+                        f"Running Zyte API search failed for product with url={product.url_resolved}",
+                        exc_info=True,
+                    )
             await queue_out.put(product)
             queue_in.task_done()
@@ -292,14 +210,12 @@ class Orchestrator(ABC):
         self,
         queue_in: asyncio.Queue[ProductItem | None],
         queue_out: asyncio.Queue[ProductItem | None],
-        prompts: List[Prompt],
     ) -> None:
         """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
         Args:
             queue_in: The input queue containing the product details.
             queue_out: The output queue to put the processed product details.
-            prompts: The list of prompts to use for classification.
         """
         # Process the products
@@ -312,22 +228,12 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    # Run all the configured prompts
-                    for prompt in prompts:
-                        classification = await self._processor.classify(
-                            product=product,
-                            prompt=prompt,
-                        )
-                        product.classifications[prompt.name] = int(
-                            classification.result
-                        )
-                        product.usage[prompt.name] = {
-                            "input_tokens": classification.input_tokens,
-                            "output_tokens": classification.output_tokens,
-                        }
-                except Exception as e:
-                    logger.warning(
-                        f"Error processing product with url={product.url}: {e}."
+                    # Run the configured workflows
+                    product = await self._processor.run(product=product)
+                except Exception:
+                    logger.error(
+                        f"Processing product with url={product.url_resolved} failed",
+                        exc_info=True,
                     )
             await queue_out.put(product)
@@ -346,52 +252,50 @@ class Orchestrator(ABC):
     def _setup_async_framework(
         self,
-        n_serp_wkrs: int,
-        n_zyte_wkrs: int,
+        n_srch_wkrs: int,
+        n_cntx_wkrs: int,
         n_proc_wkrs: int,
-        prompts: List[Prompt],
     ) -> None:
         """Sets up the necessary queues and workers for the async framework.
         Args:
-            n_serp_wkrs: Number of async workers for serp.
-            n_zyte_wkrs: Number of async workers for zyte.
-            n_proc_wkrs: Number of async workers for processor.
-            prompts: The list of prompts used for the classification by func:`Processor.classify`.
+            n_srch_wkrs: Number of async workers for search.
+            n_cntx_wkrs: Number of async workers for context extraction.
+            n_proc_wkrs: Number of async workers for processing.
         """
         # Setup the input/output queues for the workers
-        serp_queue: asyncio.Queue[dict | None] = asyncio.Queue()
+        srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
         url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
-        zyte_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
+        cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
         proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
         res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
-        # Setup the Serp workers
-        serp_wkrs = [
+        # Setup the Search workers
+        srch_wkrs = [
             asyncio.create_task(
-                self._serp_execute(
-                    queue_in=serp_queue,
+                self._srch_execute(
+                    queue_in=srch_queue,
                     queue_out=url_queue,
                 )
             )
-            for _ in range(n_serp_wkrs)
+            for _ in range(n_srch_wkrs)
         ]
         # Setup the URL collector
         url_col = asyncio.create_task(
-            self._collect_url(queue_in=url_queue, queue_out=zyte_queue)
+            self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
         )
-        # Setup the Zyte workers
-        zyte_wkrs = [
+        # Setup the context extraction workers
+        cntx_wkrs = [
             asyncio.create_task(
-                self._zyte_execute(
-                    queue_in=zyte_queue,
+                self._cntx_execute(
+                    queue_in=cntx_queue,
                     queue_out=proc_queue,
                 )
             )
-            for _ in range(n_zyte_wkrs)
+            for _ in range(n_cntx_wkrs)
         ]
         # Setup the processing workers
@@ -400,7 +304,6 @@ class Orchestrator(ABC):
                 self._proc_execute(
                     queue_in=proc_queue,
                     queue_out=res_queue,
-                    prompts=prompts,
                 )
             )
             for _ in range(n_proc_wkrs)
@@ -411,26 +314,26 @@ class Orchestrator(ABC):
         # Add the setup to the instance variables
         self._queues = {
-            "serp": serp_queue,
+            "srch": srch_queue,
             "url": url_queue,
-            "zyte": zyte_queue,
+            "cntx": cntx_queue,
             "proc": proc_queue,
             "res": res_queue,
         }
         self._workers = {
-            "serp": serp_wkrs,
+            "srch": srch_wkrs,
             "url": url_col,
-            "zyte": zyte_wkrs,
+            "cntx": cntx_wkrs,
             "proc": proc_wkrs,
             "res": res_col,
         }
     @staticmethod
-    async def _add_serp_items_for_search_term(
+    async def _add_search_items_for_search_term(
         queue: asyncio.Queue[dict | None],
         search_term: str,
         search_term_type: str,
-        search_engines: List[SearchEngineName],
+        search_engine: SearchEngineName,
         language: Language,
         location: Location,
         num_results: int,
@@ -441,17 +344,17 @@ class Orchestrator(ABC):
         item = {
             "search_term": search_term,
             "search_term_type": search_term_type,
-            "search_engines": search_engines,
+            "search_engine": search_engine,
             "language": language,
             "location": location,
             "num_results": num_results,
             "marketplaces": marketplaces,
             "excluded_urls": excluded_urls,
         }
-        logger.debug(f'Adding item="{item}" to serp_queue')
+        logger.debug(f'Adding item="{item}" to srch_queue')
         await queue.put(item)
-    async def _add_serp_items(
+    async def _add_srch_items(
         self,
         queue: asyncio.Queue[dict | None],
         search_term: str,
@@ -462,7 +365,23 @@ class Orchestrator(ABC):
         marketplaces: List[Host] | None,
         excluded_urls: List[Host] | None,
     ) -> None:
-        """Adds all the (enriched) search_term (as serp items) to the queue."""
+        """Adds all the (enriched) search_term (as srch items) to the queue.
+        One item consists of the following parameters:
+            - search_term: The search term for the query.
+            - search_term_type: The type of the search term (initial or enriched).
+            - search_engines: The search engines to use for the query.
+            - language: The language to use for the query.
+            - location: The location to use for the query.
+            - num_results: The number of results to return.
+            - marketplaces: The marketplaces to include in the search.
+            - excluded_urls: The URLs to exclude from the search.
+        For constructing such items we essentially have two loops:
+            for each search_term (initial + enriched)
+                for each search_engine
+                    add item to queue
+        """
         common_kwargs = {
             "queue": queue,
             "language": language,
@@ -471,14 +390,15 @@ class Orchestrator(ABC):
             "excluded_urls": excluded_urls,
         }
-        # Add initial items to the serp_queue
-        await self._add_serp_items_for_search_term(
-            search_term=search_term,
-            search_term_type="initial",
-            search_engines=search_engines,
-            num_results=deepness.num_results,
-            **common_kwargs,  # type: ignore[arg-type]
-        )
+        # Add initial items to the queue
+        for se in search_engines:
+            await self._add_search_items_for_search_term(
+                search_term=search_term,
+                search_term_type="initial",
+                search_engine=se,
+                num_results=deepness.num_results,
+                **common_kwargs,  # type: ignore[arg-type]
+            )
         # Enrich the search_terms
         enrichment = deepness.enrichment
@@ -492,15 +412,84 @@ class Orchestrator(ABC):
                 n_terms=n_terms,
             )
-            # Add the enriched search terms to the serp_queue
+            # Add the enriched search terms to the queue
             for trm in terms:
-                await self._add_serp_items_for_search_term(
-                    search_term=trm,
-                    search_term_type="enriched",
-                    search_engines=search_engines,
-                    num_results=enrichment.additional_urls_per_term,
-                    **common_kwargs,  # type: ignore[arg-type]
+                for se in search_engines:
+                    await self._add_search_items_for_search_term(
+                        search_term=trm,
+                        search_term_type="enriched",
+                        search_engine=se,
+                        num_results=enrichment.additional_urls_per_term,
+                        **common_kwargs,  # type: ignore[arg-type]
+                    )
+    @staticmethod
+    def _is_exact_search(search_term: str) -> bool:
+        """Check if the search term is an exact search (contains double quotation marks).
+        Args:
+            search_term: The search term to check.
+        """
+        return '"' in search_term
+    @staticmethod
+    def _extract_exact_search_terms(search_term: str) -> list[str]:
+        """Extract all exact search terms from within double quotation marks (empty if no quotes found).
+        Args:
+            search_term: The search term that may contain double quotation marks.
+        """
+        # Find all double-quoted strings
+        double_quote_matches = re.findall(r'"([^"]*)"', search_term)
+        return double_quote_matches
+    @staticmethod
+    def _check_exact_search_terms_match(
+        product: ProductItem,
+        exact_search_terms: list[str],
+    ) -> bool:
+        """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
+        Args:
+            product: The product item.
+            exact_search_terms: List of exact search terms to match against.
+        """
+        field_values = [
+            str(val)
+            for fld in EXACT_MATCH_PRODUCT_FIELDS
+            if (val := getattr(product, fld, None)) is not None
+        ]
+        product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
+        return all(
+            re.search(re.escape(est.lower()), product_str_lower)
+            for est in exact_search_terms
+        )
+    def _check_exact_search(self, product: ProductItem) -> ProductItem:
+        """Checks if the search term requests an exact search and if yes, checks for conformity."""
+        # Check for exact search and apply regex matching
+        exact_search = self._is_exact_search(product.search_term)
+        product.exact_search = exact_search
+        # Only set exact_search_match if this was an exact search (contains quotes)
+        if exact_search:
+            exact_search_terms = self._extract_exact_search_terms(product.search_term)
+            if exact_search_terms:
+                product.exact_search_match = self._check_exact_search_terms_match(
+                    product=product, exact_search_terms=exact_search_terms
+                )
+                logger.debug(
+                    f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
+                    f"for offer with url={product.url}"
+                )
+            else:
+                logger.warning(
+                    f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
+                    f"for offer with url={product.url}"
                 )
+        # If exact_search is False, product.exact_search_match remains False (default value)
+        return product
     async def run(
         self,
@@ -509,12 +498,11 @@ class Orchestrator(ABC):
         language: Language,
         location: Location,
         deepness: Deepness,
-        prompts: List[Prompt],
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
         previously_collected_urls: List[str] | None = None,
     ) -> None:
-        """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
+        """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
             search_term: The search term for the query.
@@ -522,7 +510,6 @@ class Orchestrator(ABC):
             language: The language to use for the query.
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
-            prompts: The list of prompt to use for classification.
             marketplaces: The marketplaces to include in the search.
             excluded_urls: The URLs to exclude from the search.
             previously_collected_urls: The urls that have been collected previously and are ignored.
@@ -530,7 +517,7 @@ class Orchestrator(ABC):
         # ---------------------------
         #        INITIAL SETUP
         # ---------------------------
-        # Ensure we have at least one search engine
+        # Ensure we have at least one search engine (the list might be empty)
         if not search_engines:
             logger.warning(
                 "No search engines specified, using all available search engines"
@@ -538,25 +525,24 @@ class Orchestrator(ABC):
             search_engines = list(SearchEngineName)
         # Handle previously collected URLs
-        if previously_collected_urls:
-            self._url_collector.collected_previously = set(previously_collected_urls)
+        if pcurls := previously_collected_urls:
+            self._url_collector.add_previously_collected_urls(urls=pcurls)
         # Setup the async framework
         n_terms_max = 1 + (
             deepness.enrichment.additional_terms if deepness.enrichment else 0
         )
-        n_serp_wkrs = min(self._n_serp_wkrs, n_terms_max)
-        n_zyte_wkrs = min(self._n_zyte_wkrs, deepness.num_results)
+        n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
+        n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
         n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
         logger.debug(
-            f"setting up async framework (#workers: serp={n_serp_wkrs}, zyte={n_zyte_wkrs}, proc={n_proc_wkrs})"
+            f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
         )
         self._setup_async_framework(
-            n_serp_wkrs=n_serp_wkrs,
-            n_zyte_wkrs=n_zyte_wkrs,
+            n_srch_wkrs=n_srch_wkrs,
+            n_cntx_wkrs=n_cntx_wkrs,
             n_proc_wkrs=n_proc_wkrs,
-            prompts=prompts,
         )
         # Check setup of async framework
@@ -564,21 +550,21 @@ class Orchestrator(ABC):
             raise ValueError(
                 "Async framework is not setup. Please call _setup_async_framework() first."
             )
-        if not all([k in self._queues for k in ["serp", "url", "zyte", "proc", "res"]]):
+        if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
             raise ValueError(
                 "The queues of the async framework are not setup correctly."
             )
         if not all(
-            [k in self._workers for k in ["serp", "url", "zyte", "proc", "res"]]
+            [k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
         ):
             raise ValueError(
                 "The workers of the async framework are not setup correctly."
             )
-        # Add the search items to the serp_queue
-        serp_queue = self._queues["serp"]
-        await self._add_serp_items(
-            queue=serp_queue,
+        # Add the search items to the srch_queue
+        srch_queue = self._queues["srch"]
+        await self._add_srch_items(
+            queue=srch_queue,
             search_term=search_term,
             search_engines=search_engines,
             language=language,
@@ -588,26 +574,29 @@ class Orchestrator(ABC):
             excluded_urls=excluded_urls,
         )
-        # ---------------------------
-        #   ORCHESTRATE SERP WORKERS
-        # ---------------------------
-        # Add the sentinels to the serp_queue
-        for _ in range(n_serp_wkrs):
-            await serp_queue.put(None)
+        # -----------------------------
+        #  ORCHESTRATE SEARCH WORKERS
+        # -----------------------------
+        # Add the sentinels to the srch_queue
+        for _ in range(n_srch_wkrs):
+            await srch_queue.put(None)
-        # Wait for the serp workers to be concluded before adding the sentinels to the url_queue
-        serp_workers = self._workers["serp"]
+        # Wait for the srch workers to be concluded before adding the sentinels to the url_queue
+        srch_workers = self._workers["srch"]
         try:
-            logger.debug("Waiting for serp_workers to conclude their tasks...")
-            serp_res = await asyncio.gather(*serp_workers, return_exceptions=True)
-            for i, res in enumerate(serp_res):
+            logger.debug("Waiting for srch_workers to conclude their tasks...")
+            srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
+            for i, res in enumerate(srch_res):
                 if isinstance(res, Exception):
-                    logger.error(f"Error in serp_worker {i}: {res}")
-            logger.debug("...serp_workers concluded their tasks")
-        except Exception as e:
-            logger.error(f"Gathering serp_workers failed: {e}")
+                    logger.error(f"Error in srch_worker {i}: {res}")
+            logger.debug("...srch_workers concluded their tasks")
+        except Exception:
+            logger.error(
+                "Gathering srch_workers failed",
+                exc_info=True,
+            )
         finally:
-            await serp_queue.join()
+            await srch_queue.join()
         # ---------------------------
         #  ORCHESTRATE URL COLLECTOR
@@ -616,38 +605,44 @@ class Orchestrator(ABC):
         url_queue = self._queues["url"]
         await url_queue.put(None)
-        # Wait for the url_collector to be concluded before adding the sentinels to the zyte_queue
+        # Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
         url_collector = cast(asyncio.Task, self._workers["url"])
         try:
             logger.debug("Waiting for url_collector to conclude its tasks...")
             await url_collector
             logger.debug("...url_collector concluded its tasks")
-        except Exception as e:
-            logger.error(f"Gathering url_collector failed: {e}")
+        except Exception:
+            logger.error(
+                "Gathering url_collector failed",
+                exc_info=True,
+            )
         finally:
             await url_queue.join()
-        # ---------------------------
-        #  ORCHESTRATE ZYTE WORKERS
-        # ---------------------------
-        # Add the sentinels to the zyte_queue
-        zyte_queue = self._queues["zyte"]
-        for _ in range(n_zyte_wkrs):
-            await zyte_queue.put(None)
+        # -----------------------------
+        #  ORCHESTRATE CONTEXT WORKERS
+        # -----------------------------
+        # Add the sentinels to the cntx_queue
+        cntx_queue = self._queues["cntx"]
+        for _ in range(n_cntx_wkrs):
+            await cntx_queue.put(None)
-        # Wait for the zyte_workers to be concluded before adding the sentinels to the proc_queue
-        zyte_workers = self._workers["zyte"]
+        # Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
+        cntx_workers = self._workers["cntx"]
         try:
-            logger.debug("Waiting for zyte_workers to conclude their tasks...")
-            zyte_res = await asyncio.gather(*zyte_workers, return_exceptions=True)
-            for i, res in enumerate(zyte_res):
+            logger.debug("Waiting for cntx_workers to conclude their tasks...")
+            cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
+            for i, res in enumerate(cntx_res):
                 if isinstance(res, Exception):
-                    logger.error(f"Error in zyte_worker {i}: {res}")
-            logger.debug("...zyte_workers concluded their tasks")
-        except Exception as e:
-            logger.error(f"Gathering zyte_workers failed: {e}")
+                    logger.error(f"Error in cntx_worker {i}: {res}")
+            logger.debug("...cntx_workers concluded their tasks")
+        except Exception:
+            logger.error(
+                "Gathering cntx_workers failed",
+                exc_info=True,
+            )
         finally:
-            await zyte_queue.join()
+            await cntx_queue.join()
         # ---------------------------
         #  ORCHESTRATE PROC WORKERS
@@ -666,8 +661,11 @@ class Orchestrator(ABC):
                 if isinstance(res, Exception):
                     logger.error(f"Error in proc_worker {i}: {res}")
             logger.debug("...proc_workers concluded their tasks")
-        except Exception as e:
-            logger.error(f"Gathering proc_workers failed: {e}")
+        except Exception:
+            logger.error(
+                "Gathering proc_workers failed",
+                exc_info=True,
+            )
         finally:
             await proc_queue.join()
@@ -684,8 +682,11 @@ class Orchestrator(ABC):
             logger.debug("Waiting for res_collector to conclude its tasks...")
             await res_collector
             logger.debug("...res_collector concluded its tasks")
-        except Exception as e:
-            logger.error(f"Gathering res_collector failed: {e}")
+        except Exception:
+            logger.error(
+                "Gathering res_collector failed",
+                exc_info=True,
+            )
         finally:
             await res_queue.join()

fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl