PyPI - fraudcrawler - Versions diffs - 0.6.2__tar.gz → 0.6.3__tar.gz - Mend

fraudcrawler 0.6.2tar.gz → 0.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (23) hide show

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fraudcrawler
-Version: 0.6.2
+Version: 0.6.3
 Summary: Intelligent Market Monitoring
 License: MIT
 License-File: LICENSE

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/__init__.py RENAMED Viewed

@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteAPI
+from fraudcrawler.scraping.config import ScrapingConfig
 from fraudcrawler.processing.processor import Processor
+from fraudcrawler.processing.config import ProcessingConfig
 from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
 from fraudcrawler.base.base import (
@@ -22,7 +24,9 @@ __all__ = [
     "Enricher",
     "URLCollector",
     "ZyteAPI",
+    "ScrapingConfig",
     "Processor",
+    "ProcessingConfig",
     "Orchestrator",
     "ProductItem",
     "FraudCrawlerClient",

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/base.py RENAMED Viewed

@@ -45,6 +45,7 @@ class Setup(BaseSettings):
     dataforseo_pwd: str
     zyteapi_key: str
     openaiapi_key: str
+    pypy_token: str
     class Config:
         env_file = ".env"

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/client.py RENAMED Viewed

@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
     ProductItem,
 )
 from fraudcrawler.base.orchestrator import Orchestrator
+from fraudcrawler.scraping.config import ScrapingConfig
 from fraudcrawler.scraping.search import SearchEngineName
+from fraudcrawler.processing.config import ProcessingConfig
 logger = logging.getLogger(__name__)
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
         asyncio.run(
             _run(
-                search_term=search_term,
-                search_engines=nrm_search_engines,
-                language=language,
-                location=location,
-                deepness=deepness,
-                prompts=prompts,
-                marketplaces=marketplaces,
-                excluded_urls=excluded_urls,
-                previously_collected_urls=previously_collected_urls,
+                scraping_config=ScrapingConfig(
+                    search_term=search_term,
+                    search_engines=nrm_search_engines,
+                    language=language,
+                    location=location,
+                    deepness=deepness,
+                    marketplaces=marketplaces,
+                    excluded_urls=excluded_urls,
+                    previously_collected_urls=previously_collected_urls,
+                ),
+                processing_config=ProcessingConfig(
+                    prompts=prompts,
+                ),
             )
         )

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -17,11 +17,9 @@ from fraudcrawler.settings import (
     DEFAULT_N_PROC_WKRS,
 )
 from fraudcrawler.base.base import (
-    Deepness,
     Host,
     Language,
     Location,
-    Prompt,
     ProductItem,
     HttpxAsyncClient,
 )
@@ -31,7 +29,9 @@ from fraudcrawler import (
     Enricher,
     ZyteAPI,
     URLCollector,
+    ScrapingConfig,
     Processor,
+    ProcessingConfig,
 )
 logger = logging.getLogger(__name__)
@@ -261,14 +261,14 @@ class Orchestrator(ABC):
         self,
         queue_in: asyncio.Queue[ProductItem | None],
         queue_out: asyncio.Queue[ProductItem | None],
-        prompts: List[Prompt],
+        processing_config: ProcessingConfig,
     ) -> None:
         """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
         Args:
             queue_in: The input queue containing the product details.
             queue_out: The output queue to put the processed product details.
-            prompts: The list of prompts to use for classification.
+            processing_config: Sets up the processing pipeline step.
         """
         # Process the products
@@ -282,7 +282,7 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
                     # Run all the configured prompts
-                    for prompt in prompts:
+                    for prompt in processing_config.prompts:
                         classification = await self._processor.classify(
                             product=product,
                             prompt=prompt,
@@ -318,7 +318,7 @@ class Orchestrator(ABC):
         n_srch_wkrs: int,
         n_cntx_wkrs: int,
         n_proc_wkrs: int,
-        prompts: List[Prompt],
+        processing_config: ProcessingConfig,
     ) -> None:
         """Sets up the necessary queues and workers for the async framework.
@@ -326,7 +326,7 @@ class Orchestrator(ABC):
             n_srch_wkrs: Number of async workers for search.
             n_cntx_wkrs: Number of async workers for context extraction.
             n_proc_wkrs: Number of async workers for processing.
-            prompts: The list of prompts used for the classification by func:`Processor.classify`.
+            processing_config: Sets up the processing pipeline step.
         """
         # Setup the input/output queues for the workers
@@ -369,7 +369,7 @@ class Orchestrator(ABC):
                 self._proc_execute(
                     queue_in=proc_queue,
                     queue_out=res_queue,
-                    prompts=prompts,
+                    processing_config=processing_config,
                 )
             )
             for _ in range(n_proc_wkrs)
@@ -423,13 +423,7 @@ class Orchestrator(ABC):
     async def _add_srch_items(
         self,
         queue: asyncio.Queue[dict | None],
-        search_term: str,
-        search_engines: List[SearchEngineName],
-        language: Language,
-        location: Location,
-        deepness: Deepness,
-        marketplaces: List[Host] | None,
-        excluded_urls: List[Host] | None,
+        scraping_config: ScrapingConfig,
     ) -> None:
         """Adds all the (enriched) search_term (as srch items) to the queue.
@@ -448,12 +442,17 @@ class Orchestrator(ABC):
                 for each search_engine
                     add item to queue
         """
+        search_term = scraping_config.search_term
+        search_engines = scraping_config.search_engines
+        language = scraping_config.language
+        location = scraping_config.location
+        deepness = scraping_config.deepness
         common_kwargs = {
             "queue": queue,
             "language": language,
             "location": location,
-            "marketplaces": marketplaces,
-            "excluded_urls": excluded_urls,
+            "marketplaces": scraping_config.marketplaces,
+            "excluded_urls": scraping_config.excluded_urls,
         }
         # Add initial items to the queue
@@ -495,21 +494,15 @@ class Orchestrator(ABC):
         Args:
             search_term: The search term to check.
-        Returns:
-            True if the search term contains double quotation marks, False otherwise.
         """
         return '"' in search_term
     @staticmethod
     def _extract_exact_search_terms(search_term: str) -> list[str]:
-        """Extract all exact search terms from within double quotation marks.
+        """Extract all exact search terms from within double quotation marks (empty if no quotes found).
         Args:
             search_term: The search term that may contain double quotation marks.
-        Returns:
-            A list of extracted search terms without quotes, or empty list if no quotes found.
         """
         # Find all double-quoted strings
         double_quote_matches = re.findall(r'"([^"]*)"', search_term)
@@ -565,46 +558,31 @@ class Orchestrator(ABC):
     async def run(
         self,
-        search_term: str,
-        search_engines: List[SearchEngineName],
-        language: Language,
-        location: Location,
-        deepness: Deepness,
-        prompts: List[Prompt],
-        marketplaces: List[Host] | None = None,
-        excluded_urls: List[Host] | None = None,
-        previously_collected_urls: List[str] | None = None,
+        scraping_config: ScrapingConfig,
+        processing_config: ProcessingConfig,
     ) -> None:
         """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
-            search_term: The search term for the query.
-            search_engines: The list of search engines to use for the search query.
-            language: The language to use for the query.
-            location: The location to use for the query.
-            deepness: The search depth and enrichment details.
-            prompts: The list of prompt to use for classification.
-            marketplaces: The marketplaces to include in the search.
-            excluded_urls: The URLs to exclude from the search.
-            previously_collected_urls: The urls that have been collected previously and are ignored.
+            scraping_config: Sets up the scraping pipeline step.
+            processing_config: Sets up the processing pipeline step.
         """
         # ---------------------------
         #        INITIAL SETUP
         # ---------------------------
-        # Ensure we have at least one search engine
-        if not search_engines:
+        # Ensure we have at least one search engine (the list might be empty)
+        if not scraping_config.search_engines:
             logger.warning(
                 "No search engines specified, using all available search engines"
             )
-            search_engines = list(SearchEngineName)
+            scraping_config.search_engines = list(SearchEngineName)
         # Handle previously collected URLs
-        if previously_collected_urls:
-            self._url_collector.add_previously_collected_urls(
-                urls=previously_collected_urls
-            )
+        if pcurls := scraping_config.previously_collected_urls:
+            self._url_collector.add_previously_collected_urls(urls=pcurls)
         # Setup the async framework
+        deepness = scraping_config.deepness
         n_terms_max = 1 + (
             deepness.enrichment.additional_terms if deepness.enrichment else 0
         )
@@ -619,7 +597,7 @@ class Orchestrator(ABC):
             n_srch_wkrs=n_srch_wkrs,
             n_cntx_wkrs=n_cntx_wkrs,
             n_proc_wkrs=n_proc_wkrs,
-            prompts=prompts,
+            processing_config=processing_config,
         )
         # Check setup of async framework
@@ -642,13 +620,7 @@ class Orchestrator(ABC):
         srch_queue = self._queues["srch"]
         await self._add_srch_items(
             queue=srch_queue,
-            search_term=search_term,
-            search_engines=search_engines,
-            language=language,
-            location=location,
-            deepness=deepness,
-            marketplaces=marketplaces,
-            excluded_urls=excluded_urls,
+            scraping_config=scraping_config,
         )
         # -----------------------------

fraudcrawler-0.6.3/fraudcrawler/processing/config.py ADDED Viewed

@@ -0,0 +1,12 @@
+from pydantic import BaseModel, Field
+from typing import List
+from fraudcrawler.base.base import Prompt
+class ProcessingConfig(BaseModel):
+    """Sets up the processing pipeline step."""
+    prompts: List[Prompt] = Field(
+        description="The list of prompts to use for classification."
+    )

fraudcrawler-0.6.3/fraudcrawler/scraping/config.py ADDED Viewed

@@ -0,0 +1,32 @@
+from pydantic import BaseModel, Field
+from typing import List
+from fraudcrawler.scraping.search import SearchEngineName
+from fraudcrawler.base.base import (
+    Language,
+    Location,
+    Deepness,
+    Host,
+)
+class ScrapingConfig(BaseModel):
+    """Sets up the scraping pipeline step."""
+    search_term: str = Field(description="The search term for the query.")
+    search_engines: List[SearchEngineName] = Field(
+        description="The list of search engines to use for the search query."
+    )
+    language: Language = Field(description="The language to use for the query.")
+    location: Location = Field(description="The location to use for the query.")
+    deepness: Deepness = Field(description="The search depth and enrichment details.")
+    marketplaces: List[Host] | None = Field(
+        default=None, description="The marketplaces to include in the search."
+    )
+    excluded_urls: List[Host] | None = Field(
+        default=None, description="The URLs to exclude from the search."
+    )
+    previously_collected_urls: List[str] | None = Field(
+        default=None,
+        description="The URLs that have been collected previously and are ignored.",
+    )

{fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.6.2"
+version = "0.6.3"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",