PyPI - fraudcrawler - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

fraudcrawler/__init__.py +21 -5
fraudcrawler/base/base.py +18 -38
fraudcrawler/base/client.py +57 -60
fraudcrawler/base/orchestrator.py +277 -276
fraudcrawler/base/retry.py +25 -11
fraudcrawler/launch_demo_pipeline.py +103 -41
fraudcrawler/processing/base.py +151 -0
fraudcrawler/processing/openai.py +521 -0
fraudcrawler/scraping/enrich.py +6 -4
fraudcrawler/scraping/search.py +370 -110
fraudcrawler/scraping/url.py +42 -3
fraudcrawler/scraping/zyte.py +146 -80
fraudcrawler/settings.py +22 -10
fraudcrawler-0.7.26.dist-info/METADATA +173 -0
fraudcrawler-0.7.26.dist-info/RECORD +23 -0
fraudcrawler/processing/processor.py +0 -199
fraudcrawler-0.5.0.dist-info/METADATA +0 -167
fraudcrawler-0.5.0.dist-info/RECORD +0 -22
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL +0 -0
{fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt +0 -0

fraudcrawler/__init__.py CHANGED Viewed

@@ -1,8 +1,19 @@
-from fraudcrawler.scraping.search import Search, SearchEngineName
+from fraudcrawler.scraping.search import Searcher, SearchEngineName
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteAPI
-from fraudcrawler.processing.processor import Processor
+from fraudcrawler.processing.base import (
+    UserInputs,
+    Workflow,
+    ClassificationResult,
+    TmpResult,
+    Processor,
+)
+from fraudcrawler.processing.openai import (
+    OpenAIWorkflow,
+    OpenAIClassification,
+    OpenAIClassificationUserInputs,
+)
 from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
 from fraudcrawler.base.base import (
@@ -11,17 +22,23 @@ from fraudcrawler.base.base import (
     Host,
     Language,
     Location,
-    Prompt,
     ProductItem,
     HttpxAsyncClient,
 )
 __all__ = [
-    "Search",
+    "Searcher",
     "SearchEngineName",
     "Enricher",
     "URLCollector",
     "ZyteAPI",
+    "UserInputs",
+    "Workflow",
+    "ClassificationResult",
+    "TmpResult",
+    "OpenAIWorkflow",
+    "OpenAIClassification",
+    "OpenAIClassificationUserInputs",
     "Processor",
     "Orchestrator",
     "ProductItem",
@@ -31,6 +48,5 @@ __all__ = [
     "Host",
     "Deepness",
     "Enrichment",
-    "Prompt",
     "HttpxAsyncClient",
 ]

fraudcrawler/base/base.py CHANGED Viewed

@@ -11,6 +11,7 @@ from urllib.parse import urlparse
 import re
 from typing import Any, Dict, List
 import httpx
 from fraudcrawler.settings import (
@@ -44,6 +45,7 @@ class Setup(BaseSettings):
     dataforseo_pwd: str
     zyteapi_key: str
     openaiapi_key: str
+    pypy_token: str
     class Config:
         env_file = ".env"
@@ -69,14 +71,6 @@ class Host(BaseModel):
         return [cls._normalize_domain(dom.strip()) for dom in val]
-class ClassificationResult(BaseModel):
-    """Model for classification results."""
-    result: int
-    input_tokens: int
-    output_tokens: int
 class Location(BaseModel):
     """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
@@ -132,25 +126,30 @@ class Deepness(BaseModel):
 class ProductItem(BaseModel):
     """Model representing a product item."""
-    # Serp/Enrich parameters
+    # Search parameters
     search_term: str
     search_term_type: str
     url: str
     url_resolved: str
     search_engine_name: str
     domain: str
+    exact_search: bool = False
+    exact_search_match: bool = False
-    # Zyte parameters
+    # Context parameters
     product_name: str | None = None
     product_price: str | None = None
     product_description: str | None = None
     product_images: List[str] | None = None
+    product_gtin: str | None = None
     probability: float | None = None
     html: str | None = None
     html_clean: str | None = None
-    # Processor parameters are set dynamic so we must allow extra fields
+    # Processor parameters (set dynamically)
     classifications: Dict[str, int] = Field(default_factory=dict)
+    tmp: Dict[str, Any] = Field(default_factory=dict)
+    insights: Dict[str, Any] | None = Field(default=None)
     # Usage parameters
     usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
@@ -160,33 +159,6 @@ class ProductItem(BaseModel):
     filtered_at_stage: str | None = None
-class Prompt(BaseModel):
-    """Model for prompts."""
-    name: str
-    system_prompt: str
-    product_item_fields: List[str]
-    allowed_classes: List[int]
-    @field_validator("allowed_classes", mode="before")
-    def check_for_positive_value(cls, val):
-        """Check if all values are positive."""
-        if not all(isinstance(i, int) and i >= 0 for i in val):
-            raise ValueError("all values in allowed_classes must be positive integers.")
-        return val
-    @field_validator("product_item_fields", mode="before")
-    def validate_product_item_fields(cls, val):
-        """Ensure all product_item_fields are valid ProductItem attributes."""
-        valid_fields = set(ProductItem.model_fields.keys())
-        for field in val:
-            if field not in valid_fields:
-                raise ValueError(
-                    f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
-                )
-        return val
 class HttpxAsyncClient(httpx.AsyncClient):
     """Httpx async client that can be used to retain the default settings."""
@@ -216,6 +188,14 @@ class DomainUtils:
     """
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
+    _headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
     def _get_domain(self, url: str) -> str:
         """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).

fraudcrawler/base/client.py CHANGED Viewed

@@ -4,22 +4,25 @@ from datetime import datetime
 import logging
 from pathlib import Path
 from pydantic import BaseModel
-from typing import List, Self
+from typing import List
 import pandas as pd
 from fraudcrawler.settings import ROOT_DIR
 from fraudcrawler.base.base import (
-    Setup,
     Language,
     Location,
     Deepness,
     Host,
-    Prompt,
     ProductItem,
 )
 from fraudcrawler.base.orchestrator import Orchestrator
-from fraudcrawler.scraping.search import SearchEngineName
+from fraudcrawler.scraping.search import Searcher, SearchEngineName
+from fraudcrawler.scraping.enrich import Enricher
+from fraudcrawler.scraping.url import URLCollector
+from fraudcrawler.scraping.zyte import ZyteAPI
+from fraudcrawler.processing.base import Processor
 logger = logging.getLogger(__name__)
@@ -34,18 +37,38 @@ class Results(BaseModel):
 class FraudCrawlerClient(Orchestrator):
-    """The main client for FraudCrawler."""
+    """The main client for FraudCrawler product search and analysis.
+    This client orchestrates the complete pipeline: search, deduplication, context extraction,
+    processing (classification), and result collection. It inherits from Orchestrator and adds
+    result management and persistence functionality.
+    """
+    _FILENAME_TEMPLATE = "{search_term}_{language}_{location}_{timestamp}.csv"
-    _filename_template = "{search_term}_{language}_{location}_{timestamp}.csv"
+    def __init__(
+        self,
+        searcher: Searcher,
+        enricher: Enricher,
+        url_collector: URLCollector,
+        zyteapi: ZyteAPI,
+        processor: Processor,
+    ):
+        """Initializes FraudCrawlerClient.
-    def __init__(self):
-        setup = Setup()
+        Args:
+            searcher: Client for searching step.
+            enricher: Client for enrichment step.
+            url_collector: Client for deduplication.
+            zyteapi: Client for metadata extraction.
+            processor: Client for product classification.
+        """
         super().__init__(
-            serpapi_key=setup.serpapi_key,
-            dataforseo_user=setup.dataforseo_user,
-            dataforseo_pwd=setup.dataforseo_pwd,
-            zyteapi_key=setup.zyteapi_key,
-            openaiapi_key=setup.openaiapi_key,
+            searcher=searcher,
+            enricher=enricher,
+            url_collector=url_collector,
+            zyteapi=zyteapi,
+            processor=processor,
         )
         self._results_dir = _RESULTS_DIR
@@ -53,13 +76,6 @@ class FraudCrawlerClient(Orchestrator):
             self._results_dir.mkdir(parents=True)
         self._results: List[Results] = []
-    async def __aenter__(self) -> Self:
-        await super().__aenter__()  # let base set itself up
-        return self  # so `async with FraudCrawlerClient()` gives you this instance
-    async def __aexit__(self, *args, **kwargs) -> None:
-        await super().__aexit__(*args, **kwargs)
     async def _collect_results(
         self, queue_in: asyncio.Queue[ProductItem | None]
     ) -> None:
@@ -80,45 +96,38 @@ class FraudCrawlerClient(Orchestrator):
         # Convert the list of products to a DataFrame
         df = pd.json_normalize(products)
-        cols = [c.split(".")[-1] for c in df.columns]
-        if len(cols) != len(set(cols)):
-            logger.error("Duplicate columns after json_normalize.")
-        else:
-            df.columns = cols
         # Save the DataFrame to a CSV file
         filename = self._results[-1].filename
         df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
         logger.info(f"Results saved to {filename}")
-    def execute(
+    async def run(
         self,
         search_term: str,
+        search_engines: List[SearchEngineName],
         language: Language,
         location: Location,
         deepness: Deepness,
-        prompts: List[Prompt],
         marketplaces: List[Host] | None = None,
         excluded_urls: List[Host] | None = None,
-        search_engines: List[SearchEngineName | str] | None = None,
         previously_collected_urls: List[str] | None = None,
     ) -> None:
-        """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
+        """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
             search_term: The search term for the query.
+            search_engines: The list of search engines to use for the search query.
             language: The language to use for the query.
             location: The location to use for the query.
             deepness: The search depth and enrichment details.
-            prompts: The list of prompts to use for classification.
-            marketplaces: The marketplaces to include in the search (optional).
-            excluded_urls: The URLs to exclude from the search (optional).
-            search_engines: The list of search engines to use for the search (optional).
-            previously_collected_urls: The urls that have been collected previously and are ignored (optional).
+            marketplaces: The marketplaces to include in the search.
+            excluded_urls: The URLs to exclude from the search.
+            previously_collected_urls: The urls that have been collected previously and are ignored.
         """
         # Handle results files
         timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
-        filename = self._results_dir / self._filename_template.format(
+        filename = self._results_dir / self._FILENAME_TEMPLATE.format(
             search_term=search_term,
             language=language.code,
             location=location.code,
@@ -126,31 +135,16 @@ class FraudCrawlerClient(Orchestrator):
         )
         self._results.append(Results(search_term=search_term, filename=filename))
-        # Normalize inputs - convert strings to SearchEngineName enum values
-        nrm_search_engines = list(SearchEngineName)
-        if search_engines:
-            nrm_search_engines = [
-                SearchEngineName(se) if isinstance(se, str) else se
-                for se in search_engines
-            ]
         # Run the pipeline by calling the orchestrator's run method
-        async def _run(*args, **kwargs):
-            async with self:
-                return await super(FraudCrawlerClient, self).run(*args, **kwargs)
-        asyncio.run(
-            _run(
-                search_term=search_term,
-                search_engines=nrm_search_engines,
-                language=language,
-                location=location,
-                deepness=deepness,
-                prompts=prompts,
-                marketplaces=marketplaces,
-                excluded_urls=excluded_urls,
-                previously_collected_urls=previously_collected_urls,
-            )
+        await super().run(
+            search_term=search_term,
+            search_engines=search_engines,
+            language=language,
+            location=location,
+            deepness=deepness,
+            marketplaces=marketplaces,
+            excluded_urls=excluded_urls,
+            previously_collected_urls=previously_collected_urls,
         )
     def load_results(self, index: int = -1) -> pd.DataFrame:
@@ -161,7 +155,10 @@ class FraudCrawlerClient(Orchestrator):
         """
         results = self._results[index]
-        return pd.read_csv(results.filename)
+        if (filename := results.filename) is None:
+            raise ValueError("filename not found (is None)")
+        return pd.read_csv(filename)
     def print_available_results(self) -> None:
         """Prints the available results."""

fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

fraudcrawler 0.5.0py3-none-any.whl → 0.7.26py3-none-any.whl