PyPI - fraudcrawler - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

fraudcrawler 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (15) hide show

fraudcrawler/__init__.py +4 -0
fraudcrawler/base/base.py +3 -0
fraudcrawler/base/client.py +15 -9
fraudcrawler/base/orchestrator.py +117 -84
fraudcrawler/launch_demo_pipeline.py +1 -1
fraudcrawler/processing/config.py +12 -0
fraudcrawler/scraping/config.py +32 -0
fraudcrawler/scraping/zyte.py +103 -77
fraudcrawler/settings.py +8 -0
{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/METADATA +4 -3
fraudcrawler-0.6.3.dist-info/RECORD +24 -0
{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/WHEEL +1 -1
fraudcrawler-0.6.1.dist-info/RECORD +0 -22
{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/entry_points.txt +0 -0
{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info/licenses}/LICENSE +0 -0

fraudcrawler/__init__.py CHANGED Viewed

@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
 from fraudcrawler.scraping.enrich import Enricher
 from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteAPI
+from fraudcrawler.scraping.config import ScrapingConfig
 from fraudcrawler.processing.processor import Processor
+from fraudcrawler.processing.config import ProcessingConfig
 from fraudcrawler.base.orchestrator import Orchestrator
 from fraudcrawler.base.client import FraudCrawlerClient
 from fraudcrawler.base.base import (
@@ -22,7 +24,9 @@ __all__ = [
     "Enricher",
     "URLCollector",
     "ZyteAPI",
+    "ScrapingConfig",
     "Processor",
+    "ProcessingConfig",
     "Orchestrator",
     "ProductItem",
     "FraudCrawlerClient",

fraudcrawler/base/base.py CHANGED Viewed

@@ -45,6 +45,7 @@ class Setup(BaseSettings):
     dataforseo_pwd: str
     zyteapi_key: str
     openaiapi_key: str
+    pypy_token: str
     class Config:
         env_file = ".env"
@@ -140,6 +141,8 @@ class ProductItem(BaseModel):
     url_resolved: str
     search_engine_name: str
     domain: str
+    exact_search: bool = False
+    exact_search_match: bool = False
     # Context parameters
     product_name: str | None = None

fraudcrawler/base/client.py CHANGED Viewed

@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
     ProductItem,
 )
 from fraudcrawler.base.orchestrator import Orchestrator
+from fraudcrawler.scraping.config import ScrapingConfig
 from fraudcrawler.scraping.search import SearchEngineName
+from fraudcrawler.processing.config import ProcessingConfig
 logger = logging.getLogger(__name__)
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
         asyncio.run(
             _run(
-                search_term=search_term,
-                search_engines=nrm_search_engines,
-                language=language,
-                location=location,
-                deepness=deepness,
-                prompts=prompts,
-                marketplaces=marketplaces,
-                excluded_urls=excluded_urls,
-                previously_collected_urls=previously_collected_urls,
+                scraping_config=ScrapingConfig(
+                    search_term=search_term,
+                    search_engines=nrm_search_engines,
+                    language=language,
+                    location=location,
+                    deepness=deepness,
+                    marketplaces=marketplaces,
+                    excluded_urls=excluded_urls,
+                    previously_collected_urls=previously_collected_urls,
+                ),
+                processing_config=ProcessingConfig(
+                    prompts=prompts,
+                ),
             )
         )

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -3,10 +3,12 @@ import asyncio
 import logging
 from typing import cast, Dict, List, Self
-from bs4 import BeautifulSoup
 import httpx
+import re
 from fraudcrawler.settings import (
+    EXACT_MATCH_PRODUCT_FIELDS,
+    EXACT_MATCH_FIELD_SEPARATOR,
     PROCESSOR_DEFAULT_MODEL,
 )
 from fraudcrawler.settings import (
@@ -15,11 +17,9 @@ from fraudcrawler.settings import (
     DEFAULT_N_PROC_WKRS,
 )
 from fraudcrawler.base.base import (
-    Deepness,
     Host,
     Language,
     Location,
-    Prompt,
     ProductItem,
     HttpxAsyncClient,
 )
@@ -27,9 +27,11 @@ from fraudcrawler import (
     Searcher,
     SearchEngineName,
     Enricher,
-    URLCollector,
     ZyteAPI,
+    URLCollector,
+    ScrapingConfig,
     Processor,
+    ProcessingConfig,
 )
 logger = logging.getLogger(__name__)
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    # Fetch the product context from Zyte API
+                    # Fetch and enrich the product context from Zyte API
                     details = await self._zyteapi.details(url=product.url)
-                    url_resolved = self._zyteapi.extract_url_resolved(details=details)
-                    if url_resolved:
-                        product.url_resolved = url_resolved
-                    product.product_name = self._zyteapi.extract_product_name(
-                        details=details
+                    product = self._zyteapi.enrich_context(
+                        product=product, details=details
                     )
-                    # If the resolved URL is different from the original URL, we also need to update the domain as
-                    # otherwise the unresolved domain will be shown.
-                    # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
-                    if url_resolved and url_resolved != product.url:
-                        logger.debug(
-                            f"URL resolved for {product.url} is {url_resolved}"
-                        )
-                        product.domain = self._searcher._get_domain(url_resolved)
-                    product.product_price = self._zyteapi.extract_product_price(
-                        details=details
-                    )
-                    product.product_description = (
-                        self._zyteapi.extract_product_description(details=details)
-                    )
-                    product.product_images = self._zyteapi.extract_image_urls(
-                        details=details
-                    )
-                    product.probability = self._zyteapi.extract_probability(
-                        details=details
-                    )
-                    product.html = self._zyteapi.extract_html(details=details)
-                    if product.html:
-                        soup = BeautifulSoup(product.html, "html.parser")
-                        product.html_clean = soup.get_text(separator=" ", strip=True)
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
-                        product.filtered_at_stage = "Zyte probability threshold"
+                        product.filtered_at_stage = (
+                            "Context (Zyte probability threshold)"
+                        )
+                    # Check for exact match inside the full product context
+                    product = self._check_exact_search(product=product)
+                    if (
+                        not product.filtered
+                        and product.exact_search
+                        and not product.exact_search_match
+                    ):
+                        product.filtered = True
+                        product.filtered_at_stage = "Context (exact search)"
                 except Exception as e:
                     logger.warning(f"Error executing Zyte API search: {e}.")
             await queue_out.put(product)
@@ -274,14 +261,14 @@ class Orchestrator(ABC):
         self,
         queue_in: asyncio.Queue[ProductItem | None],
         queue_out: asyncio.Queue[ProductItem | None],
-        prompts: List[Prompt],
+        processing_config: ProcessingConfig,
     ) -> None:
         """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
         Args:
             queue_in: The input queue containing the product details.
             queue_out: The output queue to put the processed product details.
-            prompts: The list of prompts to use for classification.
+            processing_config: Sets up the processing pipeline step.
         """
         # Process the products
@@ -295,7 +282,7 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
                     # Run all the configured prompts
-                    for prompt in prompts:
+                    for prompt in processing_config.prompts:
                         classification = await self._processor.classify(
                             product=product,
                             prompt=prompt,
@@ -331,7 +318,7 @@ class Orchestrator(ABC):
         n_srch_wkrs: int,
         n_cntx_wkrs: int,
         n_proc_wkrs: int,
-        prompts: List[Prompt],
+        processing_config: ProcessingConfig,
     ) -> None:
         """Sets up the necessary queues and workers for the async framework.
@@ -339,7 +326,7 @@ class Orchestrator(ABC):
             n_srch_wkrs: Number of async workers for search.
             n_cntx_wkrs: Number of async workers for context extraction.
             n_proc_wkrs: Number of async workers for processing.
-            prompts: The list of prompts used for the classification by func:`Processor.classify`.
+            processing_config: Sets up the processing pipeline step.
         """
         # Setup the input/output queues for the workers
@@ -382,7 +369,7 @@ class Orchestrator(ABC):
                 self._proc_execute(
                     queue_in=proc_queue,
                     queue_out=res_queue,
-                    prompts=prompts,
+                    processing_config=processing_config,
                 )
             )
             for _ in range(n_proc_wkrs)
@@ -436,13 +423,7 @@ class Orchestrator(ABC):
     async def _add_srch_items(
         self,
         queue: asyncio.Queue[dict | None],
-        search_term: str,
-        search_engines: List[SearchEngineName],
-        language: Language,
-        location: Location,
-        deepness: Deepness,
-        marketplaces: List[Host] | None,
-        excluded_urls: List[Host] | None,
+        scraping_config: ScrapingConfig,
     ) -> None:
         """Adds all the (enriched) search_term (as srch items) to the queue.
@@ -461,12 +442,17 @@ class Orchestrator(ABC):
                 for each search_engine
                     add item to queue
         """
+        search_term = scraping_config.search_term
+        search_engines = scraping_config.search_engines
+        language = scraping_config.language
+        location = scraping_config.location
+        deepness = scraping_config.deepness
         common_kwargs = {
             "queue": queue,
             "language": language,
             "location": location,
-            "marketplaces": marketplaces,
-            "excluded_urls": excluded_urls,
+            "marketplaces": scraping_config.marketplaces,
+            "excluded_urls": scraping_config.excluded_urls,
         }
         # Add initial items to the queue
@@ -502,48 +488,101 @@ class Orchestrator(ABC):
                         **common_kwargs,  # type: ignore[arg-type]
                     )
+    @staticmethod
+    def _is_exact_search(search_term: str) -> bool:
+        """Check if the search term is an exact search (contains double quotation marks).
+        Args:
+            search_term: The search term to check.
+        """
+        return '"' in search_term
+    @staticmethod
+    def _extract_exact_search_terms(search_term: str) -> list[str]:
+        """Extract all exact search terms from within double quotation marks (empty if no quotes found).
+        Args:
+            search_term: The search term that may contain double quotation marks.
+        """
+        # Find all double-quoted strings
+        double_quote_matches = re.findall(r'"([^"]*)"', search_term)
+        return double_quote_matches
+    @staticmethod
+    def _check_exact_search_terms_match(
+        product: ProductItem,
+        exact_search_terms: list[str],
+    ) -> bool:
+        """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
+        Args:
+            product: The product item.
+            exact_search_terms: List of exact search terms to match against.
+        """
+        field_values = [
+            str(val)
+            for fld in EXACT_MATCH_PRODUCT_FIELDS
+            if (val := getattr(product, fld, None)) is not None
+        ]
+        product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
+        return all(
+            re.search(re.escape(est.lower()), product_str_lower)
+            for est in exact_search_terms
+        )
+    def _check_exact_search(self, product: ProductItem) -> ProductItem:
+        """Checks if the search term requests an exact search and if yes, checks for conformity."""
+        # Check for exact search and apply regex matching
+        exact_search = self._is_exact_search(product.search_term)
+        product.exact_search = exact_search
+        # Only set exact_search_match if this was an exact search (contains quotes)
+        if exact_search:
+            exact_search_terms = self._extract_exact_search_terms(product.search_term)
+            if exact_search_terms:
+                product.exact_search_match = self._check_exact_search_terms_match(
+                    product=product, exact_search_terms=exact_search_terms
+                )
+                logger.debug(
+                    f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
+                    f"for offer with url={product.url}"
+                )
+            else:
+                logger.warning(
+                    f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
+                    f"for offer with url={product.url}"
+                )
+        # If exact_search is False, product.exact_search_match remains False (default value)
+        return product
     async def run(
         self,
-        search_term: str,
-        search_engines: List[SearchEngineName],
-        language: Language,
-        location: Location,
-        deepness: Deepness,
-        prompts: List[Prompt],
-        marketplaces: List[Host] | None = None,
-        excluded_urls: List[Host] | None = None,
-        previously_collected_urls: List[str] | None = None,
+        scraping_config: ScrapingConfig,
+        processing_config: ProcessingConfig,
     ) -> None:
         """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
         Args:
-            search_term: The search term for the query.
-            search_engines: The list of search engines to use for the search query.
-            language: The language to use for the query.
-            location: The location to use for the query.
-            deepness: The search depth and enrichment details.
-            prompts: The list of prompt to use for classification.
-            marketplaces: The marketplaces to include in the search.
-            excluded_urls: The URLs to exclude from the search.
-            previously_collected_urls: The urls that have been collected previously and are ignored.
+            scraping_config: Sets up the scraping pipeline step.
+            processing_config: Sets up the processing pipeline step.
         """
         # ---------------------------
         #        INITIAL SETUP
         # ---------------------------
-        # Ensure we have at least one search engine
-        if not search_engines:
+        # Ensure we have at least one search engine (the list might be empty)
+        if not scraping_config.search_engines:
             logger.warning(
                 "No search engines specified, using all available search engines"
             )
-            search_engines = list(SearchEngineName)
+            scraping_config.search_engines = list(SearchEngineName)
         # Handle previously collected URLs
-        if previously_collected_urls:
-            self._url_collector.add_previously_collected_urls(
-                urls=previously_collected_urls
-            )
+        if pcurls := scraping_config.previously_collected_urls:
+            self._url_collector.add_previously_collected_urls(urls=pcurls)
         # Setup the async framework
+        deepness = scraping_config.deepness
         n_terms_max = 1 + (
             deepness.enrichment.additional_terms if deepness.enrichment else 0
         )
@@ -558,7 +597,7 @@ class Orchestrator(ABC):
             n_srch_wkrs=n_srch_wkrs,
             n_cntx_wkrs=n_cntx_wkrs,
             n_proc_wkrs=n_proc_wkrs,
-            prompts=prompts,
+            processing_config=processing_config,
         )
         # Check setup of async framework
@@ -581,13 +620,7 @@ class Orchestrator(ABC):
         srch_queue = self._queues["srch"]
         await self._add_srch_items(
             queue=srch_queue,
-            search_term=search_term,
-            search_engines=search_engines,
-            language=language,
-            location=location,
-            deepness=deepness,
-            marketplaces=marketplaces,
-            excluded_urls=excluded_urls,
+            scraping_config=scraping_config,
         )
         # -----------------------------

fraudcrawler/launch_demo_pipeline.py CHANGED Viewed

@@ -97,4 +97,4 @@ def search(search_term: str):
 if __name__ == "__main__":
-    search(search_term='Gorenje "R619FEW5"')
+    search(search_term="Kaffeebohnen")

fraudcrawler/processing/config.py ADDED Viewed

@@ -0,0 +1,12 @@
+from pydantic import BaseModel, Field
+from typing import List
+from fraudcrawler.base.base import Prompt
+class ProcessingConfig(BaseModel):
+    """Sets up the processing pipeline step."""
+    prompts: List[Prompt] = Field(
+        description="The list of prompts to use for classification."
+    )

fraudcrawler/scraping/config.py ADDED Viewed

@@ -0,0 +1,32 @@
+from pydantic import BaseModel, Field
+from typing import List
+from fraudcrawler.scraping.search import SearchEngineName
+from fraudcrawler.base.base import (
+    Language,
+    Location,
+    Deepness,
+    Host,
+)
+class ScrapingConfig(BaseModel):
+    """Sets up the scraping pipeline step."""
+    search_term: str = Field(description="The search term for the query.")
+    search_engines: List[SearchEngineName] = Field(
+        description="The list of search engines to use for the search query."
+    )
+    language: Language = Field(description="The language to use for the query.")
+    location: Location = Field(description="The location to use for the query.")
+    deepness: Deepness = Field(description="The search depth and enrichment details.")
+    marketplaces: List[Host] | None = Field(
+        default=None, description="The marketplaces to include in the search."
+    )
+    excluded_urls: List[Host] | None = Field(
+        default=None, description="The URLs to exclude from the search."
+    )
+    previously_collected_urls: List[str] | None = Field(
+        default=None,
+        description="The URLs that have been collected previously and are ignored.",
+    )

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -2,11 +2,12 @@ from base64 import b64decode
 import logging
 from typing import List
+from bs4 import BeautifulSoup
 import httpx
 from tenacity import RetryCallState
 from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
-from fraudcrawler.base.base import DomainUtils
+from fraudcrawler.base.base import DomainUtils, ProductItem
 from fraudcrawler.base.retry import get_async_retry
 logger = logging.getLogger(__name__)
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
         else:
             logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
-    async def details(self, url: str) -> dict:
-        """Fetches product details for a single URL.
-        Args:
-            url: The URL to fetch product details from.
-        Returns:
-            A dictionary containing the product details, fields include:
-            (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
-            {
-                "url": str,
-                "statusCode": str,
-                "product": {
-                    "name": str,
-                    "price": str,
-                    "mainImage": {"url": str},
-                    "images": [{"url": str}],
-                    "description": str,
-                    "metadata": {
-                        "probability": float,
-                    },
-                },
-                "httpResponseBody": base64
-            }
-        """
-        logger.info(f"Fetching product details by Zyte for URL {url}.")
-        # Perform the request and retry if necessary. There is some context aware logging:
-        #  - `before`: before the request is made (and before retrying)
-        #  - `before_sleep`: if the request fails before sleeping
-        retry = get_async_retry()
-        retry.before = lambda retry_state: self._log_before(
-            url=url, retry_state=retry_state
-        )
-        retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            url=url, retry_state=retry_state
-        )
-        async for attempt in retry:
-            with attempt:
-                response = await self._http_client.post(
-                    url=self._endpoint,
-                    json={"url": url, **self._config},
-                    auth=(self._api_key, ""),  # API key as username, empty password
-                )
-                response.raise_for_status()
-        details = response.json()
-        return details
-    @staticmethod
-    def keep_product(
-        details: dict,
-        threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
-    ) -> bool:
-        """Determines whether to keep the product based on the probability threshold.
-        Args:
-            details: A product details data dictionary.
-            threshold: The probability threshold used to filter the products.
-        """
-        try:
-            prob = float(details["product"]["metadata"]["probability"])
-        except KeyError:
-            logger.warning(
-                f"Product with url={details.get('url')} has no probability value - product is ignored"
-            )
-            return False
-        return prob > threshold
     @staticmethod
-    def extract_product_name(details: dict) -> str | None:
+    def _extract_product_name(details: dict) -> str | None:
         """Extracts the product name from the product data.
         The input argument is a dictionary of the following structure:
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("name")
     @staticmethod
-    def extract_url_resolved(details: dict) -> str | None:
+    def _extract_url_resolved(details: dict) -> str | None:
         """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
         The input argument is a dictionary of the following structure:
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("url")
     @staticmethod
-    def extract_product_price(details: dict) -> str | None:
+    def _extract_product_price(details: dict) -> str | None:
         """Extracts the product price from the product data.
         The input argument is a dictionary of the following structure:
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("price")
     @staticmethod
-    def extract_product_description(details: dict) -> str | None:
+    def _extract_product_description(details: dict) -> str | None:
         """Extracts the product description from the product data.
         The input argument is a dictionary of the following structure:
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("description")
     @staticmethod
-    def extract_image_urls(details: dict) -> List[str]:
+    def _extract_image_urls(details: dict) -> List[str]:
         """Extracts the images from the product data.
         The input argument is a dictionary of the following structure:
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
         return images
     @staticmethod
-    def extract_probability(details: dict) -> float:
+    def _extract_probability(details: dict) -> float:
         """Extracts the probability from the product data.
         The input argument is a dictionary of the following structure:
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
         )
     @staticmethod
-    def extract_html(details: dict) -> str | None:
+    def _extract_html(details: dict) -> str | None:
         """Extracts the HTML from the Zyte API response.
         The input argument is a dictionary of the following structure:
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
             return decoded_string
         return None
+    def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
+        product.product_name = self._extract_product_name(details=details)
+        url_resolved = self._extract_url_resolved(details=details)
+        if url_resolved:
+            product.url_resolved = url_resolved
+        # If the resolved URL is different from the original URL, we also need to update the domain as
+        # otherwise the unresolved domain will be shown.
+        # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
+        if url_resolved and url_resolved != product.url:
+            logger.debug(f"URL resolved for {product.url} is {url_resolved}")
+            product.domain = self._get_domain(url=url_resolved)
+        product.product_price = self._extract_product_price(details=details)
+        product.product_description = self._extract_product_description(details=details)
+        product.product_images = self._extract_image_urls(details=details)
+        product.probability = self._extract_probability(details=details)
+        product.html = self._extract_html(details=details)
+        if product.html:
+            soup = BeautifulSoup(product.html, "html.parser")
+            product.html_clean = soup.get_text(separator=" ", strip=True)
+        return product
+    @staticmethod
+    def keep_product(
+        details: dict,
+        threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
+    ) -> bool:
+        """Determines whether to keep the product based on the probability threshold.
+        Args:
+            details: A product details data dictionary.
+            threshold: The probability threshold used to filter the products.
+        """
+        try:
+            prob = float(details["product"]["metadata"]["probability"])
+        except KeyError:
+            logger.warning(
+                f"Product with url={details.get('url')} has no probability value - product is ignored"
+            )
+            return False
+        return prob > threshold
     async def unblock_url_content(self, url: str) -> bytes:
         """Unblock the content of an URL using Zyte proxy mode.
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
             raise httpx.HTTPError("No httpResponseBody in Zyte response")
         return b64decode(details["httpResponseBody"])
+    async def details(self, url: str) -> dict:
+        """Fetches product details for a single URL.
+        Args:
+            url: The URL to fetch product details from.
+        Returns:
+            A dictionary containing the product details, fields include:
+            (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
+            {
+                "url": str,
+                "statusCode": str,
+                "product": {
+                    "name": str,
+                    "price": str,
+                    "mainImage": {"url": str},
+                    "images": [{"url": str}],
+                    "description": str,
+                    "metadata": {
+                        "probability": float,
+                    },
+                },
+                "httpResponseBody": base64
+            }
+        """
+        logger.info(f"Fetching product details by Zyte for URL {url}.")
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            url=url, retry_state=retry_state
+        )
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            url=url, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
+                response = await self._http_client.post(
+                    url=self._endpoint,
+                    json={"url": url, **self._config},
+                    auth=(self._api_key, ""),  # API key as username, empty password
+                )
+                response.raise_for_status()
+        details = response.json()
+        return details

fraudcrawler/settings.py CHANGED Viewed

@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
 # Zyte settings
 ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
+# Exact match settings
+EXACT_MATCH_PRODUCT_FIELDS = {
+    "url_resolvedproduct_name",
+    "product_description",
+    "html",
+}
+EXACT_MATCH_FIELD_SEPARATOR = "\n"
 # Processor settings
 PROCESSOR_DEFAULT_MODEL = "gpt-4o"
 PROCESSOR_DEFAULT_IF_MISSING = -1

{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: fraudcrawler
-Version: 0.6.1
+Version: 0.6.3
 Summary: Intelligent Market Monitoring
-Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
+License-File: LICENSE
 Author: Domingo Bertus
 Author-email: hello@veanu.ch
 Requires-Python: >=3.11,<4.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: httpx (>=0.28.1,<0.29.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)

fraudcrawler-0.6.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+fraudcrawler/__init__.py,sha256=YEbaofjs8pKkwqz4T-kGk7vHIQ_3XtDlF6D63wfuXjE,1008
+fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/base/base.py,sha256=l3rsXKYKmN_I0GkAXDjpjh_s07cE4siFAwMq0byroQM,7815
+fraudcrawler/base/client.py,sha256=6xAhQ7hdWYa7CQ84Ps1XUompCTXiSk0e3PywmTMUGng,6146
+fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
+fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
+fraudcrawler/base/orchestrator.py,sha256=UxjJMLm2kpxG76m2TXShuLkqhUxSuYvZyPP2yy708JA,28082
+fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
+fraudcrawler/launch_demo_pipeline.py,sha256=_aDqaPdxE_DMwQY5_vpqF2YjwLkWIZq5Z9Tz3sqLKdg,4629
+fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/processing/config.py,sha256=xqqTXK7zFA-7zwk76eZwrF97NtMzMOipUY6imeBIjQ8,301
+fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
+fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/scraping/config.py,sha256=8gyfB0VLi_FZr4J7a-HCTSYt8bRgSKXHo-Y9tlsD2MQ,1179
+fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
+fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
+fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
+fraudcrawler/scraping/zyte.py,sha256=sYpfwMuGE9MYpKvma_8x5Th2VBFn25Mqb4Wd7UChL_g,10215
+fraudcrawler/settings.py,sha256=9ukAkxEzDtvy3xA-jSF3asr9uLIAATNQ-FqrsgCEDUk,4038
+fraudcrawler-0.6.3.dist-info/METADATA,sha256=pP2S9-MFvCkNwWO7YB9Q9oYmZhsHlIxyljFXvTIfgus,6723
+fraudcrawler-0.6.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+fraudcrawler-0.6.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
+fraudcrawler-0.6.3.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
+fraudcrawler-0.6.3.dist-info/RECORD,,

{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.0.0
+Generator: poetry-core 2.2.1
 Root-Is-Purelib: true
 Tag: py3-none-any

fraudcrawler-0.6.1.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
-fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/base/base.py,sha256=74qwevU8sZBvXAladam0rmjcdn3AiT39MScpxZtD95I,7727
-fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
-fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
-fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
-fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
-fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
-fraudcrawler/launch_demo_pipeline.py,sha256=TqlQrs8raT9jIJ3TJK3BOQMLm2qNn2dKaMGL-MyhC70,4635
-fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
-fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
-fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
-fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
-fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
-fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
-fraudcrawler-0.6.1.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.6.1.dist-info/METADATA,sha256=_LcfOKayMQjAXoCxlJfqYtiSfitegUuQgFUD5XEGFog,6704
-fraudcrawler-0.6.1.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
-fraudcrawler-0.6.1.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.6.1.dist-info/RECORD,,

{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{fraudcrawler-0.6.1.dist-info → fraudcrawler-0.6.3.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

fraudcrawler 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl