PyPI - fraudcrawler - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

fraudcrawler 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (11) hide show

fraudcrawler/base/base.py +10 -0
fraudcrawler/base/orchestrator.py +95 -34
fraudcrawler/launch_demo_pipeline.py +1 -1
fraudcrawler/scraping/search.py +169 -147
fraudcrawler/scraping/zyte.py +103 -77
fraudcrawler/settings.py +8 -0
{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/METADATA +4 -3
{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/RECORD +11 -11
{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/WHEEL +1 -1
{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/entry_points.txt +0 -0
{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info/licenses}/LICENSE +0 -0

fraudcrawler/base/base.py CHANGED Viewed

@@ -140,6 +140,8 @@ class ProductItem(BaseModel):
     url_resolved: str
     search_engine_name: str
     domain: str
+    exact_search: bool = False
+    exact_search_match: bool = False
     # Context parameters
     product_name: str | None = None
@@ -217,6 +219,14 @@ class DomainUtils:
     """
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
+    _headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
     def _get_domain(self, url: str) -> str:
         """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -3,10 +3,12 @@ import asyncio
 import logging
 from typing import cast, Dict, List, Self
-from bs4 import BeautifulSoup
 import httpx
+import re
 from fraudcrawler.settings import (
+    EXACT_MATCH_PRODUCT_FIELDS,
+    EXACT_MATCH_FIELD_SEPARATOR,
     PROCESSOR_DEFAULT_MODEL,
 )
 from fraudcrawler.settings import (
@@ -27,8 +29,8 @@ from fraudcrawler import (
     Searcher,
     SearchEngineName,
     Enricher,
-    URLCollector,
     ZyteAPI,
+    URLCollector,
     Processor,
 )
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    # Fetch the product context from Zyte API
+                    # Fetch and enrich the product context from Zyte API
                     details = await self._zyteapi.details(url=product.url)
-                    url_resolved = self._zyteapi.extract_url_resolved(details=details)
-                    if url_resolved:
-                        product.url_resolved = url_resolved
-                    product.product_name = self._zyteapi.extract_product_name(
-                        details=details
+                    product = self._zyteapi.enrich_context(
+                        product=product, details=details
                     )
-                    # If the resolved URL is different from the original URL, we also need to update the domain as
-                    # otherwise the unresolved domain will be shown.
-                    # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
-                    if url_resolved and url_resolved != product.url:
-                        logger.debug(
-                            f"URL resolved for {product.url} is {url_resolved}"
-                        )
-                        product.domain = self._searcher._get_domain(url_resolved)
-                    product.product_price = self._zyteapi.extract_product_price(
-                        details=details
-                    )
-                    product.product_description = (
-                        self._zyteapi.extract_product_description(details=details)
-                    )
-                    product.product_images = self._zyteapi.extract_image_urls(
-                        details=details
-                    )
-                    product.probability = self._zyteapi.extract_probability(
-                        details=details
-                    )
-                    product.html = self._zyteapi.extract_html(details=details)
-                    if product.html:
-                        soup = BeautifulSoup(product.html, "html.parser")
-                        product.html_clean = soup.get_text(separator=" ", strip=True)
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
-                        product.filtered_at_stage = "Zyte probability threshold"
+                        product.filtered_at_stage = (
+                            "Context (Zyte probability threshold)"
+                        )
+                    # Check for exact match inside the full product context
+                    product = self._check_exact_search(product=product)
+                    if (
+                        not product.filtered
+                        and product.exact_search
+                        and not product.exact_search_match
+                    ):
+                        product.filtered = True
+                        product.filtered_at_stage = "Context (exact search)"
                 except Exception as e:
                     logger.warning(f"Error executing Zyte API search: {e}.")
             await queue_out.put(product)
@@ -502,6 +489,80 @@ class Orchestrator(ABC):
                         **common_kwargs,  # type: ignore[arg-type]
                     )
+    @staticmethod
+    def _is_exact_search(search_term: str) -> bool:
+        """Check if the search term is an exact search (contains double quotation marks).
+        Args:
+            search_term: The search term to check.
+        Returns:
+            True if the search term contains double quotation marks, False otherwise.
+        """
+        return '"' in search_term
+    @staticmethod
+    def _extract_exact_search_terms(search_term: str) -> list[str]:
+        """Extract all exact search terms from within double quotation marks.
+        Args:
+            search_term: The search term that may contain double quotation marks.
+        Returns:
+            A list of extracted search terms without quotes, or empty list if no quotes found.
+        """
+        # Find all double-quoted strings
+        double_quote_matches = re.findall(r'"([^"]*)"', search_term)
+        return double_quote_matches
+    @staticmethod
+    def _check_exact_search_terms_match(
+        product: ProductItem,
+        exact_search_terms: list[str],
+    ) -> bool:
+        """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
+        Args:
+            product: The product item.
+            exact_search_terms: List of exact search terms to match against.
+        """
+        field_values = [
+            str(val)
+            for fld in EXACT_MATCH_PRODUCT_FIELDS
+            if (val := getattr(product, fld, None)) is not None
+        ]
+        product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
+        return all(
+            re.search(re.escape(est.lower()), product_str_lower)
+            for est in exact_search_terms
+        )
+    def _check_exact_search(self, product: ProductItem) -> ProductItem:
+        """Checks if the search term requests an exact search and if yes, checks for conformity."""
+        # Check for exact search and apply regex matching
+        exact_search = self._is_exact_search(product.search_term)
+        product.exact_search = exact_search
+        # Only set exact_search_match if this was an exact search (contains quotes)
+        if exact_search:
+            exact_search_terms = self._extract_exact_search_terms(product.search_term)
+            if exact_search_terms:
+                product.exact_search_match = self._check_exact_search_terms_match(
+                    product=product, exact_search_terms=exact_search_terms
+                )
+                logger.debug(
+                    f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
+                    f"for offer with url={product.url}"
+                )
+            else:
+                logger.warning(
+                    f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
+                    f"for offer with url={product.url}"
+                )
+        # If exact_search is False, product.exact_search_match remains False (default value)
+        return product
     async def run(
         self,
         search_term: str,

fraudcrawler/launch_demo_pipeline.py CHANGED Viewed

@@ -97,4 +97,4 @@ def search(search_term: str):
 if __name__ == "__main__":
-    search(search_term="electric cigarettes")
+    search(search_term="Kaffeebohnen")

fraudcrawler/scraping/search.py CHANGED Viewed

@@ -8,7 +8,7 @@ from urllib.parse import quote_plus
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 import httpx
-from tenacity import RetryCallState, AsyncRetrying
+from tenacity import RetryCallState
 from fraudcrawler.settings import (
     SEARCH_DEFAULT_COUNTRY_CODES,
@@ -45,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
     _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
+    def __init__(self, http_client: httpx.AsyncClient):
+        """Initializes the SearchEngine with the given HTTP client.
+        Args:
+            http_client: An httpx.AsyncClient to use for the async requests.
+        """
+        self._http_client = http_client
     @property
     @abstractmethod
     def _search_engine_name(self) -> str:
@@ -56,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
         """Apply the search with the given parameters and return results."""
         pass
+    def _create_search_result(self, url: str) -> SearchResult:
+        """From a given url it creates the class:`SearchResult` instance."""
+        # Get marketplace name
+        domain = self._get_domain(url=url)
+        # Create and return the SearchResult object
+        result = SearchResult(
+            url=url,
+            domain=domain,
+            search_engine_name=self._search_engine_name,
+        )
+        return result
     @classmethod
     def _log_before(
-        cls, search_string: str, retry_state: RetryCallState | None
+        cls, url: str, params: dict | None, retry_state: RetryCallState | None
     ) -> None:
-        """Context aware logging before the request is made."""
+        """Context aware logging before HTTP request is made."""
         if retry_state:
             logger.debug(
-                f'Performing search in {cls.__name__} with q="{search_string}" '
-                f"(attempt {retry_state.attempt_number})."
+                f'Performing HTTP request in {cls.__name__} to url="{url}" '
+                f"with params={params} (attempt {retry_state.attempt_number})."
             )
         else:
             logger.debug(f"retry_state is {retry_state}; not logging before.")
     @classmethod
     def _log_before_sleep(
-        cls, search_string: str, retry_state: RetryCallState | None
+        cls, url: str, params: dict | None, retry_state: RetryCallState | None
     ) -> None:
-        """Context aware logging before sleeping after a failed request."""
+        """Context aware logging before sleeping after a failed HTTP request."""
         if retry_state and retry_state.outcome:
             logger.warning(
-                f'Attempt {retry_state.attempt_number} of {cls.__name__} search with q="{search_string}" '
+                f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
+                f'to url="{url}" with params="{params}" '
                 f"failed with error: {retry_state.outcome.exception()}. "
                 f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
             )
         else:
             logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
-    def _create_search_result(self, url: str) -> SearchResult:
-        """From a given url it creates the class:`SearchResult` instance."""
-        # Get marketplace name
-        domain = self._get_domain(url=url)
+    async def http_client_get(
+        self, url: str, params: dict | None = None, headers: dict | None = None
+    ) -> httpx.Response:
+        """Performs a GET request with retries.
-        # Create and return the SearchResult object
-        result = SearchResult(
-            url=url,
-            domain=domain,
-            search_engine_name=self._search_engine_name,
+        Args:
+            retry: The retry strategy to use.
+            url: The URL to request.
+            params: Query parameters for the request.
+            headers: HTTP headers to use for the request.
+        """
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            url=url, params=params, retry_state=retry_state
         )
-        return result
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            url=url, params=params, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
+                response = await self._http_client.get(
+                    url=url,
+                    params=params,
+                    headers=headers,
+                )
+                response.raise_for_status()
+                return response
+        # In case of not entering the for loop (for some strange reason)
+        raise RuntimeError("Retry exhausted without success")
 class SerpAPI(SearchEngine):
@@ -109,7 +153,7 @@ class SerpAPI(SearchEngine):
             http_client: An httpx.AsyncClient to use for the async requests.
             api_key: The API key for SerpAPI.
         """
-        self._http_client = http_client
+        super().__init__(http_client=http_client)
         self._api_key = api_key
     @property
@@ -205,22 +249,10 @@ class SerpAPI(SearchEngine):
         }
         logger.debug(f"SerpAPI search with params: {params}")
-        # Perform the request and retry if necessary. There is some context aware logging:
-        #  - `before`: before the request is made (and before retrying)
-        #  - `before_sleep`: if the request fails before sleeping
-        retry = get_async_retry()
-        retry.before = lambda retry_state: self._log_before(
-            search_string=search_string, retry_state=retry_state
-        )
-        retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            search_string=search_string, retry_state=retry_state
+        # Perform the search request
+        response: httpx.Response = await self.http_client_get(
+            url=self._endpoint, params=params
         )
-        async for attempt in retry:
-            with attempt:
-                response = await self._http_client.get(
-                    url=self._endpoint, params=params
-                )
-                response.raise_for_status()
         # Extract the URLs from the response
         data = response.json()
@@ -336,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
         """
         results = data.get("shopping_results")
         if results is not None:
-            return [url for res in results if (url := res.get("product_link"))]
+            # return [url for res in results if (url := res.get("product_link"))]   # c.f. https://github.com/serpapi/public-roadmap/issues/3045
+            return [
+                url
+                for res in results
+                if (url := res.get("serpapi_immersive_product_api"))
+            ]
+        return []
+    @staticmethod
+    def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
+        """Extracts product urls from the serpapi immersive product API data."""
+        if results := data.get("product_results"):
+            stores = results.get("stores", [])
+            urls = [url for sre in stores if (url := sre.get("link"))]
+            return list(set(urls))
         return []
     async def search(
@@ -349,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
     ) -> List[SearchResult]:
         """Performs a google shopping search using SerpApi and returns SearchResults.
+        Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
+        and creates multiple SearchResult objects for each merchant URL found.
         Args:
             search_term: The search term to use for the query.
             language: The language to use for the query ('hl' parameter).
@@ -362,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
             marketplaces=marketplaces,
         )
-        # Perform the search
+        # Perform the search to get Google Shopping URLs
         urls = await self._search(
             search_string=search_string,
             language=language,
@@ -375,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
         # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
         urls = urls[:num_results]
-        # Create and return SearchResult objects from the URLs
+        # Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
         results = [self._create_search_result(url=url) for url in urls]
         logger.debug(
-            f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
+            f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
         )
         return results
@@ -387,14 +436,6 @@ class Toppreise(SearchEngine):
     """Search engine for toppreise.ch."""
     _endpoint = "https://www.toppreise.ch/"
-    _headers = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.5",
-        "Accept-Encoding": "gzip, deflate",
-        "Connection": "keep-alive",
-        "Upgrade-Insecure-Requests": "1",
-    }
     def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
         """Initializes the Toppreise client.
@@ -403,9 +444,42 @@ class Toppreise(SearchEngine):
             http_client: An httpx.AsyncClient to use for the async requests.
             zyteapi_key: ZyteAPI key for fallback when direct access fails.
         """
-        self._http_client = http_client
+        super().__init__(http_client=http_client)
         self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
+    async def http_client_get_with_fallback(self, url: str) -> bytes:
+        """Performs a GET request with retries.
+        If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
+        content using Zyte proxy mode.
+        Args:
+            url: The URL to request.
+        """
+        # Try to access the URL directly
+        try:
+            response: httpx.Response = await self.http_client_get(
+                url=url, headers=self._headers
+            )
+            content = response.content
+        # If we get a 403 Error (can happen depending on IP/location of deployment),
+        # we try to unblock the URL using Zyte proxy mode
+        except httpx.HTTPStatusError as err_direct:
+            if err_direct.response.status_code == 403:
+                logger.warning(
+                    f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
+                )
+                try:
+                    content = await self._zyteapi.unblock_url_content(url)
+                except Exception as err_resolve:
+                    msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
+                    logger.error(msg)
+                    raise httpx.HTTPError(msg) from err_resolve
+            else:
+                raise err_direct
+        return content
     @classmethod
     def _get_search_endpoint(cls, language: Language) -> str:
         """Get the search endpoint based on the language."""
@@ -502,46 +576,6 @@ class Toppreise(SearchEngine):
         """The name of the search engine."""
         return SearchEngineName.TOPPREISE.value
-    async def http_client_get_with_fallback(
-        self, url: str, retry: AsyncRetrying
-    ) -> bytes:
-        """Performs a GET request with retries.
-        If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
-        content using Zyte proxy mode.
-        Args:
-            url: The URL to request.
-            retry: The retry strategy to use.
-        """
-        # Try to access the URL directly
-        try:
-            async for attempt in retry:
-                with attempt:
-                    response = await self._http_client.get(
-                        url=url,
-                        headers=self._headers,
-                    )
-                    response.raise_for_status()
-                    content = response.content
-        # If we get a 403 Error (can happen depending on IP/location of deployment),
-        # we try to unblock the URL using Zyte proxy mode
-        except httpx.HTTPStatusError as err_direct:
-            if err_direct.response.status_code == 403:
-                logger.warning(
-                    f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
-                )
-                try:
-                    content = await self._zyteapi.unblock_url_content(url)
-                except Exception as err_resolve:
-                    msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
-                    logger.error(msg)
-                    raise httpx.HTTPError(msg) from err_resolve
-            else:
-                raise err_direct
-        return content
     async def _search(
         self, search_string: str, language: Language, num_results: int
     ) -> List[str]:
@@ -561,17 +595,8 @@ class Toppreise(SearchEngine):
         url = f"{endpoint}?q={encoded_search}"
         logger.debug(f"Toppreise search URL: {url}")
-        # Perform the request and retry if necessary. There is some context aware logging:
-        #  - `before`: before the request is made (and before retrying)
-        #  - `before_sleep`: if the request fails before sleeping
-        retry = get_async_retry()
-        retry.before = lambda retry_state: self._log_before(
-            search_string=search_string, retry_state=retry_state
-        )
-        retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            search_string=search_string, retry_state=retry_state
-        )
-        content = await self.http_client_get_with_fallback(url=url, retry=retry)
+        # Perform the request with fallback if necessary
+        content = await self.http_client_get_with_fallback(url=url)
         # Get external product urls from the content
         urls = self._extract_product_urls_from_search_page(content=content)
@@ -633,61 +658,44 @@ class Searcher(DomainUtils):
             zyteapi_key=zyteapi_key,
         )
-    @staticmethod
-    def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
-        """Context aware logging before the request is made."""
-        if retry_state:
-            logger.debug(
-                f'Performing post search for url="{url}" '
-                f"(attempt {retry_state.attempt_number})."
-            )
-        else:
-            logger.debug(f"retry_state is {retry_state}; not logging before.")
+    async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
+        """Post-search for product URLs from a Google Shopping immersive product page.
-    @staticmethod
-    def _post_search_log_before_sleep(
-        url: str, retry_state: RetryCallState | None
-    ) -> None:
-        """Context aware logging before sleeping after a failed request."""
-        if retry_state and retry_state.outcome:
-            logger.warning(
-                f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
-                f"failed with error: {retry_state.outcome.exception()}. "
-                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
-            )
-        else:
-            logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
+        Args:
+            url: The URL of the Google Shopping product page.
+        """
+        # Add SerpAPI key to the url
+        sep = "&" if "?" in url else "?"
+        url = f"{url}{sep}api_key={self._google_shopping._api_key}"
+        # Fetch the content of the Google Shopping product page
+        response = await self._google_shopping.http_client_get(url=url)
+        # Get external product urls from the data
+        data = response.json()
+        urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
+            data=data
+        )
+        return urls
     async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
         """Post-search for product URLs from a Toppreise product comparison page.
         Note:
             In comparison to the function Toppreise._search, here we extract the urls from
-            product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
-            also be found in the results of a google search.
+            product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
+            pages can also be found in the results of a google search.
         Args:
             url: The URL of the Toppreise product listing page.
         """
-        # Perform the request and retry if necessary. There is some context aware logging:
-        #  - `before`: before the request is made (and before retrying)
-        #  - `before_sleep`: if the request fails before sleeping
-        retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
-        retry.before = lambda retry_state: self._post_search_log_before(
-            url=url, retry_state=retry_state
-        )
-        retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
-            url=url, retry_state=retry_state
-        )
-        content = await self._toppreise.http_client_get_with_fallback(
-            url=url, retry=retry
-        )
+        # Perform the request with fallback if necessary
+        content = await self._toppreise.http_client_get_with_fallback(url=url)
         # Get external product urls from the content
         urls = self._toppreise._extract_product_urls_from_comparison_page(
             content=content
         )
         return urls
     async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
@@ -703,9 +711,22 @@ class Searcher(DomainUtils):
         post_search_results: List[SearchResult] = []
         for res in results:
             url = res.url
+            post_search_urls: List[str] = []
+            # Extract embedded product URLs from the Google Shopping immersive product page
+            if "engine=google_immersive_product" in url:
+                logger.debug(
+                    f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
+                )
+                post_search_urls = await self._post_search_google_shopping_immersive(
+                    url=url
+                )
+                logger.debug(
+                    f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
+                )
             # Extract embedded product URLs from the Toppreise product listing page
-            if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
+            elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
                 logger.debug(
                     f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
                 )
@@ -714,15 +735,16 @@ class Searcher(DomainUtils):
                     f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
                 )
-                psr = [
-                    SearchResult(
-                        url=psu,
-                        domain=self._get_domain(url=psu),
-                        search_engine_name=res.search_engine_name,
-                    )
-                    for psu in post_search_urls
-                ]
-                post_search_results.extend(psr)
+            # Add the extracted product URLs as SearchResult objects
+            psr = [
+                SearchResult(
+                    url=psu,
+                    domain=self._get_domain(url=psu),
+                    search_engine_name=res.search_engine_name,
+                )
+                for psu in post_search_urls
+            ]
+            post_search_results.extend(psr)
         return post_search_results

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -2,11 +2,12 @@ from base64 import b64decode
 import logging
 from typing import List
+from bs4 import BeautifulSoup
 import httpx
 from tenacity import RetryCallState
 from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
-from fraudcrawler.base.base import DomainUtils
+from fraudcrawler.base.base import DomainUtils, ProductItem
 from fraudcrawler.base.retry import get_async_retry
 logger = logging.getLogger(__name__)
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
         else:
             logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
-    async def details(self, url: str) -> dict:
-        """Fetches product details for a single URL.
-        Args:
-            url: The URL to fetch product details from.
-        Returns:
-            A dictionary containing the product details, fields include:
-            (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
-            {
-                "url": str,
-                "statusCode": str,
-                "product": {
-                    "name": str,
-                    "price": str,
-                    "mainImage": {"url": str},
-                    "images": [{"url": str}],
-                    "description": str,
-                    "metadata": {
-                        "probability": float,
-                    },
-                },
-                "httpResponseBody": base64
-            }
-        """
-        logger.info(f"Fetching product details by Zyte for URL {url}.")
-        # Perform the request and retry if necessary. There is some context aware logging:
-        #  - `before`: before the request is made (and before retrying)
-        #  - `before_sleep`: if the request fails before sleeping
-        retry = get_async_retry()
-        retry.before = lambda retry_state: self._log_before(
-            url=url, retry_state=retry_state
-        )
-        retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            url=url, retry_state=retry_state
-        )
-        async for attempt in retry:
-            with attempt:
-                response = await self._http_client.post(
-                    url=self._endpoint,
-                    json={"url": url, **self._config},
-                    auth=(self._api_key, ""),  # API key as username, empty password
-                )
-                response.raise_for_status()
-        details = response.json()
-        return details
-    @staticmethod
-    def keep_product(
-        details: dict,
-        threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
-    ) -> bool:
-        """Determines whether to keep the product based on the probability threshold.
-        Args:
-            details: A product details data dictionary.
-            threshold: The probability threshold used to filter the products.
-        """
-        try:
-            prob = float(details["product"]["metadata"]["probability"])
-        except KeyError:
-            logger.warning(
-                f"Product with url={details.get('url')} has no probability value - product is ignored"
-            )
-            return False
-        return prob > threshold
     @staticmethod
-    def extract_product_name(details: dict) -> str | None:
+    def _extract_product_name(details: dict) -> str | None:
         """Extracts the product name from the product data.
         The input argument is a dictionary of the following structure:
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("name")
     @staticmethod
-    def extract_url_resolved(details: dict) -> str | None:
+    def _extract_url_resolved(details: dict) -> str | None:
         """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
         The input argument is a dictionary of the following structure:
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("url")
     @staticmethod
-    def extract_product_price(details: dict) -> str | None:
+    def _extract_product_price(details: dict) -> str | None:
         """Extracts the product price from the product data.
         The input argument is a dictionary of the following structure:
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("price")
     @staticmethod
-    def extract_product_description(details: dict) -> str | None:
+    def _extract_product_description(details: dict) -> str | None:
         """Extracts the product description from the product data.
         The input argument is a dictionary of the following structure:
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
         return details.get("product", {}).get("description")
     @staticmethod
-    def extract_image_urls(details: dict) -> List[str]:
+    def _extract_image_urls(details: dict) -> List[str]:
         """Extracts the images from the product data.
         The input argument is a dictionary of the following structure:
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
         return images
     @staticmethod
-    def extract_probability(details: dict) -> float:
+    def _extract_probability(details: dict) -> float:
         """Extracts the probability from the product data.
         The input argument is a dictionary of the following structure:
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
         )
     @staticmethod
-    def extract_html(details: dict) -> str | None:
+    def _extract_html(details: dict) -> str | None:
         """Extracts the HTML from the Zyte API response.
         The input argument is a dictionary of the following structure:
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
             return decoded_string
         return None
+    def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
+        product.product_name = self._extract_product_name(details=details)
+        url_resolved = self._extract_url_resolved(details=details)
+        if url_resolved:
+            product.url_resolved = url_resolved
+        # If the resolved URL is different from the original URL, we also need to update the domain as
+        # otherwise the unresolved domain will be shown.
+        # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
+        if url_resolved and url_resolved != product.url:
+            logger.debug(f"URL resolved for {product.url} is {url_resolved}")
+            product.domain = self._get_domain(url=url_resolved)
+        product.product_price = self._extract_product_price(details=details)
+        product.product_description = self._extract_product_description(details=details)
+        product.product_images = self._extract_image_urls(details=details)
+        product.probability = self._extract_probability(details=details)
+        product.html = self._extract_html(details=details)
+        if product.html:
+            soup = BeautifulSoup(product.html, "html.parser")
+            product.html_clean = soup.get_text(separator=" ", strip=True)
+        return product
+    @staticmethod
+    def keep_product(
+        details: dict,
+        threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
+    ) -> bool:
+        """Determines whether to keep the product based on the probability threshold.
+        Args:
+            details: A product details data dictionary.
+            threshold: The probability threshold used to filter the products.
+        """
+        try:
+            prob = float(details["product"]["metadata"]["probability"])
+        except KeyError:
+            logger.warning(
+                f"Product with url={details.get('url')} has no probability value - product is ignored"
+            )
+            return False
+        return prob > threshold
     async def unblock_url_content(self, url: str) -> bytes:
         """Unblock the content of an URL using Zyte proxy mode.
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
             raise httpx.HTTPError("No httpResponseBody in Zyte response")
         return b64decode(details["httpResponseBody"])
+    async def details(self, url: str) -> dict:
+        """Fetches product details for a single URL.
+        Args:
+            url: The URL to fetch product details from.
+        Returns:
+            A dictionary containing the product details, fields include:
+            (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
+            {
+                "url": str,
+                "statusCode": str,
+                "product": {
+                    "name": str,
+                    "price": str,
+                    "mainImage": {"url": str},
+                    "images": [{"url": str}],
+                    "description": str,
+                    "metadata": {
+                        "probability": float,
+                    },
+                },
+                "httpResponseBody": base64
+            }
+        """
+        logger.info(f"Fetching product details by Zyte for URL {url}.")
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            url=url, retry_state=retry_state
+        )
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            url=url, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
+                response = await self._http_client.post(
+                    url=self._endpoint,
+                    json={"url": url, **self._config},
+                    auth=(self._api_key, ""),  # API key as username, empty password
+                )
+                response.raise_for_status()
+        details = response.json()
+        return details

fraudcrawler/settings.py CHANGED Viewed

@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
 # Zyte settings
 ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
+# Exact match settings
+EXACT_MATCH_PRODUCT_FIELDS = {
+    "url_resolvedproduct_name",
+    "product_description",
+    "html",
+}
+EXACT_MATCH_FIELD_SEPARATOR = "\n"
 # Processor settings
 PROCESSOR_DEFAULT_MODEL = "gpt-4o"
 PROCESSOR_DEFAULT_IF_MISSING = -1

{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: fraudcrawler
-Version: 0.6.0
+Version: 0.6.2
 Summary: Intelligent Market Monitoring
-Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
+License-File: LICENSE
 Author: Domingo Bertus
 Author-email: hello@veanu.ch
 Requires-Python: >=3.11,<4.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: httpx (>=0.28.1,<0.29.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)

{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,22 @@
 fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
 fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/base/base.py,sha256=IbkPookAAkqDCztzAvVRnhh8rCsYGlY69eI6cw-Kiw0,7294
+fraudcrawler/base/base.py,sha256=mTmojNyVrPEB69-aI-43dl0Jct174G4ziBiOudDFfTY,7795
 fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
 fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
 fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
-fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
+fraudcrawler/base/orchestrator.py,sha256=TiLKAJTBIPf0dxJuyZnCGIMWReC9gNvmEXqWwE0Ykbs,29002
 fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
-fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
+fraudcrawler/launch_demo_pipeline.py,sha256=_aDqaPdxE_DMwQY5_vpqF2YjwLkWIZq5Z9Tz3sqLKdg,4629
 fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
 fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
-fraudcrawler/scraping/search.py,sha256=pMjTQEewa-jP6l2ndhHy8CNIcO4svhZOm6N_LNuv3gs,33925
+fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
 fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
-fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
-fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
-fraudcrawler-0.6.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.6.0.dist-info/METADATA,sha256=adpYLe_ToSth-YOZE3eh-KNUsNmcwcM_SE7pqKikNmU,6704
-fraudcrawler-0.6.0.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
-fraudcrawler-0.6.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.6.0.dist-info/RECORD,,
+fraudcrawler/scraping/zyte.py,sha256=sYpfwMuGE9MYpKvma_8x5Th2VBFn25Mqb4Wd7UChL_g,10215
+fraudcrawler/settings.py,sha256=9ukAkxEzDtvy3xA-jSF3asr9uLIAATNQ-FqrsgCEDUk,4038
+fraudcrawler-0.6.2.dist-info/METADATA,sha256=5hzWjCm1eQJ19Pm3vxUsS_EciUmbuppEpECi8ye2Wyw,6723
+fraudcrawler-0.6.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+fraudcrawler-0.6.2.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
+fraudcrawler-0.6.2.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
+fraudcrawler-0.6.2.dist-info/RECORD,,

{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.0.0
+Generator: poetry-core 2.2.1
 Root-Is-Purelib: true
 Tag: py3-none-any

{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

fraudcrawler 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl