PyPI - fraudcrawler - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

fraudcrawler 0.4.3py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (14) hide show

fraudcrawler/base/base.py +11 -0
fraudcrawler/base/orchestrator.py +13 -36
fraudcrawler/base/retry.py +37 -0
fraudcrawler/processing/processor.py +111 -23
fraudcrawler/scraping/enrich.py +69 -30
fraudcrawler/scraping/serp.py +43 -26
fraudcrawler/scraping/zyte.py +36 -31
fraudcrawler/settings.py +48 -10
{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/METADATA +2 -1
fraudcrawler-0.4.6.dist-info/RECORD +22 -0
fraudcrawler-0.4.3.dist-info/RECORD +0 -21
{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/LICENSE +0 -0
{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/WHEEL +0 -0
{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/entry_points.txt +0 -0

fraudcrawler/base/base.py CHANGED Viewed

@@ -63,6 +63,14 @@ class Host(BaseModel):
         return [cls._normalize_domain(dom.strip()) for dom in val]
+class ClassificationResult(BaseModel):
+    """Model for classification results."""
+    result: int
+    input_tokens: int
+    output_tokens: int
 class Location(BaseModel):
     """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
     # Processor parameters are set dynamic so we must allow extra fields
     classifications: Dict[str, int] = Field(default_factory=dict)
+    # Usage parameters
+    usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
     # Filtering parameters
     filtered: bool = False
     filtered_at_stage: str | None = None

fraudcrawler/base/orchestrator.py CHANGED Viewed

@@ -7,10 +7,6 @@ from bs4 import BeautifulSoup
 from fraudcrawler.settings import (
     PROCESSOR_DEFAULT_MODEL,
-    PROCESSOR_DEFAULT_IF_MISSING,
-    PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
-    MAX_RETRIES,
-    RETRY_DELAY,
 )
 from fraudcrawler.settings import (
     DEFAULT_N_SERP_WKRS,
@@ -61,9 +57,6 @@ class Orchestrator(ABC):
         zyteapi_key: str,
         openaiapi_key: str,
         openai_model: str = PROCESSOR_DEFAULT_MODEL,
-        max_retries: int = MAX_RETRIES,
-        retry_delay: int = RETRY_DELAY,
-        default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
         n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
         n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
         n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
@@ -77,25 +70,18 @@ class Orchestrator(ABC):
             zyteapi_key: The API key for Zyte API.
             openaiapi_key: The API key for OpenAI.
             openai_model: The model to use for the processing (optional).
-            max_retries: Maximum number of retries for API calls (optional).
-            retry_delay: Delay between retries in seconds (optional).
             n_serp_wkrs: Number of async workers for serp (optional).
             n_zyte_wkrs: Number of async workers for zyte (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
         """
         # Setup the clients
-        self._serpapi = SerpApi(
-            api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
-        )
+        self._serpapi = SerpApi(api_key=serpapi_key)
         self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
         self._url_collector = URLCollector()
-        self._zyteapi = ZyteApi(
-            api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
-        )
+        self._zyteapi = ZyteApi(api_key=zyteapi_key)
         self._processor = Processor(
             api_key=openaiapi_key,
             model=openai_model,
-            default_if_missing=default_if_missing,
         )
         # Setup the async framework
@@ -249,7 +235,6 @@ class Orchestrator(ABC):
         """
         # Process the products
         while True:
             product = await queue_in.get()
             if product is None:
@@ -259,31 +244,23 @@ class Orchestrator(ABC):
             if not product.filtered:
                 try:
-                    url = product.url
                     # Run all the configured prompts
                     for prompt in prompts:
-                        # Dynamically build product_details string
-                        details = []
-                        for field in prompt.product_item_fields:
-                            value = getattr(product, field, None)
-                            if value is not None:
-                                details.append(
-                                    PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
-                                        field_name=field, field_value=value
-                                    )
-                                )
-                        product_details = "\n\n".join(details)
-                        logger.debug(
-                            f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
-                        )
                         classification = await self._processor.classify(
+                            product=product,
                             prompt=prompt,
-                            url=url,
-                            product_details=product_details,
                         )
-                        product.classifications[prompt.name] = classification
+                        product.classifications[prompt.name] = int(
+                            classification.result
+                        )
+                        product.usage[prompt.name] = {
+                            "input_tokens": classification.input_tokens,
+                            "output_tokens": classification.output_tokens,
+                        }
                 except Exception as e:
-                    logger.warning(f"Error processing product: {e}.")
+                    logger.warning(
+                        f"Error processing product with url={product.url}: {e}."
+                    )
             await queue_out.put(product)
             queue_in.task_done()

fraudcrawler/base/retry.py ADDED Viewed

@@ -0,0 +1,37 @@
+from aiohttp.web_exceptions import HTTPException
+from tenacity import (
+    AsyncRetrying,
+    retry_if_exception,
+    stop_after_attempt,
+    wait_exponential_jitter,
+)
+from fraudcrawler.settings import (
+    RETRY_STOP_AFTER_ATTEMPT,
+    RETRY_INITIAL_DELAY,
+    RETRY_MAX_DELAY,
+    RETRY_EXP_BASE,
+    RETRY_JITTER,
+    RETRY_SKIP_IF_CODE,
+)
+def _is_retryable_exception(err: BaseException) -> bool:
+    if isinstance(err, HTTPException) and err.status_code in RETRY_SKIP_IF_CODE:
+        return False
+    return True
+def get_async_retry() -> AsyncRetrying:
+    """returns the retry configuration for async operations."""
+    return AsyncRetrying(
+        retry=retry_if_exception(_is_retryable_exception),
+        stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
+        wait=wait_exponential_jitter(
+            initial=RETRY_INITIAL_DELAY,
+            max=RETRY_MAX_DELAY,
+            exp_base=RETRY_EXP_BASE,
+            jitter=RETRY_JITTER,
+        ),
+        reraise=True,
+    )

fraudcrawler/processing/processor.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import logging
 from openai import AsyncOpenAI
+from tenacity import RetryCallState
-from fraudcrawler.base.base import Prompt
+from fraudcrawler.base.base import ProductItem, Prompt, ClassificationResult
+from fraudcrawler.base.retry import get_async_retry
 from fraudcrawler.settings import (
+    PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
     PROCESSOR_USER_PROMPT_TEMPLATE,
     PROCESSOR_DEFAULT_IF_MISSING,
+    PROCESSOR_EMPTY_TOKEN_COUNT,
 )
@@ -20,6 +24,7 @@ class Processor:
         api_key: str,
         model: str,
         default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
+        empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
     ):
         """Initializes the Processor.
@@ -27,17 +32,66 @@ class Processor:
             api_key: The OpenAI API key.
             model: The OpenAI model to use.
             default_if_missing: The default classification to return if error occurs.
+            empty_token_count: The default value to return as tokensif the classification is empty.
         """
         self._client = AsyncOpenAI(api_key=api_key)
         self._model = model
-        self._default_if_missing = default_if_missing
+        self._error_response = ClassificationResult(
+            result=default_if_missing,
+            input_tokens=empty_token_count,
+            output_tokens=empty_token_count,
+        )
+    @staticmethod
+    def _get_product_details(product: ProductItem, prompt: Prompt) -> str:
+        """Extracts product details based on the prompt configuration.
+        Args:
+            product: The product item to extract details from.
+            prompt: The prompt configuration containing field names.
+        """
+        details = []
+        for field in prompt.product_item_fields:
+            if value := getattr(product, field, None):
+                details.append(
+                    PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
+                        field_name=field, field_value=value
+                    )
+                )
+            else:
+                logger.error(
+                    f'Field "{field}" is missing in ProductItem with url="{product.url}"'
+                )
+        return "\n\n".join(details)
+    @staticmethod
+    def _log_before(url: str, prompt: Prompt, retry_state: RetryCallState) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before.")
+    @staticmethod
+    def _log_before_sleep(
+        url: str, prompt: Prompt, retry_state: RetryCallState
+    ) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
+                f"failed with error: {retry_state.outcome.exception()}. "
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
     async def _call_openai_api(
         self,
         system_prompt: str,
         user_prompt: str,
         **kwargs,
-    ) -> str:
+    ) -> ClassificationResult:
         """Calls the OpenAI API with the given user prompt."""
         response = await self._client.chat.completions.create(
             model=self._model,
@@ -50,15 +104,35 @@ class Processor:
         content = response.choices[0].message.content
         if not content:
             raise ValueError("Empty response from OpenAI API")
-        return content
-    async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
-        """A generic classification method that classifies a product based on a prompt object.
+        # Convert the content to an integer
+        try:
+            content = int(content.strip())
+        except Exception as e:
+            msg = f"Failed to convert OpenAI response '{content}' to integer: {e}"
+            logger.error(msg)
+            raise ValueError(msg)
+        # For tracking consumption we alre return the tokens used
+        classification = ClassificationResult(
+            result=content,
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+        )
+        return classification
+    async def classify(
+        self,
+        product: ProductItem,
+        prompt: Prompt,
+    ) -> ClassificationResult:
+        """A generic classification method that classifies a product based on a prompt object and returns
+          the classification, input tokens, and output tokens.
         Args:
-            prompt: A dictionary with keys "system_prompt", etc.
-            url: Product URL (often used in the user_prompt).
-            product_details: String with product details, formatted per prompt.product_item_fields.
+            product: The product item to classify.
+            prompt: The prompt to use for classification.
         Note:
             This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
@@ -66,12 +140,15 @@ class Processor:
                 - an error occurs during the API call
                 - if the response isn't in allowed_classes.
         """
-        # If required fields are missing, return the prompt's default fallback if provided.
+        url = product.url
+        # Form the product details from the ProductItem
+        product_details = self._get_product_details(product=product, prompt=prompt)
         if not product_details:
             logger.warning("Missing required product_details for classification.")
-            return self._default_if_missing
+            return self._error_response
-        # Substitute placeholders in user_prompt with the relevant arguments
+        # Prepare the user prompt
         user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
             product_details=product_details,
         )
@@ -79,24 +156,35 @@ class Processor:
         # Call the OpenAI API
         try:
             logger.debug(
-                f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
+                f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
+            )
+            # Perform the request and retry if necessary. There is some context aware logging
+            #  - `before`: before the request is made (or before retrying)
+            #  - `before_sleep`: if the request fails before sleeping
+            retry = get_async_retry()
+            retry.before = lambda retry_state: self._log_before(
+                url=url, prompt=prompt, retry_state=retry_state
             )
-            content = await self._call_openai_api(
-                system_prompt=prompt.system_prompt,
-                user_prompt=user_prompt,
-                max_tokens=1,
+            retry.before_sleep = lambda retry_state: self._log_before_sleep(
+                url=url, prompt=prompt, retry_state=retry_state
             )
-            classification = int(content.strip())
+            async for attempt in retry:
+                with attempt:
+                    classification = await self._call_openai_api(
+                        system_prompt=prompt.system_prompt,
+                        user_prompt=user_prompt,
+                        max_tokens=1,
+                    )
             # Enforce that the classification is in the allowed classes
-            if classification not in prompt.allowed_classes:
+            if classification.result not in prompt.allowed_classes:
                 logger.warning(
-                    f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
+                    f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
                 )
-                return self._default_if_missing
+                return self._error_response
             logger.info(
-                f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
+                f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
             )
             return classification
@@ -104,4 +192,4 @@ class Processor:
             logger.error(
                 f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
             )
-            return self._default_if_missing
+            return self._error_response

fraudcrawler/scraping/enrich.py CHANGED Viewed

@@ -4,8 +4,11 @@ import logging
 from pydantic import BaseModel
 from typing import Dict, List, Iterator
+from tenacity import RetryCallState
 from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
 from fraudcrawler.base.base import Location, Language, AsyncClient
+from fraudcrawler.base.retry import get_async_retry
 logger = logging.getLogger(__name__)
@@ -22,8 +25,6 @@ class Enricher(AsyncClient):
     """A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
     _auth_encoding = "ascii"
-    _max_retries = 3
-    _retry_delay = 2
     _base_endpoint = "https://api.dataforseo.com"
     _suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
     _keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
@@ -44,6 +45,28 @@ class Enricher(AsyncClient):
             "Content-Encoding": "gzip",
         }
+    @staticmethod
+    def _log_before(search_term: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f'DataForSEO suggested search with search="{search_term}" (attempt {retry_state.attempt_number}).'
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}, not logging before.")
+    @staticmethod
+    def _log_before_sleep(search_term: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f'Attempt {retry_state.attempt_number} DataForSEO suggested search with search_term="{search_term}" '
+                f"failed with error: {retry_state.outcome.exception()}. "
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}, not logging before_sleep.")
     @staticmethod
     def _extract_items_from_data(data: dict) -> Iterator[dict]:
         """Extracts the items from the DataForSEO response.
@@ -126,7 +149,7 @@ class Enricher(AsyncClient):
             limit: The upper limit of suggestions to get.
         """
-        # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
+        # Data must be a list of dictionaries, setting a number of search tasks; here we only have one task.
         data = [
             {
                 "keyword": search_term,
@@ -137,23 +160,25 @@ class Enricher(AsyncClient):
                 "include_seed_keyword": True,
             }
         ]
-        logger.debug(
-            f'DataForSEO search for suggested keywords with search_term="{search_term}".'
+        url = f"{self._base_endpoint}{self._suggestions_endpoint}"
+        logger.debug(f'DataForSEO url="{url}" with data="{data}".')
+        # Perform the request and retry if necessary. There is some context aware logging
+        #  - `before`: before the request is made (or before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            search_term=search_term, retry_state=retry_state
         )
-        try:
-            url = f"{self._base_endpoint}{self._suggestions_endpoint}"
-            logger.debug(f'DataForSEO url="{url}" with data="{data}".')
-            sugg_data = await self.post(url=url, headers=self._headers, data=data)
-        except Exception as e:
-            logger.error(f"DataForSEO suggested search failed with error: {e}.")
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            search_term=search_term, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
+                sugg_data = await self.post(url=url, headers=self._headers, data=data)
         # Extract the keywords from the response
-        try:
-            keywords = self._extract_suggested_keywords(data=sugg_data)
-        except Exception as e:
-            logger.error(
-                f"Failed to extract suggested keywords from DataForSEO response with error: {e}."
-            )
+        keywords = self._extract_suggested_keywords(data=sugg_data)
         logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
         return keywords
@@ -271,22 +296,36 @@ class Enricher(AsyncClient):
             language: The language to use for the search.
             n_terms: The number of additional terms
         """
-        # Get the additional keywords
         logger.info(
             f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
         )
-        suggested = await self._get_suggested_keywords(
-            search_term=search_term,
-            location=location,
-            language=language,
-            limit=n_terms,
-        )
-        related = await self._get_related_keywords(
-            search_term=search_term,
-            location=location,
-            language=language,
-            limit=n_terms,
-        )
+        # Get the additional suggested keywords
+        try:
+            suggested = await self._get_suggested_keywords(
+                search_term=search_term,
+                location=location,
+                language=language,
+                limit=n_terms,
+            )
+        except Exception as e:
+            logger.error(
+                f"Error fetching suggested keywords for search_term='{search_term}': {e}"
+            )
+            suggested = []
+        # Get the additional related keywords
+        try:
+            related = await self._get_related_keywords(
+                search_term=search_term,
+                location=location,
+                language=language,
+                limit=n_terms,
+            )
+        except Exception as e:
+            logger.error(
+                f"Error fetching related keywords for search_term='{search_term}': {e}"
+            )
+            related = []
         # Remove original keyword and aggregate them by volume
         keywords = [kw for kw in suggested + related if kw.text != search_term]

fraudcrawler/scraping/serp.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 from enum import Enum
 import logging
 from pydantic import BaseModel
@@ -6,8 +5,11 @@ from typing import List
 from urllib.parse import urlparse
 import re
-from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
+from tenacity import RetryCallState
+from fraudcrawler.settings import SERP_DEFAULT_COUNTRY_CODES
 from fraudcrawler.base.base import Host, Language, Location, AsyncClient
+from fraudcrawler.base.retry import get_async_retry
 logger = logging.getLogger(__name__)
@@ -42,20 +44,14 @@ class SerpApi(AsyncClient):
     def __init__(
         self,
         api_key: str,
-        max_retries: int = MAX_RETRIES,
-        retry_delay: int = RETRY_DELAY,
     ):
         """Initializes the SerpApiClient with the given API key.
         Args:
             api_key: The API key for SerpApi.
-            max_retries: Maximum number of retries for API calls.
-            retry_delay: Delay between retries in seconds.
         """
         super().__init__()
         self._api_key = api_key
-        self._max_retries = max_retries
-        self._retry_delay = retry_delay
     def _get_domain(self, url: str) -> str:
         """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
@@ -116,6 +112,31 @@ class SerpApi(AsyncClient):
         return urls
+    @staticmethod
+    def _log_before(search_string: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f'Performing SerpAPI search with q="{search_string}" '
+                f"(attempt {retry_state.attempt_number})."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}, not logging before.")
+    @staticmethod
+    def _log_before_sleep(
+        search_string: str, retry_state: RetryCallState | None
+    ) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f'Attempt {retry_state.attempt_number} of SerpAPI search with q="{search_string}" '
+                f"failed with error: {retry_state.outcome.exception()}. "
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
     async def _search(
         self,
         engine: str,
@@ -172,25 +193,21 @@ class SerpApi(AsyncClient):
             "num": num_results,
             "api_key": self._api_key,
         }
-        # Perform the request
-        attempts = 0
-        err = None
-        while attempts < self._max_retries:
-            try:
-                logger.debug(
-                    f'Performing SerpAPI search with q="{search_string}" (Attempt {attempts + 1}).'
-                )
+        logger.debug(f"SerpAPI search with params: {params}")
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            search_string=search_string, retry_state=retry_state
+        )
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            search_string=search_string, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
                 response = await self.get(url=self._endpoint, params=params)
-                break
-            except Exception as e:
-                logger.error(f"SerpAPI search failed with error: {e}.")
-                err = e
-            attempts += 1
-            if attempts < self._max_retries:
-                await asyncio.sleep(self._retry_delay)
-        if err is not None:
-            raise err
         # Extract the URLs from the response
         urls = self._extract_search_results(response=response, engine=engine)

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -1,16 +1,13 @@
-import asyncio
 import logging
 from typing import List
 from base64 import b64decode
 import aiohttp
+from tenacity import RetryCallState
-from fraudcrawler.settings import (
-    MAX_RETRIES,
-    RETRY_DELAY,
-    ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
-)
+from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
 from fraudcrawler.base.base import AsyncClient
+from fraudcrawler.base.retry import get_async_retry
 logger = logging.getLogger(__name__)
@@ -34,19 +31,32 @@ class ZyteApi(AsyncClient):
     def __init__(
         self,
         api_key: str,
-        max_retries: int = MAX_RETRIES,
-        retry_delay: int = RETRY_DELAY,
     ):
         """Initializes the ZyteApiClient with the given API key and retry configurations.
         Args:
             api_key: The API key for Zyte API.
-            max_retries: Maximum number of retries for API calls.
-            retry_delay: Delay between retries in seconds.
         """
         self._aiohttp_basic_auth = aiohttp.BasicAuth(api_key)
-        self._max_retries = max_retries
-        self._retry_delay = retry_delay
+    def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f"Zyte fetching product details for URL {url} (Attempt {retry_state.attempt_number})."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before.")
+    def _log_before_sleep(self, url: str, retry_state: RetryCallState | None) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f'Attempt {retry_state.attempt_number} of Zyte fetching product details for URL "{url}" '
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
     async def get_details(self, url: str) -> dict:
         """Fetches product details for a single URL.
@@ -74,30 +84,25 @@ class ZyteApi(AsyncClient):
             }
         """
         logger.info(f"Fetching product details by Zyte for URL {url}.")
-        attempts = 0
-        err = None
-        while attempts < self._max_retries:
-            try:
-                logger.debug(
-                    f"Fetch product details for URL {url} (Attempt {attempts + 1})."
-                )
+        # Perform the request and retry if necessary. There is some context aware logging:
+        #  - `before`: before the request is made (and before retrying)
+        #  - `before_sleep`: if the request fails before sleeping
+        retry = get_async_retry()
+        retry.before = lambda retry_state: self._log_before(
+            url=url, retry_state=retry_state
+        )
+        retry.before_sleep = lambda retry_state: self._log_before_sleep(
+            url=url, retry_state=retry_state
+        )
+        async for attempt in retry:
+            with attempt:
                 product = await self.post(
                     url=self._endpoint,
                     data={"url": url, **self._config},
                     auth=self._aiohttp_basic_auth,
                 )
-                return product
-            except Exception as e:
-                logger.debug(
-                    f"Exception occurred while fetching product details for URL {url} (Attempt {attempts + 1})."
-                )
-                err = e
-            attempts += 1
-            if attempts < self._max_retries:
-                await asyncio.sleep(self._retry_delay)
-        if err is not None:
-            raise err
-        return {}
+        return product
     @staticmethod
     def keep_product(

fraudcrawler/settings.py CHANGED Viewed

@@ -2,10 +2,18 @@ from pathlib import Path
 from typing import List
 # Generic settings
-MAX_RETRIES = 3
-RETRY_DELAY = 2
 ROOT_DIR = Path(__file__).parents[1]
+# Service retry settings
+# With the following setup (neglecting the jitter) we have 6 attempts with delays:
+#   0s, 1s, 4s, 16s, 64s, 64s (because of the max delay)
+RETRY_STOP_AFTER_ATTEMPT = 6
+RETRY_INITIAL_DELAY = 1
+RETRY_MAX_DELAY = 64
+RETRY_EXP_BASE = 4
+RETRY_JITTER = 1
+RETRY_SKIP_IF_CODE = [400, 401, 403]  # Skip retrying on these HTTP status codes
 # Serp settings
 GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
 GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
@@ -15,14 +23,43 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
 # URL De-duplication settings
 KNOWN_TRACKERS = [
-    "srsltid",
-    "utm_source",
-    "utm_medium",
-    "utm_campaign",
-    "utm_term",
-    "utm_content",
-    "ar",
-    "ps",
+    "srsltid",  # Search result click ID (used by some search engines)
+    "utm_source",  # UTM: Source of the traffic (e.g., Google, Newsletter)
+    "utm_medium",  # UTM: Medium such as CPC, email, social
+    "utm_campaign",  # UTM: Campaign name (e.g., summer_sale)
+    "utm_term",  # UTM: Keyword term (used in paid search)
+    "utm_content",  # UTM: Used to differentiate similar links or ads
+    "ar",  # Often used for ad region or targeting info
+    "ps",  # Could refer to promotion source or partner segment
+    "gclid",  # Google Ads click ID (auto-tagging)
+    "gclsrc",  # Source of the GCLID (e.g., ads, search)
+    "sku",  # Product SKU identifier, often used in ecommerce links
+    "ref",  # Referrer username or source (e.g., GitHub ref links)
+    "referral",  # Alternate form of referrer, often human-readable
+    "aff_id",  # Affiliate identifier (ID-based)
+    "aff",  # Short form for affiliate tag
+    "affiliate",  # Affiliate tracking parameter (human-readable)
+    "partner",  # Indicates marketing or distribution partner
+    "fbclid",  # Facebook Click Identifier
+    "msclkid",  # Microsoft/Bing Ads click identifier
+    "twclid",  # Twitter Ads click identifier
+    "variant",  # A/B test variant (used to test versions of pages)
+    "session_id",  # Session tracking ID, should not persist across URLs
+    "track",  # Generic flag used to enable/disable tracking
+    "cid",  # Campaign ID (used in ads or emails)
+    "campaignid",  # Alternate or long-form campaign ID
+    "adgroup",  # Ad group identifier for campaigns
+    "bannerid",  # Specific banner ad ID (for display ad tracking)
+    "token",  # Often used to identify users or temporary sessions
+    "tag",  # Affiliate or marketing tag (used for tracking)
+    "hash",  # Generic hash identifier, often for state or cache
+    "user",  # User ID or identifier passed in URL (should be avoided)
+    "src",  # Generic source indicator, less formal than `utm_source`
+    "selsort",  # Sorting parameter for search results
+    "shid",  # Shop ID (used in ecommerce)
+    "shoparea",  # Shop area (used in ecommerce)
+    "shopid",  # Shop ID (used in ecommerce)
+    "shoparea",  # Shop area (used in ecommerce)
 ]
 # Enrichment settings
@@ -34,6 +71,7 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
 # Processor settings
 PROCESSOR_DEFAULT_MODEL = "gpt-4o"
 PROCESSOR_DEFAULT_IF_MISSING = -1
+PROCESSOR_EMPTY_TOKEN_COUNT = -1
 PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
 PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"

{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.4.3
+Version: 0.4.6
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -18,6 +18,7 @@ Requires-Dist: openai (>=1.68.2,<2.0.0)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
 Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: tenacity (>=9.1.2,<10.0.0)
 Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
 Description-Content-Type: text/markdown

fraudcrawler-0.4.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,22 @@
+fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
+fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/base/base.py,sha256=pYGdRV_Ssw5fA6tLVhlZwAO0OLQl6qn6LgJPCzOCrpc,6258
+fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
+fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
+fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
+fraudcrawler/base/orchestrator.py,sha256=UzqEtC7Szw1-Ic31lex04Mgpf2f7MM-odwhC0gTxN2Q,23566
+fraudcrawler/base/retry.py,sha256=OKdOed7mP2VLYJLi1zo0MC8ISMm7k3gZgtNuqn50NhI,995
+fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
+fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/processing/processor.py,sha256=-QdLiAhdPLdYWcMvbKmuPQ_WlvFEDpmEXNps1QGChvQ,7421
+fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fraudcrawler/scraping/enrich.py,sha256=1vRGUtF9F8aw46qjKSUiVqGXLdRPaUmI8e5Bu-ZYt8Y,12398
+fraudcrawler/scraping/serp.py,sha256=aTsrH9R9yOpEH_ga-h1BylAtVl4sf9eHIaCv798GLEE,18782
+fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
+fraudcrawler/scraping/zyte.py,sha256=Pv0i2Ni6oamIo_aFdG9c-Kon0PM6oTmMgVYdT3KwvYo,7602
+fraudcrawler/settings.py,sha256=zoNd4LCBL1JNfICiYlLkggw8rGr_tkFc7rrE1morLKI,3442
+fraudcrawler-0.4.6.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
+fraudcrawler-0.4.6.dist-info/METADATA,sha256=z1dneOJNzGU4cIEEOs0kTAdibcdjYBQnrUKb8N5rOSg,5973
+fraudcrawler-0.4.6.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
+fraudcrawler-0.4.6.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
+fraudcrawler-0.4.6.dist-info/RECORD,,

fraudcrawler-0.4.3.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
-fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
-fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
-fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
-fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
-fraudcrawler/base/orchestrator.py,sha256=xOMxA0zPUXSF8AGY5AUqzsOO9LfRIjxI2HuZf__Z_sI,24689
-fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
-fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
-fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
-fraudcrawler/scraping/serp.py,sha256=divEp1UBUsws24PWZABhWIxOmaLqLwdeGn4KNrqWkYA,17865
-fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
-fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
-fraudcrawler/settings.py,sha256=31jvRFfB-gsVbeidLLl4iQgrFL7GH-824lerIniPI08,1017
-fraudcrawler-0.4.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.4.3.dist-info/METADATA,sha256=jlk2WdtXEK0-s6QRQdI96EBpQiyHWKgJiYeW93yiU24,5931
-fraudcrawler-0.4.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
-fraudcrawler-0.4.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.4.3.dist-info/RECORD,,

{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{fraudcrawler-0.4.3.dist-info → fraudcrawler-0.4.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

fraudcrawler 0.4.3__py3-none-any.whl → 0.4.6__py3-none-any.whl

Potentially problematic release.

fraudcrawler 0.4.3py3-none-any.whl → 0.4.6py3-none-any.whl