PyPI - fraudcrawler - Versions diffs - 0.7.22__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

fraudcrawler 0.7.22py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

fraudcrawler/processing/base.py CHANGED Viewed

@@ -3,12 +3,15 @@ import logging
 from pydantic import BaseModel
 from typing import Any, Dict, List, Sequence, TypeAlias
+from tenacity import RetryCallState
 from fraudcrawler.base.base import ProductItem
 logger = logging.getLogger(__name__)
-UserInputs: TypeAlias = Dict[str, List[str]]
+Context: TypeAlias = Dict[str, str]
+UserInputs: TypeAlias = Dict[str, str | List[str]]
 class ClassificationResult(BaseModel):
@@ -44,6 +47,25 @@ class Workflow(ABC):
         """
         self.name = name
+    def _log_before(self, context: Context, retry_state: RetryCallState) -> None:
+        """Context aware logging before the request is made."""
+        if retry_state:
+            logger.debug(
+                f"Workflow={self.name} retry-call within context={context} (Attempt {retry_state.attempt_number})."
+            )
+        else:
+            logger.debug(f"retry_state is {retry_state}; not logging before.")
+    def _log_before_sleep(self, context: Context, retry_state: RetryCallState) -> None:
+        """Context aware logging before sleeping after a failed request."""
+        if retry_state and retry_state.outcome:
+            logger.warning(
+                f"Attempt {retry_state.attempt_number} of workflow={self.name} "
+                f"retry-call within context={context} "
+                f"failed with error: {retry_state.outcome.exception()}. "
+                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
+            )
     @abstractmethod
     async def run(self, product: ProductItem) -> WorkflowResult:
         """Runs the workflow."""

fraudcrawler/processing/openai.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from copy import deepcopy
 import logging
 from pydantic import BaseModel
-from typing import Dict, List, Literal, TypeAlias
+from typing import List, Literal
 import httpx
 from openai import AsyncOpenAI
@@ -11,7 +12,6 @@ from openai.types.responses import (
     ResponseInputImageParam,
     ResponseInputParam,
 )
-from tenacity import RetryCallState
 from fraudcrawler.base.base import ProductItem
 from fraudcrawler.base.retry import get_async_retry
@@ -19,16 +19,19 @@ from fraudcrawler.processing.base import (
     ClassificationResult,
     UserInputs,
     Workflow,
+    Context,
 )
 logger = logging.getLogger(__name__)
-Context: TypeAlias = Dict[str, str]
 class OpenAIWorkflow(Workflow):
     """(Abstract) Workflow using OpenAI API calls."""
+    _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
+    _product_details_template = "{field_name}:\n{field_value}"
+    _user_inputs_template = "{key}: {val}"
     def __init__(
         self,
         http_client: httpx.AsyncClient,
@@ -49,29 +52,6 @@ class OpenAIWorkflow(Workflow):
         self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
         self._model = model
-    def _log_before(
-        self, endpoint: str, context: Context, retry_state: RetryCallState
-    ) -> None:
-        """Context aware logging before the request is made."""
-        if retry_state:
-            logger.debug(
-                f"Workflow={self.name} calls endpoint={endpoint} within context={context} (Attempt {retry_state.attempt_number})."
-            )
-        else:
-            logger.debug(f"retry_state is {retry_state}; not logging before.")
-    def _log_before_sleep(
-        self, endpoint: str, context: Context, retry_state: RetryCallState
-    ) -> None:
-        """Context aware logging before sleeping after a failed request."""
-        if retry_state and retry_state.outcome:
-            logger.warning(
-                f"Attempt {retry_state.attempt_number} of workflow={self.name} "
-                f"calling endpoint={endpoint} within context={context} "
-                f"failed with error: {retry_state.outcome.exception()}. "
-                f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
-            )
     async def _chat_completions_create(
         self,
         system_prompt: str,
@@ -86,17 +66,20 @@ class OpenAIWorkflow(Workflow):
             system_prompt: System prompt for the AI model.
             user_prompt: User prompt for the AI model.
         """
-        endpoint = "chat.completions.create"
+        cntx = deepcopy(context)
+        cntx["endpoint"] = "chat.completions.create"
         # Perform the request and retry if necessary. There is some context aware logging
         #  - `before`: before the request is made (or before retrying)
         #  - `before_sleep`: if the request fails before sleeping
         retry = get_async_retry()
         retry.before = lambda retry_state: self._log_before(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx,
+            retry_state=retry_state,
         )
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx,
+            retry_state=retry_state,
         )
         async for attempt in retry:
             with attempt:
@@ -126,17 +109,18 @@ class OpenAIWorkflow(Workflow):
             response_format: The model into which the response should be parsed.
             context: Logging context for retry logs.
         """
-        endpoint = "chat.completions.parse"
+        cntx = deepcopy(context)
+        cntx["endpoint"] = "chat.completions.parse"
         # Perform the request and retry if necessary. There is some context aware logging
         #  - `before`: before the request is made (or before retrying)
         #  - `before_sleep`: if the request fails before sleeping
         retry = get_async_retry()
         retry.before = lambda retry_state: self._log_before(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         async for attempt in retry:
             with attempt:
@@ -213,7 +197,9 @@ class OpenAIWorkflow(Workflow):
             The extracted text can be obtained by `response.output_text`
         """
         # Prepare variables
-        endpoint = "response.create"
+        cntx = deepcopy(context)
+        cntx["endpoint"] = "response.create"
         detail: Literal["low", "high", "auto"] = "high"
         input_param = self._get_input_param(
             image_url=image_url,
@@ -228,10 +214,10 @@ class OpenAIWorkflow(Workflow):
         #  - `before_sleep`: if the request fails before sleeping
         retry = get_async_retry()
         retry.before = lambda retry_state: self._log_before(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         async for attempt in retry:
             with attempt:
@@ -264,7 +250,8 @@ class OpenAIWorkflow(Workflow):
             (c.f. :func:`_responses_create`)
         """
         # Prepare variables
-        endpoint = "response.parse"
+        cntx = deepcopy(context)
+        cntx["enpdoint"] = "response.parse"
         detail: Literal["low", "high", "auto"] = "high"
         input_param = self._get_input_param(
             image_url=image_url,
@@ -279,10 +266,10 @@ class OpenAIWorkflow(Workflow):
         #  - `before_sleep`: if the request fails before sleeping
         retry = get_async_retry()
         retry.before = lambda retry_state: self._log_before(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         retry.before_sleep = lambda retry_state: self._log_before_sleep(
-            endpoint=endpoint, context=context, retry_state=retry_state
+            context=cntx, retry_state=retry_state
         )
         async for attempt in retry:
             with attempt:
@@ -294,6 +281,71 @@ class OpenAIWorkflow(Workflow):
                 )
         return response
+    @staticmethod
+    def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
+        """Ensure all product_item_fields are valid ProductItem attributes."""
+        return set(product_item_fields).issubset(ProductItem.model_fields.keys())
+    def _get_product_details(
+        self, product: ProductItem, product_item_fields: List[str]
+    ) -> str:
+        """Extracts product details based on the configuration.
+        Args:
+            product: The product item to extract details from.
+            product_item_fields: The product item fields to use.
+        """
+        if not self._product_item_fields_are_valid(
+            product_item_fields=product_item_fields
+        ):
+            not_valid_fields = set(product_item_fields) - set(
+                ProductItem.model_fields.keys()
+            )
+            raise ValueError(f"Invalid product_item_fields: {not_valid_fields}.")
+        details = []
+        for name in product_item_fields:
+            if value := getattr(product, name, None):
+                details.append(
+                    self._product_details_template.format(
+                        field_name=name, field_value=value
+                    )
+                )
+            else:
+                logger.warning(
+                    f'Field "{name}" is missing in ProductItem with url="{product.url}"'
+                )
+        return "\n\n".join(details)
+    async def _get_prompt_from_product_details(
+        self, product: ProductItem, product_item_fields: List[str]
+    ) -> str:
+        """Forms and returns the product related part for the user_prompt."""
+        # Form the product details from the ProductItem
+        product_details = self._get_product_details(
+            product=product, product_item_fields=product_item_fields
+        )
+        if not product_details:
+            raise ValueError(
+                f"Missing product_details for product_item_fields={product_item_fields}."
+            )
+        # Create user prompt
+        product_prompt = self._product_prompt_template.format(
+            product_details=product_details,
+        )
+        return product_prompt
+    async def _get_prompt_from_user_inputs(self, user_inputs: UserInputs) -> str:
+        """Forms and returns the user_inputs part for the user_prompt."""
+        user_inputs_strings = [
+            self._user_inputs_template.format(key=k, val=v)
+            for k, v in user_inputs.items()
+        ]
+        user_inputs_joined = "\n".join(user_inputs_strings)
+        return f"User Inputs:\n{user_inputs_joined}"
 class OpenAIClassification(OpenAIWorkflow):
     """Open AI classification workflow with single API call using specific product_item fields for setting up the context.
@@ -304,8 +356,6 @@ class OpenAIClassification(OpenAIWorkflow):
         which the classification should happen.
     """
-    _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
-    _product_details_template = "{field_name}:\n{field_value}"
     _max_tokens: int = 1
     def __init__(
@@ -335,16 +385,6 @@ class OpenAIClassification(OpenAIWorkflow):
             api_key=api_key,
             model=model,
         )
-        if not self._product_item_fields_are_valid(
-            product_item_fields=product_item_fields
-        ):
-            not_valid_fields = set(product_item_fields) - set(
-                ProductItem.model_fields.keys()
-            )
-            raise ValueError(
-                f"Invalid product_item_fields are given: {not_valid_fields}."
-            )
         self._product_item_fields = product_item_fields
         self._system_prompt = system_prompt
@@ -352,50 +392,12 @@ class OpenAIClassification(OpenAIWorkflow):
             raise ValueError("Values of allowed_classes must be >= 0")
         self._allowed_classes = allowed_classes
-    @staticmethod
-    def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
-        """Ensure all product_item_fields are valid ProductItem attributes."""
-        return set(product_item_fields).issubset(ProductItem.model_fields.keys())
-    def _get_product_details(self, product: ProductItem) -> str:
-        """Extracts product details based on the configuration.
-        Args:
-            product: The product item to extract details from.
-        """
-        details = []
-        for name in self._product_item_fields:
-            if value := getattr(product, name, None):
-                details.append(
-                    self._product_details_template.format(
-                        field_name=name, field_value=value
-                    )
-                )
-            else:
-                logger.warning(
-                    f'Field "{name}" is missing in ProductItem with url="{product.url}"'
-                )
-        return "\n\n".join(details)
-    async def _get_product_prompt(self, product: ProductItem) -> str:
-        """Forms and returns the product related part for the user_prompt."""
-        # Form the product details from the ProductItem
-        product_details = self._get_product_details(product=product)
-        if not product_details:
-            raise ValueError(
-                f"Missing product_details for product_item_fields={self._product_item_fields}."
-            )
-        # Create user prompt
-        product_prompt = self._product_prompt_template.format(
-            product_details=product_details,
-        )
-        return product_prompt
     async def _get_user_prompt(self, product: ProductItem) -> str:
         """Forms and returns the user_prompt."""
-        product_prompt = await self._get_product_prompt(product=product)
+        product_prompt = await self._get_prompt_from_product_details(
+            product=product,
+            product_item_fields=self._product_item_fields,
+        )
         return product_prompt
     async def _chat_classification(
@@ -472,8 +474,6 @@ class OpenAIClassificationUserInputs(OpenAIClassification):
         creating a user prompt from which the classification should happen.
     """
-    _user_inputs_template = "{key}: {val}"
     def __init__(
         self,
         http_client: httpx.AsyncClient,
@@ -506,15 +506,16 @@ class OpenAIClassificationUserInputs(OpenAIClassification):
             system_prompt=system_prompt,
             allowed_classes=allowed_classes,
         )
-        user_inputs_strings = [
-            self._user_inputs_template.format(key=k, val=v)
-            for k, v in user_inputs.items()
-        ]
-        user_inputs_joined = "\n".join(user_inputs_strings)
-        self._user_inputs_prompt = f"User Inputs:\n{user_inputs_joined}"
+        self._user_inputs = user_inputs
     async def _get_user_prompt(self, product: ProductItem) -> str:
         """Forms the user_prompt from the product details plus user_inputs."""
-        product_prompt = await super()._get_product_prompt(product=product)
-        user_prompt = f"{self._user_inputs_prompt}\n\n{product_prompt}"
+        product_prompt = await self._get_prompt_from_product_details(
+            product=product,
+            product_item_fields=self._product_item_fields,
+        )
+        user_inputs_prompt = await self._get_prompt_from_user_inputs(
+            user_inputs=self._user_inputs,
+        )
+        user_prompt = f"{user_inputs_prompt}\n\n{product_prompt}"
         return user_prompt

fraudcrawler/scraping/zyte.py CHANGED Viewed

@@ -150,8 +150,8 @@ class ZyteAPI(DomainUtils):
         """
         product = details.get("product", {})
         gtin_list = product.get("gtin", [])
-        if gtin_list and len(gtin_list) > 0:
+        if len(gtin_list) > 0:
             # Extract the first GTIN value
             gtin_value = gtin_list[0].get("value")
             if gtin_value:

{fraudcrawler-0.7.22.dist-info → fraudcrawler-0.7.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fraudcrawler
-Version: 0.7.22
+Version: 0.7.26
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.7.22.dist-info → fraudcrawler-0.7.26.dist-info}/RECORD RENAMED Viewed

@@ -8,16 +8,16 @@ fraudcrawler/base/orchestrator.py,sha256=BklS4DNzxbp7yvE2NvBWrDDqnvT4YO7Xh_WXstY
 fraudcrawler/base/retry.py,sha256=bCDd44XO2-lHO8MGvPblD5152-lHt1dOfMAQSmymLO4,1462
 fraudcrawler/launch_demo_pipeline.py,sha256=oZWodtNzA5mhmLNYMS6lglry88NutvH4IxnEWOUtL8M,6179
 fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fraudcrawler/processing/base.py,sha256=UkoYxFNZ3BQkXmgJnTtruz8-eIFCtWiquRN_IoEXfM4,4091
-fraudcrawler/processing/openai.py,sha256=7sbFg2NPsn627VDzsfIkKantE2KahGmVkSZ1R10OrzQ,19050
+fraudcrawler/processing/base.py,sha256=vNwbwdaN2WANuo73ZFNqo-FMdN7OMCQ08K5bVUtedtc,5129
+fraudcrawler/processing/openai.py,sha256=iwzJEpbMVluhSABEoA_RCMuCC81_2ujonQExJpR6d_o,18627
 fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fraudcrawler/scraping/enrich.py,sha256=dGWi9p0JStQYSGscCnsQPHNlAeqjoL2rXZnHFNmPhaQ,13158
 fraudcrawler/scraping/search.py,sha256=qHeUpzv1IpRhdFvaycGtL3FLOwT8rOiF0PfiOH6BmUA,34561
 fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
-fraudcrawler/scraping/zyte.py,sha256=xSHGKo09sX2dgQBrPI7oeoHsVL4qZ8voQLBXRU1XBqM,11102
+fraudcrawler/scraping/zyte.py,sha256=RbZxmRWaDk3AgcB2EPFVzbqwo_RvzAm5TWT9OTpX2gs,11080
 fraudcrawler/settings.py,sha256=q3je0r_jd30x2dzlgfm8GyKcigFdgteOLa8HX188bho,3768
-fraudcrawler-0.7.22.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
-fraudcrawler-0.7.22.dist-info/METADATA,sha256=D749e0ZWDZSn8pjxvHj7RUf5m0D1_qHzRlZPRFqTE9A,5303
-fraudcrawler-0.7.22.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-fraudcrawler-0.7.22.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
-fraudcrawler-0.7.22.dist-info/RECORD,,
+fraudcrawler-0.7.26.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
+fraudcrawler-0.7.26.dist-info/METADATA,sha256=xkNvjmbS9MVld2gSMJDMGzLf9yJWzlA83cTGgvUuTZg,5303
+fraudcrawler-0.7.26.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+fraudcrawler-0.7.26.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
+fraudcrawler-0.7.26.dist-info/RECORD,,

{fraudcrawler-0.7.22.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.7.22.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{fraudcrawler-0.7.22.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt RENAMED Viewed

File without changes

fraudcrawler 0.7.22__py3-none-any.whl → 0.7.26__py3-none-any.whl

fraudcrawler 0.7.22py3-none-any.whl → 0.7.26py3-none-any.whl