PyPI - fraudcrawler - Versions diffs - 0.4.3__tar.gz → 0.4.5__tar.gz - Mend

fraudcrawler 0.4.3tar.gz → 0.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.1
 Name: fraudcrawler
-Version: 0.4.3
+Version: 0.4.5
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
 Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/base.py RENAMED Viewed

@@ -63,6 +63,14 @@ class Host(BaseModel):
         return [cls._normalize_domain(dom.strip()) for dom in val]
+class ClassificationResult(BaseModel):
+    """Model for classification results."""
+    result: int
+    input_tokens: int
+    output_tokens: int
 class Location(BaseModel):
     """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
     # Processor parameters are set dynamic so we must allow extra fields
     classifications: Dict[str, int] = Field(default_factory=dict)
+    # Usage parameters
+    usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
     # Filtering parameters
     filtered: bool = False
     filtered_at_stage: str | None = None

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -281,7 +281,13 @@ class Orchestrator(ABC):
                             url=url,
                             product_details=product_details,
                         )
-                        product.classifications[prompt.name] = classification
+                        product.classifications[prompt.name] = int(
+                            classification.result
+                        )
+                        product.usage[prompt.name] = {
+                            "input_tokens": classification.input_tokens,
+                            "output_tokens": classification.output_tokens,
+                        }
                 except Exception as e:
                     logger.warning(f"Error processing product: {e}.")

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/processing/processor.py RENAMED Viewed

@@ -2,10 +2,11 @@ import logging
 from openai import AsyncOpenAI
-from fraudcrawler.base.base import Prompt
+from fraudcrawler.base.base import Prompt, ClassificationResult
 from fraudcrawler.settings import (
     PROCESSOR_USER_PROMPT_TEMPLATE,
     PROCESSOR_DEFAULT_IF_MISSING,
+    PROCESSOR_EMPTY_TOKEN_COUNT,
 )
@@ -20,6 +21,7 @@ class Processor:
         api_key: str,
         model: str,
         default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
+        empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
     ):
         """Initializes the Processor.
@@ -27,17 +29,22 @@ class Processor:
             api_key: The OpenAI API key.
             model: The OpenAI model to use.
             default_if_missing: The default classification to return if error occurs.
+            empty_token_count: The default value to return as tokensif the classification is empty.
         """
         self._client = AsyncOpenAI(api_key=api_key)
         self._model = model
-        self._default_if_missing = default_if_missing
+        self._error_response = ClassificationResult(
+            result=default_if_missing,
+            input_tokens=empty_token_count,
+            output_tokens=empty_token_count,
+        )
     async def _call_openai_api(
         self,
         system_prompt: str,
         user_prompt: str,
         **kwargs,
-    ) -> str:
+    ) -> ClassificationResult:
         """Calls the OpenAI API with the given user prompt."""
         response = await self._client.chat.completions.create(
             model=self._model,
@@ -50,10 +57,24 @@ class Processor:
         content = response.choices[0].message.content
         if not content:
             raise ValueError("Empty response from OpenAI API")
-        return content
-    async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
-        """A generic classification method that classifies a product based on a prompt object.
+        # Convert the content to an integer
+        content = int(content.strip())
+        # For tracking consumption we alre return the tokens used
+        classification = ClassificationResult(
+            result=content,
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+        )
+        return classification
+    async def classify(
+        self, prompt: Prompt, url: str, product_details: str
+    ) -> ClassificationResult:
+        """A generic classification method that classifies a product based on a prompt object and returns
+          the classification, input tokens, and output tokens.
         Args:
             prompt: A dictionary with keys "system_prompt", etc.
@@ -69,7 +90,7 @@ class Processor:
         # If required fields are missing, return the prompt's default fallback if provided.
         if not product_details:
             logger.warning("Missing required product_details for classification.")
-            return self._default_if_missing
+            return self._error_response
         # Substitute placeholders in user_prompt with the relevant arguments
         user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
@@ -81,22 +102,21 @@ class Processor:
             logger.debug(
                 f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
             )
-            content = await self._call_openai_api(
+            classification = await self._call_openai_api(
                 system_prompt=prompt.system_prompt,
                 user_prompt=user_prompt,
                 max_tokens=1,
             )
-            classification = int(content.strip())
             # Enforce that the classification is in the allowed classes
-            if classification not in prompt.allowed_classes:
+            if classification.result not in prompt.allowed_classes:
                 logger.warning(
-                    f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
+                    f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
                 )
-                return self._default_if_missing
+                return self._error_response
             logger.info(
-                f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
+                f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
             )
             return classification
@@ -104,4 +124,4 @@ class Processor:
             logger.error(
                 f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
             )
-            return self._default_if_missing
+            return self._error_response

fraudcrawler-0.4.5/fraudcrawler/settings.py ADDED Viewed

@@ -0,0 +1,73 @@
+from pathlib import Path
+from typing import List
+# Generic settings
+MAX_RETRIES = 3
+RETRY_DELAY = 2
+ROOT_DIR = Path(__file__).parents[1]
+# Serp settings
+GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
+GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
+SERP_DEFAULT_COUNTRY_CODES: List[str] = [
+    # ".com",
+]
+# URL De-duplication settings
+KNOWN_TRACKERS = [
+    "srsltid",        # Search result click ID (used by some search engines)
+    "utm_source",     # UTM: Source of the traffic (e.g., Google, Newsletter)
+    "utm_medium",     # UTM: Medium such as CPC, email, social
+    "utm_campaign",   # UTM: Campaign name (e.g., summer_sale)
+    "utm_term",       # UTM: Keyword term (used in paid search)
+    "utm_content",    # UTM: Used to differentiate similar links or ads
+    "ar",             # Often used for ad region or targeting info
+    "ps",             # Could refer to promotion source or partner segment
+    "gclid",          # Google Ads click ID (auto-tagging)
+    "gclsrc",         # Source of the GCLID (e.g., ads, search)
+    "sku",            # Product SKU identifier, often used in ecommerce links
+    "ref",            # Referrer username or source (e.g., GitHub ref links)
+    "referral",       # Alternate form of referrer, often human-readable
+    "aff_id",         # Affiliate identifier (ID-based)
+    "aff",            # Short form for affiliate tag
+    "affiliate",      # Affiliate tracking parameter (human-readable)
+    "partner",        # Indicates marketing or distribution partner
+    "fbclid",         # Facebook Click Identifier
+    "msclkid",        # Microsoft/Bing Ads click identifier
+    "twclid",         # Twitter Ads click identifier
+    "variant",        # A/B test variant (used to test versions of pages)
+    "session_id",     # Session tracking ID, should not persist across URLs
+    "track",          # Generic flag used to enable/disable tracking
+    "cid",            # Campaign ID (used in ads or emails)
+    "campaignid",     # Alternate or long-form campaign ID
+    "adgroup",        # Ad group identifier for campaigns
+    "bannerid",       # Specific banner ad ID (for display ad tracking)
+    "token",          # Often used to identify users or temporary sessions
+    "tag",            # Affiliate or marketing tag (used for tracking)
+    "hash",           # Generic hash identifier, often for state or cache
+    "user",           # User ID or identifier passed in URL (should be avoided)
+    "src",            # Generic source indicator, less formal than `utm_source`
+    "selsort",        # Sorting parameter for search results
+    "shid",           # Shop ID (used in ecommerce)
+    "shoparea",       # Shop area (used in ecommerce)
+    "shopid",         # Shop ID (used in ecommerce)
+    "shoparea",       # Shop area (used in ecommerce)
+]
+# Enrichment settings
+ENRICHMENT_DEFAULT_LIMIT = 10
+# Zyte settings
+ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
+# Processor settings
+PROCESSOR_DEFAULT_MODEL = "gpt-4o"
+PROCESSOR_DEFAULT_IF_MISSING = -1
+PROCESSOR_EMPTY_TOKEN_COUNT = -1
+PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
+PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
+# Async settings
+DEFAULT_N_SERP_WKRS = 10
+DEFAULT_N_ZYTE_WKRS = 10
+DEFAULT_N_PROC_WKRS = 10

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.4.3"
+version = "0.4.5"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",

fraudcrawler-0.4.3/fraudcrawler/settings.py DELETED Viewed

@@ -1,43 +0,0 @@
-from pathlib import Path
-from typing import List
-# Generic settings
-MAX_RETRIES = 3
-RETRY_DELAY = 2
-ROOT_DIR = Path(__file__).parents[1]
-# Serp settings
-GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
-GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
-SERP_DEFAULT_COUNTRY_CODES: List[str] = [
-    # ".com",
-]
-# URL De-duplication settings
-KNOWN_TRACKERS = [
-    "srsltid",
-    "utm_source",
-    "utm_medium",
-    "utm_campaign",
-    "utm_term",
-    "utm_content",
-    "ar",
-    "ps",
-]
-# Enrichment settings
-ENRICHMENT_DEFAULT_LIMIT = 10
-# Zyte settings
-ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
-# Processor settings
-PROCESSOR_DEFAULT_MODEL = "gpt-4o"
-PROCESSOR_DEFAULT_IF_MISSING = -1
-PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
-PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
-# Async settings
-DEFAULT_N_SERP_WKRS = 10
-DEFAULT_N_ZYTE_WKRS = 10
-DEFAULT_N_PROC_WKRS = 10

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/README.md RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/client.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/google-languages.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/google-locations.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/launch_demo_pipeline.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/processing/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/enrich.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/serp.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/url.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/zyte.py RENAMED Viewed

File without changes

fraudcrawler 0.4.3__tar.gz → 0.4.5__tar.gz

Potentially problematic release.

fraudcrawler 0.4.3tar.gz → 0.4.5tar.gz