PyPI - fraudcrawler - Versions diffs - 0.4.2__tar.gz → 0.4.5__tar.gz - Mend

fraudcrawler 0.4.2tar.gz → 0.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.1
 Name: fraudcrawler
-Version: 0.4.2
+Version: 0.4.5
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
 Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
 Requires-Dist: openai (>=1.68.2,<2.0.0)

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from fraudcrawler.scraping.serp import SerpApi, SearchEngine
 from fraudcrawler.scraping.enrich import Enricher
+from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteApi
 from fraudcrawler.processing.processor import Processor
 from fraudcrawler.base.orchestrator import Orchestrator
@@ -18,6 +19,7 @@ __all__ = [
     "SerpApi",
     "SearchEngine",
     "Enricher",
+    "URLCollector",
     "ZyteApi",
     "Processor",
     "Orchestrator",

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/base.py RENAMED Viewed

@@ -63,6 +63,14 @@ class Host(BaseModel):
         return [cls._normalize_domain(dom.strip()) for dom in val]
+class ClassificationResult(BaseModel):
+    """Model for classification results."""
+    result: int
+    input_tokens: int
+    output_tokens: int
 class Location(BaseModel):
     """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
     # Processor parameters are set dynamic so we must allow extra fields
     classifications: Dict[str, int] = Field(default_factory=dict)
+    # Usage parameters
+    usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
     # Filtering parameters
     filtered: bool = False
     filtered_at_stage: str | None = None

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from typing import Dict, List, Set, cast
+from typing import Dict, List, cast
 from bs4 import BeautifulSoup
 from fraudcrawler.settings import (
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
     Prompt,
     ProductItem,
 )
-from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
+from fraudcrawler import (
+    SerpApi,
+    SearchEngine,
+    Enricher,
+    URLCollector,
+    ZyteApi,
+    Processor,
+)
 logger = logging.getLogger(__name__)
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
             n_zyte_wkrs: Number of async workers for zyte (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
         """
-        # Setup the variables
-        self._collected_urls_current_run: Set[str] = set()
-        self._collected_urls_previous_runs: Set[str] = set()
         # Setup the clients
         self._serpapi = SerpApi(
             api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
         self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
+        self._url_collector = URLCollector()
         self._zyteapi = ZyteApi(
             api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
                 break
             if not product.filtered:
-                url = product.url
+                # Clean the URL by removing tracking parameters
+                url = self._url_collector.remove_tracking_parameters(product.url)
+                product.url = url
-                if url in self._collected_urls_current_run:
+                if url in self._url_collector.collected_currently:
                     # deduplicate on current run
                     product.filtered = True
                     product.filtered_at_stage = (
                         "URL collection (current run deduplication)"
                     )
                     logger.debug(f"URL {url} already collected in current run")
-                elif url in self._collected_urls_previous_runs:
+                elif url in self._url_collector.collected_previously:
                     # deduplicate on previous runs coming from a db
                     product.filtered = True
                     product.filtered_at_stage = (
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
                     )
                     logger.debug(f"URL {url} as already collected in previous run")
                 else:
-                    self._collected_urls_current_run.add(url)
+                    self._url_collector.collected_currently.add(url)
             await queue_out.put(product)
             queue_in.task_done()
@@ -274,7 +281,13 @@ class Orchestrator(ABC):
                             url=url,
                             product_details=product_details,
                         )
-                        product.classifications[prompt.name] = classification
+                        product.classifications[prompt.name] = int(
+                            classification.result
+                        )
+                        product.usage[prompt.name] = {
+                            "input_tokens": classification.input_tokens,
+                            "output_tokens": classification.output_tokens,
+                        }
                 except Exception as e:
                     logger.warning(f"Error processing product: {e}.")
@@ -480,7 +493,7 @@ class Orchestrator(ABC):
         #        INITIAL SETUP
         # ---------------------------
         if previously_collected_urls:
-            self._collected_urls_previous_runs = set(self._collected_urls_current_run)
+            self._url_collector.collected_previously = set(previously_collected_urls)
         # Setup the async framework
         n_terms_max = 1 + (

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/launch_demo_pipeline.py RENAMED Viewed

@@ -13,7 +13,7 @@ def main():
     client = FraudCrawlerClient()
     # Setup the search
-    search_term = "Kühlschrank"
+    search_term = "Medion Kühlbox MD 37454"
     language = Language(name="German")
     location = Location(name="Switzerland")
     deepness = Deepness(num_results=10)

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/processor.py RENAMED Viewed

@@ -2,10 +2,11 @@ import logging
 from openai import AsyncOpenAI
-from fraudcrawler.base.base import Prompt
+from fraudcrawler.base.base import Prompt, ClassificationResult
 from fraudcrawler.settings import (
     PROCESSOR_USER_PROMPT_TEMPLATE,
     PROCESSOR_DEFAULT_IF_MISSING,
+    PROCESSOR_EMPTY_TOKEN_COUNT,
 )
@@ -20,6 +21,7 @@ class Processor:
         api_key: str,
         model: str,
         default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
+        empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
     ):
         """Initializes the Processor.
@@ -27,17 +29,22 @@ class Processor:
             api_key: The OpenAI API key.
             model: The OpenAI model to use.
             default_if_missing: The default classification to return if error occurs.
+            empty_token_count: The default value to return as tokensif the classification is empty.
         """
         self._client = AsyncOpenAI(api_key=api_key)
         self._model = model
-        self._default_if_missing = default_if_missing
+        self._error_response = ClassificationResult(
+            result=default_if_missing,
+            input_tokens=empty_token_count,
+            output_tokens=empty_token_count,
+        )
     async def _call_openai_api(
         self,
         system_prompt: str,
         user_prompt: str,
         **kwargs,
-    ) -> str:
+    ) -> ClassificationResult:
         """Calls the OpenAI API with the given user prompt."""
         response = await self._client.chat.completions.create(
             model=self._model,
@@ -50,10 +57,24 @@ class Processor:
         content = response.choices[0].message.content
         if not content:
             raise ValueError("Empty response from OpenAI API")
-        return content
-    async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
-        """A generic classification method that classifies a product based on a prompt object.
+        # Convert the content to an integer
+        content = int(content.strip())
+        # For tracking consumption we alre return the tokens used
+        classification = ClassificationResult(
+            result=content,
+            input_tokens=response.usage.prompt_tokens,
+            output_tokens=response.usage.completion_tokens,
+        )
+        return classification
+    async def classify(
+        self, prompt: Prompt, url: str, product_details: str
+    ) -> ClassificationResult:
+        """A generic classification method that classifies a product based on a prompt object and returns
+          the classification, input tokens, and output tokens.
         Args:
             prompt: A dictionary with keys "system_prompt", etc.
@@ -69,7 +90,7 @@ class Processor:
         # If required fields are missing, return the prompt's default fallback if provided.
         if not product_details:
             logger.warning("Missing required product_details for classification.")
-            return self._default_if_missing
+            return self._error_response
         # Substitute placeholders in user_prompt with the relevant arguments
         user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
@@ -81,22 +102,21 @@ class Processor:
             logger.debug(
                 f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
             )
-            content = await self._call_openai_api(
+            classification = await self._call_openai_api(
                 system_prompt=prompt.system_prompt,
                 user_prompt=user_prompt,
                 max_tokens=1,
             )
-            classification = int(content.strip())
             # Enforce that the classification is in the allowed classes
-            if classification not in prompt.allowed_classes:
+            if classification.result not in prompt.allowed_classes:
                 logger.warning(
-                    f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
+                    f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
                 )
-                return self._default_if_missing
+                return self._error_response
             logger.info(
-                f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
+                f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
             )
             return classification
@@ -104,4 +124,4 @@ class Processor:
             logger.error(
                 f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
             )
-            return self._default_if_missing
+            return self._error_response

fraudcrawler-0.4.5/fraudcrawler/scraping/url.py ADDED Viewed

@@ -0,0 +1,57 @@
+import logging
+from typing import List, Set, Tuple
+from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
+from fraudcrawler.settings import KNOWN_TRACKERS
+logger = logging.getLogger(__name__)
+class URLCollector:
+    """A class to collect and de-duplicate URLs."""
+    def __init__(self):
+        self.collected_currently: Set[str] = set()
+        self.collected_previously: Set[str] = set()
+    @staticmethod
+    def remove_tracking_parameters(url: str) -> str:
+        """Remove tracking parameters from URLs.
+        Args:
+            url: The URL to clean.
+        Returns:
+            The cleaned URL without tracking parameters.
+        """
+        logging.debug(f"Removing tracking parameters from URL: {url}")
+        # Parse the url
+        parsed_url = urlparse(url)
+        # Parse query parameters
+        queries: List[Tuple[str, str]] = parse_qsl(
+            parsed_url.query, keep_blank_values=True
+        )
+        remove_all = url.startswith(
+            "https://www.ebay"
+        )  # eBay URLs have all query parameters as tracking parameters
+        if remove_all:
+            filtered_queries = []
+        else:
+            filtered_queries = [
+                q
+                for q in queries
+                if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
+            ]
+        # Rebuild the URL without tracking parameters
+        clean_url = ParseResult(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc,
+            path=parsed_url.path,
+            params=parsed_url.params,
+            query=urlencode(filtered_queries, quote_via=quote),
+            fragment=parsed_url.fragment,
+        )
+        return urlunparse(clean_url)

fraudcrawler-0.4.5/fraudcrawler/settings.py ADDED Viewed

@@ -0,0 +1,73 @@
+from pathlib import Path
+from typing import List
+# Generic settings
+MAX_RETRIES = 3
+RETRY_DELAY = 2
+ROOT_DIR = Path(__file__).parents[1]
+# Serp settings
+GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
+GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
+SERP_DEFAULT_COUNTRY_CODES: List[str] = [
+    # ".com",
+]
+# URL De-duplication settings
+KNOWN_TRACKERS = [
+    "srsltid",        # Search result click ID (used by some search engines)
+    "utm_source",     # UTM: Source of the traffic (e.g., Google, Newsletter)
+    "utm_medium",     # UTM: Medium such as CPC, email, social
+    "utm_campaign",   # UTM: Campaign name (e.g., summer_sale)
+    "utm_term",       # UTM: Keyword term (used in paid search)
+    "utm_content",    # UTM: Used to differentiate similar links or ads
+    "ar",             # Often used for ad region or targeting info
+    "ps",             # Could refer to promotion source or partner segment
+    "gclid",          # Google Ads click ID (auto-tagging)
+    "gclsrc",         # Source of the GCLID (e.g., ads, search)
+    "sku",            # Product SKU identifier, often used in ecommerce links
+    "ref",            # Referrer username or source (e.g., GitHub ref links)
+    "referral",       # Alternate form of referrer, often human-readable
+    "aff_id",         # Affiliate identifier (ID-based)
+    "aff",            # Short form for affiliate tag
+    "affiliate",      # Affiliate tracking parameter (human-readable)
+    "partner",        # Indicates marketing or distribution partner
+    "fbclid",         # Facebook Click Identifier
+    "msclkid",        # Microsoft/Bing Ads click identifier
+    "twclid",         # Twitter Ads click identifier
+    "variant",        # A/B test variant (used to test versions of pages)
+    "session_id",     # Session tracking ID, should not persist across URLs
+    "track",          # Generic flag used to enable/disable tracking
+    "cid",            # Campaign ID (used in ads or emails)
+    "campaignid",     # Alternate or long-form campaign ID
+    "adgroup",        # Ad group identifier for campaigns
+    "bannerid",       # Specific banner ad ID (for display ad tracking)
+    "token",          # Often used to identify users or temporary sessions
+    "tag",            # Affiliate or marketing tag (used for tracking)
+    "hash",           # Generic hash identifier, often for state or cache
+    "user",           # User ID or identifier passed in URL (should be avoided)
+    "src",            # Generic source indicator, less formal than `utm_source`
+    "selsort",        # Sorting parameter for search results
+    "shid",           # Shop ID (used in ecommerce)
+    "shoparea",       # Shop area (used in ecommerce)
+    "shopid",         # Shop ID (used in ecommerce)
+    "shoparea",       # Shop area (used in ecommerce)
+]
+# Enrichment settings
+ENRICHMENT_DEFAULT_LIMIT = 10
+# Zyte settings
+ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
+# Processor settings
+PROCESSOR_DEFAULT_MODEL = "gpt-4o"
+PROCESSOR_DEFAULT_IF_MISSING = -1
+PROCESSOR_EMPTY_TOKEN_COUNT = -1
+PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
+PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
+# Async settings
+DEFAULT_N_SERP_WKRS = 10
+DEFAULT_N_ZYTE_WKRS = 10
+DEFAULT_N_PROC_WKRS = 10

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.4.2"
+version = "0.4.5"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",

fraudcrawler-0.4.2/fraudcrawler/settings.py DELETED Viewed

@@ -1,31 +0,0 @@
-from pathlib import Path
-from typing import List
-# Generic settings
-MAX_RETRIES = 3
-RETRY_DELAY = 2
-ROOT_DIR = Path(__file__).parents[1]
-# Serp settings
-GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
-GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
-SERP_DEFAULT_COUNTRY_CODES: List[str] = [
-    # ".com",
-]
-# Enrichment settings
-ENRICHMENT_DEFAULT_LIMIT = 10
-# Zyte settings
-ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
-# Processor settings
-PROCESSOR_DEFAULT_MODEL = "gpt-4o"
-PROCESSOR_DEFAULT_IF_MISSING = -1
-PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
-PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
-# Async settings
-DEFAULT_N_SERP_WKRS = 10
-DEFAULT_N_ZYTE_WKRS = 10
-DEFAULT_N_PROC_WKRS = 10

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/README.md RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/client.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-languages.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-locations.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/enrich.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -4,10 +4,10 @@ import logging
 from pydantic import BaseModel
 from typing import List
 from urllib.parse import urlparse
+import re
 from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
 from fraudcrawler.base.base import Host, Language, Location, AsyncClient
-import re
 logger = logging.getLogger(__name__)

{fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/zyte.py RENAMED Viewed

File without changes

fraudcrawler 0.4.2__tar.gz → 0.4.5__tar.gz

Potentially problematic release.

fraudcrawler 0.4.2tar.gz → 0.4.5tar.gz