PyPI - fraudcrawler - Versions diffs - 0.4.2__tar.gz → 0.4.3__tar.gz - Mend

fraudcrawler 0.4.2tar.gz → 0.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fraudcrawler might be problematic. Click here for more details.

Files changed (20) hide show

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fraudcrawler
-Version: 0.4.2
+Version: 0.4.3
 Summary: Intelligent Market Monitoring
 Home-page: https://github.com/open-veanu/fraudcrawler
 License: MIT

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from fraudcrawler.scraping.serp import SerpApi, SearchEngine
 from fraudcrawler.scraping.enrich import Enricher
+from fraudcrawler.scraping.url import URLCollector
 from fraudcrawler.scraping.zyte import ZyteApi
 from fraudcrawler.processing.processor import Processor
 from fraudcrawler.base.orchestrator import Orchestrator
@@ -18,6 +19,7 @@ __all__ = [
     "SerpApi",
     "SearchEngine",
     "Enricher",
+    "URLCollector",
     "ZyteApi",
     "Processor",
     "Orchestrator",

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/orchestrator.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 import asyncio
 import logging
-from typing import Dict, List, Set, cast
+from typing import Dict, List, cast
 from bs4 import BeautifulSoup
 from fraudcrawler.settings import (
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
     Prompt,
     ProductItem,
 )
-from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
+from fraudcrawler import (
+    SerpApi,
+    SearchEngine,
+    Enricher,
+    URLCollector,
+    ZyteApi,
+    Processor,
+)
 logger = logging.getLogger(__name__)
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
             n_zyte_wkrs: Number of async workers for zyte (optional).
             n_proc_wkrs: Number of async workers for the processor (optional).
         """
-        # Setup the variables
-        self._collected_urls_current_run: Set[str] = set()
-        self._collected_urls_previous_runs: Set[str] = set()
         # Setup the clients
         self._serpapi = SerpApi(
             api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
         self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
+        self._url_collector = URLCollector()
         self._zyteapi = ZyteApi(
             api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
         )
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
                 break
             if not product.filtered:
-                url = product.url
+                # Clean the URL by removing tracking parameters
+                url = self._url_collector.remove_tracking_parameters(product.url)
+                product.url = url
-                if url in self._collected_urls_current_run:
+                if url in self._url_collector.collected_currently:
                     # deduplicate on current run
                     product.filtered = True
                     product.filtered_at_stage = (
                         "URL collection (current run deduplication)"
                     )
                     logger.debug(f"URL {url} already collected in current run")
-                elif url in self._collected_urls_previous_runs:
+                elif url in self._url_collector.collected_previously:
                     # deduplicate on previous runs coming from a db
                     product.filtered = True
                     product.filtered_at_stage = (
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
                     )
                     logger.debug(f"URL {url} as already collected in previous run")
                 else:
-                    self._collected_urls_current_run.add(url)
+                    self._url_collector.collected_currently.add(url)
             await queue_out.put(product)
             queue_in.task_done()
@@ -480,7 +487,7 @@ class Orchestrator(ABC):
         #        INITIAL SETUP
         # ---------------------------
         if previously_collected_urls:
-            self._collected_urls_previous_runs = set(self._collected_urls_current_run)
+            self._url_collector.collected_previously = set(previously_collected_urls)
         # Setup the async framework
         n_terms_max = 1 + (

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/launch_demo_pipeline.py RENAMED Viewed

@@ -13,7 +13,7 @@ def main():
     client = FraudCrawlerClient()
     # Setup the search
-    search_term = "Kühlschrank"
+    search_term = "Medion Kühlbox MD 37454"
     language = Language(name="German")
     location = Location(name="Switzerland")
     deepness = Deepness(num_results=10)

fraudcrawler-0.4.3/fraudcrawler/scraping/url.py ADDED Viewed

@@ -0,0 +1,57 @@
+import logging
+from typing import List, Set, Tuple
+from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
+from fraudcrawler.settings import KNOWN_TRACKERS
+logger = logging.getLogger(__name__)
+class URLCollector:
+    """A class to collect and de-duplicate URLs."""
+    def __init__(self):
+        self.collected_currently: Set[str] = set()
+        self.collected_previously: Set[str] = set()
+    @staticmethod
+    def remove_tracking_parameters(url: str) -> str:
+        """Remove tracking parameters from URLs.
+        Args:
+            url: The URL to clean.
+        Returns:
+            The cleaned URL without tracking parameters.
+        """
+        logging.debug(f"Removing tracking parameters from URL: {url}")
+        # Parse the url
+        parsed_url = urlparse(url)
+        # Parse query parameters
+        queries: List[Tuple[str, str]] = parse_qsl(
+            parsed_url.query, keep_blank_values=True
+        )
+        remove_all = url.startswith(
+            "https://www.ebay"
+        )  # eBay URLs have all query parameters as tracking parameters
+        if remove_all:
+            filtered_queries = []
+        else:
+            filtered_queries = [
+                q
+                for q in queries
+                if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
+            ]
+        # Rebuild the URL without tracking parameters
+        clean_url = ParseResult(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc,
+            path=parsed_url.path,
+            params=parsed_url.params,
+            query=urlencode(filtered_queries, quote_via=quote),
+            fragment=parsed_url.fragment,
+        )
+        return urlunparse(clean_url)

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/settings.py RENAMED Viewed

@@ -13,6 +13,18 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
     # ".com",
 ]
+# URL De-duplication settings
+KNOWN_TRACKERS = [
+    "srsltid",
+    "utm_source",
+    "utm_medium",
+    "utm_campaign",
+    "utm_term",
+    "utm_content",
+    "ar",
+    "ps",
+]
 # Enrichment settings
 ENRICHMENT_DEFAULT_LIMIT = 10

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.4.2"
+version = "0.4.3"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/LICENSE RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/README.md RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/base.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/client.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/google-languages.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/google-locations.json RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/processing/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/processing/processor.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/__init__.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/enrich.py RENAMED Viewed

File without changes

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/serp.py RENAMED Viewed

@@ -4,10 +4,10 @@ import logging
 from pydantic import BaseModel
 from typing import List
 from urllib.parse import urlparse
+import re
 from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
 from fraudcrawler.base.base import Host, Language, Location, AsyncClient
-import re
 logger = logging.getLogger(__name__)

{fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/zyte.py RENAMED Viewed

File without changes

fraudcrawler 0.4.2__tar.gz → 0.4.3__tar.gz

Potentially problematic release.

fraudcrawler 0.4.2tar.gz → 0.4.3tar.gz