fraudcrawler 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (20) hide show
  1. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/PKG-INFO +1 -1
  2. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/__init__.py +2 -0
  3. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/orchestrator.py +18 -11
  4. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/launch_demo_pipeline.py +1 -1
  5. fraudcrawler-0.4.3/fraudcrawler/scraping/url.py +57 -0
  6. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/settings.py +12 -0
  7. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/pyproject.toml +1 -1
  8. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/LICENSE +0 -0
  9. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/README.md +0 -0
  10. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/__init__.py +0 -0
  11. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/base.py +0 -0
  12. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/client.py +0 -0
  13. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/google-languages.json +0 -0
  14. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/base/google-locations.json +0 -0
  15. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/serp.py +1 -1
  20. {fraudcrawler-0.4.2 → fraudcrawler-0.4.3}/fraudcrawler/scraping/zyte.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -1,5 +1,6 @@
1
1
  from fraudcrawler.scraping.serp import SerpApi, SearchEngine
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
+ from fraudcrawler.scraping.url import URLCollector
3
4
  from fraudcrawler.scraping.zyte import ZyteApi
4
5
  from fraudcrawler.processing.processor import Processor
5
6
  from fraudcrawler.base.orchestrator import Orchestrator
@@ -18,6 +19,7 @@ __all__ = [
18
19
  "SerpApi",
19
20
  "SearchEngine",
20
21
  "Enricher",
22
+ "URLCollector",
21
23
  "ZyteApi",
22
24
  "Processor",
23
25
  "Orchestrator",
@@ -1,7 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from typing import Dict, List, Set, cast
4
+ from typing import Dict, List, cast
5
+
5
6
  from bs4 import BeautifulSoup
6
7
 
7
8
  from fraudcrawler.settings import (
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
24
25
  Prompt,
25
26
  ProductItem,
26
27
  )
27
- from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
28
+ from fraudcrawler import (
29
+ SerpApi,
30
+ SearchEngine,
31
+ Enricher,
32
+ URLCollector,
33
+ ZyteApi,
34
+ Processor,
35
+ )
28
36
 
29
37
  logger = logging.getLogger(__name__)
30
38
 
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
75
83
  n_zyte_wkrs: Number of async workers for zyte (optional).
76
84
  n_proc_wkrs: Number of async workers for the processor (optional).
77
85
  """
78
- # Setup the variables
79
- self._collected_urls_current_run: Set[str] = set()
80
- self._collected_urls_previous_runs: Set[str] = set()
81
-
82
86
  # Setup the clients
83
87
  self._serpapi = SerpApi(
84
88
  api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
85
89
  )
86
90
  self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
91
+ self._url_collector = URLCollector()
87
92
  self._zyteapi = ZyteApi(
88
93
  api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
89
94
  )
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
156
161
  break
157
162
 
158
163
  if not product.filtered:
159
- url = product.url
164
+ # Clean the URL by removing tracking parameters
165
+ url = self._url_collector.remove_tracking_parameters(product.url)
166
+ product.url = url
160
167
 
161
- if url in self._collected_urls_current_run:
168
+ if url in self._url_collector.collected_currently:
162
169
  # deduplicate on current run
163
170
  product.filtered = True
164
171
  product.filtered_at_stage = (
165
172
  "URL collection (current run deduplication)"
166
173
  )
167
174
  logger.debug(f"URL {url} already collected in current run")
168
- elif url in self._collected_urls_previous_runs:
175
+ elif url in self._url_collector.collected_previously:
169
176
  # deduplicate on previous runs coming from a db
170
177
  product.filtered = True
171
178
  product.filtered_at_stage = (
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
173
180
  )
174
181
  logger.debug(f"URL {url} as already collected in previous run")
175
182
  else:
176
- self._collected_urls_current_run.add(url)
183
+ self._url_collector.collected_currently.add(url)
177
184
 
178
185
  await queue_out.put(product)
179
186
  queue_in.task_done()
@@ -480,7 +487,7 @@ class Orchestrator(ABC):
480
487
  # INITIAL SETUP
481
488
  # ---------------------------
482
489
  if previously_collected_urls:
483
- self._collected_urls_previous_runs = set(self._collected_urls_current_run)
490
+ self._url_collector.collected_previously = set(previously_collected_urls)
484
491
 
485
492
  # Setup the async framework
486
493
  n_terms_max = 1 + (
@@ -13,7 +13,7 @@ def main():
13
13
  client = FraudCrawlerClient()
14
14
 
15
15
  # Setup the search
16
- search_term = "Kühlschrank"
16
+ search_term = "Medion Kühlbox MD 37454"
17
17
  language = Language(name="German")
18
18
  location = Location(name="Switzerland")
19
19
  deepness = Deepness(num_results=10)
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from typing import List, Set, Tuple
3
+ from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
+
5
+ from fraudcrawler.settings import KNOWN_TRACKERS
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class URLCollector:
11
+ """A class to collect and de-duplicate URLs."""
12
+
13
+ def __init__(self):
14
+ self.collected_currently: Set[str] = set()
15
+ self.collected_previously: Set[str] = set()
16
+
17
+ @staticmethod
18
+ def remove_tracking_parameters(url: str) -> str:
19
+ """Remove tracking parameters from URLs.
20
+
21
+ Args:
22
+ url: The URL to clean.
23
+
24
+ Returns:
25
+ The cleaned URL without tracking parameters.
26
+ """
27
+ logging.debug(f"Removing tracking parameters from URL: {url}")
28
+
29
+ # Parse the url
30
+ parsed_url = urlparse(url)
31
+
32
+ # Parse query parameters
33
+ queries: List[Tuple[str, str]] = parse_qsl(
34
+ parsed_url.query, keep_blank_values=True
35
+ )
36
+ remove_all = url.startswith(
37
+ "https://www.ebay"
38
+ ) # eBay URLs have all query parameters as tracking parameters
39
+ if remove_all:
40
+ filtered_queries = []
41
+ else:
42
+ filtered_queries = [
43
+ q
44
+ for q in queries
45
+ if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
46
+ ]
47
+
48
+ # Rebuild the URL without tracking parameters
49
+ clean_url = ParseResult(
50
+ scheme=parsed_url.scheme,
51
+ netloc=parsed_url.netloc,
52
+ path=parsed_url.path,
53
+ params=parsed_url.params,
54
+ query=urlencode(filtered_queries, quote_via=quote),
55
+ fragment=parsed_url.fragment,
56
+ )
57
+ return urlunparse(clean_url)
@@ -13,6 +13,18 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
13
  # ".com",
14
14
  ]
15
15
 
16
+ # URL De-duplication settings
17
+ KNOWN_TRACKERS = [
18
+ "srsltid",
19
+ "utm_source",
20
+ "utm_medium",
21
+ "utm_campaign",
22
+ "utm_term",
23
+ "utm_content",
24
+ "ar",
25
+ "ps",
26
+ ]
27
+
16
28
  # Enrichment settings
17
29
  ENRICHMENT_DEFAULT_LIMIT = 10
18
30
 
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.4.2"
7
+ version = "0.4.3"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes
@@ -4,10 +4,10 @@ import logging
4
4
  from pydantic import BaseModel
5
5
  from typing import List
6
6
  from urllib.parse import urlparse
7
+ import re
7
8
 
8
9
  from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
9
10
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
10
- import re
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13