fraudcrawler 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -0
- fraudcrawler/base/orchestrator.py +18 -11
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/scraping/serp.py +1 -1
- fraudcrawler/scraping/url.py +57 -0
- fraudcrawler/settings.py +12 -0
- {fraudcrawler-0.4.2.dist-info → fraudcrawler-0.4.3.dist-info}/METADATA +1 -1
- {fraudcrawler-0.4.2.dist-info → fraudcrawler-0.4.3.dist-info}/RECORD +11 -10
- {fraudcrawler-0.4.2.dist-info → fraudcrawler-0.4.3.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.4.2.dist-info → fraudcrawler-0.4.3.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.4.2.dist-info → fraudcrawler-0.4.3.dist-info}/entry_points.txt +0 -0
fraudcrawler/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from fraudcrawler.scraping.serp import SerpApi, SearchEngine
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
|
+
from fraudcrawler.scraping.url import URLCollector
|
|
3
4
|
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
5
|
from fraudcrawler.processing.processor import Processor
|
|
5
6
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
@@ -18,6 +19,7 @@ __all__ = [
|
|
|
18
19
|
"SerpApi",
|
|
19
20
|
"SearchEngine",
|
|
20
21
|
"Enricher",
|
|
22
|
+
"URLCollector",
|
|
21
23
|
"ZyteApi",
|
|
22
24
|
"Processor",
|
|
23
25
|
"Orchestrator",
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import Dict, List, cast
|
|
5
|
+
|
|
5
6
|
from bs4 import BeautifulSoup
|
|
6
7
|
|
|
7
8
|
from fraudcrawler.settings import (
|
|
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
|
|
|
24
25
|
Prompt,
|
|
25
26
|
ProductItem,
|
|
26
27
|
)
|
|
27
|
-
from fraudcrawler import
|
|
28
|
+
from fraudcrawler import (
|
|
29
|
+
SerpApi,
|
|
30
|
+
SearchEngine,
|
|
31
|
+
Enricher,
|
|
32
|
+
URLCollector,
|
|
33
|
+
ZyteApi,
|
|
34
|
+
Processor,
|
|
35
|
+
)
|
|
28
36
|
|
|
29
37
|
logger = logging.getLogger(__name__)
|
|
30
38
|
|
|
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
|
|
|
75
83
|
n_zyte_wkrs: Number of async workers for zyte (optional).
|
|
76
84
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
77
85
|
"""
|
|
78
|
-
# Setup the variables
|
|
79
|
-
self._collected_urls_current_run: Set[str] = set()
|
|
80
|
-
self._collected_urls_previous_runs: Set[str] = set()
|
|
81
|
-
|
|
82
86
|
# Setup the clients
|
|
83
87
|
self._serpapi = SerpApi(
|
|
84
88
|
api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
85
89
|
)
|
|
86
90
|
self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
|
|
91
|
+
self._url_collector = URLCollector()
|
|
87
92
|
self._zyteapi = ZyteApi(
|
|
88
93
|
api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
89
94
|
)
|
|
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
|
|
|
156
161
|
break
|
|
157
162
|
|
|
158
163
|
if not product.filtered:
|
|
159
|
-
|
|
164
|
+
# Clean the URL by removing tracking parameters
|
|
165
|
+
url = self._url_collector.remove_tracking_parameters(product.url)
|
|
166
|
+
product.url = url
|
|
160
167
|
|
|
161
|
-
if url in self.
|
|
168
|
+
if url in self._url_collector.collected_currently:
|
|
162
169
|
# deduplicate on current run
|
|
163
170
|
product.filtered = True
|
|
164
171
|
product.filtered_at_stage = (
|
|
165
172
|
"URL collection (current run deduplication)"
|
|
166
173
|
)
|
|
167
174
|
logger.debug(f"URL {url} already collected in current run")
|
|
168
|
-
elif url in self.
|
|
175
|
+
elif url in self._url_collector.collected_previously:
|
|
169
176
|
# deduplicate on previous runs coming from a db
|
|
170
177
|
product.filtered = True
|
|
171
178
|
product.filtered_at_stage = (
|
|
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
|
|
|
173
180
|
)
|
|
174
181
|
logger.debug(f"URL {url} as already collected in previous run")
|
|
175
182
|
else:
|
|
176
|
-
self.
|
|
183
|
+
self._url_collector.collected_currently.add(url)
|
|
177
184
|
|
|
178
185
|
await queue_out.put(product)
|
|
179
186
|
queue_in.task_done()
|
|
@@ -480,7 +487,7 @@ class Orchestrator(ABC):
|
|
|
480
487
|
# INITIAL SETUP
|
|
481
488
|
# ---------------------------
|
|
482
489
|
if previously_collected_urls:
|
|
483
|
-
self.
|
|
490
|
+
self._url_collector.collected_previously = set(previously_collected_urls)
|
|
484
491
|
|
|
485
492
|
# Setup the async framework
|
|
486
493
|
n_terms_max = 1 + (
|
|
@@ -13,7 +13,7 @@ def main():
|
|
|
13
13
|
client = FraudCrawlerClient()
|
|
14
14
|
|
|
15
15
|
# Setup the search
|
|
16
|
-
search_term = "
|
|
16
|
+
search_term = "Medion Kühlbox MD 37454"
|
|
17
17
|
language = Language(name="German")
|
|
18
18
|
location = Location(name="Switzerland")
|
|
19
19
|
deepness = Deepness(num_results=10)
|
fraudcrawler/scraping/serp.py
CHANGED
|
@@ -4,10 +4,10 @@ import logging
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
from typing import List
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
|
|
9
10
|
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
10
|
-
import re
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Set, Tuple
|
|
3
|
+
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
|
|
4
|
+
|
|
5
|
+
from fraudcrawler.settings import KNOWN_TRACKERS
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class URLCollector:
|
|
11
|
+
"""A class to collect and de-duplicate URLs."""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.collected_currently: Set[str] = set()
|
|
15
|
+
self.collected_previously: Set[str] = set()
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def remove_tracking_parameters(url: str) -> str:
|
|
19
|
+
"""Remove tracking parameters from URLs.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
url: The URL to clean.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The cleaned URL without tracking parameters.
|
|
26
|
+
"""
|
|
27
|
+
logging.debug(f"Removing tracking parameters from URL: {url}")
|
|
28
|
+
|
|
29
|
+
# Parse the url
|
|
30
|
+
parsed_url = urlparse(url)
|
|
31
|
+
|
|
32
|
+
# Parse query parameters
|
|
33
|
+
queries: List[Tuple[str, str]] = parse_qsl(
|
|
34
|
+
parsed_url.query, keep_blank_values=True
|
|
35
|
+
)
|
|
36
|
+
remove_all = url.startswith(
|
|
37
|
+
"https://www.ebay"
|
|
38
|
+
) # eBay URLs have all query parameters as tracking parameters
|
|
39
|
+
if remove_all:
|
|
40
|
+
filtered_queries = []
|
|
41
|
+
else:
|
|
42
|
+
filtered_queries = [
|
|
43
|
+
q
|
|
44
|
+
for q in queries
|
|
45
|
+
if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# Rebuild the URL without tracking parameters
|
|
49
|
+
clean_url = ParseResult(
|
|
50
|
+
scheme=parsed_url.scheme,
|
|
51
|
+
netloc=parsed_url.netloc,
|
|
52
|
+
path=parsed_url.path,
|
|
53
|
+
params=parsed_url.params,
|
|
54
|
+
query=urlencode(filtered_queries, quote_via=quote),
|
|
55
|
+
fragment=parsed_url.fragment,
|
|
56
|
+
)
|
|
57
|
+
return urlunparse(clean_url)
|
fraudcrawler/settings.py
CHANGED
|
@@ -13,6 +13,18 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
|
13
13
|
# ".com",
|
|
14
14
|
]
|
|
15
15
|
|
|
16
|
+
# URL De-duplication settings
|
|
17
|
+
KNOWN_TRACKERS = [
|
|
18
|
+
"srsltid",
|
|
19
|
+
"utm_source",
|
|
20
|
+
"utm_medium",
|
|
21
|
+
"utm_campaign",
|
|
22
|
+
"utm_term",
|
|
23
|
+
"utm_content",
|
|
24
|
+
"ar",
|
|
25
|
+
"ps",
|
|
26
|
+
]
|
|
27
|
+
|
|
16
28
|
# Enrichment settings
|
|
17
29
|
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
18
30
|
|
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
fraudcrawler/__init__.py,sha256=
|
|
1
|
+
fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
|
|
2
2
|
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
|
|
4
4
|
fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
|
|
5
5
|
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
6
|
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=
|
|
8
|
-
fraudcrawler/launch_demo_pipeline.py,sha256=
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=xOMxA0zPUXSF8AGY5AUqzsOO9LfRIjxI2HuZf__Z_sI,24689
|
|
8
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
|
|
9
9
|
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
|
|
11
11
|
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
|
|
13
|
-
fraudcrawler/scraping/serp.py,sha256=
|
|
13
|
+
fraudcrawler/scraping/serp.py,sha256=divEp1UBUsws24PWZABhWIxOmaLqLwdeGn4KNrqWkYA,17865
|
|
14
|
+
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
14
15
|
fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
|
|
15
|
-
fraudcrawler/settings.py,sha256=
|
|
16
|
-
fraudcrawler-0.4.
|
|
17
|
-
fraudcrawler-0.4.
|
|
18
|
-
fraudcrawler-0.4.
|
|
19
|
-
fraudcrawler-0.4.
|
|
20
|
-
fraudcrawler-0.4.
|
|
16
|
+
fraudcrawler/settings.py,sha256=31jvRFfB-gsVbeidLLl4iQgrFL7GH-824lerIniPI08,1017
|
|
17
|
+
fraudcrawler-0.4.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
18
|
+
fraudcrawler-0.4.3.dist-info/METADATA,sha256=jlk2WdtXEK0-s6QRQdI96EBpQiyHWKgJiYeW93yiU24,5931
|
|
19
|
+
fraudcrawler-0.4.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
|
20
|
+
fraudcrawler-0.4.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
21
|
+
fraudcrawler-0.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|