fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +21 -5
- fraudcrawler/base/base.py +18 -38
- fraudcrawler/base/client.py +57 -60
- fraudcrawler/base/orchestrator.py +277 -276
- fraudcrawler/base/retry.py +25 -11
- fraudcrawler/launch_demo_pipeline.py +103 -41
- fraudcrawler/processing/base.py +129 -0
- fraudcrawler/processing/openai.py +520 -0
- fraudcrawler/scraping/enrich.py +6 -4
- fraudcrawler/scraping/search.py +370 -110
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +146 -80
- fraudcrawler/settings.py +22 -10
- fraudcrawler-0.7.22.dist-info/METADATA +173 -0
- fraudcrawler-0.7.22.dist-info/RECORD +23 -0
- fraudcrawler/processing/processor.py +0 -199
- fraudcrawler-0.5.0.dist-info/METADATA +0 -167
- fraudcrawler-0.5.0.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/url.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
|
|
|
3
3
|
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
|
|
4
4
|
|
|
5
5
|
from fraudcrawler.settings import KNOWN_TRACKERS
|
|
6
|
+
from fraudcrawler.base.base import ProductItem
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
@@ -11,11 +12,19 @@ class URLCollector:
|
|
|
11
12
|
"""A class to collect and de-duplicate URLs."""
|
|
12
13
|
|
|
13
14
|
def __init__(self):
|
|
14
|
-
self.
|
|
15
|
-
self.
|
|
15
|
+
self._collected_currently: Set[str] = set()
|
|
16
|
+
self._collected_previously: Set[str] = set()
|
|
17
|
+
|
|
18
|
+
def add_previously_collected_urls(self, urls: List[str]) -> None:
|
|
19
|
+
"""Add a set of previously collected URLs to the internal state.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
urls: A set of URLs that have been collected in previous runs.
|
|
23
|
+
"""
|
|
24
|
+
self._collected_previously.update(urls)
|
|
16
25
|
|
|
17
26
|
@staticmethod
|
|
18
|
-
def
|
|
27
|
+
def _remove_tracking_parameters(url: str) -> str:
|
|
19
28
|
"""Remove tracking parameters from URLs.
|
|
20
29
|
|
|
21
30
|
Args:
|
|
@@ -55,3 +64,33 @@ class URLCollector:
|
|
|
55
64
|
fragment=parsed_url.fragment,
|
|
56
65
|
)
|
|
57
66
|
return urlunparse(clean_url)
|
|
67
|
+
|
|
68
|
+
async def apply(self, product: ProductItem) -> ProductItem:
|
|
69
|
+
"""Manages the collection and deduplication of ProductItems.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
product: The product item to process.
|
|
73
|
+
"""
|
|
74
|
+
logger.debug(f'Processing product with url="{product.url}"')
|
|
75
|
+
|
|
76
|
+
# Remove tracking parameters from the URL
|
|
77
|
+
url = self._remove_tracking_parameters(product.url)
|
|
78
|
+
product.url = url
|
|
79
|
+
|
|
80
|
+
# deduplicate on current run
|
|
81
|
+
if url in self._collected_currently:
|
|
82
|
+
product.filtered = True
|
|
83
|
+
product.filtered_at_stage = "URL collection (current run deduplication)"
|
|
84
|
+
logger.debug(f"URL {url} already collected in current run")
|
|
85
|
+
|
|
86
|
+
# deduplicate on previous runs coming from a db
|
|
87
|
+
elif url in self._collected_previously:
|
|
88
|
+
product.filtered = True
|
|
89
|
+
product.filtered_at_stage = "URL collection (previous run deduplication)"
|
|
90
|
+
logger.debug(f"URL {url} as already collected in previous run")
|
|
91
|
+
|
|
92
|
+
# Add to currently collected URLs
|
|
93
|
+
else:
|
|
94
|
+
self._collected_currently.add(url)
|
|
95
|
+
|
|
96
|
+
return product
|
fraudcrawler/scraping/zyte.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
+
from base64 import b64decode
|
|
1
2
|
import logging
|
|
2
3
|
from typing import List
|
|
3
|
-
from base64 import b64decode
|
|
4
4
|
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
5
6
|
import httpx
|
|
6
7
|
from tenacity import RetryCallState
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
9
|
-
from fraudcrawler.base.base import DomainUtils
|
|
10
|
+
from fraudcrawler.base.base import DomainUtils, ProductItem
|
|
10
11
|
from fraudcrawler.base.retry import get_async_retry
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
|
|
|
61
62
|
else:
|
|
62
63
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
63
64
|
|
|
64
|
-
async def details(self, url: str) -> dict:
|
|
65
|
-
"""Fetches product details for a single URL.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
url: The URL to fetch product details from.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
A dictionary containing the product details, fields include:
|
|
72
|
-
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
73
|
-
{
|
|
74
|
-
"url": str,
|
|
75
|
-
"statusCode": str,
|
|
76
|
-
"product": {
|
|
77
|
-
"name": str,
|
|
78
|
-
"price": str,
|
|
79
|
-
"mainImage": {"url": str},
|
|
80
|
-
"images": [{"url": str}],
|
|
81
|
-
"description": str,
|
|
82
|
-
"metadata": {
|
|
83
|
-
"probability": float,
|
|
84
|
-
},
|
|
85
|
-
},
|
|
86
|
-
"httpResponseBody": base64
|
|
87
|
-
}
|
|
88
|
-
"""
|
|
89
|
-
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
90
|
-
|
|
91
|
-
# Perform the request and retry if necessary. There is some context aware logging:
|
|
92
|
-
# - `before`: before the request is made (and before retrying)
|
|
93
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
94
|
-
retry = get_async_retry()
|
|
95
|
-
retry.before = lambda retry_state: self._log_before(
|
|
96
|
-
url=url, retry_state=retry_state
|
|
97
|
-
)
|
|
98
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
99
|
-
url=url, retry_state=retry_state
|
|
100
|
-
)
|
|
101
|
-
async for attempt in retry:
|
|
102
|
-
with attempt:
|
|
103
|
-
response = await self._http_client.post(
|
|
104
|
-
url=self._endpoint,
|
|
105
|
-
json={"url": url, **self._config},
|
|
106
|
-
auth=(self._api_key, ""), # API key as username, empty password
|
|
107
|
-
)
|
|
108
|
-
response.raise_for_status()
|
|
109
|
-
|
|
110
|
-
details = response.json()
|
|
111
|
-
return details
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def keep_product(
|
|
115
|
-
details: dict,
|
|
116
|
-
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
117
|
-
) -> bool:
|
|
118
|
-
"""Determines whether to keep the product based on the probability threshold.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
details: A product details data dictionary.
|
|
122
|
-
threshold: The probability threshold used to filter the products.
|
|
123
|
-
"""
|
|
124
|
-
try:
|
|
125
|
-
prob = float(details["product"]["metadata"]["probability"])
|
|
126
|
-
except KeyError:
|
|
127
|
-
logger.warning(
|
|
128
|
-
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
129
|
-
)
|
|
130
|
-
return False
|
|
131
|
-
return prob > threshold
|
|
132
|
-
|
|
133
65
|
@staticmethod
|
|
134
|
-
def
|
|
66
|
+
def _extract_product_name(details: dict) -> str | None:
|
|
135
67
|
"""Extracts the product name from the product data.
|
|
136
68
|
|
|
137
69
|
The input argument is a dictionary of the following structure:
|
|
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
|
|
|
144
76
|
return details.get("product", {}).get("name")
|
|
145
77
|
|
|
146
78
|
@staticmethod
|
|
147
|
-
def
|
|
79
|
+
def _extract_url_resolved(details: dict) -> str | None:
|
|
148
80
|
"""Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
|
|
149
81
|
|
|
150
82
|
The input argument is a dictionary of the following structure:
|
|
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
|
|
|
157
89
|
return details.get("product", {}).get("url")
|
|
158
90
|
|
|
159
91
|
@staticmethod
|
|
160
|
-
def
|
|
92
|
+
def _extract_product_price(details: dict) -> str | None:
|
|
161
93
|
"""Extracts the product price from the product data.
|
|
162
94
|
|
|
163
95
|
The input argument is a dictionary of the following structure:
|
|
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
|
|
|
170
102
|
return details.get("product", {}).get("price")
|
|
171
103
|
|
|
172
104
|
@staticmethod
|
|
173
|
-
def
|
|
105
|
+
def _extract_product_description(details: dict) -> str | None:
|
|
174
106
|
"""Extracts the product description from the product data.
|
|
175
107
|
|
|
176
108
|
The input argument is a dictionary of the following structure:
|
|
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
|
|
|
183
115
|
return details.get("product", {}).get("description")
|
|
184
116
|
|
|
185
117
|
@staticmethod
|
|
186
|
-
def
|
|
118
|
+
def _extract_image_urls(details: dict) -> List[str]:
|
|
187
119
|
"""Extracts the images from the product data.
|
|
188
120
|
|
|
189
121
|
The input argument is a dictionary of the following structure:
|
|
@@ -206,7 +138,28 @@ class ZyteAPI(DomainUtils):
|
|
|
206
138
|
return images
|
|
207
139
|
|
|
208
140
|
@staticmethod
|
|
209
|
-
def
|
|
141
|
+
def _extract_gtin(details: dict) -> str | None:
|
|
142
|
+
"""Extracts the GTIN from the product data.
|
|
143
|
+
|
|
144
|
+
The input argument is a dictionary of the following structure:
|
|
145
|
+
{
|
|
146
|
+
"product": {
|
|
147
|
+
"gtin": [{"type": str, "value": str}],
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
"""
|
|
151
|
+
product = details.get("product", {})
|
|
152
|
+
gtin_list = product.get("gtin", [])
|
|
153
|
+
|
|
154
|
+
if gtin_list and len(gtin_list) > 0:
|
|
155
|
+
# Extract the first GTIN value
|
|
156
|
+
gtin_value = gtin_list[0].get("value")
|
|
157
|
+
if gtin_value:
|
|
158
|
+
return gtin_value
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _extract_probability(details: dict) -> float:
|
|
210
163
|
"""Extracts the probability from the product data.
|
|
211
164
|
|
|
212
165
|
The input argument is a dictionary of the following structure:
|
|
@@ -223,7 +176,7 @@ class ZyteAPI(DomainUtils):
|
|
|
223
176
|
)
|
|
224
177
|
|
|
225
178
|
@staticmethod
|
|
226
|
-
def
|
|
179
|
+
def _extract_html(details: dict) -> str | None:
|
|
227
180
|
"""Extracts the HTML from the Zyte API response.
|
|
228
181
|
|
|
229
182
|
The input argument is a dictionary of the following structure:
|
|
@@ -238,7 +191,120 @@ class ZyteAPI(DomainUtils):
|
|
|
238
191
|
if isinstance(encoded, str):
|
|
239
192
|
decoded_bytes = b64decode(encoded)
|
|
240
193
|
|
|
241
|
-
# Convert bytes to string
|
|
242
|
-
|
|
194
|
+
# Convert bytes to string
|
|
195
|
+
try:
|
|
196
|
+
decoded_string = decoded_bytes.decode("utf-8")
|
|
197
|
+
except UnicodeDecodeError:
|
|
198
|
+
decoded_string = decoded_bytes.decode("iso-8859-1")
|
|
243
199
|
return decoded_string
|
|
244
200
|
return None
|
|
201
|
+
|
|
202
|
+
def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
|
|
203
|
+
product.product_name = self._extract_product_name(details=details)
|
|
204
|
+
|
|
205
|
+
url_resolved = self._extract_url_resolved(details=details)
|
|
206
|
+
if url_resolved:
|
|
207
|
+
product.url_resolved = url_resolved
|
|
208
|
+
|
|
209
|
+
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
210
|
+
# otherwise the unresolved domain will be shown.
|
|
211
|
+
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
212
|
+
if url_resolved and url_resolved != product.url:
|
|
213
|
+
logger.debug(f"URL resolved for {product.url} is {url_resolved}")
|
|
214
|
+
product.domain = self._get_domain(url=url_resolved)
|
|
215
|
+
|
|
216
|
+
product.product_price = self._extract_product_price(details=details)
|
|
217
|
+
product.product_description = self._extract_product_description(details=details)
|
|
218
|
+
product.product_images = self._extract_image_urls(details=details)
|
|
219
|
+
product.product_gtin = self._extract_gtin(details=details)
|
|
220
|
+
product.probability = self._extract_probability(details=details)
|
|
221
|
+
product.html = self._extract_html(details=details)
|
|
222
|
+
if product.html:
|
|
223
|
+
soup = BeautifulSoup(product.html, "html.parser")
|
|
224
|
+
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
225
|
+
|
|
226
|
+
return product
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def keep_product(
|
|
230
|
+
details: dict,
|
|
231
|
+
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
232
|
+
) -> bool:
|
|
233
|
+
"""Determines whether to keep the product based on the probability threshold.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
details: A product details data dictionary.
|
|
237
|
+
threshold: The probability threshold used to filter the products.
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
prob = float(details["product"]["metadata"]["probability"])
|
|
241
|
+
except KeyError:
|
|
242
|
+
logger.warning(
|
|
243
|
+
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
244
|
+
)
|
|
245
|
+
return False
|
|
246
|
+
return prob > threshold
|
|
247
|
+
|
|
248
|
+
async def unblock_url_content(self, url: str) -> bytes:
|
|
249
|
+
"""Unblock the content of an URL using Zyte proxy mode.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
url: The URL to fetch using Zyte proxy mode.
|
|
253
|
+
"""
|
|
254
|
+
logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
|
|
255
|
+
details = await self.details(url)
|
|
256
|
+
|
|
257
|
+
if not details or "httpResponseBody" not in details:
|
|
258
|
+
raise httpx.HTTPError("No httpResponseBody in Zyte response")
|
|
259
|
+
|
|
260
|
+
return b64decode(details["httpResponseBody"])
|
|
261
|
+
|
|
262
|
+
async def details(self, url: str) -> dict:
|
|
263
|
+
"""Fetches product details for a single URL.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
url: The URL to fetch product details from.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
A dictionary containing the product details, fields include:
|
|
270
|
+
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
271
|
+
{
|
|
272
|
+
"url": str,
|
|
273
|
+
"statusCode": str,
|
|
274
|
+
"product": {
|
|
275
|
+
"name": str,
|
|
276
|
+
"price": str,
|
|
277
|
+
"mainImage": {"url": str},
|
|
278
|
+
"images": [{"url": str}],
|
|
279
|
+
"description": str,
|
|
280
|
+
"gtin": [{"type": str, "value": str}],
|
|
281
|
+
"metadata": {
|
|
282
|
+
"probability": float,
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
"httpResponseBody": base64
|
|
286
|
+
}
|
|
287
|
+
"""
|
|
288
|
+
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
289
|
+
|
|
290
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
291
|
+
# - `before`: before the request is made (and before retrying)
|
|
292
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
293
|
+
retry = get_async_retry()
|
|
294
|
+
retry.before = lambda retry_state: self._log_before(
|
|
295
|
+
url=url, retry_state=retry_state
|
|
296
|
+
)
|
|
297
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
298
|
+
url=url, retry_state=retry_state
|
|
299
|
+
)
|
|
300
|
+
async for attempt in retry:
|
|
301
|
+
with attempt:
|
|
302
|
+
response = await self._http_client.post(
|
|
303
|
+
url=self._endpoint,
|
|
304
|
+
json={"url": url, **self._config},
|
|
305
|
+
auth=(self._api_key, ""), # API key as username, empty password
|
|
306
|
+
)
|
|
307
|
+
response.raise_for_status()
|
|
308
|
+
|
|
309
|
+
details = response.json()
|
|
310
|
+
return details
|
fraudcrawler/settings.py
CHANGED
|
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
|
|
|
14
14
|
RETRY_JITTER = 1
|
|
15
15
|
RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# Search settings
|
|
18
18
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
19
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
20
|
SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
21
|
# ".com",
|
|
22
22
|
]
|
|
23
|
+
TOPPREISE_SEARCH_PATHS = {
|
|
24
|
+
"de": "produktsuche",
|
|
25
|
+
"fr": "chercher",
|
|
26
|
+
"default": "browse",
|
|
27
|
+
}
|
|
28
|
+
TOPPREISE_COMPARISON_PATHS = [
|
|
29
|
+
"preisvergleich",
|
|
30
|
+
"comparison-prix",
|
|
31
|
+
"price-comparison",
|
|
32
|
+
]
|
|
23
33
|
|
|
24
34
|
# URL De-duplication settings
|
|
25
35
|
KNOWN_TRACKERS = [
|
|
@@ -68,17 +78,19 @@ ENRICHMENT_DEFAULT_LIMIT = 10
|
|
|
68
78
|
# Zyte settings
|
|
69
79
|
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
70
80
|
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
81
|
+
# Exact match settings
|
|
82
|
+
EXACT_MATCH_PRODUCT_FIELDS = {
|
|
83
|
+
"url_resolved",
|
|
84
|
+
"product_name",
|
|
85
|
+
"product_description",
|
|
86
|
+
"html",
|
|
87
|
+
}
|
|
88
|
+
EXACT_MATCH_FIELD_SEPARATOR = "\n"
|
|
77
89
|
|
|
78
90
|
# Async workers settings
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
DEFAULT_N_PROC_WKRS =
|
|
91
|
+
DEFAULT_N_SRCH_WKRS = 2
|
|
92
|
+
DEFAULT_N_CNTX_WKRS = 23
|
|
93
|
+
DEFAULT_N_PROC_WKRS = 5
|
|
82
94
|
|
|
83
95
|
# HTTPX client settings
|
|
84
96
|
DEFAULT_HTTPX_TIMEOUT = {
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: fraudcrawler
|
|
3
|
+
Version: 0.7.22
|
|
4
|
+
Summary: Intelligent Market Monitoring
|
|
5
|
+
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
|
+
License: MIT
|
|
7
|
+
Author: Domingo Bertus
|
|
8
|
+
Author-email: hello@veanu.ch
|
|
9
|
+
Requires-Python: >=3.11,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
|
+
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
|
+
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
19
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
20
|
+
Requires-Dist: tenacity (>=9.1.2,<10.0.0)
|
|
21
|
+
Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# fraudcrawler
|
|
25
|
+
|
|
26
|
+

|
|
27
|
+

|
|
28
|
+

|
|
29
|
+

|
|
30
|
+
|
|
31
|
+
Fraudcrawler is an intelligent **market monitoring** tool that searches the web for products, extracts product details, and classifies them using LLMs. It combines search APIs, web scraping, and AI to automate product discovery and relevance assessment.
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Asynchronous pipeline** - Products move through search, extraction, and classification stages independently
|
|
36
|
+
- **Multiple search engines** - Google Search, Google Shopping, and more...
|
|
37
|
+
- **Search term enrichment** - Automatically find related terms and expand your search
|
|
38
|
+
- **Product extraction** - Get structured product data via Zyte API
|
|
39
|
+
- **LLM classification** - Assess product relevance using OpenAI API with custom prompts
|
|
40
|
+
- **Marketplace filtering** - Focus searches on specific domains
|
|
41
|
+
- **Deduplication** - Avoid reprocessing previously collected URLs
|
|
42
|
+
- **CSV export** - Results saved with timestamps for easy tracking
|
|
43
|
+
|
|
44
|
+
## Prerequisites
|
|
45
|
+
|
|
46
|
+
- Python 3.11 or higher
|
|
47
|
+
- API keys for:
|
|
48
|
+
- **SerpAPI** - Google search results
|
|
49
|
+
- **Zyte API** - Product data extraction
|
|
50
|
+
- **OpenAI API** - Product classification
|
|
51
|
+
- **DataForSEO** (optional) - Search term enrichment
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
python3.11 -m venv .venv
|
|
57
|
+
source .venv/bin/activate
|
|
58
|
+
pip install fraudcrawler
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Using Poetry:**
|
|
62
|
+
```bash
|
|
63
|
+
poetry install
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Configuration
|
|
67
|
+
|
|
68
|
+
Create a `.env` file with your API credentials (see `.env.example` for template):
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
SERPAPI_KEY=your_serpapi_key
|
|
72
|
+
ZYTEAPI_KEY=your_zyte_key
|
|
73
|
+
OPENAIAPI_KEY=your_openai_key
|
|
74
|
+
DATAFORSEO_USER=your_user # optional
|
|
75
|
+
DATAFORSEO_PWD=your_pwd # optional
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
### Basic Configuration
|
|
81
|
+
For a complete working example, see `fraudcrawler/launch_demo_pipeline.py`. After setting up the necessary parameters you can launch and analyse the results with:
|
|
82
|
+
```python
|
|
83
|
+
# Run pipeline
|
|
84
|
+
await client.run(
|
|
85
|
+
search_term=search_term,
|
|
86
|
+
search_engines=search_engines,
|
|
87
|
+
language=language,
|
|
88
|
+
location=location,
|
|
89
|
+
deepness=deepness,
|
|
90
|
+
excluded_urls=excluded_urls,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Load results
|
|
94
|
+
df = client.load_results()
|
|
95
|
+
print(df.head())
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Advanced Configuration
|
|
99
|
+
|
|
100
|
+
**Search term enrichment** - Find and search related terms:
|
|
101
|
+
```python
|
|
102
|
+
from fraudcrawler import Enrichment
|
|
103
|
+
|
|
104
|
+
deepness.enrichment = Enrichment(
|
|
105
|
+
additional_terms=5,
|
|
106
|
+
additional_urls_per_term=10
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
**Marketplace filtering** - Focus on specific domains:
|
|
111
|
+
```python
|
|
112
|
+
from fraudcrawler import Host
|
|
113
|
+
|
|
114
|
+
marketplaces = [
|
|
115
|
+
Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
116
|
+
Host(name="National", domains="netdoktor.ch,nobelpharma.ch"),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
await client.run(..., marketplaces=marketplaces)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
**Exclude domains** - Exclude specific domains from your results:
|
|
123
|
+
```python
|
|
124
|
+
excluded_urls = [
|
|
125
|
+
Host(name="Compendium", domains="compendium.ch"),
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
await client.run(..., excluded_urls=excluded_urls)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Skip previously collected URLs**:
|
|
132
|
+
```python
|
|
133
|
+
previously_collected_urls = [
|
|
134
|
+
"https://example.com/product1",
|
|
135
|
+
"https://example.com/product2",
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
await client.run(..., previously_collected_urls=previously_collected_urls)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**View all results** from a client instance:
|
|
142
|
+
```python
|
|
143
|
+
client.print_available_results()
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Output
|
|
147
|
+
|
|
148
|
+
Results are saved as CSV files in `data/results/` with the naming pattern:
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
<search_term>_<language_code>_<location_code>_<timestamp>.csv
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Example: `sildenafil_de_ch_20250115143022.csv`
|
|
155
|
+
|
|
156
|
+
The CSV includes product details, URLs, and classification scores from your workflows.
|
|
157
|
+
|
|
158
|
+
## Development
|
|
159
|
+
|
|
160
|
+
For detailed contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
165
|
+
|
|
166
|
+
## Architecture
|
|
167
|
+
|
|
168
|
+
Fraudcrawler uses an asynchronous pipeline where products can be at different processing stages simultaneously. Product A might be in classification while Product B is still being scraped. This is enabled by async workers for each stage (Search, Context Extraction, Processing) using `httpx.AsyncClient`.
|
|
169
|
+
|
|
170
|
+

|
|
171
|
+
|
|
172
|
+
For more details on the async design, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
173
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
fraudcrawler/__init__.py,sha256=4Xzhj6aS9zmjs8KZS9nhFg9YAWOpCX-TAtJ4s32A5Jk,1191
|
|
2
|
+
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
fraudcrawler/base/base.py,sha256=QchdTXgrabQKnjP28n_QKjpxHChVpOGQA2cfKFaGLAc,6821
|
|
4
|
+
fraudcrawler/base/client.py,sha256=uPaC1uMpHlMR1ThcTkzsoliwmGuTGH3jyTHLyVNYSHk,5608
|
|
5
|
+
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
+
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=BklS4DNzxbp7yvE2NvBWrDDqnvT4YO7Xh_WXstYNWYA,26050
|
|
8
|
+
fraudcrawler/base/retry.py,sha256=bCDd44XO2-lHO8MGvPblD5152-lHt1dOfMAQSmymLO4,1462
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=oZWodtNzA5mhmLNYMS6lglry88NutvH4IxnEWOUtL8M,6179
|
|
10
|
+
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
fraudcrawler/processing/base.py,sha256=UkoYxFNZ3BQkXmgJnTtruz8-eIFCtWiquRN_IoEXfM4,4091
|
|
12
|
+
fraudcrawler/processing/openai.py,sha256=7sbFg2NPsn627VDzsfIkKantE2KahGmVkSZ1R10OrzQ,19050
|
|
13
|
+
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
fraudcrawler/scraping/enrich.py,sha256=dGWi9p0JStQYSGscCnsQPHNlAeqjoL2rXZnHFNmPhaQ,13158
|
|
15
|
+
fraudcrawler/scraping/search.py,sha256=qHeUpzv1IpRhdFvaycGtL3FLOwT8rOiF0PfiOH6BmUA,34561
|
|
16
|
+
fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
|
|
17
|
+
fraudcrawler/scraping/zyte.py,sha256=xSHGKo09sX2dgQBrPI7oeoHsVL4qZ8voQLBXRU1XBqM,11102
|
|
18
|
+
fraudcrawler/settings.py,sha256=q3je0r_jd30x2dzlgfm8GyKcigFdgteOLa8HX188bho,3768
|
|
19
|
+
fraudcrawler-0.7.22.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
20
|
+
fraudcrawler-0.7.22.dist-info/METADATA,sha256=D749e0ZWDZSn8pjxvHj7RUf5m0D1_qHzRlZPRFqTE9A,5303
|
|
21
|
+
fraudcrawler-0.7.22.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
22
|
+
fraudcrawler-0.7.22.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
23
|
+
fraudcrawler-0.7.22.dist-info/RECORD,,
|