fraudcrawler 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/base/base.py +10 -0
- fraudcrawler/base/orchestrator.py +95 -34
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/scraping/search.py +169 -147
- fraudcrawler/scraping/zyte.py +103 -77
- fraudcrawler/settings.py +8 -0
- {fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/METADATA +4 -3
- {fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/RECORD +11 -11
- {fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/WHEEL +1 -1
- {fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info}/entry_points.txt +0 -0
- {fraudcrawler-0.6.0.dist-info → fraudcrawler-0.6.2.dist-info/licenses}/LICENSE +0 -0
fraudcrawler/base/base.py
CHANGED
|
@@ -140,6 +140,8 @@ class ProductItem(BaseModel):
|
|
|
140
140
|
url_resolved: str
|
|
141
141
|
search_engine_name: str
|
|
142
142
|
domain: str
|
|
143
|
+
exact_search: bool = False
|
|
144
|
+
exact_search_match: bool = False
|
|
143
145
|
|
|
144
146
|
# Context parameters
|
|
145
147
|
product_name: str | None = None
|
|
@@ -217,6 +219,14 @@ class DomainUtils:
|
|
|
217
219
|
"""
|
|
218
220
|
|
|
219
221
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
222
|
+
_headers = {
|
|
223
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
224
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
225
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
226
|
+
"Accept-Encoding": "gzip, deflate",
|
|
227
|
+
"Connection": "keep-alive",
|
|
228
|
+
"Upgrade-Insecure-Requests": "1",
|
|
229
|
+
}
|
|
220
230
|
|
|
221
231
|
def _get_domain(self, url: str) -> str:
|
|
222
232
|
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
@@ -3,10 +3,12 @@ import asyncio
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import cast, Dict, List, Self
|
|
5
5
|
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
6
|
import httpx
|
|
7
|
+
import re
|
|
8
8
|
|
|
9
9
|
from fraudcrawler.settings import (
|
|
10
|
+
EXACT_MATCH_PRODUCT_FIELDS,
|
|
11
|
+
EXACT_MATCH_FIELD_SEPARATOR,
|
|
10
12
|
PROCESSOR_DEFAULT_MODEL,
|
|
11
13
|
)
|
|
12
14
|
from fraudcrawler.settings import (
|
|
@@ -27,8 +29,8 @@ from fraudcrawler import (
|
|
|
27
29
|
Searcher,
|
|
28
30
|
SearchEngineName,
|
|
29
31
|
Enricher,
|
|
30
|
-
URLCollector,
|
|
31
32
|
ZyteAPI,
|
|
33
|
+
URLCollector,
|
|
32
34
|
Processor,
|
|
33
35
|
)
|
|
34
36
|
|
|
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
|
|
|
227
229
|
|
|
228
230
|
if not product.filtered:
|
|
229
231
|
try:
|
|
230
|
-
# Fetch the product context from Zyte API
|
|
232
|
+
# Fetch and enrich the product context from Zyte API
|
|
231
233
|
details = await self._zyteapi.details(url=product.url)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
product.url_resolved = url_resolved
|
|
235
|
-
product.product_name = self._zyteapi.extract_product_name(
|
|
236
|
-
details=details
|
|
234
|
+
product = self._zyteapi.enrich_context(
|
|
235
|
+
product=product, details=details
|
|
237
236
|
)
|
|
238
237
|
|
|
239
|
-
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
240
|
-
# otherwise the unresolved domain will be shown.
|
|
241
|
-
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
242
|
-
if url_resolved and url_resolved != product.url:
|
|
243
|
-
logger.debug(
|
|
244
|
-
f"URL resolved for {product.url} is {url_resolved}"
|
|
245
|
-
)
|
|
246
|
-
product.domain = self._searcher._get_domain(url_resolved)
|
|
247
|
-
|
|
248
|
-
product.product_price = self._zyteapi.extract_product_price(
|
|
249
|
-
details=details
|
|
250
|
-
)
|
|
251
|
-
product.product_description = (
|
|
252
|
-
self._zyteapi.extract_product_description(details=details)
|
|
253
|
-
)
|
|
254
|
-
product.product_images = self._zyteapi.extract_image_urls(
|
|
255
|
-
details=details
|
|
256
|
-
)
|
|
257
|
-
product.probability = self._zyteapi.extract_probability(
|
|
258
|
-
details=details
|
|
259
|
-
)
|
|
260
|
-
product.html = self._zyteapi.extract_html(details=details)
|
|
261
|
-
if product.html:
|
|
262
|
-
soup = BeautifulSoup(product.html, "html.parser")
|
|
263
|
-
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
264
238
|
# Filter the product based on the probability threshold
|
|
265
239
|
if not self._zyteapi.keep_product(details=details):
|
|
266
240
|
product.filtered = True
|
|
267
|
-
product.filtered_at_stage =
|
|
241
|
+
product.filtered_at_stage = (
|
|
242
|
+
"Context (Zyte probability threshold)"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Check for exact match inside the full product context
|
|
246
|
+
product = self._check_exact_search(product=product)
|
|
247
|
+
if (
|
|
248
|
+
not product.filtered
|
|
249
|
+
and product.exact_search
|
|
250
|
+
and not product.exact_search_match
|
|
251
|
+
):
|
|
252
|
+
product.filtered = True
|
|
253
|
+
product.filtered_at_stage = "Context (exact search)"
|
|
254
|
+
|
|
268
255
|
except Exception as e:
|
|
269
256
|
logger.warning(f"Error executing Zyte API search: {e}.")
|
|
270
257
|
await queue_out.put(product)
|
|
@@ -502,6 +489,80 @@ class Orchestrator(ABC):
|
|
|
502
489
|
**common_kwargs, # type: ignore[arg-type]
|
|
503
490
|
)
|
|
504
491
|
|
|
492
|
+
@staticmethod
|
|
493
|
+
def _is_exact_search(search_term: str) -> bool:
|
|
494
|
+
"""Check if the search term is an exact search (contains double quotation marks).
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
search_term: The search term to check.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
True if the search term contains double quotation marks, False otherwise.
|
|
501
|
+
"""
|
|
502
|
+
return '"' in search_term
|
|
503
|
+
|
|
504
|
+
@staticmethod
|
|
505
|
+
def _extract_exact_search_terms(search_term: str) -> list[str]:
|
|
506
|
+
"""Extract all exact search terms from within double quotation marks.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
search_term: The search term that may contain double quotation marks.
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
A list of extracted search terms without quotes, or empty list if no quotes found.
|
|
513
|
+
"""
|
|
514
|
+
# Find all double-quoted strings
|
|
515
|
+
double_quote_matches = re.findall(r'"([^"]*)"', search_term)
|
|
516
|
+
return double_quote_matches
|
|
517
|
+
|
|
518
|
+
@staticmethod
|
|
519
|
+
def _check_exact_search_terms_match(
|
|
520
|
+
product: ProductItem,
|
|
521
|
+
exact_search_terms: list[str],
|
|
522
|
+
) -> bool:
|
|
523
|
+
"""Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
product: The product item.
|
|
527
|
+
exact_search_terms: List of exact search terms to match against.
|
|
528
|
+
"""
|
|
529
|
+
field_values = [
|
|
530
|
+
str(val)
|
|
531
|
+
for fld in EXACT_MATCH_PRODUCT_FIELDS
|
|
532
|
+
if (val := getattr(product, fld, None)) is not None
|
|
533
|
+
]
|
|
534
|
+
product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
|
|
535
|
+
|
|
536
|
+
return all(
|
|
537
|
+
re.search(re.escape(est.lower()), product_str_lower)
|
|
538
|
+
for est in exact_search_terms
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
def _check_exact_search(self, product: ProductItem) -> ProductItem:
|
|
542
|
+
"""Checks if the search term requests an exact search and if yes, checks for conformity."""
|
|
543
|
+
# Check for exact search and apply regex matching
|
|
544
|
+
exact_search = self._is_exact_search(product.search_term)
|
|
545
|
+
product.exact_search = exact_search
|
|
546
|
+
|
|
547
|
+
# Only set exact_search_match if this was an exact search (contains quotes)
|
|
548
|
+
if exact_search:
|
|
549
|
+
exact_search_terms = self._extract_exact_search_terms(product.search_term)
|
|
550
|
+
if exact_search_terms:
|
|
551
|
+
product.exact_search_match = self._check_exact_search_terms_match(
|
|
552
|
+
product=product, exact_search_terms=exact_search_terms
|
|
553
|
+
)
|
|
554
|
+
logger.debug(
|
|
555
|
+
f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
|
|
556
|
+
f"for offer with url={product.url}"
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
logger.warning(
|
|
560
|
+
f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
|
|
561
|
+
f"for offer with url={product.url}"
|
|
562
|
+
)
|
|
563
|
+
# If exact_search is False, product.exact_search_match remains False (default value)
|
|
564
|
+
return product
|
|
565
|
+
|
|
505
566
|
async def run(
|
|
506
567
|
self,
|
|
507
568
|
search_term: str,
|
fraudcrawler/scraping/search.py
CHANGED
|
@@ -8,7 +8,7 @@ from urllib.parse import quote_plus
|
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
from bs4.element import Tag
|
|
10
10
|
import httpx
|
|
11
|
-
from tenacity import RetryCallState
|
|
11
|
+
from tenacity import RetryCallState
|
|
12
12
|
|
|
13
13
|
from fraudcrawler.settings import (
|
|
14
14
|
SEARCH_DEFAULT_COUNTRY_CODES,
|
|
@@ -45,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
45
45
|
|
|
46
46
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
47
47
|
|
|
48
|
+
def __init__(self, http_client: httpx.AsyncClient):
|
|
49
|
+
"""Initializes the SearchEngine with the given HTTP client.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
53
|
+
"""
|
|
54
|
+
self._http_client = http_client
|
|
55
|
+
|
|
48
56
|
@property
|
|
49
57
|
@abstractmethod
|
|
50
58
|
def _search_engine_name(self) -> str:
|
|
@@ -56,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
56
64
|
"""Apply the search with the given parameters and return results."""
|
|
57
65
|
pass
|
|
58
66
|
|
|
67
|
+
def _create_search_result(self, url: str) -> SearchResult:
|
|
68
|
+
"""From a given url it creates the class:`SearchResult` instance."""
|
|
69
|
+
# Get marketplace name
|
|
70
|
+
domain = self._get_domain(url=url)
|
|
71
|
+
|
|
72
|
+
# Create and return the SearchResult object
|
|
73
|
+
result = SearchResult(
|
|
74
|
+
url=url,
|
|
75
|
+
domain=domain,
|
|
76
|
+
search_engine_name=self._search_engine_name,
|
|
77
|
+
)
|
|
78
|
+
return result
|
|
79
|
+
|
|
59
80
|
@classmethod
|
|
60
81
|
def _log_before(
|
|
61
|
-
cls,
|
|
82
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
62
83
|
) -> None:
|
|
63
|
-
"""Context aware logging before
|
|
84
|
+
"""Context aware logging before HTTP request is made."""
|
|
64
85
|
if retry_state:
|
|
65
86
|
logger.debug(
|
|
66
|
-
f'Performing
|
|
67
|
-
f"(attempt {retry_state.attempt_number})."
|
|
87
|
+
f'Performing HTTP request in {cls.__name__} to url="{url}" '
|
|
88
|
+
f"with params={params} (attempt {retry_state.attempt_number})."
|
|
68
89
|
)
|
|
69
90
|
else:
|
|
70
91
|
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
71
92
|
|
|
72
93
|
@classmethod
|
|
73
94
|
def _log_before_sleep(
|
|
74
|
-
cls,
|
|
95
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
75
96
|
) -> None:
|
|
76
|
-
"""Context aware logging before sleeping after a failed request."""
|
|
97
|
+
"""Context aware logging before sleeping after a failed HTTP request."""
|
|
77
98
|
if retry_state and retry_state.outcome:
|
|
78
99
|
logger.warning(
|
|
79
|
-
f
|
|
100
|
+
f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
|
|
101
|
+
f'to url="{url}" with params="{params}" '
|
|
80
102
|
f"failed with error: {retry_state.outcome.exception()}. "
|
|
81
103
|
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
82
104
|
)
|
|
83
105
|
else:
|
|
84
106
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
85
107
|
|
|
86
|
-
def
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
108
|
+
async def http_client_get(
|
|
109
|
+
self, url: str, params: dict | None = None, headers: dict | None = None
|
|
110
|
+
) -> httpx.Response:
|
|
111
|
+
"""Performs a GET request with retries.
|
|
90
112
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
url
|
|
94
|
-
|
|
95
|
-
|
|
113
|
+
Args:
|
|
114
|
+
retry: The retry strategy to use.
|
|
115
|
+
url: The URL to request.
|
|
116
|
+
params: Query parameters for the request.
|
|
117
|
+
headers: HTTP headers to use for the request.
|
|
118
|
+
"""
|
|
119
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
120
|
+
# - `before`: before the request is made (and before retrying)
|
|
121
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
122
|
+
retry = get_async_retry()
|
|
123
|
+
retry.before = lambda retry_state: self._log_before(
|
|
124
|
+
url=url, params=params, retry_state=retry_state
|
|
96
125
|
)
|
|
97
|
-
|
|
126
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
127
|
+
url=url, params=params, retry_state=retry_state
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
async for attempt in retry:
|
|
131
|
+
with attempt:
|
|
132
|
+
response = await self._http_client.get(
|
|
133
|
+
url=url,
|
|
134
|
+
params=params,
|
|
135
|
+
headers=headers,
|
|
136
|
+
)
|
|
137
|
+
response.raise_for_status()
|
|
138
|
+
return response
|
|
139
|
+
|
|
140
|
+
# In case of not entering the for loop (for some strange reason)
|
|
141
|
+
raise RuntimeError("Retry exhausted without success")
|
|
98
142
|
|
|
99
143
|
|
|
100
144
|
class SerpAPI(SearchEngine):
|
|
@@ -109,7 +153,7 @@ class SerpAPI(SearchEngine):
|
|
|
109
153
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
110
154
|
api_key: The API key for SerpAPI.
|
|
111
155
|
"""
|
|
112
|
-
|
|
156
|
+
super().__init__(http_client=http_client)
|
|
113
157
|
self._api_key = api_key
|
|
114
158
|
|
|
115
159
|
@property
|
|
@@ -205,22 +249,10 @@ class SerpAPI(SearchEngine):
|
|
|
205
249
|
}
|
|
206
250
|
logger.debug(f"SerpAPI search with params: {params}")
|
|
207
251
|
|
|
208
|
-
# Perform the request
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
retry = get_async_retry()
|
|
212
|
-
retry.before = lambda retry_state: self._log_before(
|
|
213
|
-
search_string=search_string, retry_state=retry_state
|
|
214
|
-
)
|
|
215
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
216
|
-
search_string=search_string, retry_state=retry_state
|
|
252
|
+
# Perform the search request
|
|
253
|
+
response: httpx.Response = await self.http_client_get(
|
|
254
|
+
url=self._endpoint, params=params
|
|
217
255
|
)
|
|
218
|
-
async for attempt in retry:
|
|
219
|
-
with attempt:
|
|
220
|
-
response = await self._http_client.get(
|
|
221
|
-
url=self._endpoint, params=params
|
|
222
|
-
)
|
|
223
|
-
response.raise_for_status()
|
|
224
256
|
|
|
225
257
|
# Extract the URLs from the response
|
|
226
258
|
data = response.json()
|
|
@@ -336,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
336
368
|
"""
|
|
337
369
|
results = data.get("shopping_results")
|
|
338
370
|
if results is not None:
|
|
339
|
-
return [url for res in results if (url := res.get("product_link"))]
|
|
371
|
+
# return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
|
|
372
|
+
return [
|
|
373
|
+
url
|
|
374
|
+
for res in results
|
|
375
|
+
if (url := res.get("serpapi_immersive_product_api"))
|
|
376
|
+
]
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
@staticmethod
|
|
380
|
+
def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
|
|
381
|
+
"""Extracts product urls from the serpapi immersive product API data."""
|
|
382
|
+
if results := data.get("product_results"):
|
|
383
|
+
stores = results.get("stores", [])
|
|
384
|
+
urls = [url for sre in stores if (url := sre.get("link"))]
|
|
385
|
+
return list(set(urls))
|
|
340
386
|
return []
|
|
341
387
|
|
|
342
388
|
async def search(
|
|
@@ -349,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
349
395
|
) -> List[SearchResult]:
|
|
350
396
|
"""Performs a google shopping search using SerpApi and returns SearchResults.
|
|
351
397
|
|
|
398
|
+
Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
|
|
399
|
+
and creates multiple SearchResult objects for each merchant URL found.
|
|
400
|
+
|
|
352
401
|
Args:
|
|
353
402
|
search_term: The search term to use for the query.
|
|
354
403
|
language: The language to use for the query ('hl' parameter).
|
|
@@ -362,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
362
411
|
marketplaces=marketplaces,
|
|
363
412
|
)
|
|
364
413
|
|
|
365
|
-
# Perform the search
|
|
414
|
+
# Perform the search to get Google Shopping URLs
|
|
366
415
|
urls = await self._search(
|
|
367
416
|
search_string=search_string,
|
|
368
417
|
language=language,
|
|
@@ -375,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
375
424
|
# and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
|
|
376
425
|
urls = urls[:num_results]
|
|
377
426
|
|
|
378
|
-
# Create
|
|
427
|
+
# Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
|
|
379
428
|
results = [self._create_search_result(url=url) for url in urls]
|
|
380
429
|
logger.debug(
|
|
381
|
-
f'Produced {len(results)} results from
|
|
430
|
+
f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
|
|
382
431
|
)
|
|
383
432
|
return results
|
|
384
433
|
|
|
@@ -387,14 +436,6 @@ class Toppreise(SearchEngine):
|
|
|
387
436
|
"""Search engine for toppreise.ch."""
|
|
388
437
|
|
|
389
438
|
_endpoint = "https://www.toppreise.ch/"
|
|
390
|
-
_headers = {
|
|
391
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
392
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
393
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
394
|
-
"Accept-Encoding": "gzip, deflate",
|
|
395
|
-
"Connection": "keep-alive",
|
|
396
|
-
"Upgrade-Insecure-Requests": "1",
|
|
397
|
-
}
|
|
398
439
|
|
|
399
440
|
def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
|
|
400
441
|
"""Initializes the Toppreise client.
|
|
@@ -403,9 +444,42 @@ class Toppreise(SearchEngine):
|
|
|
403
444
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
404
445
|
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
405
446
|
"""
|
|
406
|
-
|
|
447
|
+
super().__init__(http_client=http_client)
|
|
407
448
|
self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
|
|
408
449
|
|
|
450
|
+
async def http_client_get_with_fallback(self, url: str) -> bytes:
|
|
451
|
+
"""Performs a GET request with retries.
|
|
452
|
+
|
|
453
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
454
|
+
content using Zyte proxy mode.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
url: The URL to request.
|
|
458
|
+
"""
|
|
459
|
+
# Try to access the URL directly
|
|
460
|
+
try:
|
|
461
|
+
response: httpx.Response = await self.http_client_get(
|
|
462
|
+
url=url, headers=self._headers
|
|
463
|
+
)
|
|
464
|
+
content = response.content
|
|
465
|
+
|
|
466
|
+
# If we get a 403 Error (can happen depending on IP/location of deployment),
|
|
467
|
+
# we try to unblock the URL using Zyte proxy mode
|
|
468
|
+
except httpx.HTTPStatusError as err_direct:
|
|
469
|
+
if err_direct.response.status_code == 403:
|
|
470
|
+
logger.warning(
|
|
471
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
472
|
+
)
|
|
473
|
+
try:
|
|
474
|
+
content = await self._zyteapi.unblock_url_content(url)
|
|
475
|
+
except Exception as err_resolve:
|
|
476
|
+
msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
|
|
477
|
+
logger.error(msg)
|
|
478
|
+
raise httpx.HTTPError(msg) from err_resolve
|
|
479
|
+
else:
|
|
480
|
+
raise err_direct
|
|
481
|
+
return content
|
|
482
|
+
|
|
409
483
|
@classmethod
|
|
410
484
|
def _get_search_endpoint(cls, language: Language) -> str:
|
|
411
485
|
"""Get the search endpoint based on the language."""
|
|
@@ -502,46 +576,6 @@ class Toppreise(SearchEngine):
|
|
|
502
576
|
"""The name of the search engine."""
|
|
503
577
|
return SearchEngineName.TOPPREISE.value
|
|
504
578
|
|
|
505
|
-
async def http_client_get_with_fallback(
|
|
506
|
-
self, url: str, retry: AsyncRetrying
|
|
507
|
-
) -> bytes:
|
|
508
|
-
"""Performs a GET request with retries.
|
|
509
|
-
|
|
510
|
-
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
511
|
-
content using Zyte proxy mode.
|
|
512
|
-
|
|
513
|
-
Args:
|
|
514
|
-
url: The URL to request.
|
|
515
|
-
retry: The retry strategy to use.
|
|
516
|
-
"""
|
|
517
|
-
# Try to access the URL directly
|
|
518
|
-
try:
|
|
519
|
-
async for attempt in retry:
|
|
520
|
-
with attempt:
|
|
521
|
-
response = await self._http_client.get(
|
|
522
|
-
url=url,
|
|
523
|
-
headers=self._headers,
|
|
524
|
-
)
|
|
525
|
-
response.raise_for_status()
|
|
526
|
-
content = response.content
|
|
527
|
-
|
|
528
|
-
# If we get a 403 Error (can happen depending on IP/location of deployment),
|
|
529
|
-
# we try to unblock the URL using Zyte proxy mode
|
|
530
|
-
except httpx.HTTPStatusError as err_direct:
|
|
531
|
-
if err_direct.response.status_code == 403:
|
|
532
|
-
logger.warning(
|
|
533
|
-
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
534
|
-
)
|
|
535
|
-
try:
|
|
536
|
-
content = await self._zyteapi.unblock_url_content(url)
|
|
537
|
-
except Exception as err_resolve:
|
|
538
|
-
msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
|
|
539
|
-
logger.error(msg)
|
|
540
|
-
raise httpx.HTTPError(msg) from err_resolve
|
|
541
|
-
else:
|
|
542
|
-
raise err_direct
|
|
543
|
-
return content
|
|
544
|
-
|
|
545
579
|
async def _search(
|
|
546
580
|
self, search_string: str, language: Language, num_results: int
|
|
547
581
|
) -> List[str]:
|
|
@@ -561,17 +595,8 @@ class Toppreise(SearchEngine):
|
|
|
561
595
|
url = f"{endpoint}?q={encoded_search}"
|
|
562
596
|
logger.debug(f"Toppreise search URL: {url}")
|
|
563
597
|
|
|
564
|
-
# Perform the request
|
|
565
|
-
|
|
566
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
567
|
-
retry = get_async_retry()
|
|
568
|
-
retry.before = lambda retry_state: self._log_before(
|
|
569
|
-
search_string=search_string, retry_state=retry_state
|
|
570
|
-
)
|
|
571
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
572
|
-
search_string=search_string, retry_state=retry_state
|
|
573
|
-
)
|
|
574
|
-
content = await self.http_client_get_with_fallback(url=url, retry=retry)
|
|
598
|
+
# Perform the request with fallback if necessary
|
|
599
|
+
content = await self.http_client_get_with_fallback(url=url)
|
|
575
600
|
|
|
576
601
|
# Get external product urls from the content
|
|
577
602
|
urls = self._extract_product_urls_from_search_page(content=content)
|
|
@@ -633,61 +658,44 @@ class Searcher(DomainUtils):
|
|
|
633
658
|
zyteapi_key=zyteapi_key,
|
|
634
659
|
)
|
|
635
660
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
"""Context aware logging before the request is made."""
|
|
639
|
-
if retry_state:
|
|
640
|
-
logger.debug(
|
|
641
|
-
f'Performing post search for url="{url}" '
|
|
642
|
-
f"(attempt {retry_state.attempt_number})."
|
|
643
|
-
)
|
|
644
|
-
else:
|
|
645
|
-
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
661
|
+
async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
|
|
662
|
+
"""Post-search for product URLs from a Google Shopping immersive product page.
|
|
646
663
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
""
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
664
|
+
Args:
|
|
665
|
+
url: The URL of the Google Shopping product page.
|
|
666
|
+
"""
|
|
667
|
+
# Add SerpAPI key to the url
|
|
668
|
+
sep = "&" if "?" in url else "?"
|
|
669
|
+
url = f"{url}{sep}api_key={self._google_shopping._api_key}"
|
|
670
|
+
|
|
671
|
+
# Fetch the content of the Google Shopping product page
|
|
672
|
+
response = await self._google_shopping.http_client_get(url=url)
|
|
673
|
+
|
|
674
|
+
# Get external product urls from the data
|
|
675
|
+
data = response.json()
|
|
676
|
+
urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
|
|
677
|
+
data=data
|
|
678
|
+
)
|
|
679
|
+
return urls
|
|
660
680
|
|
|
661
681
|
async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
|
|
662
682
|
"""Post-search for product URLs from a Toppreise product comparison page.
|
|
663
683
|
|
|
664
684
|
Note:
|
|
665
685
|
In comparison to the function Toppreise._search, here we extract the urls from
|
|
666
|
-
product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/).
|
|
667
|
-
also be found in the results of a google search.
|
|
686
|
+
product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
|
|
687
|
+
pages can also be found in the results of a google search.
|
|
668
688
|
|
|
669
689
|
Args:
|
|
670
690
|
url: The URL of the Toppreise product listing page.
|
|
671
691
|
"""
|
|
672
|
-
# Perform the request
|
|
673
|
-
|
|
674
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
675
|
-
retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
|
|
676
|
-
retry.before = lambda retry_state: self._post_search_log_before(
|
|
677
|
-
url=url, retry_state=retry_state
|
|
678
|
-
)
|
|
679
|
-
retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
|
|
680
|
-
url=url, retry_state=retry_state
|
|
681
|
-
)
|
|
682
|
-
content = await self._toppreise.http_client_get_with_fallback(
|
|
683
|
-
url=url, retry=retry
|
|
684
|
-
)
|
|
692
|
+
# Perform the request with fallback if necessary
|
|
693
|
+
content = await self._toppreise.http_client_get_with_fallback(url=url)
|
|
685
694
|
|
|
686
695
|
# Get external product urls from the content
|
|
687
696
|
urls = self._toppreise._extract_product_urls_from_comparison_page(
|
|
688
697
|
content=content
|
|
689
698
|
)
|
|
690
|
-
|
|
691
699
|
return urls
|
|
692
700
|
|
|
693
701
|
async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
|
|
@@ -703,9 +711,22 @@ class Searcher(DomainUtils):
|
|
|
703
711
|
post_search_results: List[SearchResult] = []
|
|
704
712
|
for res in results:
|
|
705
713
|
url = res.url
|
|
714
|
+
post_search_urls: List[str] = []
|
|
715
|
+
|
|
716
|
+
# Extract embedded product URLs from the Google Shopping immersive product page
|
|
717
|
+
if "engine=google_immersive_product" in url:
|
|
718
|
+
logger.debug(
|
|
719
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
720
|
+
)
|
|
721
|
+
post_search_urls = await self._post_search_google_shopping_immersive(
|
|
722
|
+
url=url
|
|
723
|
+
)
|
|
724
|
+
logger.debug(
|
|
725
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
726
|
+
)
|
|
706
727
|
|
|
707
728
|
# Extract embedded product URLs from the Toppreise product listing page
|
|
708
|
-
|
|
729
|
+
elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
|
|
709
730
|
logger.debug(
|
|
710
731
|
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
711
732
|
)
|
|
@@ -714,15 +735,16 @@ class Searcher(DomainUtils):
|
|
|
714
735
|
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
715
736
|
)
|
|
716
737
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
738
|
+
# Add the extracted product URLs as SearchResult objects
|
|
739
|
+
psr = [
|
|
740
|
+
SearchResult(
|
|
741
|
+
url=psu,
|
|
742
|
+
domain=self._get_domain(url=psu),
|
|
743
|
+
search_engine_name=res.search_engine_name,
|
|
744
|
+
)
|
|
745
|
+
for psu in post_search_urls
|
|
746
|
+
]
|
|
747
|
+
post_search_results.extend(psr)
|
|
726
748
|
|
|
727
749
|
return post_search_results
|
|
728
750
|
|
fraudcrawler/scraping/zyte.py
CHANGED
|
@@ -2,11 +2,12 @@ from base64 import b64decode
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
5
6
|
import httpx
|
|
6
7
|
from tenacity import RetryCallState
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
9
|
-
from fraudcrawler.base.base import DomainUtils
|
|
10
|
+
from fraudcrawler.base.base import DomainUtils, ProductItem
|
|
10
11
|
from fraudcrawler.base.retry import get_async_retry
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
|
|
|
61
62
|
else:
|
|
62
63
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
63
64
|
|
|
64
|
-
async def details(self, url: str) -> dict:
|
|
65
|
-
"""Fetches product details for a single URL.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
url: The URL to fetch product details from.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
A dictionary containing the product details, fields include:
|
|
72
|
-
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
73
|
-
{
|
|
74
|
-
"url": str,
|
|
75
|
-
"statusCode": str,
|
|
76
|
-
"product": {
|
|
77
|
-
"name": str,
|
|
78
|
-
"price": str,
|
|
79
|
-
"mainImage": {"url": str},
|
|
80
|
-
"images": [{"url": str}],
|
|
81
|
-
"description": str,
|
|
82
|
-
"metadata": {
|
|
83
|
-
"probability": float,
|
|
84
|
-
},
|
|
85
|
-
},
|
|
86
|
-
"httpResponseBody": base64
|
|
87
|
-
}
|
|
88
|
-
"""
|
|
89
|
-
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
90
|
-
|
|
91
|
-
# Perform the request and retry if necessary. There is some context aware logging:
|
|
92
|
-
# - `before`: before the request is made (and before retrying)
|
|
93
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
94
|
-
retry = get_async_retry()
|
|
95
|
-
retry.before = lambda retry_state: self._log_before(
|
|
96
|
-
url=url, retry_state=retry_state
|
|
97
|
-
)
|
|
98
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
99
|
-
url=url, retry_state=retry_state
|
|
100
|
-
)
|
|
101
|
-
async for attempt in retry:
|
|
102
|
-
with attempt:
|
|
103
|
-
response = await self._http_client.post(
|
|
104
|
-
url=self._endpoint,
|
|
105
|
-
json={"url": url, **self._config},
|
|
106
|
-
auth=(self._api_key, ""), # API key as username, empty password
|
|
107
|
-
)
|
|
108
|
-
response.raise_for_status()
|
|
109
|
-
|
|
110
|
-
details = response.json()
|
|
111
|
-
return details
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def keep_product(
|
|
115
|
-
details: dict,
|
|
116
|
-
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
117
|
-
) -> bool:
|
|
118
|
-
"""Determines whether to keep the product based on the probability threshold.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
details: A product details data dictionary.
|
|
122
|
-
threshold: The probability threshold used to filter the products.
|
|
123
|
-
"""
|
|
124
|
-
try:
|
|
125
|
-
prob = float(details["product"]["metadata"]["probability"])
|
|
126
|
-
except KeyError:
|
|
127
|
-
logger.warning(
|
|
128
|
-
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
129
|
-
)
|
|
130
|
-
return False
|
|
131
|
-
return prob > threshold
|
|
132
|
-
|
|
133
65
|
@staticmethod
|
|
134
|
-
def
|
|
66
|
+
def _extract_product_name(details: dict) -> str | None:
|
|
135
67
|
"""Extracts the product name from the product data.
|
|
136
68
|
|
|
137
69
|
The input argument is a dictionary of the following structure:
|
|
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
|
|
|
144
76
|
return details.get("product", {}).get("name")
|
|
145
77
|
|
|
146
78
|
@staticmethod
|
|
147
|
-
def
|
|
79
|
+
def _extract_url_resolved(details: dict) -> str | None:
|
|
148
80
|
"""Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
|
|
149
81
|
|
|
150
82
|
The input argument is a dictionary of the following structure:
|
|
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
|
|
|
157
89
|
return details.get("product", {}).get("url")
|
|
158
90
|
|
|
159
91
|
@staticmethod
|
|
160
|
-
def
|
|
92
|
+
def _extract_product_price(details: dict) -> str | None:
|
|
161
93
|
"""Extracts the product price from the product data.
|
|
162
94
|
|
|
163
95
|
The input argument is a dictionary of the following structure:
|
|
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
|
|
|
170
102
|
return details.get("product", {}).get("price")
|
|
171
103
|
|
|
172
104
|
@staticmethod
|
|
173
|
-
def
|
|
105
|
+
def _extract_product_description(details: dict) -> str | None:
|
|
174
106
|
"""Extracts the product description from the product data.
|
|
175
107
|
|
|
176
108
|
The input argument is a dictionary of the following structure:
|
|
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
|
|
|
183
115
|
return details.get("product", {}).get("description")
|
|
184
116
|
|
|
185
117
|
@staticmethod
|
|
186
|
-
def
|
|
118
|
+
def _extract_image_urls(details: dict) -> List[str]:
|
|
187
119
|
"""Extracts the images from the product data.
|
|
188
120
|
|
|
189
121
|
The input argument is a dictionary of the following structure:
|
|
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
|
|
|
206
138
|
return images
|
|
207
139
|
|
|
208
140
|
@staticmethod
|
|
209
|
-
def
|
|
141
|
+
def _extract_probability(details: dict) -> float:
|
|
210
142
|
"""Extracts the probability from the product data.
|
|
211
143
|
|
|
212
144
|
The input argument is a dictionary of the following structure:
|
|
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
|
|
|
223
155
|
)
|
|
224
156
|
|
|
225
157
|
@staticmethod
|
|
226
|
-
def
|
|
158
|
+
def _extract_html(details: dict) -> str | None:
|
|
227
159
|
"""Extracts the HTML from the Zyte API response.
|
|
228
160
|
|
|
229
161
|
The input argument is a dictionary of the following structure:
|
|
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
|
|
|
243
175
|
return decoded_string
|
|
244
176
|
return None
|
|
245
177
|
|
|
178
|
+
def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
|
|
179
|
+
product.product_name = self._extract_product_name(details=details)
|
|
180
|
+
|
|
181
|
+
url_resolved = self._extract_url_resolved(details=details)
|
|
182
|
+
if url_resolved:
|
|
183
|
+
product.url_resolved = url_resolved
|
|
184
|
+
|
|
185
|
+
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
186
|
+
# otherwise the unresolved domain will be shown.
|
|
187
|
+
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
188
|
+
if url_resolved and url_resolved != product.url:
|
|
189
|
+
logger.debug(f"URL resolved for {product.url} is {url_resolved}")
|
|
190
|
+
product.domain = self._get_domain(url=url_resolved)
|
|
191
|
+
|
|
192
|
+
product.product_price = self._extract_product_price(details=details)
|
|
193
|
+
product.product_description = self._extract_product_description(details=details)
|
|
194
|
+
product.product_images = self._extract_image_urls(details=details)
|
|
195
|
+
product.probability = self._extract_probability(details=details)
|
|
196
|
+
product.html = self._extract_html(details=details)
|
|
197
|
+
if product.html:
|
|
198
|
+
soup = BeautifulSoup(product.html, "html.parser")
|
|
199
|
+
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
200
|
+
|
|
201
|
+
return product
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def keep_product(
|
|
205
|
+
details: dict,
|
|
206
|
+
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
207
|
+
) -> bool:
|
|
208
|
+
"""Determines whether to keep the product based on the probability threshold.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
details: A product details data dictionary.
|
|
212
|
+
threshold: The probability threshold used to filter the products.
|
|
213
|
+
"""
|
|
214
|
+
try:
|
|
215
|
+
prob = float(details["product"]["metadata"]["probability"])
|
|
216
|
+
except KeyError:
|
|
217
|
+
logger.warning(
|
|
218
|
+
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
219
|
+
)
|
|
220
|
+
return False
|
|
221
|
+
return prob > threshold
|
|
222
|
+
|
|
246
223
|
async def unblock_url_content(self, url: str) -> bytes:
|
|
247
224
|
"""Unblock the content of an URL using Zyte proxy mode.
|
|
248
225
|
|
|
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
|
|
|
256
233
|
raise httpx.HTTPError("No httpResponseBody in Zyte response")
|
|
257
234
|
|
|
258
235
|
return b64decode(details["httpResponseBody"])
|
|
236
|
+
|
|
237
|
+
async def details(self, url: str) -> dict:
|
|
238
|
+
"""Fetches product details for a single URL.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
url: The URL to fetch product details from.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
A dictionary containing the product details, fields include:
|
|
245
|
+
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
246
|
+
{
|
|
247
|
+
"url": str,
|
|
248
|
+
"statusCode": str,
|
|
249
|
+
"product": {
|
|
250
|
+
"name": str,
|
|
251
|
+
"price": str,
|
|
252
|
+
"mainImage": {"url": str},
|
|
253
|
+
"images": [{"url": str}],
|
|
254
|
+
"description": str,
|
|
255
|
+
"metadata": {
|
|
256
|
+
"probability": float,
|
|
257
|
+
},
|
|
258
|
+
},
|
|
259
|
+
"httpResponseBody": base64
|
|
260
|
+
}
|
|
261
|
+
"""
|
|
262
|
+
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
263
|
+
|
|
264
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
265
|
+
# - `before`: before the request is made (and before retrying)
|
|
266
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
267
|
+
retry = get_async_retry()
|
|
268
|
+
retry.before = lambda retry_state: self._log_before(
|
|
269
|
+
url=url, retry_state=retry_state
|
|
270
|
+
)
|
|
271
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
272
|
+
url=url, retry_state=retry_state
|
|
273
|
+
)
|
|
274
|
+
async for attempt in retry:
|
|
275
|
+
with attempt:
|
|
276
|
+
response = await self._http_client.post(
|
|
277
|
+
url=self._endpoint,
|
|
278
|
+
json={"url": url, **self._config},
|
|
279
|
+
auth=(self._api_key, ""), # API key as username, empty password
|
|
280
|
+
)
|
|
281
|
+
response.raise_for_status()
|
|
282
|
+
|
|
283
|
+
details = response.json()
|
|
284
|
+
return details
|
fraudcrawler/settings.py
CHANGED
|
@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
|
|
|
78
78
|
# Zyte settings
|
|
79
79
|
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
80
80
|
|
|
81
|
+
# Exact match settings
|
|
82
|
+
EXACT_MATCH_PRODUCT_FIELDS = {
|
|
83
|
+
"url_resolvedproduct_name",
|
|
84
|
+
"product_description",
|
|
85
|
+
"html",
|
|
86
|
+
}
|
|
87
|
+
EXACT_MATCH_FIELD_SEPARATOR = "\n"
|
|
88
|
+
|
|
81
89
|
# Processor settings
|
|
82
90
|
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
83
91
|
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
|
-
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
7
|
Author: Domingo Bertus
|
|
8
8
|
Author-email: hello@veanu.ch
|
|
9
9
|
Requires-Python: >=3.11,<4.0
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
16
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
16
17
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
17
18
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
|
|
2
2
|
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=
|
|
3
|
+
fraudcrawler/base/base.py,sha256=mTmojNyVrPEB69-aI-43dl0Jct174G4ziBiOudDFfTY,7795
|
|
4
4
|
fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
|
|
5
5
|
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
6
|
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=TiLKAJTBIPf0dxJuyZnCGIMWReC9gNvmEXqWwE0Ykbs,29002
|
|
8
8
|
fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
|
|
9
|
-
fraudcrawler/launch_demo_pipeline.py,sha256=
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=_aDqaPdxE_DMwQY5_vpqF2YjwLkWIZq5Z9Tz3sqLKdg,4629
|
|
10
10
|
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
|
|
12
12
|
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
-
fraudcrawler/scraping/search.py,sha256=
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
|
|
15
15
|
fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
|
|
16
|
-
fraudcrawler/scraping/zyte.py,sha256=
|
|
17
|
-
fraudcrawler/settings.py,sha256=
|
|
18
|
-
fraudcrawler-0.6.
|
|
19
|
-
fraudcrawler-0.6.
|
|
20
|
-
fraudcrawler-0.6.
|
|
21
|
-
fraudcrawler-0.6.
|
|
22
|
-
fraudcrawler-0.6.
|
|
16
|
+
fraudcrawler/scraping/zyte.py,sha256=sYpfwMuGE9MYpKvma_8x5Th2VBFn25Mqb4Wd7UChL_g,10215
|
|
17
|
+
fraudcrawler/settings.py,sha256=9ukAkxEzDtvy3xA-jSF3asr9uLIAATNQ-FqrsgCEDUk,4038
|
|
18
|
+
fraudcrawler-0.6.2.dist-info/METADATA,sha256=5hzWjCm1eQJ19Pm3vxUsS_EciUmbuppEpECi8ye2Wyw,6723
|
|
19
|
+
fraudcrawler-0.6.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
20
|
+
fraudcrawler-0.6.2.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
21
|
+
fraudcrawler-0.6.2.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
22
|
+
fraudcrawler-0.6.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|