fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +21 -5
- fraudcrawler/base/base.py +18 -38
- fraudcrawler/base/client.py +57 -60
- fraudcrawler/base/orchestrator.py +277 -276
- fraudcrawler/base/retry.py +25 -11
- fraudcrawler/launch_demo_pipeline.py +103 -41
- fraudcrawler/processing/base.py +151 -0
- fraudcrawler/processing/openai.py +521 -0
- fraudcrawler/scraping/enrich.py +6 -4
- fraudcrawler/scraping/search.py +370 -110
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +146 -80
- fraudcrawler/settings.py +22 -10
- fraudcrawler-0.7.26.dist-info/METADATA +173 -0
- fraudcrawler-0.7.26.dist-info/RECORD +23 -0
- fraudcrawler/processing/processor.py +0 -199
- fraudcrawler-0.5.0.dist-info/METADATA +0 -167
- fraudcrawler-0.5.0.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/search.py
CHANGED
|
@@ -6,12 +6,18 @@ from typing import Dict, List
|
|
|
6
6
|
from urllib.parse import quote_plus
|
|
7
7
|
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
|
+
from bs4.element import Tag
|
|
9
10
|
import httpx
|
|
10
11
|
from tenacity import RetryCallState
|
|
11
12
|
|
|
12
|
-
from fraudcrawler.settings import
|
|
13
|
+
from fraudcrawler.settings import (
|
|
14
|
+
SEARCH_DEFAULT_COUNTRY_CODES,
|
|
15
|
+
TOPPREISE_SEARCH_PATHS,
|
|
16
|
+
TOPPREISE_COMPARISON_PATHS,
|
|
17
|
+
)
|
|
13
18
|
from fraudcrawler.base.base import Host, Language, Location, DomainUtils
|
|
14
19
|
from fraudcrawler.base.retry import get_async_retry
|
|
20
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
15
21
|
|
|
16
22
|
logger = logging.getLogger(__name__)
|
|
17
23
|
|
|
@@ -39,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
39
45
|
|
|
40
46
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
41
47
|
|
|
48
|
+
def __init__(self, http_client: httpx.AsyncClient):
|
|
49
|
+
"""Initializes the SearchEngine with the given HTTP client.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
53
|
+
"""
|
|
54
|
+
self._http_client = http_client
|
|
55
|
+
|
|
42
56
|
@property
|
|
43
57
|
@abstractmethod
|
|
44
58
|
def _search_engine_name(self) -> str:
|
|
@@ -50,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
50
64
|
"""Apply the search with the given parameters and return results."""
|
|
51
65
|
pass
|
|
52
66
|
|
|
67
|
+
def _create_search_result(self, url: str) -> SearchResult:
|
|
68
|
+
"""From a given url it creates the class:`SearchResult` instance."""
|
|
69
|
+
# Get marketplace name
|
|
70
|
+
domain = self._get_domain(url=url)
|
|
71
|
+
|
|
72
|
+
# Create and return the SearchResult object
|
|
73
|
+
result = SearchResult(
|
|
74
|
+
url=url,
|
|
75
|
+
domain=domain,
|
|
76
|
+
search_engine_name=self._search_engine_name,
|
|
77
|
+
)
|
|
78
|
+
return result
|
|
79
|
+
|
|
53
80
|
@classmethod
|
|
54
81
|
def _log_before(
|
|
55
|
-
cls,
|
|
82
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
56
83
|
) -> None:
|
|
57
|
-
"""Context aware logging before
|
|
84
|
+
"""Context aware logging before HTTP request is made."""
|
|
58
85
|
if retry_state:
|
|
59
86
|
logger.debug(
|
|
60
|
-
f'Performing
|
|
61
|
-
f"(attempt {retry_state.attempt_number})."
|
|
87
|
+
f'Performing HTTP request in {cls.__name__} to url="{url}" '
|
|
88
|
+
f"with params={params} (attempt {retry_state.attempt_number})."
|
|
62
89
|
)
|
|
63
90
|
else:
|
|
64
91
|
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
65
92
|
|
|
66
93
|
@classmethod
|
|
67
94
|
def _log_before_sleep(
|
|
68
|
-
cls,
|
|
95
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
69
96
|
) -> None:
|
|
70
|
-
"""Context aware logging before sleeping after a failed request."""
|
|
97
|
+
"""Context aware logging before sleeping after a failed HTTP request."""
|
|
71
98
|
if retry_state and retry_state.outcome:
|
|
72
99
|
logger.warning(
|
|
73
|
-
f
|
|
100
|
+
f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
|
|
101
|
+
f'to url="{url}" with params="{params}" '
|
|
74
102
|
f"failed with error: {retry_state.outcome.exception()}. "
|
|
75
103
|
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
76
104
|
)
|
|
77
105
|
else:
|
|
78
106
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
79
107
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
108
|
+
async def http_client_get(
|
|
109
|
+
self, url: str, params: dict | None = None, headers: dict | None = None
|
|
110
|
+
) -> httpx.Response:
|
|
111
|
+
"""Performs a GET request with retries.
|
|
84
112
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
url
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
Args:
|
|
114
|
+
retry: The retry strategy to use.
|
|
115
|
+
url: The URL to request.
|
|
116
|
+
params: Query parameters for the request.
|
|
117
|
+
headers: HTTP headers to use for the request.
|
|
118
|
+
"""
|
|
119
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
120
|
+
# - `before`: before the request is made (and before retrying)
|
|
121
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
122
|
+
retry = get_async_retry()
|
|
123
|
+
retry.before = lambda retry_state: self._log_before(
|
|
124
|
+
url=url, params=params, retry_state=retry_state
|
|
125
|
+
)
|
|
126
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
127
|
+
url=url, params=params, retry_state=retry_state
|
|
90
128
|
)
|
|
91
|
-
|
|
129
|
+
|
|
130
|
+
async for attempt in retry:
|
|
131
|
+
with attempt:
|
|
132
|
+
response = await self._http_client.get(
|
|
133
|
+
url=url,
|
|
134
|
+
params=params,
|
|
135
|
+
headers=headers,
|
|
136
|
+
)
|
|
137
|
+
response.raise_for_status()
|
|
138
|
+
return response
|
|
139
|
+
|
|
140
|
+
# In case of not entering the for loop (for some strange reason)
|
|
141
|
+
raise RuntimeError("Retry exhausted without success")
|
|
92
142
|
|
|
93
143
|
|
|
94
144
|
class SerpAPI(SearchEngine):
|
|
@@ -103,7 +153,7 @@ class SerpAPI(SearchEngine):
|
|
|
103
153
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
104
154
|
api_key: The API key for SerpAPI.
|
|
105
155
|
"""
|
|
106
|
-
|
|
156
|
+
super().__init__(http_client=http_client)
|
|
107
157
|
self._api_key = api_key
|
|
108
158
|
|
|
109
159
|
@property
|
|
@@ -131,6 +181,17 @@ class SerpAPI(SearchEngine):
|
|
|
131
181
|
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
132
182
|
return search_string
|
|
133
183
|
|
|
184
|
+
@staticmethod
|
|
185
|
+
def _get_google_domain(location: Location) -> str:
|
|
186
|
+
"""Gets the Google domain for the given location if they do not use the default pattern google.tld"""
|
|
187
|
+
if location.name == "Brazil":
|
|
188
|
+
return "google.com.br"
|
|
189
|
+
elif location.name == "United Kingdom":
|
|
190
|
+
return "google.co.uk"
|
|
191
|
+
elif location.name == "Argentina":
|
|
192
|
+
return "google.com.ar"
|
|
193
|
+
return f"google.{location.code}"
|
|
194
|
+
|
|
134
195
|
async def _search(
|
|
135
196
|
self,
|
|
136
197
|
search_string: str,
|
|
@@ -169,38 +230,29 @@ class SerpAPI(SearchEngine):
|
|
|
169
230
|
f"num_results={num_results}."
|
|
170
231
|
)
|
|
171
232
|
|
|
172
|
-
#
|
|
233
|
+
# Get Google domain and country code
|
|
234
|
+
google_domain = self._get_google_domain(location)
|
|
235
|
+
country_code = location.code
|
|
236
|
+
|
|
173
237
|
params: Dict[str, str | int] = {
|
|
174
238
|
"engine": engine,
|
|
175
239
|
"q": search_string,
|
|
176
|
-
"google_domain":
|
|
240
|
+
"google_domain": google_domain,
|
|
177
241
|
"location_requested": location.name,
|
|
178
242
|
"location_used": location.name,
|
|
179
|
-
"tbs": f"ctr:{
|
|
180
|
-
"cr": f"country{
|
|
181
|
-
"gl":
|
|
243
|
+
"tbs": f"ctr:{country_code.upper()}",
|
|
244
|
+
"cr": f"country{country_code.upper()}",
|
|
245
|
+
"gl": country_code,
|
|
182
246
|
"hl": language.code,
|
|
183
247
|
"num": num_results,
|
|
184
248
|
"api_key": self._api_key,
|
|
185
249
|
}
|
|
186
250
|
logger.debug(f"SerpAPI search with params: {params}")
|
|
187
251
|
|
|
188
|
-
# Perform the request
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
retry = get_async_retry()
|
|
192
|
-
retry.before = lambda retry_state: self._log_before(
|
|
193
|
-
search_string=search_string, retry_state=retry_state
|
|
252
|
+
# Perform the search request
|
|
253
|
+
response: httpx.Response = await self.http_client_get(
|
|
254
|
+
url=self._endpoint, params=params
|
|
194
255
|
)
|
|
195
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
196
|
-
search_string=search_string, retry_state=retry_state
|
|
197
|
-
)
|
|
198
|
-
async for attempt in retry:
|
|
199
|
-
with attempt:
|
|
200
|
-
response = await self._http_client.get(
|
|
201
|
-
url=self._endpoint, params=params
|
|
202
|
-
)
|
|
203
|
-
response.raise_for_status()
|
|
204
256
|
|
|
205
257
|
# Extract the URLs from the response
|
|
206
258
|
data = response.json()
|
|
@@ -316,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
316
368
|
"""
|
|
317
369
|
results = data.get("shopping_results")
|
|
318
370
|
if results is not None:
|
|
319
|
-
return [url for res in results if (url := res.get("product_link"))]
|
|
371
|
+
# return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
|
|
372
|
+
return [
|
|
373
|
+
url
|
|
374
|
+
for res in results
|
|
375
|
+
if (url := res.get("serpapi_immersive_product_api"))
|
|
376
|
+
]
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
@staticmethod
|
|
380
|
+
def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
|
|
381
|
+
"""Extracts product urls from the serpapi immersive product API data."""
|
|
382
|
+
if results := data.get("product_results"):
|
|
383
|
+
stores = results.get("stores", [])
|
|
384
|
+
urls = [url for sre in stores if (url := sre.get("link"))]
|
|
385
|
+
return list(set(urls))
|
|
320
386
|
return []
|
|
321
387
|
|
|
322
388
|
async def search(
|
|
@@ -329,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
329
395
|
) -> List[SearchResult]:
|
|
330
396
|
"""Performs a google shopping search using SerpApi and returns SearchResults.
|
|
331
397
|
|
|
398
|
+
Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
|
|
399
|
+
and creates multiple SearchResult objects for each merchant URL found.
|
|
400
|
+
|
|
332
401
|
Args:
|
|
333
402
|
search_term: The search term to use for the query.
|
|
334
403
|
language: The language to use for the query ('hl' parameter).
|
|
@@ -342,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
342
411
|
marketplaces=marketplaces,
|
|
343
412
|
)
|
|
344
413
|
|
|
345
|
-
# Perform the search
|
|
414
|
+
# Perform the search to get Google Shopping URLs
|
|
346
415
|
urls = await self._search(
|
|
347
416
|
search_string=search_string,
|
|
348
417
|
language=language,
|
|
@@ -355,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
355
424
|
# and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
|
|
356
425
|
urls = urls[:num_results]
|
|
357
426
|
|
|
358
|
-
# Create
|
|
427
|
+
# Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
|
|
359
428
|
results = [self._create_search_result(url=url) for url in urls]
|
|
360
429
|
logger.debug(
|
|
361
|
-
f'Produced {len(results)} results from
|
|
430
|
+
f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
|
|
362
431
|
)
|
|
363
432
|
return results
|
|
364
433
|
|
|
@@ -366,36 +435,77 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
366
435
|
class Toppreise(SearchEngine):
|
|
367
436
|
"""Search engine for toppreise.ch."""
|
|
368
437
|
|
|
369
|
-
_endpoint = "https://www.toppreise.ch/
|
|
370
|
-
_headers = {
|
|
371
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
372
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
373
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
374
|
-
"Accept-Encoding": "gzip, deflate",
|
|
375
|
-
"Connection": "keep-alive",
|
|
376
|
-
"Upgrade-Insecure-Requests": "1",
|
|
377
|
-
}
|
|
438
|
+
_endpoint = "https://www.toppreise.ch/"
|
|
378
439
|
|
|
379
|
-
def __init__(self, http_client: httpx.AsyncClient):
|
|
440
|
+
def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
|
|
380
441
|
"""Initializes the Toppreise client.
|
|
381
442
|
|
|
382
443
|
Args:
|
|
383
444
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
445
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
384
446
|
"""
|
|
385
|
-
|
|
447
|
+
super().__init__(http_client=http_client)
|
|
448
|
+
self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
|
|
386
449
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
450
|
+
async def http_client_get_with_fallback(self, url: str) -> bytes:
|
|
451
|
+
"""Performs a GET request with retries.
|
|
452
|
+
|
|
453
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
454
|
+
content using Zyte proxy mode.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
url: The URL to request.
|
|
458
|
+
"""
|
|
459
|
+
# Try to access the URL directly
|
|
460
|
+
try:
|
|
461
|
+
response: httpx.Response = await self.http_client_get(
|
|
462
|
+
url=url, headers=self._headers
|
|
463
|
+
)
|
|
464
|
+
content = response.content
|
|
465
|
+
|
|
466
|
+
# If we get a 403 Error (can happen depending on IP/location of deployment),
|
|
467
|
+
# we try to unblock the URL using Zyte proxy mode
|
|
468
|
+
except httpx.HTTPStatusError as err_direct:
|
|
469
|
+
if err_direct.response.status_code == 403:
|
|
470
|
+
logger.warning(
|
|
471
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
472
|
+
)
|
|
473
|
+
try:
|
|
474
|
+
content = await self._zyteapi.unblock_url_content(url)
|
|
475
|
+
except Exception as err_resolve:
|
|
476
|
+
msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
|
|
477
|
+
logger.error(msg, exc_info=True)
|
|
478
|
+
raise httpx.HTTPError(msg) from err_resolve
|
|
479
|
+
else:
|
|
480
|
+
raise err_direct
|
|
481
|
+
return content
|
|
482
|
+
|
|
483
|
+
@classmethod
|
|
484
|
+
def _get_search_endpoint(cls, language: Language) -> str:
|
|
485
|
+
"""Get the search endpoint based on the language."""
|
|
486
|
+
search_path = TOPPREISE_SEARCH_PATHS.get(
|
|
487
|
+
language.code, TOPPREISE_SEARCH_PATHS["default"]
|
|
488
|
+
)
|
|
489
|
+
return f"{cls._endpoint}{search_path}"
|
|
391
490
|
|
|
392
491
|
@staticmethod
|
|
393
|
-
def
|
|
394
|
-
|
|
492
|
+
def _extract_links(
|
|
493
|
+
element: Tag, ext_products: bool = True, comp_products: bool = True
|
|
494
|
+
) -> List[str]:
|
|
495
|
+
"""Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
|
|
395
496
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
497
|
+
Note:
|
|
498
|
+
Depending on the arguments, it extracts:
|
|
499
|
+
- product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
|
|
500
|
+
- external product URLs (i.e. https://www.example.com/ext_...).
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
tag: BeautifulSoup Tag object containing the HTML to parse.
|
|
504
|
+
ext_products: Whether to extract external product URLs.
|
|
505
|
+
comp_products: Whether to extract product comparison URLs.
|
|
506
|
+
"""
|
|
507
|
+
# Find all links in the page
|
|
508
|
+
links = element.find_all("a", href=True)
|
|
399
509
|
|
|
400
510
|
# Filter links to only include external product links
|
|
401
511
|
hrefs = [
|
|
@@ -406,7 +516,15 @@ class Toppreise(SearchEngine):
|
|
|
406
516
|
and (href := link.get("href")) # Ensure href is not None
|
|
407
517
|
and not href.startswith("javascript:") # Skip javascript links
|
|
408
518
|
and isinstance(href, str) # Ensure href is a string
|
|
409
|
-
|
|
519
|
+
# Make sure the link is either an external product link (href contains 'ext_')
|
|
520
|
+
# or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
|
|
521
|
+
and (
|
|
522
|
+
("ext_" in href and ext_products)
|
|
523
|
+
or (
|
|
524
|
+
any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
|
|
525
|
+
and comp_products
|
|
526
|
+
)
|
|
527
|
+
)
|
|
410
528
|
)
|
|
411
529
|
]
|
|
412
530
|
|
|
@@ -421,44 +539,67 @@ class Toppreise(SearchEngine):
|
|
|
421
539
|
|
|
422
540
|
# Return deduplicated urls
|
|
423
541
|
urls = list(set(urls))
|
|
542
|
+
return urls
|
|
543
|
+
|
|
544
|
+
def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
|
|
545
|
+
"""Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
|
|
546
|
+
|
|
547
|
+
# Parse the HTML
|
|
548
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
549
|
+
main = soup.find("div", id="Page_Browsing")
|
|
550
|
+
if not isinstance(main, Tag):
|
|
551
|
+
logger.warning("No main content found in Toppreise search page.")
|
|
552
|
+
return []
|
|
553
|
+
|
|
554
|
+
# Extract links (external product links and comparison links)
|
|
555
|
+
urls = self._extract_links(element=main)
|
|
556
|
+
|
|
557
|
+
logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
|
|
558
|
+
return urls
|
|
559
|
+
|
|
560
|
+
def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
|
|
561
|
+
"""Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
|
|
562
|
+
|
|
563
|
+
# Parse the HTML
|
|
564
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
565
|
+
|
|
566
|
+
# Extract links (external product links only)
|
|
567
|
+
urls = self._extract_links(element=soup, comp_products=False)
|
|
568
|
+
|
|
424
569
|
logger.debug(
|
|
425
|
-
f"Found {len(urls)} external product URLs from Toppreise
|
|
570
|
+
f"Found {len(urls)} external product URLs from Toppreise comparison page."
|
|
426
571
|
)
|
|
427
572
|
return urls
|
|
428
573
|
|
|
429
|
-
|
|
574
|
+
@property
|
|
575
|
+
def _search_engine_name(self) -> str:
|
|
576
|
+
"""The name of the search engine."""
|
|
577
|
+
return SearchEngineName.TOPPREISE.value
|
|
578
|
+
|
|
579
|
+
async def _search(
|
|
580
|
+
self, search_string: str, language: Language, num_results: int
|
|
581
|
+
) -> List[str]:
|
|
430
582
|
"""Performs a search on Toppreise and returns the URLs of the results.
|
|
431
583
|
|
|
584
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
585
|
+
content using Zyte proxy mode.
|
|
586
|
+
|
|
432
587
|
Args:
|
|
433
588
|
search_string: The search string to use for the query.
|
|
589
|
+
language: The language to use for the query.
|
|
434
590
|
num_results: Max number of results to return.
|
|
435
591
|
"""
|
|
436
592
|
# Build the search URL for Toppreise
|
|
593
|
+
endpoint = self._get_search_endpoint(language=language)
|
|
437
594
|
encoded_search = quote_plus(search_string)
|
|
438
|
-
url = f"{
|
|
595
|
+
url = f"{endpoint}?q={encoded_search}"
|
|
439
596
|
logger.debug(f"Toppreise search URL: {url}")
|
|
440
597
|
|
|
441
|
-
# Perform the request
|
|
442
|
-
|
|
443
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
444
|
-
retry = get_async_retry()
|
|
445
|
-
retry.before = lambda retry_state: self._log_before(
|
|
446
|
-
search_string=search_string, retry_state=retry_state
|
|
447
|
-
)
|
|
448
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
449
|
-
search_string=search_string, retry_state=retry_state
|
|
450
|
-
)
|
|
451
|
-
async for attempt in retry:
|
|
452
|
-
with attempt:
|
|
453
|
-
response = await self._http_client.get(
|
|
454
|
-
url=url,
|
|
455
|
-
headers=self._headers,
|
|
456
|
-
)
|
|
457
|
-
response.raise_for_status()
|
|
598
|
+
# Perform the request with fallback if necessary
|
|
599
|
+
content = await self.http_client_get_with_fallback(url=url)
|
|
458
600
|
|
|
459
601
|
# Get external product urls from the content
|
|
460
|
-
|
|
461
|
-
urls = self._get_external_product_urls(content=content)
|
|
602
|
+
urls = self._extract_product_urls_from_search_page(content=content)
|
|
462
603
|
urls = urls[:num_results] # Limit to num_results if needed
|
|
463
604
|
|
|
464
605
|
return urls
|
|
@@ -466,17 +607,20 @@ class Toppreise(SearchEngine):
|
|
|
466
607
|
async def search(
|
|
467
608
|
self,
|
|
468
609
|
search_term: str,
|
|
610
|
+
language: Language,
|
|
469
611
|
num_results: int,
|
|
470
612
|
) -> List[SearchResult]:
|
|
471
613
|
"""Performs a Toppreise search and returns SearchResults.
|
|
472
614
|
|
|
473
615
|
Args:
|
|
474
616
|
search_term: The search term to use for the query.
|
|
617
|
+
language: The language to use for the search.
|
|
475
618
|
num_results: Max number of results to return.
|
|
476
619
|
"""
|
|
477
620
|
# Perform the search
|
|
478
621
|
urls = await self._search(
|
|
479
622
|
search_string=search_term,
|
|
623
|
+
language=language,
|
|
480
624
|
num_results=num_results,
|
|
481
625
|
)
|
|
482
626
|
|
|
@@ -488,21 +632,121 @@ class Toppreise(SearchEngine):
|
|
|
488
632
|
return results
|
|
489
633
|
|
|
490
634
|
|
|
491
|
-
class
|
|
635
|
+
class Searcher(DomainUtils):
|
|
492
636
|
"""Class to perform searches using different search engines."""
|
|
493
637
|
|
|
494
|
-
|
|
638
|
+
_post_search_retry_stop_after = 3
|
|
639
|
+
|
|
640
|
+
def __init__(
|
|
641
|
+
self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
|
|
642
|
+
):
|
|
495
643
|
"""Initializes the Search class with the given SerpAPI key.
|
|
496
644
|
|
|
497
645
|
Args:
|
|
498
646
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
499
647
|
serpapi_key: The API key for SERP API.
|
|
648
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
500
649
|
"""
|
|
650
|
+
self._http_client = http_client
|
|
501
651
|
self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
|
|
502
652
|
self._google_shopping = SerpAPIGoogleShopping(
|
|
503
|
-
http_client=http_client,
|
|
653
|
+
http_client=http_client,
|
|
654
|
+
api_key=serpapi_key,
|
|
655
|
+
)
|
|
656
|
+
self._toppreise = Toppreise(
|
|
657
|
+
http_client=http_client,
|
|
658
|
+
zyteapi_key=zyteapi_key,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
|
|
662
|
+
"""Post-search for product URLs from a Google Shopping immersive product page.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
url: The URL of the Google Shopping product page.
|
|
666
|
+
"""
|
|
667
|
+
# Add SerpAPI key to the url
|
|
668
|
+
sep = "&" if "?" in url else "?"
|
|
669
|
+
url = f"{url}{sep}api_key={self._google_shopping._api_key}"
|
|
670
|
+
|
|
671
|
+
# Fetch the content of the Google Shopping product page
|
|
672
|
+
response = await self._google_shopping.http_client_get(url=url)
|
|
673
|
+
|
|
674
|
+
# Get external product urls from the data
|
|
675
|
+
data = response.json()
|
|
676
|
+
urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
|
|
677
|
+
data=data
|
|
678
|
+
)
|
|
679
|
+
return urls
|
|
680
|
+
|
|
681
|
+
async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
|
|
682
|
+
"""Post-search for product URLs from a Toppreise product comparison page.
|
|
683
|
+
|
|
684
|
+
Note:
|
|
685
|
+
In comparison to the function Toppreise._search, here we extract the urls from
|
|
686
|
+
product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
|
|
687
|
+
pages can also be found in the results of a google search.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
url: The URL of the Toppreise product listing page.
|
|
691
|
+
"""
|
|
692
|
+
# Perform the request with fallback if necessary
|
|
693
|
+
content = await self._toppreise.http_client_get_with_fallback(url=url)
|
|
694
|
+
|
|
695
|
+
# Get external product urls from the content
|
|
696
|
+
urls = self._toppreise._extract_product_urls_from_comparison_page(
|
|
697
|
+
content=content
|
|
504
698
|
)
|
|
505
|
-
|
|
699
|
+
return urls
|
|
700
|
+
|
|
701
|
+
async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
|
|
702
|
+
"""Post-search for additional embedded product URLs from the obtained results.
|
|
703
|
+
|
|
704
|
+
Note:
|
|
705
|
+
This function is used to extract embedded product URLs from
|
|
706
|
+
product listing pages (e.g. Toppreise, Google Shopping) if needed.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
results: The list of SearchResult objects obtained from the search.
|
|
710
|
+
"""
|
|
711
|
+
post_search_results: List[SearchResult] = []
|
|
712
|
+
for res in results:
|
|
713
|
+
url = res.url
|
|
714
|
+
post_search_urls: List[str] = []
|
|
715
|
+
|
|
716
|
+
# Extract embedded product URLs from the Google Shopping immersive product page
|
|
717
|
+
if "engine=google_immersive_product" in url:
|
|
718
|
+
logger.debug(
|
|
719
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
720
|
+
)
|
|
721
|
+
post_search_urls = await self._post_search_google_shopping_immersive(
|
|
722
|
+
url=url
|
|
723
|
+
)
|
|
724
|
+
logger.debug(
|
|
725
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# Extract embedded product URLs from the Toppreise product listing page
|
|
729
|
+
elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
|
|
730
|
+
logger.debug(
|
|
731
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
732
|
+
)
|
|
733
|
+
post_search_urls = await self._post_search_toppreise_comparison(url=url)
|
|
734
|
+
logger.debug(
|
|
735
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Add the extracted product URLs as SearchResult objects
|
|
739
|
+
psr = [
|
|
740
|
+
SearchResult(
|
|
741
|
+
url=psu,
|
|
742
|
+
domain=self._get_domain(url=psu),
|
|
743
|
+
search_engine_name=res.search_engine_name,
|
|
744
|
+
)
|
|
745
|
+
for psu in post_search_urls
|
|
746
|
+
]
|
|
747
|
+
post_search_results.extend(psr)
|
|
748
|
+
|
|
749
|
+
return post_search_results
|
|
506
750
|
|
|
507
751
|
@staticmethod
|
|
508
752
|
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
@@ -592,63 +836,77 @@ class Search(DomainUtils):
|
|
|
592
836
|
async def apply(
|
|
593
837
|
self,
|
|
594
838
|
search_term: str,
|
|
839
|
+
search_engine: SearchEngineName | str,
|
|
595
840
|
language: Language,
|
|
596
841
|
location: Location,
|
|
597
842
|
num_results: int,
|
|
598
843
|
marketplaces: List[Host] | None = None,
|
|
599
844
|
excluded_urls: List[Host] | None = None,
|
|
600
|
-
search_engines: List[SearchEngineName | str] | None = None,
|
|
601
845
|
) -> List[SearchResult]:
|
|
602
846
|
"""Performs a search and returns SearchResults.
|
|
603
847
|
|
|
604
848
|
Args:
|
|
605
849
|
search_term: The search term to use for the query.
|
|
850
|
+
search_engine: The search engine to use for the search.
|
|
606
851
|
language: The language to use for the query ('hl' parameter).
|
|
607
852
|
location: The location to use for the query ('gl' parameter).
|
|
608
853
|
num_results: Max number of results per search engine.
|
|
609
854
|
marketplaces: The marketplaces to include in the search.
|
|
610
855
|
excluded_urls: The URLs to exclude from the search.
|
|
611
|
-
search_engines: The list of search engines to use for the search.
|
|
612
856
|
"""
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
857
|
+
logger.info(
|
|
858
|
+
f'Performing search for term="{search_term}" using engine="{search_engine}".'
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# -------------------------------
|
|
862
|
+
# SEARCH
|
|
863
|
+
# -------------------------------
|
|
864
|
+
# Map string to SearchEngineName if needed
|
|
865
|
+
if isinstance(search_engine, str):
|
|
866
|
+
search_engine = SearchEngineName(search_engine)
|
|
621
867
|
|
|
622
868
|
# Make SerpAPI google search
|
|
623
|
-
if SearchEngineName.GOOGLE
|
|
624
|
-
|
|
869
|
+
if search_engine == SearchEngineName.GOOGLE:
|
|
870
|
+
results = await self._google.search(
|
|
625
871
|
search_term=search_term,
|
|
626
872
|
language=language,
|
|
627
873
|
location=location,
|
|
628
874
|
num_results=num_results,
|
|
629
875
|
marketplaces=marketplaces,
|
|
630
876
|
)
|
|
631
|
-
results.extend(res)
|
|
632
877
|
|
|
633
878
|
# Make SerpAPI google shopping search
|
|
634
|
-
|
|
635
|
-
|
|
879
|
+
elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
|
|
880
|
+
results = await self._google_shopping.search(
|
|
636
881
|
search_term=search_term,
|
|
637
882
|
language=language,
|
|
638
883
|
location=location,
|
|
639
884
|
num_results=num_results,
|
|
640
885
|
marketplaces=marketplaces,
|
|
641
886
|
)
|
|
642
|
-
results.extend(res)
|
|
643
887
|
|
|
644
888
|
# Make Toppreise search
|
|
645
|
-
|
|
646
|
-
|
|
889
|
+
elif search_engine == SearchEngineName.TOPPREISE:
|
|
890
|
+
results = await self._toppreise.search(
|
|
647
891
|
search_term=search_term,
|
|
892
|
+
language=language,
|
|
648
893
|
num_results=num_results,
|
|
649
894
|
)
|
|
650
|
-
results.extend(res)
|
|
651
895
|
|
|
896
|
+
# Other search engines can be added here (raise unknown engine error otherwise)
|
|
897
|
+
else:
|
|
898
|
+
raise ValueError(f"Unknown search engine: {search_engine}")
|
|
899
|
+
|
|
900
|
+
# -------------------------------
|
|
901
|
+
# POST-SEARCH URL EXTRACTION
|
|
902
|
+
# -------------------------------
|
|
903
|
+
post_search_results = await self._post_search(results=results)
|
|
904
|
+
post_search_results = post_search_results[:num_results]
|
|
905
|
+
results.extend(post_search_results)
|
|
906
|
+
|
|
907
|
+
# -------------------------------
|
|
908
|
+
# FILTERS
|
|
909
|
+
# -------------------------------
|
|
652
910
|
# Apply filters
|
|
653
911
|
results = [
|
|
654
912
|
self._apply_filters(
|
|
@@ -660,5 +918,7 @@ class Search(DomainUtils):
|
|
|
660
918
|
for res in results
|
|
661
919
|
]
|
|
662
920
|
|
|
663
|
-
logger.
|
|
921
|
+
logger.info(
|
|
922
|
+
f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
|
|
923
|
+
)
|
|
664
924
|
return results
|