fraudcrawler 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -2
- fraudcrawler/base/base.py +11 -32
- fraudcrawler/base/client.py +1 -1
- fraudcrawler/base/orchestrator.py +135 -135
- fraudcrawler/base/retry.py +12 -6
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/processing/processor.py +3 -3
- fraudcrawler/scraping/search.py +352 -125
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +15 -1
- fraudcrawler/settings.py +13 -3
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/METADATA +4 -3
- fraudcrawler-0.6.1.dist-info/RECORD +22 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/WHEEL +1 -1
- fraudcrawler-0.5.9.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/search.py
CHANGED
|
@@ -6,12 +6,18 @@ from typing import Dict, List
|
|
|
6
6
|
from urllib.parse import quote_plus
|
|
7
7
|
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
|
+
from bs4.element import Tag
|
|
9
10
|
import httpx
|
|
10
11
|
from tenacity import RetryCallState
|
|
11
12
|
|
|
12
|
-
from fraudcrawler.settings import
|
|
13
|
+
from fraudcrawler.settings import (
|
|
14
|
+
SEARCH_DEFAULT_COUNTRY_CODES,
|
|
15
|
+
TOPPREISE_SEARCH_PATHS,
|
|
16
|
+
TOPPREISE_COMPARISON_PATHS,
|
|
17
|
+
)
|
|
13
18
|
from fraudcrawler.base.base import Host, Language, Location, DomainUtils
|
|
14
19
|
from fraudcrawler.base.retry import get_async_retry
|
|
20
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
15
21
|
|
|
16
22
|
logger = logging.getLogger(__name__)
|
|
17
23
|
|
|
@@ -39,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
39
45
|
|
|
40
46
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
41
47
|
|
|
48
|
+
def __init__(self, http_client: httpx.AsyncClient):
|
|
49
|
+
"""Initializes the SearchEngine with the given HTTP client.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
53
|
+
"""
|
|
54
|
+
self._http_client = http_client
|
|
55
|
+
|
|
42
56
|
@property
|
|
43
57
|
@abstractmethod
|
|
44
58
|
def _search_engine_name(self) -> str:
|
|
@@ -50,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
|
|
|
50
64
|
"""Apply the search with the given parameters and return results."""
|
|
51
65
|
pass
|
|
52
66
|
|
|
67
|
+
def _create_search_result(self, url: str) -> SearchResult:
|
|
68
|
+
"""From a given url it creates the class:`SearchResult` instance."""
|
|
69
|
+
# Get marketplace name
|
|
70
|
+
domain = self._get_domain(url=url)
|
|
71
|
+
|
|
72
|
+
# Create and return the SearchResult object
|
|
73
|
+
result = SearchResult(
|
|
74
|
+
url=url,
|
|
75
|
+
domain=domain,
|
|
76
|
+
search_engine_name=self._search_engine_name,
|
|
77
|
+
)
|
|
78
|
+
return result
|
|
79
|
+
|
|
53
80
|
@classmethod
|
|
54
81
|
def _log_before(
|
|
55
|
-
cls,
|
|
82
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
56
83
|
) -> None:
|
|
57
|
-
"""Context aware logging before
|
|
84
|
+
"""Context aware logging before HTTP request is made."""
|
|
58
85
|
if retry_state:
|
|
59
86
|
logger.debug(
|
|
60
|
-
f'Performing
|
|
61
|
-
f"(attempt {retry_state.attempt_number})."
|
|
87
|
+
f'Performing HTTP request in {cls.__name__} to url="{url}" '
|
|
88
|
+
f"with params={params} (attempt {retry_state.attempt_number})."
|
|
62
89
|
)
|
|
63
90
|
else:
|
|
64
91
|
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
65
92
|
|
|
66
93
|
@classmethod
|
|
67
94
|
def _log_before_sleep(
|
|
68
|
-
cls,
|
|
95
|
+
cls, url: str, params: dict | None, retry_state: RetryCallState | None
|
|
69
96
|
) -> None:
|
|
70
|
-
"""Context aware logging before sleeping after a failed request."""
|
|
97
|
+
"""Context aware logging before sleeping after a failed HTTP request."""
|
|
71
98
|
if retry_state and retry_state.outcome:
|
|
72
99
|
logger.warning(
|
|
73
|
-
f
|
|
100
|
+
f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
|
|
101
|
+
f'to url="{url}" with params="{params}" '
|
|
74
102
|
f"failed with error: {retry_state.outcome.exception()}. "
|
|
75
103
|
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
76
104
|
)
|
|
77
105
|
else:
|
|
78
106
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
79
107
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
108
|
+
async def http_client_get(
|
|
109
|
+
self, url: str, params: dict | None = None, headers: dict | None = None
|
|
110
|
+
) -> httpx.Response:
|
|
111
|
+
"""Performs a GET request with retries.
|
|
84
112
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
url
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
Args:
|
|
114
|
+
retry: The retry strategy to use.
|
|
115
|
+
url: The URL to request.
|
|
116
|
+
params: Query parameters for the request.
|
|
117
|
+
headers: HTTP headers to use for the request.
|
|
118
|
+
"""
|
|
119
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
120
|
+
# - `before`: before the request is made (and before retrying)
|
|
121
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
122
|
+
retry = get_async_retry()
|
|
123
|
+
retry.before = lambda retry_state: self._log_before(
|
|
124
|
+
url=url, params=params, retry_state=retry_state
|
|
90
125
|
)
|
|
91
|
-
|
|
126
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
127
|
+
url=url, params=params, retry_state=retry_state
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
async for attempt in retry:
|
|
131
|
+
with attempt:
|
|
132
|
+
response = await self._http_client.get(
|
|
133
|
+
url=url,
|
|
134
|
+
params=params,
|
|
135
|
+
headers=headers,
|
|
136
|
+
)
|
|
137
|
+
response.raise_for_status()
|
|
138
|
+
return response
|
|
139
|
+
|
|
140
|
+
# In case of not entering the for loop (for some strange reason)
|
|
141
|
+
raise RuntimeError("Retry exhausted without success")
|
|
92
142
|
|
|
93
143
|
|
|
94
144
|
class SerpAPI(SearchEngine):
|
|
@@ -103,7 +153,7 @@ class SerpAPI(SearchEngine):
|
|
|
103
153
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
104
154
|
api_key: The API key for SerpAPI.
|
|
105
155
|
"""
|
|
106
|
-
|
|
156
|
+
super().__init__(http_client=http_client)
|
|
107
157
|
self._api_key = api_key
|
|
108
158
|
|
|
109
159
|
@property
|
|
@@ -199,22 +249,10 @@ class SerpAPI(SearchEngine):
|
|
|
199
249
|
}
|
|
200
250
|
logger.debug(f"SerpAPI search with params: {params}")
|
|
201
251
|
|
|
202
|
-
# Perform the request
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
retry = get_async_retry()
|
|
206
|
-
retry.before = lambda retry_state: self._log_before(
|
|
207
|
-
search_string=search_string, retry_state=retry_state
|
|
252
|
+
# Perform the search request
|
|
253
|
+
response: httpx.Response = await self.http_client_get(
|
|
254
|
+
url=self._endpoint, params=params
|
|
208
255
|
)
|
|
209
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
210
|
-
search_string=search_string, retry_state=retry_state
|
|
211
|
-
)
|
|
212
|
-
async for attempt in retry:
|
|
213
|
-
with attempt:
|
|
214
|
-
response = await self._http_client.get(
|
|
215
|
-
url=self._endpoint, params=params
|
|
216
|
-
)
|
|
217
|
-
response.raise_for_status()
|
|
218
256
|
|
|
219
257
|
# Extract the URLs from the response
|
|
220
258
|
data = response.json()
|
|
@@ -330,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
330
368
|
"""
|
|
331
369
|
results = data.get("shopping_results")
|
|
332
370
|
if results is not None:
|
|
333
|
-
return [url for res in results if (url := res.get("product_link"))]
|
|
371
|
+
# return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
|
|
372
|
+
return [
|
|
373
|
+
url
|
|
374
|
+
for res in results
|
|
375
|
+
if (url := res.get("serpapi_immersive_product_api"))
|
|
376
|
+
]
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
@staticmethod
|
|
380
|
+
def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
|
|
381
|
+
"""Extracts product urls from the serpapi immersive product API data."""
|
|
382
|
+
if results := data.get("product_results"):
|
|
383
|
+
stores = results.get("stores", [])
|
|
384
|
+
urls = [url for sre in stores if (url := sre.get("link"))]
|
|
385
|
+
return list(set(urls))
|
|
334
386
|
return []
|
|
335
387
|
|
|
336
388
|
async def search(
|
|
@@ -343,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
343
395
|
) -> List[SearchResult]:
|
|
344
396
|
"""Performs a google shopping search using SerpApi and returns SearchResults.
|
|
345
397
|
|
|
398
|
+
Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
|
|
399
|
+
and creates multiple SearchResult objects for each merchant URL found.
|
|
400
|
+
|
|
346
401
|
Args:
|
|
347
402
|
search_term: The search term to use for the query.
|
|
348
403
|
language: The language to use for the query ('hl' parameter).
|
|
@@ -356,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
356
411
|
marketplaces=marketplaces,
|
|
357
412
|
)
|
|
358
413
|
|
|
359
|
-
# Perform the search
|
|
414
|
+
# Perform the search to get Google Shopping URLs
|
|
360
415
|
urls = await self._search(
|
|
361
416
|
search_string=search_string,
|
|
362
417
|
language=language,
|
|
@@ -369,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
369
424
|
# and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
|
|
370
425
|
urls = urls[:num_results]
|
|
371
426
|
|
|
372
|
-
# Create
|
|
427
|
+
# Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
|
|
373
428
|
results = [self._create_search_result(url=url) for url in urls]
|
|
374
429
|
logger.debug(
|
|
375
|
-
f'Produced {len(results)} results from
|
|
430
|
+
f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
|
|
376
431
|
)
|
|
377
432
|
return results
|
|
378
433
|
|
|
@@ -380,38 +435,77 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
380
435
|
class Toppreise(SearchEngine):
|
|
381
436
|
"""Search engine for toppreise.ch."""
|
|
382
437
|
|
|
383
|
-
_endpoint = "https://www.toppreise.ch/
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
387
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
388
|
-
"Accept-Encoding": "gzip, deflate",
|
|
389
|
-
"Connection": "keep-alive",
|
|
390
|
-
"Upgrade-Insecure-Requests": "1",
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
|
|
438
|
+
_endpoint = "https://www.toppreise.ch/"
|
|
439
|
+
|
|
440
|
+
def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
|
|
394
441
|
"""Initializes the Toppreise client.
|
|
395
442
|
|
|
396
443
|
Args:
|
|
397
444
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
398
|
-
|
|
445
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
399
446
|
"""
|
|
400
|
-
|
|
401
|
-
self.
|
|
447
|
+
super().__init__(http_client=http_client)
|
|
448
|
+
self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
|
|
402
449
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
450
|
+
async def http_client_get_with_fallback(self, url: str) -> bytes:
|
|
451
|
+
"""Performs a GET request with retries.
|
|
452
|
+
|
|
453
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
454
|
+
content using Zyte proxy mode.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
url: The URL to request.
|
|
458
|
+
"""
|
|
459
|
+
# Try to access the URL directly
|
|
460
|
+
try:
|
|
461
|
+
response: httpx.Response = await self.http_client_get(
|
|
462
|
+
url=url, headers=self._headers
|
|
463
|
+
)
|
|
464
|
+
content = response.content
|
|
465
|
+
|
|
466
|
+
# If we get a 403 Error (can happen depending on IP/location of deployment),
|
|
467
|
+
# we try to unblock the URL using Zyte proxy mode
|
|
468
|
+
except httpx.HTTPStatusError as err_direct:
|
|
469
|
+
if err_direct.response.status_code == 403:
|
|
470
|
+
logger.warning(
|
|
471
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
472
|
+
)
|
|
473
|
+
try:
|
|
474
|
+
content = await self._zyteapi.unblock_url_content(url)
|
|
475
|
+
except Exception as err_resolve:
|
|
476
|
+
msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
|
|
477
|
+
logger.error(msg)
|
|
478
|
+
raise httpx.HTTPError(msg) from err_resolve
|
|
479
|
+
else:
|
|
480
|
+
raise err_direct
|
|
481
|
+
return content
|
|
482
|
+
|
|
483
|
+
@classmethod
|
|
484
|
+
def _get_search_endpoint(cls, language: Language) -> str:
|
|
485
|
+
"""Get the search endpoint based on the language."""
|
|
486
|
+
search_path = TOPPREISE_SEARCH_PATHS.get(
|
|
487
|
+
language.code, TOPPREISE_SEARCH_PATHS["default"]
|
|
488
|
+
)
|
|
489
|
+
return f"{cls._endpoint}{search_path}"
|
|
407
490
|
|
|
408
491
|
@staticmethod
|
|
409
|
-
def
|
|
410
|
-
|
|
492
|
+
def _extract_links(
|
|
493
|
+
element: Tag, ext_products: bool = True, comp_products: bool = True
|
|
494
|
+
) -> List[str]:
|
|
495
|
+
"""Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
|
|
411
496
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
497
|
+
Note:
|
|
498
|
+
Depending on the arguments, it extracts:
|
|
499
|
+
- product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
|
|
500
|
+
- external product URLs (i.e. https://www.example.com/ext_...).
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
tag: BeautifulSoup Tag object containing the HTML to parse.
|
|
504
|
+
ext_products: Whether to extract external product URLs.
|
|
505
|
+
comp_products: Whether to extract product comparison URLs.
|
|
506
|
+
"""
|
|
507
|
+
# Find all links in the page
|
|
508
|
+
links = element.find_all("a", href=True)
|
|
415
509
|
|
|
416
510
|
# Filter links to only include external product links
|
|
417
511
|
hrefs = [
|
|
@@ -422,7 +516,15 @@ class Toppreise(SearchEngine):
|
|
|
422
516
|
and (href := link.get("href")) # Ensure href is not None
|
|
423
517
|
and not href.startswith("javascript:") # Skip javascript links
|
|
424
518
|
and isinstance(href, str) # Ensure href is a string
|
|
425
|
-
|
|
519
|
+
# Make sure the link is either an external product link (href contains 'ext_')
|
|
520
|
+
# or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
|
|
521
|
+
and (
|
|
522
|
+
("ext_" in href and ext_products)
|
|
523
|
+
or (
|
|
524
|
+
any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
|
|
525
|
+
and comp_products
|
|
526
|
+
)
|
|
527
|
+
)
|
|
426
528
|
)
|
|
427
529
|
]
|
|
428
530
|
|
|
@@ -437,60 +539,67 @@ class Toppreise(SearchEngine):
|
|
|
437
539
|
|
|
438
540
|
# Return deduplicated urls
|
|
439
541
|
urls = list(set(urls))
|
|
542
|
+
return urls
|
|
543
|
+
|
|
544
|
+
def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
|
|
545
|
+
"""Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
|
|
546
|
+
|
|
547
|
+
# Parse the HTML
|
|
548
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
549
|
+
main = soup.find("div", id="Page_Browsing")
|
|
550
|
+
if not isinstance(main, Tag):
|
|
551
|
+
logger.warning("No main content found in Toppreise search page.")
|
|
552
|
+
return []
|
|
553
|
+
|
|
554
|
+
# Extract links (external product links and comparison links)
|
|
555
|
+
urls = self._extract_links(element=main)
|
|
556
|
+
|
|
557
|
+
logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
|
|
558
|
+
return urls
|
|
559
|
+
|
|
560
|
+
def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
|
|
561
|
+
"""Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
|
|
562
|
+
|
|
563
|
+
# Parse the HTML
|
|
564
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
565
|
+
|
|
566
|
+
# Extract links (external product links only)
|
|
567
|
+
urls = self._extract_links(element=soup, comp_products=False)
|
|
568
|
+
|
|
440
569
|
logger.debug(
|
|
441
|
-
f"Found {len(urls)} external product URLs from Toppreise
|
|
570
|
+
f"Found {len(urls)} external product URLs from Toppreise comparison page."
|
|
442
571
|
)
|
|
443
572
|
return urls
|
|
444
573
|
|
|
445
|
-
|
|
574
|
+
@property
|
|
575
|
+
def _search_engine_name(self) -> str:
|
|
576
|
+
"""The name of the search engine."""
|
|
577
|
+
return SearchEngineName.TOPPREISE.value
|
|
578
|
+
|
|
579
|
+
async def _search(
|
|
580
|
+
self, search_string: str, language: Language, num_results: int
|
|
581
|
+
) -> List[str]:
|
|
446
582
|
"""Performs a search on Toppreise and returns the URLs of the results.
|
|
447
583
|
|
|
584
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
585
|
+
content using Zyte proxy mode.
|
|
586
|
+
|
|
448
587
|
Args:
|
|
449
588
|
search_string: The search string to use for the query.
|
|
589
|
+
language: The language to use for the query.
|
|
450
590
|
num_results: Max number of results to return.
|
|
451
591
|
"""
|
|
452
592
|
# Build the search URL for Toppreise
|
|
593
|
+
endpoint = self._get_search_endpoint(language=language)
|
|
453
594
|
encoded_search = quote_plus(search_string)
|
|
454
|
-
url = f"{
|
|
595
|
+
url = f"{endpoint}?q={encoded_search}"
|
|
455
596
|
logger.debug(f"Toppreise search URL: {url}")
|
|
456
597
|
|
|
457
|
-
# Perform the request
|
|
458
|
-
|
|
459
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
460
|
-
retry = get_async_retry()
|
|
461
|
-
retry.before = lambda retry_state: self._log_before(
|
|
462
|
-
search_string=search_string, retry_state=retry_state
|
|
463
|
-
)
|
|
464
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
465
|
-
search_string=search_string, retry_state=retry_state
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
content = None
|
|
469
|
-
try:
|
|
470
|
-
async for attempt in retry:
|
|
471
|
-
with attempt:
|
|
472
|
-
response = await self._http_client.get(
|
|
473
|
-
url=url,
|
|
474
|
-
headers=self._headers,
|
|
475
|
-
)
|
|
476
|
-
response.raise_for_status()
|
|
477
|
-
content = response.content
|
|
478
|
-
except httpx.HTTPStatusError as e:
|
|
479
|
-
if e.response.status_code == 403 and self._zyte_api:
|
|
480
|
-
logger.warning(
|
|
481
|
-
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
482
|
-
)
|
|
483
|
-
content = await self._unblock_url(url, self._zyte_api)
|
|
484
|
-
if content is None:
|
|
485
|
-
raise e # Re-raise if zyte fallback also failed
|
|
486
|
-
else:
|
|
487
|
-
raise e
|
|
488
|
-
|
|
489
|
-
if content is None:
|
|
490
|
-
raise httpx.HTTPError("Failed to fetch content")
|
|
598
|
+
# Perform the request with fallback if necessary
|
|
599
|
+
content = await self.http_client_get_with_fallback(url=url)
|
|
491
600
|
|
|
492
601
|
# Get external product urls from the content
|
|
493
|
-
urls = self.
|
|
602
|
+
urls = self._extract_product_urls_from_search_page(content=content)
|
|
494
603
|
urls = urls[:num_results] # Limit to num_results if needed
|
|
495
604
|
|
|
496
605
|
return urls
|
|
@@ -498,17 +607,20 @@ class Toppreise(SearchEngine):
|
|
|
498
607
|
async def search(
|
|
499
608
|
self,
|
|
500
609
|
search_term: str,
|
|
610
|
+
language: Language,
|
|
501
611
|
num_results: int,
|
|
502
612
|
) -> List[SearchResult]:
|
|
503
613
|
"""Performs a Toppreise search and returns SearchResults.
|
|
504
614
|
|
|
505
615
|
Args:
|
|
506
616
|
search_term: The search term to use for the query.
|
|
617
|
+
language: The language to use for the search.
|
|
507
618
|
num_results: Max number of results to return.
|
|
508
619
|
"""
|
|
509
620
|
# Perform the search
|
|
510
621
|
urls = await self._search(
|
|
511
622
|
search_string=search_term,
|
|
623
|
+
language=language,
|
|
512
624
|
num_results=num_results,
|
|
513
625
|
)
|
|
514
626
|
|
|
@@ -520,22 +632,121 @@ class Toppreise(SearchEngine):
|
|
|
520
632
|
return results
|
|
521
633
|
|
|
522
634
|
|
|
523
|
-
class
|
|
635
|
+
class Searcher(DomainUtils):
|
|
524
636
|
"""Class to perform searches using different search engines."""
|
|
525
637
|
|
|
526
|
-
|
|
638
|
+
_post_search_retry_stop_after = 3
|
|
639
|
+
|
|
640
|
+
def __init__(
|
|
641
|
+
self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
|
|
642
|
+
):
|
|
527
643
|
"""Initializes the Search class with the given SerpAPI key.
|
|
528
644
|
|
|
529
645
|
Args:
|
|
530
646
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
531
647
|
serpapi_key: The API key for SERP API.
|
|
532
|
-
|
|
648
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
533
649
|
"""
|
|
650
|
+
self._http_client = http_client
|
|
534
651
|
self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
|
|
535
652
|
self._google_shopping = SerpAPIGoogleShopping(
|
|
536
|
-
http_client=http_client,
|
|
653
|
+
http_client=http_client,
|
|
654
|
+
api_key=serpapi_key,
|
|
655
|
+
)
|
|
656
|
+
self._toppreise = Toppreise(
|
|
657
|
+
http_client=http_client,
|
|
658
|
+
zyteapi_key=zyteapi_key,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
|
|
662
|
+
"""Post-search for product URLs from a Google Shopping immersive product page.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
url: The URL of the Google Shopping product page.
|
|
666
|
+
"""
|
|
667
|
+
# Add SerpAPI key to the url
|
|
668
|
+
sep = "&" if "?" in url else "?"
|
|
669
|
+
url = f"{url}{sep}api_key={self._google_shopping._api_key}"
|
|
670
|
+
|
|
671
|
+
# Fetch the content of the Google Shopping product page
|
|
672
|
+
response = await self._google_shopping.http_client_get(url=url)
|
|
673
|
+
|
|
674
|
+
# Get external product urls from the data
|
|
675
|
+
data = response.json()
|
|
676
|
+
urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
|
|
677
|
+
data=data
|
|
537
678
|
)
|
|
538
|
-
|
|
679
|
+
return urls
|
|
680
|
+
|
|
681
|
+
async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
|
|
682
|
+
"""Post-search for product URLs from a Toppreise product comparison page.
|
|
683
|
+
|
|
684
|
+
Note:
|
|
685
|
+
In comparison to the function Toppreise._search, here we extract the urls from
|
|
686
|
+
product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
|
|
687
|
+
pages can also be found in the results of a google search.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
url: The URL of the Toppreise product listing page.
|
|
691
|
+
"""
|
|
692
|
+
# Perform the request with fallback if necessary
|
|
693
|
+
content = await self._toppreise.http_client_get_with_fallback(url=url)
|
|
694
|
+
|
|
695
|
+
# Get external product urls from the content
|
|
696
|
+
urls = self._toppreise._extract_product_urls_from_comparison_page(
|
|
697
|
+
content=content
|
|
698
|
+
)
|
|
699
|
+
return urls
|
|
700
|
+
|
|
701
|
+
async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
|
|
702
|
+
"""Post-search for additional embedded product URLs from the obtained results.
|
|
703
|
+
|
|
704
|
+
Note:
|
|
705
|
+
This function is used to extract embedded product URLs from
|
|
706
|
+
product listing pages (e.g. Toppreise, Google Shopping) if needed.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
results: The list of SearchResult objects obtained from the search.
|
|
710
|
+
"""
|
|
711
|
+
post_search_results: List[SearchResult] = []
|
|
712
|
+
for res in results:
|
|
713
|
+
url = res.url
|
|
714
|
+
post_search_urls: List[str] = []
|
|
715
|
+
|
|
716
|
+
# Extract embedded product URLs from the Google Shopping immersive product page
|
|
717
|
+
if "engine=google_immersive_product" in url:
|
|
718
|
+
logger.debug(
|
|
719
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
720
|
+
)
|
|
721
|
+
post_search_urls = await self._post_search_google_shopping_immersive(
|
|
722
|
+
url=url
|
|
723
|
+
)
|
|
724
|
+
logger.debug(
|
|
725
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# Extract embedded product URLs from the Toppreise product listing page
|
|
729
|
+
elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
|
|
730
|
+
logger.debug(
|
|
731
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
732
|
+
)
|
|
733
|
+
post_search_urls = await self._post_search_toppreise_comparison(url=url)
|
|
734
|
+
logger.debug(
|
|
735
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Add the extracted product URLs as SearchResult objects
|
|
739
|
+
psr = [
|
|
740
|
+
SearchResult(
|
|
741
|
+
url=psu,
|
|
742
|
+
domain=self._get_domain(url=psu),
|
|
743
|
+
search_engine_name=res.search_engine_name,
|
|
744
|
+
)
|
|
745
|
+
for psu in post_search_urls
|
|
746
|
+
]
|
|
747
|
+
post_search_results.extend(psr)
|
|
748
|
+
|
|
749
|
+
return post_search_results
|
|
539
750
|
|
|
540
751
|
@staticmethod
|
|
541
752
|
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
@@ -625,63 +836,77 @@ class Search(DomainUtils):
|
|
|
625
836
|
async def apply(
|
|
626
837
|
self,
|
|
627
838
|
search_term: str,
|
|
839
|
+
search_engine: SearchEngineName | str,
|
|
628
840
|
language: Language,
|
|
629
841
|
location: Location,
|
|
630
842
|
num_results: int,
|
|
631
843
|
marketplaces: List[Host] | None = None,
|
|
632
844
|
excluded_urls: List[Host] | None = None,
|
|
633
|
-
search_engines: List[SearchEngineName | str] | None = None,
|
|
634
845
|
) -> List[SearchResult]:
|
|
635
846
|
"""Performs a search and returns SearchResults.
|
|
636
847
|
|
|
637
848
|
Args:
|
|
638
849
|
search_term: The search term to use for the query.
|
|
850
|
+
search_engine: The search engine to use for the search.
|
|
639
851
|
language: The language to use for the query ('hl' parameter).
|
|
640
852
|
location: The location to use for the query ('gl' parameter).
|
|
641
853
|
num_results: Max number of results per search engine.
|
|
642
854
|
marketplaces: The marketplaces to include in the search.
|
|
643
855
|
excluded_urls: The URLs to exclude from the search.
|
|
644
|
-
search_engines: The list of search engines to use for the search.
|
|
645
856
|
"""
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
857
|
+
logger.info(
|
|
858
|
+
f'Performing search for term="{search_term}" using engine="{search_engine}".'
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# -------------------------------
|
|
862
|
+
# SEARCH
|
|
863
|
+
# -------------------------------
|
|
864
|
+
# Map string to SearchEngineName if needed
|
|
865
|
+
if isinstance(search_engine, str):
|
|
866
|
+
search_engine = SearchEngineName(search_engine)
|
|
654
867
|
|
|
655
868
|
# Make SerpAPI google search
|
|
656
|
-
if SearchEngineName.GOOGLE
|
|
657
|
-
|
|
869
|
+
if search_engine == SearchEngineName.GOOGLE:
|
|
870
|
+
results = await self._google.search(
|
|
658
871
|
search_term=search_term,
|
|
659
872
|
language=language,
|
|
660
873
|
location=location,
|
|
661
874
|
num_results=num_results,
|
|
662
875
|
marketplaces=marketplaces,
|
|
663
876
|
)
|
|
664
|
-
results.extend(res)
|
|
665
877
|
|
|
666
878
|
# Make SerpAPI google shopping search
|
|
667
|
-
|
|
668
|
-
|
|
879
|
+
elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
|
|
880
|
+
results = await self._google_shopping.search(
|
|
669
881
|
search_term=search_term,
|
|
670
882
|
language=language,
|
|
671
883
|
location=location,
|
|
672
884
|
num_results=num_results,
|
|
673
885
|
marketplaces=marketplaces,
|
|
674
886
|
)
|
|
675
|
-
results.extend(res)
|
|
676
887
|
|
|
677
888
|
# Make Toppreise search
|
|
678
|
-
|
|
679
|
-
|
|
889
|
+
elif search_engine == SearchEngineName.TOPPREISE:
|
|
890
|
+
results = await self._toppreise.search(
|
|
680
891
|
search_term=search_term,
|
|
892
|
+
language=language,
|
|
681
893
|
num_results=num_results,
|
|
682
894
|
)
|
|
683
|
-
results.extend(res)
|
|
684
895
|
|
|
896
|
+
# Other search engines can be added here (raise unknown engine error otherwise)
|
|
897
|
+
else:
|
|
898
|
+
raise ValueError(f"Unknown search engine: {search_engine}")
|
|
899
|
+
|
|
900
|
+
# -------------------------------
|
|
901
|
+
# POST-SEARCH URL EXTRACTION
|
|
902
|
+
# -------------------------------
|
|
903
|
+
post_search_results = await self._post_search(results=results)
|
|
904
|
+
post_search_results = post_search_results[:num_results]
|
|
905
|
+
results.extend(post_search_results)
|
|
906
|
+
|
|
907
|
+
# -------------------------------
|
|
908
|
+
# FILTERS
|
|
909
|
+
# -------------------------------
|
|
685
910
|
# Apply filters
|
|
686
911
|
results = [
|
|
687
912
|
self._apply_filters(
|
|
@@ -693,5 +918,7 @@ class Search(DomainUtils):
|
|
|
693
918
|
for res in results
|
|
694
919
|
]
|
|
695
920
|
|
|
696
|
-
logger.
|
|
921
|
+
logger.info(
|
|
922
|
+
f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
|
|
923
|
+
)
|
|
697
924
|
return results
|