fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,924 @@
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+ import logging
4
+ from pydantic import BaseModel
5
+ from typing import Dict, List
6
+ from urllib.parse import quote_plus
7
+
8
+ from bs4 import BeautifulSoup
9
+ from bs4.element import Tag
10
+ import httpx
11
+ from tenacity import RetryCallState
12
+
13
+ from fraudcrawler.settings import (
14
+ SEARCH_DEFAULT_COUNTRY_CODES,
15
+ TOPPREISE_SEARCH_PATHS,
16
+ TOPPREISE_COMPARISON_PATHS,
17
+ )
18
+ from fraudcrawler.base.base import Host, Language, Location, DomainUtils
19
+ from fraudcrawler.base.retry import get_async_retry
20
+ from fraudcrawler.scraping.zyte import ZyteAPI
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class SearchResult(BaseModel):
26
+ """Model for a single search result."""
27
+
28
+ url: str
29
+ domain: str
30
+ search_engine_name: str
31
+ filtered: bool = False
32
+ filtered_at_stage: str | None = None
33
+
34
+
35
+ class SearchEngineName(Enum):
36
+ """Enum for search engine names."""
37
+
38
+ GOOGLE = "google"
39
+ GOOGLE_SHOPPING = "google_shopping"
40
+ TOPPREISE = "toppreise"
41
+
42
+
43
+ class SearchEngine(ABC, DomainUtils):
44
+ """Abstract base class for search engines."""
45
+
46
+ _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
47
+
48
+ def __init__(self, http_client: httpx.AsyncClient):
49
+ """Initializes the SearchEngine with the given HTTP client.
50
+
51
+ Args:
52
+ http_client: An httpx.AsyncClient to use for the async requests.
53
+ """
54
+ self._http_client = http_client
55
+
56
+ @property
57
+ @abstractmethod
58
+ def _search_engine_name(self) -> str:
59
+ """The name of the search engine."""
60
+ pass
61
+
62
+ @abstractmethod
63
+ async def search(self, *args, **kwargs) -> List[SearchResult]:
64
+ """Apply the search with the given parameters and return results."""
65
+ pass
66
+
67
+ def _create_search_result(self, url: str) -> SearchResult:
68
+ """From a given url it creates the class:`SearchResult` instance."""
69
+ # Get marketplace name
70
+ domain = self._get_domain(url=url)
71
+
72
+ # Create and return the SearchResult object
73
+ result = SearchResult(
74
+ url=url,
75
+ domain=domain,
76
+ search_engine_name=self._search_engine_name,
77
+ )
78
+ return result
79
+
80
+ @classmethod
81
+ def _log_before(
82
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
83
+ ) -> None:
84
+ """Context aware logging before HTTP request is made."""
85
+ if retry_state:
86
+ logger.debug(
87
+ f'Performing HTTP request in {cls.__name__} to url="{url}" '
88
+ f"with params={params} (attempt {retry_state.attempt_number})."
89
+ )
90
+ else:
91
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
92
+
93
+ @classmethod
94
+ def _log_before_sleep(
95
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
96
+ ) -> None:
97
+ """Context aware logging before sleeping after a failed HTTP request."""
98
+ if retry_state and retry_state.outcome:
99
+ logger.warning(
100
+ f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
101
+ f'to url="{url}" with params="{params}" '
102
+ f"failed with error: {retry_state.outcome.exception()}. "
103
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
104
+ )
105
+ else:
106
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
107
+
108
+ async def http_client_get(
109
+ self, url: str, params: dict | None = None, headers: dict | None = None
110
+ ) -> httpx.Response:
111
+ """Performs a GET request with retries.
112
+
113
+ Args:
114
+ retry: The retry strategy to use.
115
+ url: The URL to request.
116
+ params: Query parameters for the request.
117
+ headers: HTTP headers to use for the request.
118
+ """
119
+ # Perform the request and retry if necessary. There is some context aware logging:
120
+ # - `before`: before the request is made (and before retrying)
121
+ # - `before_sleep`: if the request fails before sleeping
122
+ retry = get_async_retry()
123
+ retry.before = lambda retry_state: self._log_before(
124
+ url=url, params=params, retry_state=retry_state
125
+ )
126
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
127
+ url=url, params=params, retry_state=retry_state
128
+ )
129
+
130
+ async for attempt in retry:
131
+ with attempt:
132
+ response = await self._http_client.get(
133
+ url=url,
134
+ params=params,
135
+ headers=headers,
136
+ )
137
+ response.raise_for_status()
138
+ return response
139
+
140
+ # In case of not entering the for loop (for some strange reason)
141
+ raise RuntimeError("Retry exhausted without success")
142
+
143
+
144
+ class SerpAPI(SearchEngine):
145
+ """Base class for SerpAPI search engines."""
146
+
147
+ _endpoint = "https://serpapi.com/search"
148
+
149
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
150
+ """Initializes the SerpAPI client with the given API key.
151
+
152
+ Args:
153
+ http_client: An httpx.AsyncClient to use for the async requests.
154
+ api_key: The API key for SerpAPI.
155
+ """
156
+ super().__init__(http_client=http_client)
157
+ self._api_key = api_key
158
+
159
+ @property
160
+ @abstractmethod
161
+ def _engine(self) -> str:
162
+ """The search engine name used in the SerpAPI request."""
163
+ pass
164
+
165
+ @staticmethod
166
+ @abstractmethod
167
+ def _extract_search_results_urls(data: dict) -> List[str]:
168
+ """Extracts search results urls from the response.
169
+
170
+ Args:
171
+ data: The json from the SerpAPI search response.
172
+ """
173
+ pass
174
+
175
+ @staticmethod
176
+ def _get_search_string(search_term: str, marketplaces: List[Host] | None) -> str:
177
+ """Constructs the search string with site: parameters for marketplaces."""
178
+ search_string = search_term
179
+ if marketplaces:
180
+ sites = [dom for host in marketplaces for dom in host.domains]
181
+ search_string += " site:" + " OR site:".join(s for s in sites)
182
+ return search_string
183
+
184
+ @staticmethod
185
+ def _get_google_domain(location: Location) -> str:
186
+ """Gets the Google domain for the given location if they do not use the default pattern google.tld"""
187
+ if location.name == "Brazil":
188
+ return "google.com.br"
189
+ elif location.name == "United Kingdom":
190
+ return "google.co.uk"
191
+ elif location.name == "Argentina":
192
+ return "google.com.ar"
193
+ return f"google.{location.code}"
194
+
195
+ async def _search(
196
+ self,
197
+ search_string: str,
198
+ language: Language,
199
+ location: Location,
200
+ num_results: int,
201
+ ) -> List[str]:
202
+ """Performs a search using SerpAPI and returns the URLs of the results.
203
+
204
+ Args:
205
+ search_string: The search string to use (with potentially added site: parameters).
206
+ language: The language to use for the query ('hl' parameter).
207
+ location: The location to use for the query ('gl' parameter).
208
+ num_results: Max number of results to return.
209
+
210
+ The SerpAPI parameters are:
211
+ engine: The search engine to use ('google', 'google_shopping' etc.).
212
+ q: The search string (with potentially added site: parameters).
213
+ google_domain: The Google domain to use for the search (e.g. google.[com]).
214
+ location_[requested|used]: The location to use for the search.
215
+ tbs: The to-be-searched parameters (e.g. 'ctr:CH').
216
+ cr: The country code to limit the search to (e.g. 'countryCH').
217
+ gl: The country code to use for the search.
218
+ hl: The language code to use for the search.
219
+ num: The number of results to return.
220
+ api_key: The API key to use for the search.
221
+ """
222
+ engine = self._engine
223
+
224
+ # Log the search parameters
225
+ logger.debug(
226
+ f'Performing SerpAPI search with engine="{engine}", '
227
+ f'q="{search_string}", '
228
+ f'location="{location.name}", '
229
+ f'language="{language.code}", '
230
+ f"num_results={num_results}."
231
+ )
232
+
233
+ # Get Google domain and country code
234
+ google_domain = self._get_google_domain(location)
235
+ country_code = location.code
236
+
237
+ params: Dict[str, str | int] = {
238
+ "engine": engine,
239
+ "q": search_string,
240
+ "google_domain": google_domain,
241
+ "location_requested": location.name,
242
+ "location_used": location.name,
243
+ "tbs": f"ctr:{country_code.upper()}",
244
+ "cr": f"country{country_code.upper()}",
245
+ "gl": country_code,
246
+ "hl": language.code,
247
+ "num": num_results,
248
+ "api_key": self._api_key,
249
+ }
250
+ logger.debug(f"SerpAPI search with params: {params}")
251
+
252
+ # Perform the search request
253
+ response: httpx.Response = await self.http_client_get(
254
+ url=self._endpoint, params=params
255
+ )
256
+
257
+ # Extract the URLs from the response
258
+ data = response.json()
259
+ urls = self._extract_search_results_urls(data=data)
260
+
261
+ logger.debug(
262
+ f'Found total of {len(urls)} URLs from SerpAPI search for q="{search_string}" and engine="{engine}".'
263
+ )
264
+ return urls
265
+
266
+
267
+ class SerpAPIGoogle(SerpAPI):
268
+ """Search engine for Google in SerpAPI."""
269
+
270
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
271
+ """Initializes the SerpAPIGoogle client with the given API key.
272
+
273
+ Args:
274
+ http_client: An httpx.AsyncClient to use for the async requests.
275
+ api_key: The API key for SerpAPI.
276
+ """
277
+ super().__init__(http_client=http_client, api_key=api_key)
278
+
279
+ @property
280
+ def _search_engine_name(self) -> str:
281
+ """The name of the search engine."""
282
+ return SearchEngineName.GOOGLE.value
283
+
284
+ @property
285
+ def _engine(self) -> str:
286
+ """The search engine name used in the SerpAPI request."""
287
+ return "google"
288
+
289
+ @staticmethod
290
+ def _extract_search_results_urls(data: dict) -> List[str]:
291
+ """Extracts search results urls from the response data.
292
+
293
+ Args:
294
+ data: The json data from the SerpApi search response.
295
+ """
296
+ results = data.get("organic_results")
297
+ if results is not None:
298
+ return [url for res in results if (url := res.get("link"))]
299
+ return []
300
+
301
+ async def search(
302
+ self,
303
+ search_term: str,
304
+ language: Language,
305
+ location: Location,
306
+ num_results: int,
307
+ marketplaces: List[Host] | None = None,
308
+ ) -> List[SearchResult]:
309
+ """Performs a google search using SerpApi and returns SearchResults.
310
+
311
+ Args:
312
+ search_term: The search term to use for the query.
313
+ language: The language to use for the query ('hl' parameter).
314
+ location: The location to use for the query ('gl' parameter).
315
+ num_results: Max number of results to return.
316
+ marketplaces: The marketplaces to include in the search.
317
+ """
318
+ # Construct the search string
319
+ search_string = self._get_search_string(
320
+ search_term=search_term,
321
+ marketplaces=marketplaces,
322
+ )
323
+
324
+ # Perform the search
325
+ urls = await self._search(
326
+ search_string=search_string,
327
+ language=language,
328
+ location=location,
329
+ num_results=num_results,
330
+ )
331
+
332
+ # Create and return SearchResult objects from the URLs
333
+ results = [self._create_search_result(url=url) for url in urls]
334
+ logger.debug(
335
+ f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
336
+ )
337
+ return results
338
+
339
+
340
+ class SerpAPIGoogleShopping(SerpAPI):
341
+ """Search engine for Google Shopping in SerpAPI."""
342
+
343
+ def __init__(self, http_client: httpx.AsyncClient, api_key: str):
344
+ """Initializes the SerpAPIGoogleShopping client with the given API key.
345
+
346
+ Args:
347
+ http_client: An httpx.AsyncClient to use for the async requests.
348
+ api_key: The API key for SerpAPI.
349
+ """
350
+ super().__init__(http_client=http_client, api_key=api_key)
351
+
352
+ @property
353
+ def _search_engine_name(self) -> str:
354
+ """The name of the search engine."""
355
+ return SearchEngineName.GOOGLE_SHOPPING.value
356
+
357
+ @property
358
+ def _engine(self) -> str:
359
+ """The search engine name used in the SerpAPI request."""
360
+ return "google_shopping"
361
+
362
+ @staticmethod
363
+ def _extract_search_results_urls(data: dict) -> List[str]:
364
+ """Extracts search results urls from the response data.
365
+
366
+ Args:
367
+ data: The json data from the SerpApi search response.
368
+ """
369
+ results = data.get("shopping_results")
370
+ if results is not None:
371
+ # return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
372
+ return [
373
+ url
374
+ for res in results
375
+ if (url := res.get("serpapi_immersive_product_api"))
376
+ ]
377
+ return []
378
+
379
+ @staticmethod
380
+ def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
381
+ """Extracts product urls from the serpapi immersive product API data."""
382
+ if results := data.get("product_results"):
383
+ stores = results.get("stores", [])
384
+ urls = [url for sre in stores if (url := sre.get("link"))]
385
+ return list(set(urls))
386
+ return []
387
+
388
+ async def search(
389
+ self,
390
+ search_term: str,
391
+ language: Language,
392
+ location: Location,
393
+ num_results: int,
394
+ marketplaces: List[Host] | None = None,
395
+ ) -> List[SearchResult]:
396
+ """Performs a google shopping search using SerpApi and returns SearchResults.
397
+
398
+ Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
399
+ and creates multiple SearchResult objects for each merchant URL found.
400
+
401
+ Args:
402
+ search_term: The search term to use for the query.
403
+ language: The language to use for the query ('hl' parameter).
404
+ location: The location to use for the query ('gl' parameter).
405
+ num_results: Max number of results to return.
406
+ marketplaces: The marketplaces to include in the search.
407
+ """
408
+ # Construct the search string
409
+ search_string = self._get_search_string(
410
+ search_term=search_term,
411
+ marketplaces=marketplaces,
412
+ )
413
+
414
+ # Perform the search to get Google Shopping URLs
415
+ urls = await self._search(
416
+ search_string=search_string,
417
+ language=language,
418
+ location=location,
419
+ num_results=num_results,
420
+ )
421
+
422
+ # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
423
+ # so we might get more results than requested. This is a known issue with SerpAPI
424
+ # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
425
+ urls = urls[:num_results]
426
+
427
+ # Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
428
+ results = [self._create_search_result(url=url) for url in urls]
429
+ logger.debug(
430
+ f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
431
+ )
432
+ return results
433
+
434
+
435
+ class Toppreise(SearchEngine):
436
+ """Search engine for toppreise.ch."""
437
+
438
+ _endpoint = "https://www.toppreise.ch/"
439
+
440
+ def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
441
+ """Initializes the Toppreise client.
442
+
443
+ Args:
444
+ http_client: An httpx.AsyncClient to use for the async requests.
445
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
446
+ """
447
+ super().__init__(http_client=http_client)
448
+ self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
449
+
450
+ async def http_client_get_with_fallback(self, url: str) -> bytes:
451
+ """Performs a GET request with retries.
452
+
453
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
454
+ content using Zyte proxy mode.
455
+
456
+ Args:
457
+ url: The URL to request.
458
+ """
459
+ # Try to access the URL directly
460
+ try:
461
+ response: httpx.Response = await self.http_client_get(
462
+ url=url, headers=self._headers
463
+ )
464
+ content = response.content
465
+
466
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
467
+ # we try to unblock the URL using Zyte proxy mode
468
+ except httpx.HTTPStatusError as err_direct:
469
+ if err_direct.response.status_code == 403:
470
+ logger.warning(
471
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
472
+ )
473
+ try:
474
+ content = await self._zyteapi.unblock_url_content(url)
475
+ except Exception as err_resolve:
476
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
477
+ logger.error(msg, exc_info=True)
478
+ raise httpx.HTTPError(msg) from err_resolve
479
+ else:
480
+ raise err_direct
481
+ return content
482
+
483
+ @classmethod
484
+ def _get_search_endpoint(cls, language: Language) -> str:
485
+ """Get the search endpoint based on the language."""
486
+ search_path = TOPPREISE_SEARCH_PATHS.get(
487
+ language.code, TOPPREISE_SEARCH_PATHS["default"]
488
+ )
489
+ return f"{cls._endpoint}{search_path}"
490
+
491
+ @staticmethod
492
+ def _extract_links(
493
+ element: Tag, ext_products: bool = True, comp_products: bool = True
494
+ ) -> List[str]:
495
+ """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
496
+
497
+ Note:
498
+ Depending on the arguments, it extracts:
499
+ - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
500
+ - external product URLs (i.e. https://www.example.com/ext_...).
501
+
502
+ Args:
503
+ tag: BeautifulSoup Tag object containing the HTML to parse.
504
+ ext_products: Whether to extract external product URLs.
505
+ comp_products: Whether to extract product comparison URLs.
506
+ """
507
+ # Find all links in the page
508
+ links = element.find_all("a", href=True)
509
+
510
+ # Filter links to only include external product links
511
+ hrefs = [
512
+ href
513
+ for link in links
514
+ if (
515
+ hasattr(link, "get") # Ensure we have a Tag object with href attribute
516
+ and (href := link.get("href")) # Ensure href is not None
517
+ and not href.startswith("javascript:") # Skip javascript links
518
+ and isinstance(href, str) # Ensure href is a string
519
+ # Make sure the link is either an external product link (href contains 'ext_')
520
+ # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
521
+ and (
522
+ ("ext_" in href and ext_products)
523
+ or (
524
+ any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
525
+ and comp_products
526
+ )
527
+ )
528
+ )
529
+ ]
530
+
531
+ # Make relative URLs absolute
532
+ urls = []
533
+ for href in hrefs:
534
+ if href.startswith("/"):
535
+ href = f"https://www.toppreise.ch{href}"
536
+ elif not href.startswith("http"):
537
+ href = f"https://www.toppreise.ch/{href}"
538
+ urls.append(href)
539
+
540
+ # Return deduplicated urls
541
+ urls = list(set(urls))
542
+ return urls
543
+
544
+ def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
545
+ """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
546
+
547
+ # Parse the HTML
548
+ soup = BeautifulSoup(content, "html.parser")
549
+ main = soup.find("div", id="Page_Browsing")
550
+ if not isinstance(main, Tag):
551
+ logger.warning("No main content found in Toppreise search page.")
552
+ return []
553
+
554
+ # Extract links (external product links and comparison links)
555
+ urls = self._extract_links(element=main)
556
+
557
+ logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
558
+ return urls
559
+
560
+ def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
561
+ """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
562
+
563
+ # Parse the HTML
564
+ soup = BeautifulSoup(content, "html.parser")
565
+
566
+ # Extract links (external product links only)
567
+ urls = self._extract_links(element=soup, comp_products=False)
568
+
569
+ logger.debug(
570
+ f"Found {len(urls)} external product URLs from Toppreise comparison page."
571
+ )
572
+ return urls
573
+
574
+ @property
575
+ def _search_engine_name(self) -> str:
576
+ """The name of the search engine."""
577
+ return SearchEngineName.TOPPREISE.value
578
+
579
+ async def _search(
580
+ self, search_string: str, language: Language, num_results: int
581
+ ) -> List[str]:
582
+ """Performs a search on Toppreise and returns the URLs of the results.
583
+
584
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
585
+ content using Zyte proxy mode.
586
+
587
+ Args:
588
+ search_string: The search string to use for the query.
589
+ language: The language to use for the query.
590
+ num_results: Max number of results to return.
591
+ """
592
+ # Build the search URL for Toppreise
593
+ endpoint = self._get_search_endpoint(language=language)
594
+ encoded_search = quote_plus(search_string)
595
+ url = f"{endpoint}?q={encoded_search}"
596
+ logger.debug(f"Toppreise search URL: {url}")
597
+
598
+ # Perform the request with fallback if necessary
599
+ content = await self.http_client_get_with_fallback(url=url)
600
+
601
+ # Get external product urls from the content
602
+ urls = self._extract_product_urls_from_search_page(content=content)
603
+ urls = urls[:num_results] # Limit to num_results if needed
604
+
605
+ return urls
606
+
607
+ async def search(
608
+ self,
609
+ search_term: str,
610
+ language: Language,
611
+ num_results: int,
612
+ ) -> List[SearchResult]:
613
+ """Performs a Toppreise search and returns SearchResults.
614
+
615
+ Args:
616
+ search_term: The search term to use for the query.
617
+ language: The language to use for the search.
618
+ num_results: Max number of results to return.
619
+ """
620
+ # Perform the search
621
+ urls = await self._search(
622
+ search_string=search_term,
623
+ language=language,
624
+ num_results=num_results,
625
+ )
626
+
627
+ # Create and return SearchResult objects from the URLs
628
+ results = [self._create_search_result(url=url) for url in urls]
629
+ logger.debug(
630
+ f'Produced {len(results)} results from Toppreise search with q="{search_term}".'
631
+ )
632
+ return results
633
+
634
+
635
+ class Searcher(DomainUtils):
636
+ """Class to perform searches using different search engines."""
637
+
638
+ _post_search_retry_stop_after = 3
639
+
640
+ def __init__(
641
+ self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
642
+ ):
643
+ """Initializes the Search class with the given SerpAPI key.
644
+
645
+ Args:
646
+ http_client: An httpx.AsyncClient to use for the async requests.
647
+ serpapi_key: The API key for SERP API.
648
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
649
+ """
650
+ self._http_client = http_client
651
+ self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
652
+ self._google_shopping = SerpAPIGoogleShopping(
653
+ http_client=http_client,
654
+ api_key=serpapi_key,
655
+ )
656
+ self._toppreise = Toppreise(
657
+ http_client=http_client,
658
+ zyteapi_key=zyteapi_key,
659
+ )
660
+
661
+ async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Google Shopping immersive product page.
663
+
664
+ Args:
665
+ url: The URL of the Google Shopping product page.
666
+ """
667
+ # Add SerpAPI key to the url
668
+ sep = "&" if "?" in url else "?"
669
+ url = f"{url}{sep}api_key={self._google_shopping._api_key}"
670
+
671
+ # Fetch the content of the Google Shopping product page
672
+ response = await self._google_shopping.http_client_get(url=url)
673
+
674
+ # Get external product urls from the data
675
+ data = response.json()
676
+ urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
677
+ data=data
678
+ )
679
+ return urls
680
+
681
+ async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
682
+ """Post-search for product URLs from a Toppreise product comparison page.
683
+
684
+ Note:
685
+ In comparison to the function Toppreise._search, here we extract the urls from
686
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
687
+ pages can also be found in the results of a google search.
688
+
689
+ Args:
690
+ url: The URL of the Toppreise product listing page.
691
+ """
692
+ # Perform the request with fallback if necessary
693
+ content = await self._toppreise.http_client_get_with_fallback(url=url)
694
+
695
+ # Get external product urls from the content
696
+ urls = self._toppreise._extract_product_urls_from_comparison_page(
697
+ content=content
698
+ )
699
+ return urls
700
+
701
+ async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
702
+ """Post-search for additional embedded product URLs from the obtained results.
703
+
704
+ Note:
705
+ This function is used to extract embedded product URLs from
706
+ product listing pages (e.g. Toppreise, Google Shopping) if needed.
707
+
708
+ Args:
709
+ results: The list of SearchResult objects obtained from the search.
710
+ """
711
+ post_search_results: List[SearchResult] = []
712
+ for res in results:
713
+ url = res.url
714
+ post_search_urls: List[str] = []
715
+
716
+ # Extract embedded product URLs from the Google Shopping immersive product page
717
+ if "engine=google_immersive_product" in url:
718
+ logger.debug(
719
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
720
+ )
721
+ post_search_urls = await self._post_search_google_shopping_immersive(
722
+ url=url
723
+ )
724
+ logger.debug(
725
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
726
+ )
727
+
728
+ # Extract embedded product URLs from the Toppreise product listing page
729
+ elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
730
+ logger.debug(
731
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
732
+ )
733
+ post_search_urls = await self._post_search_toppreise_comparison(url=url)
734
+ logger.debug(
735
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
736
+ )
737
+
738
+ # Add the extracted product URLs as SearchResult objects
739
+ psr = [
740
+ SearchResult(
741
+ url=psu,
742
+ domain=self._get_domain(url=psu),
743
+ search_engine_name=res.search_engine_name,
744
+ )
745
+ for psu in post_search_urls
746
+ ]
747
+ post_search_results.extend(psr)
748
+
749
+ return post_search_results
750
+
751
+ @staticmethod
752
+ def _domain_in_host(domain: str, host: Host) -> bool:
753
+ """Checks if the domain is present in the host.
754
+
755
+ Note:
756
+ By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
757
+ it also checks for subdomains. For example, if the domain is
758
+ `link.springer.com` and the host domain is `springer.com`,
759
+ it will be detected as being present in the hosts.
760
+
761
+ Args:
762
+ domain: The domain to check.
763
+ host: The host to check against.
764
+ """
765
+ return any(
766
+ domain == hst_dom or domain.endswith(f".{hst_dom}")
767
+ for hst_dom in host.domains
768
+ )
769
+
770
+ def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
771
+ """Checks if the domain is present in the list of hosts.
772
+
773
+ Args:
774
+ domain: The domain to check.
775
+ hosts: The list of hosts to check against.
776
+ """
777
+ return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
778
+
779
+ @staticmethod
780
+ def _relevant_country_code(url: str, country_code: str) -> bool:
781
+ """Determines whether the url shows relevant country codes.
782
+
783
+ Args:
784
+ url: The URL to investigate.
785
+ country_code: The country code used to filter the products.
786
+ """
787
+ url = url.lower()
788
+ country_code_relevance = f".{country_code}" in url
789
+ default_relevance = any(cc in url for cc in SEARCH_DEFAULT_COUNTRY_CODES)
790
+ return country_code_relevance or default_relevance
791
+
792
+ def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
793
+ """Checks if the domain is in the excluded URLs.
794
+
795
+ Args:
796
+ domain: The domain to check.
797
+ excluded_urls: The list of excluded URLs.
798
+ """
799
+ return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
800
+
801
+ def _apply_filters(
802
+ self,
803
+ result: SearchResult,
804
+ location: Location,
805
+ marketplaces: List[Host] | None = None,
806
+ excluded_urls: List[Host] | None = None,
807
+ ) -> SearchResult:
808
+ """Checks for filters and updates the SearchResult accordingly.
809
+
810
+ Args:
811
+ result: The SearchResult object to check.
812
+ location: The location to use for the query.
813
+ marketplaces: The list of marketplaces to compare the URL against.
814
+ excluded_urls: The list of excluded URLs.
815
+ """
816
+ domain = result.domain
817
+ # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
818
+ if marketplaces:
819
+ if self._domain_in_hosts(domain=domain, hosts=marketplaces):
820
+ return result
821
+
822
+ # Check if the URL has a relevant country_code
823
+ if not self._relevant_country_code(url=result.url, country_code=location.code):
824
+ result.filtered = True
825
+ result.filtered_at_stage = "Search (country code filtering)"
826
+ return result
827
+
828
+ # Check if the URL is in the excluded URLs
829
+ if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
830
+ result.filtered = True
831
+ result.filtered_at_stage = "Search (excluded URLs filtering)"
832
+ return result
833
+
834
+ return result
835
+
836
+ async def apply(
837
+ self,
838
+ search_term: str,
839
+ search_engine: SearchEngineName | str,
840
+ language: Language,
841
+ location: Location,
842
+ num_results: int,
843
+ marketplaces: List[Host] | None = None,
844
+ excluded_urls: List[Host] | None = None,
845
+ ) -> List[SearchResult]:
846
+ """Performs a search and returns SearchResults.
847
+
848
+ Args:
849
+ search_term: The search term to use for the query.
850
+ search_engine: The search engine to use for the search.
851
+ language: The language to use for the query ('hl' parameter).
852
+ location: The location to use for the query ('gl' parameter).
853
+ num_results: Max number of results per search engine.
854
+ marketplaces: The marketplaces to include in the search.
855
+ excluded_urls: The URLs to exclude from the search.
856
+ """
857
+ logger.info(
858
+ f'Performing search for term="{search_term}" using engine="{search_engine}".'
859
+ )
860
+
861
+ # -------------------------------
862
+ # SEARCH
863
+ # -------------------------------
864
+ # Map string to SearchEngineName if needed
865
+ if isinstance(search_engine, str):
866
+ search_engine = SearchEngineName(search_engine)
867
+
868
+ # Make SerpAPI google search
869
+ if search_engine == SearchEngineName.GOOGLE:
870
+ results = await self._google.search(
871
+ search_term=search_term,
872
+ language=language,
873
+ location=location,
874
+ num_results=num_results,
875
+ marketplaces=marketplaces,
876
+ )
877
+
878
+ # Make SerpAPI google shopping search
879
+ elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
880
+ results = await self._google_shopping.search(
881
+ search_term=search_term,
882
+ language=language,
883
+ location=location,
884
+ num_results=num_results,
885
+ marketplaces=marketplaces,
886
+ )
887
+
888
+ # Make Toppreise search
889
+ elif search_engine == SearchEngineName.TOPPREISE:
890
+ results = await self._toppreise.search(
891
+ search_term=search_term,
892
+ language=language,
893
+ num_results=num_results,
894
+ )
895
+
896
+ # Other search engines can be added here (raise unknown engine error otherwise)
897
+ else:
898
+ raise ValueError(f"Unknown search engine: {search_engine}")
899
+
900
+ # -------------------------------
901
+ # POST-SEARCH URL EXTRACTION
902
+ # -------------------------------
903
+ post_search_results = await self._post_search(results=results)
904
+ post_search_results = post_search_results[:num_results]
905
+ results.extend(post_search_results)
906
+
907
+ # -------------------------------
908
+ # FILTERS
909
+ # -------------------------------
910
+ # Apply filters
911
+ results = [
912
+ self._apply_filters(
913
+ result=res,
914
+ location=location,
915
+ marketplaces=marketplaces,
916
+ excluded_urls=excluded_urls,
917
+ )
918
+ for res in results
919
+ ]
920
+
921
+ logger.info(
922
+ f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
923
+ )
924
+ return results