fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,12 +6,18 @@ from typing import Dict, List
6
6
  from urllib.parse import quote_plus
7
7
 
8
8
  from bs4 import BeautifulSoup
9
+ from bs4.element import Tag
9
10
  import httpx
10
11
  from tenacity import RetryCallState
11
12
 
12
- from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
13
+ from fraudcrawler.settings import (
14
+ SEARCH_DEFAULT_COUNTRY_CODES,
15
+ TOPPREISE_SEARCH_PATHS,
16
+ TOPPREISE_COMPARISON_PATHS,
17
+ )
13
18
  from fraudcrawler.base.base import Host, Language, Location, DomainUtils
14
19
  from fraudcrawler.base.retry import get_async_retry
20
+ from fraudcrawler.scraping.zyte import ZyteAPI
15
21
 
16
22
  logger = logging.getLogger(__name__)
17
23
 
@@ -39,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
39
45
 
40
46
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
41
47
 
48
+ def __init__(self, http_client: httpx.AsyncClient):
49
+ """Initializes the SearchEngine with the given HTTP client.
50
+
51
+ Args:
52
+ http_client: An httpx.AsyncClient to use for the async requests.
53
+ """
54
+ self._http_client = http_client
55
+
42
56
  @property
43
57
  @abstractmethod
44
58
  def _search_engine_name(self) -> str:
@@ -50,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
50
64
  """Apply the search with the given parameters and return results."""
51
65
  pass
52
66
 
67
+ def _create_search_result(self, url: str) -> SearchResult:
68
+ """From a given url it creates the class:`SearchResult` instance."""
69
+ # Get marketplace name
70
+ domain = self._get_domain(url=url)
71
+
72
+ # Create and return the SearchResult object
73
+ result = SearchResult(
74
+ url=url,
75
+ domain=domain,
76
+ search_engine_name=self._search_engine_name,
77
+ )
78
+ return result
79
+
53
80
  @classmethod
54
81
  def _log_before(
55
- cls, search_string: str, retry_state: RetryCallState | None
82
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
56
83
  ) -> None:
57
- """Context aware logging before the request is made."""
84
+ """Context aware logging before HTTP request is made."""
58
85
  if retry_state:
59
86
  logger.debug(
60
- f'Performing search in {cls.__name__} with q="{search_string}" '
61
- f"(attempt {retry_state.attempt_number})."
87
+ f'Performing HTTP request in {cls.__name__} to url="{url}" '
88
+ f"with params={params} (attempt {retry_state.attempt_number})."
62
89
  )
63
90
  else:
64
91
  logger.debug(f"retry_state is {retry_state}; not logging before.")
65
92
 
66
93
  @classmethod
67
94
  def _log_before_sleep(
68
- cls, search_string: str, retry_state: RetryCallState | None
95
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
69
96
  ) -> None:
70
- """Context aware logging before sleeping after a failed request."""
97
+ """Context aware logging before sleeping after a failed HTTP request."""
71
98
  if retry_state and retry_state.outcome:
72
99
  logger.warning(
73
- f'Attempt {retry_state.attempt_number} of {cls.__name__} search with q="{search_string}" '
100
+ f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
101
+ f'to url="{url}" with params="{params}" '
74
102
  f"failed with error: {retry_state.outcome.exception()}. "
75
103
  f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
76
104
  )
77
105
  else:
78
106
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
79
107
 
80
- def _create_search_result(self, url: str) -> SearchResult:
81
- """From a given url it creates the class:`SearchResult` instance."""
82
- # Get marketplace name
83
- domain = self._get_domain(url=url)
108
+ async def http_client_get(
109
+ self, url: str, params: dict | None = None, headers: dict | None = None
110
+ ) -> httpx.Response:
111
+ """Performs a GET request with retries.
84
112
 
85
- # Create and return the SearchResult object
86
- result = SearchResult(
87
- url=url,
88
- domain=domain,
89
- search_engine_name=self._search_engine_name,
113
+ Args:
114
+ retry: The retry strategy to use.
115
+ url: The URL to request.
116
+ params: Query parameters for the request.
117
+ headers: HTTP headers to use for the request.
118
+ """
119
+ # Perform the request and retry if necessary. There is some context aware logging:
120
+ # - `before`: before the request is made (and before retrying)
121
+ # - `before_sleep`: if the request fails before sleeping
122
+ retry = get_async_retry()
123
+ retry.before = lambda retry_state: self._log_before(
124
+ url=url, params=params, retry_state=retry_state
125
+ )
126
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
127
+ url=url, params=params, retry_state=retry_state
90
128
  )
91
- return result
129
+
130
+ async for attempt in retry:
131
+ with attempt:
132
+ response = await self._http_client.get(
133
+ url=url,
134
+ params=params,
135
+ headers=headers,
136
+ )
137
+ response.raise_for_status()
138
+ return response
139
+
140
+ # In case of not entering the for loop (for some strange reason)
141
+ raise RuntimeError("Retry exhausted without success")
92
142
 
93
143
 
94
144
  class SerpAPI(SearchEngine):
@@ -103,7 +153,7 @@ class SerpAPI(SearchEngine):
103
153
  http_client: An httpx.AsyncClient to use for the async requests.
104
154
  api_key: The API key for SerpAPI.
105
155
  """
106
- self._http_client = http_client
156
+ super().__init__(http_client=http_client)
107
157
  self._api_key = api_key
108
158
 
109
159
  @property
@@ -131,6 +181,17 @@ class SerpAPI(SearchEngine):
131
181
  search_string += " site:" + " OR site:".join(s for s in sites)
132
182
  return search_string
133
183
 
184
+ @staticmethod
185
+ def _get_google_domain(location: Location) -> str:
186
+ """Gets the Google domain for the given location if they do not use the default pattern google.tld"""
187
+ if location.name == "Brazil":
188
+ return "google.com.br"
189
+ elif location.name == "United Kingdom":
190
+ return "google.co.uk"
191
+ elif location.name == "Argentina":
192
+ return "google.com.ar"
193
+ return f"google.{location.code}"
194
+
134
195
  async def _search(
135
196
  self,
136
197
  search_string: str,
@@ -169,38 +230,29 @@ class SerpAPI(SearchEngine):
169
230
  f"num_results={num_results}."
170
231
  )
171
232
 
172
- # Setup the parameters
233
+ # Get Google domain and country code
234
+ google_domain = self._get_google_domain(location)
235
+ country_code = location.code
236
+
173
237
  params: Dict[str, str | int] = {
174
238
  "engine": engine,
175
239
  "q": search_string,
176
- "google_domain": f"google.{location.code}",
240
+ "google_domain": google_domain,
177
241
  "location_requested": location.name,
178
242
  "location_used": location.name,
179
- "tbs": f"ctr:{location.code.upper()}",
180
- "cr": f"country{location.code.upper()}",
181
- "gl": location.code,
243
+ "tbs": f"ctr:{country_code.upper()}",
244
+ "cr": f"country{country_code.upper()}",
245
+ "gl": country_code,
182
246
  "hl": language.code,
183
247
  "num": num_results,
184
248
  "api_key": self._api_key,
185
249
  }
186
250
  logger.debug(f"SerpAPI search with params: {params}")
187
251
 
188
- # Perform the request and retry if necessary. There is some context aware logging:
189
- # - `before`: before the request is made (and before retrying)
190
- # - `before_sleep`: if the request fails before sleeping
191
- retry = get_async_retry()
192
- retry.before = lambda retry_state: self._log_before(
193
- search_string=search_string, retry_state=retry_state
252
+ # Perform the search request
253
+ response: httpx.Response = await self.http_client_get(
254
+ url=self._endpoint, params=params
194
255
  )
195
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
196
- search_string=search_string, retry_state=retry_state
197
- )
198
- async for attempt in retry:
199
- with attempt:
200
- response = await self._http_client.get(
201
- url=self._endpoint, params=params
202
- )
203
- response.raise_for_status()
204
256
 
205
257
  # Extract the URLs from the response
206
258
  data = response.json()
@@ -316,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
316
368
  """
317
369
  results = data.get("shopping_results")
318
370
  if results is not None:
319
- return [url for res in results if (url := res.get("product_link"))]
371
+ # return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
372
+ return [
373
+ url
374
+ for res in results
375
+ if (url := res.get("serpapi_immersive_product_api"))
376
+ ]
377
+ return []
378
+
379
+ @staticmethod
380
+ def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
381
+ """Extracts product urls from the serpapi immersive product API data."""
382
+ if results := data.get("product_results"):
383
+ stores = results.get("stores", [])
384
+ urls = [url for sre in stores if (url := sre.get("link"))]
385
+ return list(set(urls))
320
386
  return []
321
387
 
322
388
  async def search(
@@ -329,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
329
395
  ) -> List[SearchResult]:
330
396
  """Performs a google shopping search using SerpApi and returns SearchResults.
331
397
 
398
+ Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
399
+ and creates multiple SearchResult objects for each merchant URL found.
400
+
332
401
  Args:
333
402
  search_term: The search term to use for the query.
334
403
  language: The language to use for the query ('hl' parameter).
@@ -342,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
342
411
  marketplaces=marketplaces,
343
412
  )
344
413
 
345
- # Perform the search
414
+ # Perform the search to get Google Shopping URLs
346
415
  urls = await self._search(
347
416
  search_string=search_string,
348
417
  language=language,
@@ -355,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
355
424
  # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
356
425
  urls = urls[:num_results]
357
426
 
358
- # Create and return SearchResult objects from the URLs
427
+ # Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
359
428
  results = [self._create_search_result(url=url) for url in urls]
360
429
  logger.debug(
361
- f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
430
+ f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
362
431
  )
363
432
  return results
364
433
 
@@ -366,36 +435,77 @@ class SerpAPIGoogleShopping(SerpAPI):
366
435
  class Toppreise(SearchEngine):
367
436
  """Search engine for toppreise.ch."""
368
437
 
369
- _endpoint = "https://www.toppreise.ch/produktsuche"
370
- _headers = {
371
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
372
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
373
- "Accept-Language": "en-US,en;q=0.5",
374
- "Accept-Encoding": "gzip, deflate",
375
- "Connection": "keep-alive",
376
- "Upgrade-Insecure-Requests": "1",
377
- }
438
+ _endpoint = "https://www.toppreise.ch/"
378
439
 
379
- def __init__(self, http_client: httpx.AsyncClient):
440
+ def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
380
441
  """Initializes the Toppreise client.
381
442
 
382
443
  Args:
383
444
  http_client: An httpx.AsyncClient to use for the async requests.
445
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
384
446
  """
385
- self._http_client = http_client
447
+ super().__init__(http_client=http_client)
448
+ self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
386
449
 
387
- @property
388
- def _search_engine_name(self) -> str:
389
- """The name of the search engine."""
390
- return SearchEngineName.TOPPREISE.value
450
+ async def http_client_get_with_fallback(self, url: str) -> bytes:
451
+ """Performs a GET request with retries.
452
+
453
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
454
+ content using Zyte proxy mode.
455
+
456
+ Args:
457
+ url: The URL to request.
458
+ """
459
+ # Try to access the URL directly
460
+ try:
461
+ response: httpx.Response = await self.http_client_get(
462
+ url=url, headers=self._headers
463
+ )
464
+ content = response.content
465
+
466
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
467
+ # we try to unblock the URL using Zyte proxy mode
468
+ except httpx.HTTPStatusError as err_direct:
469
+ if err_direct.response.status_code == 403:
470
+ logger.warning(
471
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
472
+ )
473
+ try:
474
+ content = await self._zyteapi.unblock_url_content(url)
475
+ except Exception as err_resolve:
476
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
477
+ logger.error(msg, exc_info=True)
478
+ raise httpx.HTTPError(msg) from err_resolve
479
+ else:
480
+ raise err_direct
481
+ return content
482
+
483
+ @classmethod
484
+ def _get_search_endpoint(cls, language: Language) -> str:
485
+ """Get the search endpoint based on the language."""
486
+ search_path = TOPPREISE_SEARCH_PATHS.get(
487
+ language.code, TOPPREISE_SEARCH_PATHS["default"]
488
+ )
489
+ return f"{cls._endpoint}{search_path}"
391
490
 
392
491
  @staticmethod
393
- def _get_external_product_urls(content: bytes) -> List[str]:
394
- """Extracts external product URLs from the Toppreise search results page."""
492
+ def _extract_links(
493
+ element: Tag, ext_products: bool = True, comp_products: bool = True
494
+ ) -> List[str]:
495
+ """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
395
496
 
396
- # Parse the HTML
397
- soup = BeautifulSoup(content, "html.parser")
398
- links = soup.find_all("a", href=True)
497
+ Note:
498
+ Depending on the arguments, it extracts:
499
+ - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
500
+ - external product URLs (i.e. https://www.example.com/ext_...).
501
+
502
+ Args:
503
+ tag: BeautifulSoup Tag object containing the HTML to parse.
504
+ ext_products: Whether to extract external product URLs.
505
+ comp_products: Whether to extract product comparison URLs.
506
+ """
507
+ # Find all links in the page
508
+ links = element.find_all("a", href=True)
399
509
 
400
510
  # Filter links to only include external product links
401
511
  hrefs = [
@@ -406,7 +516,15 @@ class Toppreise(SearchEngine):
406
516
  and (href := link.get("href")) # Ensure href is not None
407
517
  and not href.startswith("javascript:") # Skip javascript links
408
518
  and isinstance(href, str) # Ensure href is a string
409
- and "ext_" in href # Skip links that are not external product link
519
+ # Make sure the link is either an external product link (href contains 'ext_')
520
+ # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
521
+ and (
522
+ ("ext_" in href and ext_products)
523
+ or (
524
+ any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
525
+ and comp_products
526
+ )
527
+ )
410
528
  )
411
529
  ]
412
530
 
@@ -421,44 +539,67 @@ class Toppreise(SearchEngine):
421
539
 
422
540
  # Return deduplicated urls
423
541
  urls = list(set(urls))
542
+ return urls
543
+
544
+ def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
545
+ """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
546
+
547
+ # Parse the HTML
548
+ soup = BeautifulSoup(content, "html.parser")
549
+ main = soup.find("div", id="Page_Browsing")
550
+ if not isinstance(main, Tag):
551
+ logger.warning("No main content found in Toppreise search page.")
552
+ return []
553
+
554
+ # Extract links (external product links and comparison links)
555
+ urls = self._extract_links(element=main)
556
+
557
+ logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
558
+ return urls
559
+
560
+ def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
561
+ """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
562
+
563
+ # Parse the HTML
564
+ soup = BeautifulSoup(content, "html.parser")
565
+
566
+ # Extract links (external product links only)
567
+ urls = self._extract_links(element=soup, comp_products=False)
568
+
424
569
  logger.debug(
425
- f"Found {len(urls)} external product URLs from Toppreise search results."
570
+ f"Found {len(urls)} external product URLs from Toppreise comparison page."
426
571
  )
427
572
  return urls
428
573
 
429
- async def _search(self, search_string: str, num_results: int) -> List[str]:
574
+ @property
575
+ def _search_engine_name(self) -> str:
576
+ """The name of the search engine."""
577
+ return SearchEngineName.TOPPREISE.value
578
+
579
+ async def _search(
580
+ self, search_string: str, language: Language, num_results: int
581
+ ) -> List[str]:
430
582
  """Performs a search on Toppreise and returns the URLs of the results.
431
583
 
584
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
585
+ content using Zyte proxy mode.
586
+
432
587
  Args:
433
588
  search_string: The search string to use for the query.
589
+ language: The language to use for the query.
434
590
  num_results: Max number of results to return.
435
591
  """
436
592
  # Build the search URL for Toppreise
593
+ endpoint = self._get_search_endpoint(language=language)
437
594
  encoded_search = quote_plus(search_string)
438
- url = f"{self._endpoint}?q={encoded_search}"
595
+ url = f"{endpoint}?q={encoded_search}"
439
596
  logger.debug(f"Toppreise search URL: {url}")
440
597
 
441
- # Perform the request and retry if necessary. There is some context aware logging:
442
- # - `before`: before the request is made (and before retrying)
443
- # - `before_sleep`: if the request fails before sleeping
444
- retry = get_async_retry()
445
- retry.before = lambda retry_state: self._log_before(
446
- search_string=search_string, retry_state=retry_state
447
- )
448
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
449
- search_string=search_string, retry_state=retry_state
450
- )
451
- async for attempt in retry:
452
- with attempt:
453
- response = await self._http_client.get(
454
- url=url,
455
- headers=self._headers,
456
- )
457
- response.raise_for_status()
598
+ # Perform the request with fallback if necessary
599
+ content = await self.http_client_get_with_fallback(url=url)
458
600
 
459
601
  # Get external product urls from the content
460
- content = response.content
461
- urls = self._get_external_product_urls(content=content)
602
+ urls = self._extract_product_urls_from_search_page(content=content)
462
603
  urls = urls[:num_results] # Limit to num_results if needed
463
604
 
464
605
  return urls
@@ -466,17 +607,20 @@ class Toppreise(SearchEngine):
466
607
  async def search(
467
608
  self,
468
609
  search_term: str,
610
+ language: Language,
469
611
  num_results: int,
470
612
  ) -> List[SearchResult]:
471
613
  """Performs a Toppreise search and returns SearchResults.
472
614
 
473
615
  Args:
474
616
  search_term: The search term to use for the query.
617
+ language: The language to use for the search.
475
618
  num_results: Max number of results to return.
476
619
  """
477
620
  # Perform the search
478
621
  urls = await self._search(
479
622
  search_string=search_term,
623
+ language=language,
480
624
  num_results=num_results,
481
625
  )
482
626
 
@@ -488,21 +632,121 @@ class Toppreise(SearchEngine):
488
632
  return results
489
633
 
490
634
 
491
- class Search(DomainUtils):
635
+ class Searcher(DomainUtils):
492
636
  """Class to perform searches using different search engines."""
493
637
 
494
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str):
638
+ _post_search_retry_stop_after = 3
639
+
640
+ def __init__(
641
+ self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
642
+ ):
495
643
  """Initializes the Search class with the given SerpAPI key.
496
644
 
497
645
  Args:
498
646
  http_client: An httpx.AsyncClient to use for the async requests.
499
647
  serpapi_key: The API key for SERP API.
648
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
500
649
  """
650
+ self._http_client = http_client
501
651
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
502
652
  self._google_shopping = SerpAPIGoogleShopping(
503
- http_client=http_client, api_key=serpapi_key
653
+ http_client=http_client,
654
+ api_key=serpapi_key,
655
+ )
656
+ self._toppreise = Toppreise(
657
+ http_client=http_client,
658
+ zyteapi_key=zyteapi_key,
659
+ )
660
+
661
+ async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Google Shopping immersive product page.
663
+
664
+ Args:
665
+ url: The URL of the Google Shopping product page.
666
+ """
667
+ # Add SerpAPI key to the url
668
+ sep = "&" if "?" in url else "?"
669
+ url = f"{url}{sep}api_key={self._google_shopping._api_key}"
670
+
671
+ # Fetch the content of the Google Shopping product page
672
+ response = await self._google_shopping.http_client_get(url=url)
673
+
674
+ # Get external product urls from the data
675
+ data = response.json()
676
+ urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
677
+ data=data
678
+ )
679
+ return urls
680
+
681
+ async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
682
+ """Post-search for product URLs from a Toppreise product comparison page.
683
+
684
+ Note:
685
+ In comparison to the function Toppreise._search, here we extract the urls from
686
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
687
+ pages can also be found in the results of a google search.
688
+
689
+ Args:
690
+ url: The URL of the Toppreise product listing page.
691
+ """
692
+ # Perform the request with fallback if necessary
693
+ content = await self._toppreise.http_client_get_with_fallback(url=url)
694
+
695
+ # Get external product urls from the content
696
+ urls = self._toppreise._extract_product_urls_from_comparison_page(
697
+ content=content
504
698
  )
505
- self._toppreise = Toppreise(http_client=http_client)
699
+ return urls
700
+
701
+ async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
702
+ """Post-search for additional embedded product URLs from the obtained results.
703
+
704
+ Note:
705
+ This function is used to extract embedded product URLs from
706
+ product listing pages (e.g. Toppreise, Google Shopping) if needed.
707
+
708
+ Args:
709
+ results: The list of SearchResult objects obtained from the search.
710
+ """
711
+ post_search_results: List[SearchResult] = []
712
+ for res in results:
713
+ url = res.url
714
+ post_search_urls: List[str] = []
715
+
716
+ # Extract embedded product URLs from the Google Shopping immersive product page
717
+ if "engine=google_immersive_product" in url:
718
+ logger.debug(
719
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
720
+ )
721
+ post_search_urls = await self._post_search_google_shopping_immersive(
722
+ url=url
723
+ )
724
+ logger.debug(
725
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
726
+ )
727
+
728
+ # Extract embedded product URLs from the Toppreise product listing page
729
+ elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
730
+ logger.debug(
731
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
732
+ )
733
+ post_search_urls = await self._post_search_toppreise_comparison(url=url)
734
+ logger.debug(
735
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
736
+ )
737
+
738
+ # Add the extracted product URLs as SearchResult objects
739
+ psr = [
740
+ SearchResult(
741
+ url=psu,
742
+ domain=self._get_domain(url=psu),
743
+ search_engine_name=res.search_engine_name,
744
+ )
745
+ for psu in post_search_urls
746
+ ]
747
+ post_search_results.extend(psr)
748
+
749
+ return post_search_results
506
750
 
507
751
  @staticmethod
508
752
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -592,63 +836,77 @@ class Search(DomainUtils):
592
836
  async def apply(
593
837
  self,
594
838
  search_term: str,
839
+ search_engine: SearchEngineName | str,
595
840
  language: Language,
596
841
  location: Location,
597
842
  num_results: int,
598
843
  marketplaces: List[Host] | None = None,
599
844
  excluded_urls: List[Host] | None = None,
600
- search_engines: List[SearchEngineName | str] | None = None,
601
845
  ) -> List[SearchResult]:
602
846
  """Performs a search and returns SearchResults.
603
847
 
604
848
  Args:
605
849
  search_term: The search term to use for the query.
850
+ search_engine: The search engine to use for the search.
606
851
  language: The language to use for the query ('hl' parameter).
607
852
  location: The location to use for the query ('gl' parameter).
608
853
  num_results: Max number of results per search engine.
609
854
  marketplaces: The marketplaces to include in the search.
610
855
  excluded_urls: The URLs to exclude from the search.
611
- search_engines: The list of search engines to use for the search.
612
856
  """
613
- if search_engines is None:
614
- search_engines = list(SearchEngineName)
615
- else:
616
- search_engines = [
617
- SearchEngineName(sen) if isinstance(sen, str) else sen
618
- for sen in search_engines
619
- ]
620
- results: List[SearchResult] = []
857
+ logger.info(
858
+ f'Performing search for term="{search_term}" using engine="{search_engine}".'
859
+ )
860
+
861
+ # -------------------------------
862
+ # SEARCH
863
+ # -------------------------------
864
+ # Map string to SearchEngineName if needed
865
+ if isinstance(search_engine, str):
866
+ search_engine = SearchEngineName(search_engine)
621
867
 
622
868
  # Make SerpAPI google search
623
- if SearchEngineName.GOOGLE in search_engines:
624
- res = await self._google.search(
869
+ if search_engine == SearchEngineName.GOOGLE:
870
+ results = await self._google.search(
625
871
  search_term=search_term,
626
872
  language=language,
627
873
  location=location,
628
874
  num_results=num_results,
629
875
  marketplaces=marketplaces,
630
876
  )
631
- results.extend(res)
632
877
 
633
878
  # Make SerpAPI google shopping search
634
- if SearchEngineName.GOOGLE_SHOPPING in search_engines:
635
- res = await self._google_shopping.search(
879
+ elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
880
+ results = await self._google_shopping.search(
636
881
  search_term=search_term,
637
882
  language=language,
638
883
  location=location,
639
884
  num_results=num_results,
640
885
  marketplaces=marketplaces,
641
886
  )
642
- results.extend(res)
643
887
 
644
888
  # Make Toppreise search
645
- if SearchEngineName.TOPPREISE in search_engines:
646
- res = await self._toppreise.search(
889
+ elif search_engine == SearchEngineName.TOPPREISE:
890
+ results = await self._toppreise.search(
647
891
  search_term=search_term,
892
+ language=language,
648
893
  num_results=num_results,
649
894
  )
650
- results.extend(res)
651
895
 
896
+ # Other search engines can be added here (raise unknown engine error otherwise)
897
+ else:
898
+ raise ValueError(f"Unknown search engine: {search_engine}")
899
+
900
+ # -------------------------------
901
+ # POST-SEARCH URL EXTRACTION
902
+ # -------------------------------
903
+ post_search_results = await self._post_search(results=results)
904
+ post_search_results = post_search_results[:num_results]
905
+ results.extend(post_search_results)
906
+
907
+ # -------------------------------
908
+ # FILTERS
909
+ # -------------------------------
652
910
  # Apply filters
653
911
  results = [
654
912
  self._apply_filters(
@@ -660,5 +918,7 @@ class Search(DomainUtils):
660
918
  for res in results
661
919
  ]
662
920
 
663
- logger.debug(f"Search produced a total of {len(results)} results.")
921
+ logger.info(
922
+ f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
923
+ )
664
924
  return results