fraudcrawler 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -6,12 +6,18 @@ from typing import Dict, List
6
6
  from urllib.parse import quote_plus
7
7
 
8
8
  from bs4 import BeautifulSoup
9
+ from bs4.element import Tag
9
10
  import httpx
10
11
  from tenacity import RetryCallState
11
12
 
12
- from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
13
+ from fraudcrawler.settings import (
14
+ SEARCH_DEFAULT_COUNTRY_CODES,
15
+ TOPPREISE_SEARCH_PATHS,
16
+ TOPPREISE_COMPARISON_PATHS,
17
+ )
13
18
  from fraudcrawler.base.base import Host, Language, Location, DomainUtils
14
19
  from fraudcrawler.base.retry import get_async_retry
20
+ from fraudcrawler.scraping.zyte import ZyteAPI
15
21
 
16
22
  logger = logging.getLogger(__name__)
17
23
 
@@ -39,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
39
45
 
40
46
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
41
47
 
48
+ def __init__(self, http_client: httpx.AsyncClient):
49
+ """Initializes the SearchEngine with the given HTTP client.
50
+
51
+ Args:
52
+ http_client: An httpx.AsyncClient to use for the async requests.
53
+ """
54
+ self._http_client = http_client
55
+
42
56
  @property
43
57
  @abstractmethod
44
58
  def _search_engine_name(self) -> str:
@@ -50,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
50
64
  """Apply the search with the given parameters and return results."""
51
65
  pass
52
66
 
67
+ def _create_search_result(self, url: str) -> SearchResult:
68
+ """From a given url it creates the class:`SearchResult` instance."""
69
+ # Get marketplace name
70
+ domain = self._get_domain(url=url)
71
+
72
+ # Create and return the SearchResult object
73
+ result = SearchResult(
74
+ url=url,
75
+ domain=domain,
76
+ search_engine_name=self._search_engine_name,
77
+ )
78
+ return result
79
+
53
80
  @classmethod
54
81
  def _log_before(
55
- cls, search_string: str, retry_state: RetryCallState | None
82
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
56
83
  ) -> None:
57
- """Context aware logging before the request is made."""
84
+ """Context aware logging before HTTP request is made."""
58
85
  if retry_state:
59
86
  logger.debug(
60
- f'Performing search in {cls.__name__} with q="{search_string}" '
61
- f"(attempt {retry_state.attempt_number})."
87
+ f'Performing HTTP request in {cls.__name__} to url="{url}" '
88
+ f"with params={params} (attempt {retry_state.attempt_number})."
62
89
  )
63
90
  else:
64
91
  logger.debug(f"retry_state is {retry_state}; not logging before.")
65
92
 
66
93
  @classmethod
67
94
  def _log_before_sleep(
68
- cls, search_string: str, retry_state: RetryCallState | None
95
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
69
96
  ) -> None:
70
- """Context aware logging before sleeping after a failed request."""
97
+ """Context aware logging before sleeping after a failed HTTP request."""
71
98
  if retry_state and retry_state.outcome:
72
99
  logger.warning(
73
- f'Attempt {retry_state.attempt_number} of {cls.__name__} search with q="{search_string}" '
100
+ f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
101
+ f'to url="{url}" with params="{params}" '
74
102
  f"failed with error: {retry_state.outcome.exception()}. "
75
103
  f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
76
104
  )
77
105
  else:
78
106
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
79
107
 
80
- def _create_search_result(self, url: str) -> SearchResult:
81
- """From a given url it creates the class:`SearchResult` instance."""
82
- # Get marketplace name
83
- domain = self._get_domain(url=url)
108
+ async def http_client_get(
109
+ self, url: str, params: dict | None = None, headers: dict | None = None
110
+ ) -> httpx.Response:
111
+ """Performs a GET request with retries.
84
112
 
85
- # Create and return the SearchResult object
86
- result = SearchResult(
87
- url=url,
88
- domain=domain,
89
- search_engine_name=self._search_engine_name,
113
+ Args:
114
+ retry: The retry strategy to use.
115
+ url: The URL to request.
116
+ params: Query parameters for the request.
117
+ headers: HTTP headers to use for the request.
118
+ """
119
+ # Perform the request and retry if necessary. There is some context aware logging:
120
+ # - `before`: before the request is made (and before retrying)
121
+ # - `before_sleep`: if the request fails before sleeping
122
+ retry = get_async_retry()
123
+ retry.before = lambda retry_state: self._log_before(
124
+ url=url, params=params, retry_state=retry_state
90
125
  )
91
- return result
126
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
127
+ url=url, params=params, retry_state=retry_state
128
+ )
129
+
130
+ async for attempt in retry:
131
+ with attempt:
132
+ response = await self._http_client.get(
133
+ url=url,
134
+ params=params,
135
+ headers=headers,
136
+ )
137
+ response.raise_for_status()
138
+ return response
139
+
140
+ # In case of not entering the for loop (for some strange reason)
141
+ raise RuntimeError("Retry exhausted without success")
92
142
 
93
143
 
94
144
  class SerpAPI(SearchEngine):
@@ -103,7 +153,7 @@ class SerpAPI(SearchEngine):
103
153
  http_client: An httpx.AsyncClient to use for the async requests.
104
154
  api_key: The API key for SerpAPI.
105
155
  """
106
- self._http_client = http_client
156
+ super().__init__(http_client=http_client)
107
157
  self._api_key = api_key
108
158
 
109
159
  @property
@@ -199,22 +249,10 @@ class SerpAPI(SearchEngine):
199
249
  }
200
250
  logger.debug(f"SerpAPI search with params: {params}")
201
251
 
202
- # Perform the request and retry if necessary. There is some context aware logging:
203
- # - `before`: before the request is made (and before retrying)
204
- # - `before_sleep`: if the request fails before sleeping
205
- retry = get_async_retry()
206
- retry.before = lambda retry_state: self._log_before(
207
- search_string=search_string, retry_state=retry_state
252
+ # Perform the search request
253
+ response: httpx.Response = await self.http_client_get(
254
+ url=self._endpoint, params=params
208
255
  )
209
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
210
- search_string=search_string, retry_state=retry_state
211
- )
212
- async for attempt in retry:
213
- with attempt:
214
- response = await self._http_client.get(
215
- url=self._endpoint, params=params
216
- )
217
- response.raise_for_status()
218
256
 
219
257
  # Extract the URLs from the response
220
258
  data = response.json()
@@ -330,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
330
368
  """
331
369
  results = data.get("shopping_results")
332
370
  if results is not None:
333
- return [url for res in results if (url := res.get("product_link"))]
371
+ # return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
372
+ return [
373
+ url
374
+ for res in results
375
+ if (url := res.get("serpapi_immersive_product_api"))
376
+ ]
377
+ return []
378
+
379
+ @staticmethod
380
+ def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
381
+ """Extracts product urls from the serpapi immersive product API data."""
382
+ if results := data.get("product_results"):
383
+ stores = results.get("stores", [])
384
+ urls = [url for sre in stores if (url := sre.get("link"))]
385
+ return list(set(urls))
334
386
  return []
335
387
 
336
388
  async def search(
@@ -343,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
343
395
  ) -> List[SearchResult]:
344
396
  """Performs a google shopping search using SerpApi and returns SearchResults.
345
397
 
398
+ Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
399
+ and creates multiple SearchResult objects for each merchant URL found.
400
+
346
401
  Args:
347
402
  search_term: The search term to use for the query.
348
403
  language: The language to use for the query ('hl' parameter).
@@ -356,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
356
411
  marketplaces=marketplaces,
357
412
  )
358
413
 
359
- # Perform the search
414
+ # Perform the search to get Google Shopping URLs
360
415
  urls = await self._search(
361
416
  search_string=search_string,
362
417
  language=language,
@@ -369,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
369
424
  # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
370
425
  urls = urls[:num_results]
371
426
 
372
- # Create and return SearchResult objects from the URLs
427
+ # Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
373
428
  results = [self._create_search_result(url=url) for url in urls]
374
429
  logger.debug(
375
- f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
430
+ f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
376
431
  )
377
432
  return results
378
433
 
@@ -380,38 +435,77 @@ class SerpAPIGoogleShopping(SerpAPI):
380
435
  class Toppreise(SearchEngine):
381
436
  """Search engine for toppreise.ch."""
382
437
 
383
- _endpoint = "https://www.toppreise.ch/produktsuche"
384
- _headers = {
385
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
386
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
387
- "Accept-Language": "en-US,en;q=0.5",
388
- "Accept-Encoding": "gzip, deflate",
389
- "Connection": "keep-alive",
390
- "Upgrade-Insecure-Requests": "1",
391
- }
392
-
393
- def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
438
+ _endpoint = "https://www.toppreise.ch/"
439
+
440
+ def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
394
441
  """Initializes the Toppreise client.
395
442
 
396
443
  Args:
397
444
  http_client: An httpx.AsyncClient to use for the async requests.
398
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
445
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
399
446
  """
400
- self._http_client = http_client
401
- self._zyte_api = zyte_api
447
+ super().__init__(http_client=http_client)
448
+ self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
402
449
 
403
- @property
404
- def _search_engine_name(self) -> str:
405
- """The name of the search engine."""
406
- return SearchEngineName.TOPPREISE.value
450
+ async def http_client_get_with_fallback(self, url: str) -> bytes:
451
+ """Performs a GET request with retries.
452
+
453
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
454
+ content using Zyte proxy mode.
455
+
456
+ Args:
457
+ url: The URL to request.
458
+ """
459
+ # Try to access the URL directly
460
+ try:
461
+ response: httpx.Response = await self.http_client_get(
462
+ url=url, headers=self._headers
463
+ )
464
+ content = response.content
465
+
466
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
467
+ # we try to unblock the URL using Zyte proxy mode
468
+ except httpx.HTTPStatusError as err_direct:
469
+ if err_direct.response.status_code == 403:
470
+ logger.warning(
471
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
472
+ )
473
+ try:
474
+ content = await self._zyteapi.unblock_url_content(url)
475
+ except Exception as err_resolve:
476
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
477
+ logger.error(msg)
478
+ raise httpx.HTTPError(msg) from err_resolve
479
+ else:
480
+ raise err_direct
481
+ return content
482
+
483
+ @classmethod
484
+ def _get_search_endpoint(cls, language: Language) -> str:
485
+ """Get the search endpoint based on the language."""
486
+ search_path = TOPPREISE_SEARCH_PATHS.get(
487
+ language.code, TOPPREISE_SEARCH_PATHS["default"]
488
+ )
489
+ return f"{cls._endpoint}{search_path}"
407
490
 
408
491
  @staticmethod
409
- def _get_external_product_urls(content: bytes) -> List[str]:
410
- """Extracts external product URLs from the Toppreise search results page."""
492
+ def _extract_links(
493
+ element: Tag, ext_products: bool = True, comp_products: bool = True
494
+ ) -> List[str]:
495
+ """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
411
496
 
412
- # Parse the HTML
413
- soup = BeautifulSoup(content, "html.parser")
414
- links = soup.find_all("a", href=True)
497
+ Note:
498
+ Depending on the arguments, it extracts:
499
+ - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
500
+ - external product URLs (i.e. https://www.example.com/ext_...).
501
+
502
+ Args:
503
+ tag: BeautifulSoup Tag object containing the HTML to parse.
504
+ ext_products: Whether to extract external product URLs.
505
+ comp_products: Whether to extract product comparison URLs.
506
+ """
507
+ # Find all links in the page
508
+ links = element.find_all("a", href=True)
415
509
 
416
510
  # Filter links to only include external product links
417
511
  hrefs = [
@@ -422,7 +516,15 @@ class Toppreise(SearchEngine):
422
516
  and (href := link.get("href")) # Ensure href is not None
423
517
  and not href.startswith("javascript:") # Skip javascript links
424
518
  and isinstance(href, str) # Ensure href is a string
425
- and "ext_" in href # Skip links that are not external product link
519
+ # Make sure the link is either an external product link (href contains 'ext_')
520
+ # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
521
+ and (
522
+ ("ext_" in href and ext_products)
523
+ or (
524
+ any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
525
+ and comp_products
526
+ )
527
+ )
426
528
  )
427
529
  ]
428
530
 
@@ -437,60 +539,67 @@ class Toppreise(SearchEngine):
437
539
 
438
540
  # Return deduplicated urls
439
541
  urls = list(set(urls))
542
+ return urls
543
+
544
+ def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
545
+ """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
546
+
547
+ # Parse the HTML
548
+ soup = BeautifulSoup(content, "html.parser")
549
+ main = soup.find("div", id="Page_Browsing")
550
+ if not isinstance(main, Tag):
551
+ logger.warning("No main content found in Toppreise search page.")
552
+ return []
553
+
554
+ # Extract links (external product links and comparison links)
555
+ urls = self._extract_links(element=main)
556
+
557
+ logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
558
+ return urls
559
+
560
+ def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
561
+ """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
562
+
563
+ # Parse the HTML
564
+ soup = BeautifulSoup(content, "html.parser")
565
+
566
+ # Extract links (external product links only)
567
+ urls = self._extract_links(element=soup, comp_products=False)
568
+
440
569
  logger.debug(
441
- f"Found {len(urls)} external product URLs from Toppreise search results."
570
+ f"Found {len(urls)} external product URLs from Toppreise comparison page."
442
571
  )
443
572
  return urls
444
573
 
445
- async def _search(self, search_string: str, num_results: int) -> List[str]:
574
+ @property
575
+ def _search_engine_name(self) -> str:
576
+ """The name of the search engine."""
577
+ return SearchEngineName.TOPPREISE.value
578
+
579
+ async def _search(
580
+ self, search_string: str, language: Language, num_results: int
581
+ ) -> List[str]:
446
582
  """Performs a search on Toppreise and returns the URLs of the results.
447
583
 
584
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
585
+ content using Zyte proxy mode.
586
+
448
587
  Args:
449
588
  search_string: The search string to use for the query.
589
+ language: The language to use for the query.
450
590
  num_results: Max number of results to return.
451
591
  """
452
592
  # Build the search URL for Toppreise
593
+ endpoint = self._get_search_endpoint(language=language)
453
594
  encoded_search = quote_plus(search_string)
454
- url = f"{self._endpoint}?q={encoded_search}"
595
+ url = f"{endpoint}?q={encoded_search}"
455
596
  logger.debug(f"Toppreise search URL: {url}")
456
597
 
457
- # Perform the request and retry if necessary. There is some context aware logging:
458
- # - `before`: before the request is made (and before retrying)
459
- # - `before_sleep`: if the request fails before sleeping
460
- retry = get_async_retry()
461
- retry.before = lambda retry_state: self._log_before(
462
- search_string=search_string, retry_state=retry_state
463
- )
464
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
465
- search_string=search_string, retry_state=retry_state
466
- )
467
-
468
- content = None
469
- try:
470
- async for attempt in retry:
471
- with attempt:
472
- response = await self._http_client.get(
473
- url=url,
474
- headers=self._headers,
475
- )
476
- response.raise_for_status()
477
- content = response.content
478
- except httpx.HTTPStatusError as e:
479
- if e.response.status_code == 403 and self._zyte_api:
480
- logger.warning(
481
- f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
482
- )
483
- content = await self._unblock_url(url, self._zyte_api)
484
- if content is None:
485
- raise e # Re-raise if zyte fallback also failed
486
- else:
487
- raise e
488
-
489
- if content is None:
490
- raise httpx.HTTPError("Failed to fetch content")
598
+ # Perform the request with fallback if necessary
599
+ content = await self.http_client_get_with_fallback(url=url)
491
600
 
492
601
  # Get external product urls from the content
493
- urls = self._get_external_product_urls(content=content)
602
+ urls = self._extract_product_urls_from_search_page(content=content)
494
603
  urls = urls[:num_results] # Limit to num_results if needed
495
604
 
496
605
  return urls
@@ -498,17 +607,20 @@ class Toppreise(SearchEngine):
498
607
  async def search(
499
608
  self,
500
609
  search_term: str,
610
+ language: Language,
501
611
  num_results: int,
502
612
  ) -> List[SearchResult]:
503
613
  """Performs a Toppreise search and returns SearchResults.
504
614
 
505
615
  Args:
506
616
  search_term: The search term to use for the query.
617
+ language: The language to use for the search.
507
618
  num_results: Max number of results to return.
508
619
  """
509
620
  # Perform the search
510
621
  urls = await self._search(
511
622
  search_string=search_term,
623
+ language=language,
512
624
  num_results=num_results,
513
625
  )
514
626
 
@@ -520,22 +632,121 @@ class Toppreise(SearchEngine):
520
632
  return results
521
633
 
522
634
 
523
- class Search(DomainUtils):
635
+ class Searcher(DomainUtils):
524
636
  """Class to perform searches using different search engines."""
525
637
 
526
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
638
+ _post_search_retry_stop_after = 3
639
+
640
+ def __init__(
641
+ self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
642
+ ):
527
643
  """Initializes the Search class with the given SerpAPI key.
528
644
 
529
645
  Args:
530
646
  http_client: An httpx.AsyncClient to use for the async requests.
531
647
  serpapi_key: The API key for SERP API.
532
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
648
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
533
649
  """
650
+ self._http_client = http_client
534
651
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
535
652
  self._google_shopping = SerpAPIGoogleShopping(
536
- http_client=http_client, api_key=serpapi_key
653
+ http_client=http_client,
654
+ api_key=serpapi_key,
655
+ )
656
+ self._toppreise = Toppreise(
657
+ http_client=http_client,
658
+ zyteapi_key=zyteapi_key,
659
+ )
660
+
661
+ async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Google Shopping immersive product page.
663
+
664
+ Args:
665
+ url: The URL of the Google Shopping product page.
666
+ """
667
+ # Add SerpAPI key to the url
668
+ sep = "&" if "?" in url else "?"
669
+ url = f"{url}{sep}api_key={self._google_shopping._api_key}"
670
+
671
+ # Fetch the content of the Google Shopping product page
672
+ response = await self._google_shopping.http_client_get(url=url)
673
+
674
+ # Get external product urls from the data
675
+ data = response.json()
676
+ urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
677
+ data=data
537
678
  )
538
- self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
679
+ return urls
680
+
681
+ async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
682
+ """Post-search for product URLs from a Toppreise product comparison page.
683
+
684
+ Note:
685
+ In comparison to the function Toppreise._search, here we extract the urls from
686
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
687
+ pages can also be found in the results of a google search.
688
+
689
+ Args:
690
+ url: The URL of the Toppreise product listing page.
691
+ """
692
+ # Perform the request with fallback if necessary
693
+ content = await self._toppreise.http_client_get_with_fallback(url=url)
694
+
695
+ # Get external product urls from the content
696
+ urls = self._toppreise._extract_product_urls_from_comparison_page(
697
+ content=content
698
+ )
699
+ return urls
700
+
701
+ async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
702
+ """Post-search for additional embedded product URLs from the obtained results.
703
+
704
+ Note:
705
+ This function is used to extract embedded product URLs from
706
+ product listing pages (e.g. Toppreise, Google Shopping) if needed.
707
+
708
+ Args:
709
+ results: The list of SearchResult objects obtained from the search.
710
+ """
711
+ post_search_results: List[SearchResult] = []
712
+ for res in results:
713
+ url = res.url
714
+ post_search_urls: List[str] = []
715
+
716
+ # Extract embedded product URLs from the Google Shopping immersive product page
717
+ if "engine=google_immersive_product" in url:
718
+ logger.debug(
719
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
720
+ )
721
+ post_search_urls = await self._post_search_google_shopping_immersive(
722
+ url=url
723
+ )
724
+ logger.debug(
725
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
726
+ )
727
+
728
+ # Extract embedded product URLs from the Toppreise product listing page
729
+ elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
730
+ logger.debug(
731
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
732
+ )
733
+ post_search_urls = await self._post_search_toppreise_comparison(url=url)
734
+ logger.debug(
735
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
736
+ )
737
+
738
+ # Add the extracted product URLs as SearchResult objects
739
+ psr = [
740
+ SearchResult(
741
+ url=psu,
742
+ domain=self._get_domain(url=psu),
743
+ search_engine_name=res.search_engine_name,
744
+ )
745
+ for psu in post_search_urls
746
+ ]
747
+ post_search_results.extend(psr)
748
+
749
+ return post_search_results
539
750
 
540
751
  @staticmethod
541
752
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -625,63 +836,77 @@ class Search(DomainUtils):
625
836
  async def apply(
626
837
  self,
627
838
  search_term: str,
839
+ search_engine: SearchEngineName | str,
628
840
  language: Language,
629
841
  location: Location,
630
842
  num_results: int,
631
843
  marketplaces: List[Host] | None = None,
632
844
  excluded_urls: List[Host] | None = None,
633
- search_engines: List[SearchEngineName | str] | None = None,
634
845
  ) -> List[SearchResult]:
635
846
  """Performs a search and returns SearchResults.
636
847
 
637
848
  Args:
638
849
  search_term: The search term to use for the query.
850
+ search_engine: The search engine to use for the search.
639
851
  language: The language to use for the query ('hl' parameter).
640
852
  location: The location to use for the query ('gl' parameter).
641
853
  num_results: Max number of results per search engine.
642
854
  marketplaces: The marketplaces to include in the search.
643
855
  excluded_urls: The URLs to exclude from the search.
644
- search_engines: The list of search engines to use for the search.
645
856
  """
646
- if search_engines is None:
647
- search_engines = list(SearchEngineName)
648
- else:
649
- search_engines = [
650
- SearchEngineName(sen) if isinstance(sen, str) else sen
651
- for sen in search_engines
652
- ]
653
- results: List[SearchResult] = []
857
+ logger.info(
858
+ f'Performing search for term="{search_term}" using engine="{search_engine}".'
859
+ )
860
+
861
+ # -------------------------------
862
+ # SEARCH
863
+ # -------------------------------
864
+ # Map string to SearchEngineName if needed
865
+ if isinstance(search_engine, str):
866
+ search_engine = SearchEngineName(search_engine)
654
867
 
655
868
  # Make SerpAPI google search
656
- if SearchEngineName.GOOGLE in search_engines:
657
- res = await self._google.search(
869
+ if search_engine == SearchEngineName.GOOGLE:
870
+ results = await self._google.search(
658
871
  search_term=search_term,
659
872
  language=language,
660
873
  location=location,
661
874
  num_results=num_results,
662
875
  marketplaces=marketplaces,
663
876
  )
664
- results.extend(res)
665
877
 
666
878
  # Make SerpAPI google shopping search
667
- if SearchEngineName.GOOGLE_SHOPPING in search_engines:
668
- res = await self._google_shopping.search(
879
+ elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
880
+ results = await self._google_shopping.search(
669
881
  search_term=search_term,
670
882
  language=language,
671
883
  location=location,
672
884
  num_results=num_results,
673
885
  marketplaces=marketplaces,
674
886
  )
675
- results.extend(res)
676
887
 
677
888
  # Make Toppreise search
678
- if SearchEngineName.TOPPREISE in search_engines:
679
- res = await self._toppreise.search(
889
+ elif search_engine == SearchEngineName.TOPPREISE:
890
+ results = await self._toppreise.search(
680
891
  search_term=search_term,
892
+ language=language,
681
893
  num_results=num_results,
682
894
  )
683
- results.extend(res)
684
895
 
896
+ # Other search engines can be added here (raise unknown engine error otherwise)
897
+ else:
898
+ raise ValueError(f"Unknown search engine: {search_engine}")
899
+
900
+ # -------------------------------
901
+ # POST-SEARCH URL EXTRACTION
902
+ # -------------------------------
903
+ post_search_results = await self._post_search(results=results)
904
+ post_search_results = post_search_results[:num_results]
905
+ results.extend(post_search_results)
906
+
907
+ # -------------------------------
908
+ # FILTERS
909
+ # -------------------------------
685
910
  # Apply filters
686
911
  results = [
687
912
  self._apply_filters(
@@ -693,5 +918,7 @@ class Search(DomainUtils):
693
918
  for res in results
694
919
  ]
695
920
 
696
- logger.debug(f"Search produced a total of {len(results)} results.")
921
+ logger.info(
922
+ f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
923
+ )
697
924
  return results