fraudcrawler 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -72,7 +72,7 @@ class Processor:
72
72
  """Context aware logging before the request is made."""
73
73
  if retry_state:
74
74
  logger.debug(
75
- f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
75
+ f"Classifying product with url={url} using prompt={prompt.name} (Attempt {retry_state.attempt_number})."
76
76
  )
77
77
  else:
78
78
  logger.debug(f"retry_state is {retry_state}; not logging before.")
@@ -84,7 +84,7 @@ class Processor:
84
84
  """Context aware logging before sleeping after a failed request."""
85
85
  if retry_state and retry_state.outcome:
86
86
  logger.warning(
87
- f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
87
+ f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt.name} "
88
88
  f"failed with error: {retry_state.outcome.exception()}. "
89
89
  f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
90
90
  )
@@ -160,7 +160,7 @@ class Processor:
160
160
  # Call the OpenAI API
161
161
  try:
162
162
  logger.debug(
163
- f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
163
+ f"Classifying product with url={url}, using prompt={prompt.name}."
164
164
  )
165
165
  # Perform the request and retry if necessary. There is some context aware logging
166
166
  # - `before`: before the request is made (or before retrying)
@@ -6,12 +6,18 @@ from typing import Dict, List
6
6
  from urllib.parse import quote_plus
7
7
 
8
8
  from bs4 import BeautifulSoup
9
+ from bs4.element import Tag
9
10
  import httpx
10
- from tenacity import RetryCallState
11
+ from tenacity import RetryCallState, AsyncRetrying
11
12
 
12
- from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
13
+ from fraudcrawler.settings import (
14
+ SEARCH_DEFAULT_COUNTRY_CODES,
15
+ TOPPREISE_SEARCH_PATHS,
16
+ TOPPREISE_COMPARISON_PATHS,
17
+ )
13
18
  from fraudcrawler.base.base import Host, Language, Location, DomainUtils
14
19
  from fraudcrawler.base.retry import get_async_retry
20
+ from fraudcrawler.scraping.zyte import ZyteAPI
15
21
 
16
22
  logger = logging.getLogger(__name__)
17
23
 
@@ -380,7 +386,7 @@ class SerpAPIGoogleShopping(SerpAPI):
380
386
  class Toppreise(SearchEngine):
381
387
  """Search engine for toppreise.ch."""
382
388
 
383
- _endpoint = "https://www.toppreise.ch/produktsuche"
389
+ _endpoint = "https://www.toppreise.ch/"
384
390
  _headers = {
385
391
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
386
392
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -390,28 +396,42 @@ class Toppreise(SearchEngine):
390
396
  "Upgrade-Insecure-Requests": "1",
391
397
  }
392
398
 
393
- def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
399
+ def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
394
400
  """Initializes the Toppreise client.
395
401
 
396
402
  Args:
397
403
  http_client: An httpx.AsyncClient to use for the async requests.
398
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
404
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
399
405
  """
400
406
  self._http_client = http_client
401
- self._zyte_api = zyte_api
407
+ self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
402
408
 
403
- @property
404
- def _search_engine_name(self) -> str:
405
- """The name of the search engine."""
406
- return SearchEngineName.TOPPREISE.value
409
+ @classmethod
410
+ def _get_search_endpoint(cls, language: Language) -> str:
411
+ """Get the search endpoint based on the language."""
412
+ search_path = TOPPREISE_SEARCH_PATHS.get(
413
+ language.code, TOPPREISE_SEARCH_PATHS["default"]
414
+ )
415
+ return f"{cls._endpoint}{search_path}"
407
416
 
408
417
  @staticmethod
409
- def _get_external_product_urls(content: bytes) -> List[str]:
410
- """Extracts external product URLs from the Toppreise search results page."""
418
+ def _extract_links(
419
+ element: Tag, ext_products: bool = True, comp_products: bool = True
420
+ ) -> List[str]:
421
+ """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
411
422
 
412
- # Parse the HTML
413
- soup = BeautifulSoup(content, "html.parser")
414
- links = soup.find_all("a", href=True)
423
+ Note:
424
+ Depending on the arguments, it extracts:
425
+ - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
426
+ - external product URLs (i.e. https://www.example.com/ext_...).
427
+
428
+ Args:
429
+ tag: BeautifulSoup Tag object containing the HTML to parse.
430
+ ext_products: Whether to extract external product URLs.
431
+ comp_products: Whether to extract product comparison URLs.
432
+ """
433
+ # Find all links in the page
434
+ links = element.find_all("a", href=True)
415
435
 
416
436
  # Filter links to only include external product links
417
437
  hrefs = [
@@ -422,7 +442,15 @@ class Toppreise(SearchEngine):
422
442
  and (href := link.get("href")) # Ensure href is not None
423
443
  and not href.startswith("javascript:") # Skip javascript links
424
444
  and isinstance(href, str) # Ensure href is a string
425
- and "ext_" in href # Skip links that are not external product link
445
+ # Make sure the link is either an external product link (href contains 'ext_')
446
+ # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
447
+ and (
448
+ ("ext_" in href and ext_products)
449
+ or (
450
+ any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
451
+ and comp_products
452
+ )
453
+ )
426
454
  )
427
455
  ]
428
456
 
@@ -437,21 +465,100 @@ class Toppreise(SearchEngine):
437
465
 
438
466
  # Return deduplicated urls
439
467
  urls = list(set(urls))
468
+ return urls
469
+
470
+ def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
471
+ """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
472
+
473
+ # Parse the HTML
474
+ soup = BeautifulSoup(content, "html.parser")
475
+ main = soup.find("div", id="Page_Browsing")
476
+ if not isinstance(main, Tag):
477
+ logger.warning("No main content found in Toppreise search page.")
478
+ return []
479
+
480
+ # Extract links (external product links and comparison links)
481
+ urls = self._extract_links(element=main)
482
+
483
+ logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
484
+ return urls
485
+
486
+ def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
487
+ """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
488
+
489
+ # Parse the HTML
490
+ soup = BeautifulSoup(content, "html.parser")
491
+
492
+ # Extract links (external product links only)
493
+ urls = self._extract_links(element=soup, comp_products=False)
494
+
440
495
  logger.debug(
441
- f"Found {len(urls)} external product URLs from Toppreise search results."
496
+ f"Found {len(urls)} external product URLs from Toppreise comparison page."
442
497
  )
443
498
  return urls
444
499
 
445
- async def _search(self, search_string: str, num_results: int) -> List[str]:
500
+ @property
501
+ def _search_engine_name(self) -> str:
502
+ """The name of the search engine."""
503
+ return SearchEngineName.TOPPREISE.value
504
+
505
+ async def http_client_get_with_fallback(
506
+ self, url: str, retry: AsyncRetrying
507
+ ) -> bytes:
508
+ """Performs a GET request with retries.
509
+
510
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
511
+ content using Zyte proxy mode.
512
+
513
+ Args:
514
+ url: The URL to request.
515
+ retry: The retry strategy to use.
516
+ """
517
+ # Try to access the URL directly
518
+ try:
519
+ async for attempt in retry:
520
+ with attempt:
521
+ response = await self._http_client.get(
522
+ url=url,
523
+ headers=self._headers,
524
+ )
525
+ response.raise_for_status()
526
+ content = response.content
527
+
528
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
529
+ # we try to unblock the URL using Zyte proxy mode
530
+ except httpx.HTTPStatusError as err_direct:
531
+ if err_direct.response.status_code == 403:
532
+ logger.warning(
533
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
534
+ )
535
+ try:
536
+ content = await self._zyteapi.unblock_url_content(url)
537
+ except Exception as err_resolve:
538
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
539
+ logger.error(msg)
540
+ raise httpx.HTTPError(msg) from err_resolve
541
+ else:
542
+ raise err_direct
543
+ return content
544
+
545
+ async def _search(
546
+ self, search_string: str, language: Language, num_results: int
547
+ ) -> List[str]:
446
548
  """Performs a search on Toppreise and returns the URLs of the results.
447
549
 
550
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
551
+ content using Zyte proxy mode.
552
+
448
553
  Args:
449
554
  search_string: The search string to use for the query.
555
+ language: The language to use for the query.
450
556
  num_results: Max number of results to return.
451
557
  """
452
558
  # Build the search URL for Toppreise
559
+ endpoint = self._get_search_endpoint(language=language)
453
560
  encoded_search = quote_plus(search_string)
454
- url = f"{self._endpoint}?q={encoded_search}"
561
+ url = f"{endpoint}?q={encoded_search}"
455
562
  logger.debug(f"Toppreise search URL: {url}")
456
563
 
457
564
  # Perform the request and retry if necessary. There is some context aware logging:
@@ -464,33 +571,10 @@ class Toppreise(SearchEngine):
464
571
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
465
572
  search_string=search_string, retry_state=retry_state
466
573
  )
467
-
468
- content = None
469
- try:
470
- async for attempt in retry:
471
- with attempt:
472
- response = await self._http_client.get(
473
- url=url,
474
- headers=self._headers,
475
- )
476
- response.raise_for_status()
477
- content = response.content
478
- except httpx.HTTPStatusError as e:
479
- if e.response.status_code == 403 and self._zyte_api:
480
- logger.warning(
481
- f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
482
- )
483
- content = await self._unblock_url(url, self._zyte_api)
484
- if content is None:
485
- raise e # Re-raise if zyte fallback also failed
486
- else:
487
- raise e
488
-
489
- if content is None:
490
- raise httpx.HTTPError("Failed to fetch content")
574
+ content = await self.http_client_get_with_fallback(url=url, retry=retry)
491
575
 
492
576
  # Get external product urls from the content
493
- urls = self._get_external_product_urls(content=content)
577
+ urls = self._extract_product_urls_from_search_page(content=content)
494
578
  urls = urls[:num_results] # Limit to num_results if needed
495
579
 
496
580
  return urls
@@ -498,17 +582,20 @@ class Toppreise(SearchEngine):
498
582
  async def search(
499
583
  self,
500
584
  search_term: str,
585
+ language: Language,
501
586
  num_results: int,
502
587
  ) -> List[SearchResult]:
503
588
  """Performs a Toppreise search and returns SearchResults.
504
589
 
505
590
  Args:
506
591
  search_term: The search term to use for the query.
592
+ language: The language to use for the search.
507
593
  num_results: Max number of results to return.
508
594
  """
509
595
  # Perform the search
510
596
  urls = await self._search(
511
597
  search_string=search_term,
598
+ language=language,
512
599
  num_results=num_results,
513
600
  )
514
601
 
@@ -520,22 +607,124 @@ class Toppreise(SearchEngine):
520
607
  return results
521
608
 
522
609
 
523
- class Search(DomainUtils):
610
+ class Searcher(DomainUtils):
524
611
  """Class to perform searches using different search engines."""
525
612
 
526
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
613
+ _post_search_retry_stop_after = 3
614
+
615
+ def __init__(
616
+ self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
617
+ ):
527
618
  """Initializes the Search class with the given SerpAPI key.
528
619
 
529
620
  Args:
530
621
  http_client: An httpx.AsyncClient to use for the async requests.
531
622
  serpapi_key: The API key for SERP API.
532
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
623
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
533
624
  """
625
+ self._http_client = http_client
534
626
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
535
627
  self._google_shopping = SerpAPIGoogleShopping(
536
- http_client=http_client, api_key=serpapi_key
628
+ http_client=http_client,
629
+ api_key=serpapi_key,
630
+ )
631
+ self._toppreise = Toppreise(
632
+ http_client=http_client,
633
+ zyteapi_key=zyteapi_key,
537
634
  )
538
- self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
635
+
636
+ @staticmethod
637
+ def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
638
+ """Context aware logging before the request is made."""
639
+ if retry_state:
640
+ logger.debug(
641
+ f'Performing post search for url="{url}" '
642
+ f"(attempt {retry_state.attempt_number})."
643
+ )
644
+ else:
645
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
646
+
647
+ @staticmethod
648
+ def _post_search_log_before_sleep(
649
+ url: str, retry_state: RetryCallState | None
650
+ ) -> None:
651
+ """Context aware logging before sleeping after a failed request."""
652
+ if retry_state and retry_state.outcome:
653
+ logger.warning(
654
+ f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
655
+ f"failed with error: {retry_state.outcome.exception()}. "
656
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
657
+ )
658
+ else:
659
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
660
+
661
+ async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Toppreise product comparison page.
663
+
664
+ Note:
665
+ In comparison to the function Toppreise._search, here we extract the urls from
666
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
667
+ also be found in the results of a google search.
668
+
669
+ Args:
670
+ url: The URL of the Toppreise product listing page.
671
+ """
672
+ # Perform the request and retry if necessary. There is some context aware logging:
673
+ # - `before`: before the request is made (and before retrying)
674
+ # - `before_sleep`: if the request fails before sleeping
675
+ retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
676
+ retry.before = lambda retry_state: self._post_search_log_before(
677
+ url=url, retry_state=retry_state
678
+ )
679
+ retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
680
+ url=url, retry_state=retry_state
681
+ )
682
+ content = await self._toppreise.http_client_get_with_fallback(
683
+ url=url, retry=retry
684
+ )
685
+
686
+ # Get external product urls from the content
687
+ urls = self._toppreise._extract_product_urls_from_comparison_page(
688
+ content=content
689
+ )
690
+
691
+ return urls
692
+
693
+ async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
694
+ """Post-search for additional embedded product URLs from the obtained results.
695
+
696
+ Note:
697
+ This function is used to extract embedded product URLs from
698
+ product listing pages (e.g. Toppreise, Google Shopping) if needed.
699
+
700
+ Args:
701
+ results: The list of SearchResult objects obtained from the search.
702
+ """
703
+ post_search_results: List[SearchResult] = []
704
+ for res in results:
705
+ url = res.url
706
+
707
+ # Extract embedded product URLs from the Toppreise product listing page
708
+ if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
709
+ logger.debug(
710
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
711
+ )
712
+ post_search_urls = await self._post_search_toppreise_comparison(url=url)
713
+ logger.debug(
714
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
715
+ )
716
+
717
+ psr = [
718
+ SearchResult(
719
+ url=psu,
720
+ domain=self._get_domain(url=psu),
721
+ search_engine_name=res.search_engine_name,
722
+ )
723
+ for psu in post_search_urls
724
+ ]
725
+ post_search_results.extend(psr)
726
+
727
+ return post_search_results
539
728
 
540
729
  @staticmethod
541
730
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -625,63 +814,77 @@ class Search(DomainUtils):
625
814
  async def apply(
626
815
  self,
627
816
  search_term: str,
817
+ search_engine: SearchEngineName | str,
628
818
  language: Language,
629
819
  location: Location,
630
820
  num_results: int,
631
821
  marketplaces: List[Host] | None = None,
632
822
  excluded_urls: List[Host] | None = None,
633
- search_engines: List[SearchEngineName | str] | None = None,
634
823
  ) -> List[SearchResult]:
635
824
  """Performs a search and returns SearchResults.
636
825
 
637
826
  Args:
638
827
  search_term: The search term to use for the query.
828
+ search_engine: The search engine to use for the search.
639
829
  language: The language to use for the query ('hl' parameter).
640
830
  location: The location to use for the query ('gl' parameter).
641
831
  num_results: Max number of results per search engine.
642
832
  marketplaces: The marketplaces to include in the search.
643
833
  excluded_urls: The URLs to exclude from the search.
644
- search_engines: The list of search engines to use for the search.
645
834
  """
646
- if search_engines is None:
647
- search_engines = list(SearchEngineName)
648
- else:
649
- search_engines = [
650
- SearchEngineName(sen) if isinstance(sen, str) else sen
651
- for sen in search_engines
652
- ]
653
- results: List[SearchResult] = []
835
+ logger.info(
836
+ f'Performing search for term="{search_term}" using engine="{search_engine}".'
837
+ )
838
+
839
+ # -------------------------------
840
+ # SEARCH
841
+ # -------------------------------
842
+ # Map string to SearchEngineName if needed
843
+ if isinstance(search_engine, str):
844
+ search_engine = SearchEngineName(search_engine)
654
845
 
655
846
  # Make SerpAPI google search
656
- if SearchEngineName.GOOGLE in search_engines:
657
- res = await self._google.search(
847
+ if search_engine == SearchEngineName.GOOGLE:
848
+ results = await self._google.search(
658
849
  search_term=search_term,
659
850
  language=language,
660
851
  location=location,
661
852
  num_results=num_results,
662
853
  marketplaces=marketplaces,
663
854
  )
664
- results.extend(res)
665
855
 
666
856
  # Make SerpAPI google shopping search
667
- if SearchEngineName.GOOGLE_SHOPPING in search_engines:
668
- res = await self._google_shopping.search(
857
+ elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
858
+ results = await self._google_shopping.search(
669
859
  search_term=search_term,
670
860
  language=language,
671
861
  location=location,
672
862
  num_results=num_results,
673
863
  marketplaces=marketplaces,
674
864
  )
675
- results.extend(res)
676
865
 
677
866
  # Make Toppreise search
678
- if SearchEngineName.TOPPREISE in search_engines:
679
- res = await self._toppreise.search(
867
+ elif search_engine == SearchEngineName.TOPPREISE:
868
+ results = await self._toppreise.search(
680
869
  search_term=search_term,
870
+ language=language,
681
871
  num_results=num_results,
682
872
  )
683
- results.extend(res)
684
873
 
874
+ # Other search engines can be added here (raise unknown engine error otherwise)
875
+ else:
876
+ raise ValueError(f"Unknown search engine: {search_engine}")
877
+
878
+ # -------------------------------
879
+ # POST-SEARCH URL EXTRACTION
880
+ # -------------------------------
881
+ post_search_results = await self._post_search(results=results)
882
+ post_search_results = post_search_results[:num_results]
883
+ results.extend(post_search_results)
884
+
885
+ # -------------------------------
886
+ # FILTERS
887
+ # -------------------------------
685
888
  # Apply filters
686
889
  results = [
687
890
  self._apply_filters(
@@ -693,5 +896,7 @@ class Search(DomainUtils):
693
896
  for res in results
694
897
  ]
695
898
 
696
- logger.debug(f"Search produced a total of {len(results)} results.")
899
+ logger.info(
900
+ f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
901
+ )
697
902
  return results
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
3
3
  from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
4
 
5
5
  from fraudcrawler.settings import KNOWN_TRACKERS
6
+ from fraudcrawler.base.base import ProductItem
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -11,11 +12,19 @@ class URLCollector:
11
12
  """A class to collect and de-duplicate URLs."""
12
13
 
13
14
  def __init__(self):
14
- self.collected_currently: Set[str] = set()
15
- self.collected_previously: Set[str] = set()
15
+ self._collected_currently: Set[str] = set()
16
+ self._collected_previously: Set[str] = set()
17
+
18
+ def add_previously_collected_urls(self, urls: List[str]) -> None:
19
+ """Add a set of previously collected URLs to the internal state.
20
+
21
+ Args:
22
+ urls: A set of URLs that have been collected in previous runs.
23
+ """
24
+ self._collected_previously.update(urls)
16
25
 
17
26
  @staticmethod
18
- def remove_tracking_parameters(url: str) -> str:
27
+ def _remove_tracking_parameters(url: str) -> str:
19
28
  """Remove tracking parameters from URLs.
20
29
 
21
30
  Args:
@@ -55,3 +64,33 @@ class URLCollector:
55
64
  fragment=parsed_url.fragment,
56
65
  )
57
66
  return urlunparse(clean_url)
67
+
68
+ async def apply(self, product: ProductItem) -> ProductItem:
69
+ """Manages the collection and deduplication of ProductItems.
70
+
71
+ Args:
72
+ product: The product item to process.
73
+ """
74
+ logger.debug(f'Processing product with url="{product.url}"')
75
+
76
+ # Remove tracking parameters from the URL
77
+ url = self._remove_tracking_parameters(product.url)
78
+ product.url = url
79
+
80
+ # deduplicate on current run
81
+ if url in self._collected_currently:
82
+ product.filtered = True
83
+ product.filtered_at_stage = "URL collection (current run deduplication)"
84
+ logger.debug(f"URL {url} already collected in current run")
85
+
86
+ # deduplicate on previous runs coming from a db
87
+ elif url in self._collected_previously:
88
+ product.filtered = True
89
+ product.filtered_at_stage = "URL collection (previous run deduplication)"
90
+ logger.debug(f"URL {url} as already collected in previous run")
91
+
92
+ # Add to currently collected URLs
93
+ else:
94
+ self._collected_currently.add(url)
95
+
96
+ return product
@@ -1,6 +1,6 @@
1
+ from base64 import b64decode
1
2
  import logging
2
3
  from typing import List
3
- from base64 import b64decode
4
4
 
5
5
  import httpx
6
6
  from tenacity import RetryCallState
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
242
242
  decoded_string = decoded_bytes.decode("utf-8")
243
243
  return decoded_string
244
244
  return None
245
+
246
+ async def unblock_url_content(self, url: str) -> bytes:
247
+ """Unblock the content of an URL using Zyte proxy mode.
248
+
249
+ Args:
250
+ url: The URL to fetch using Zyte proxy mode.
251
+ """
252
+ logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
253
+ details = await self.details(url)
254
+
255
+ if not details or "httpResponseBody" not in details:
256
+ raise httpx.HTTPError("No httpResponseBody in Zyte response")
257
+
258
+ return b64decode(details["httpResponseBody"])
fraudcrawler/settings.py CHANGED
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
14
14
  RETRY_JITTER = 1
15
15
  RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
16
16
 
17
- # Serp settings
17
+ # Search settings
18
18
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
19
19
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
20
20
  SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
21
21
  # ".com",
22
22
  ]
23
+ TOPPREISE_SEARCH_PATHS = {
24
+ "de": "produktsuche",
25
+ "fr": "chercher",
26
+ "default": "browse",
27
+ }
28
+ TOPPREISE_COMPARISON_PATHS = [
29
+ "preisvergleich",
30
+ "comparison-prix",
31
+ "price-comparison",
32
+ ]
23
33
 
24
34
  # URL De-duplication settings
25
35
  KNOWN_TRACKERS = [
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
76
86
  PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
77
87
 
78
88
  # Async workers settings
79
- DEFAULT_N_SERP_WKRS = 10
80
- DEFAULT_N_ZYTE_WKRS = 10
89
+ DEFAULT_N_SRCH_WKRS = 5
90
+ DEFAULT_N_CNTX_WKRS = 15
81
91
  DEFAULT_N_PROC_WKRS = 10
82
92
 
83
93
  # HTTPX client settings
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.5.9
3
+ Version: 0.6.0
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
14
15
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
15
16
  Requires-Dist: httpx (>=0.28.1,<0.29.0)
16
17
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
160
161
  ### Async Setup
161
162
  The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
162
163
 
163
- This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
+ This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
165
 
165
166
  The following image provides a schematic representation of the package's async setup.
166
167
  ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)