fraudcrawler 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -72,7 +72,7 @@ class Processor:
72
72
  """Context aware logging before the request is made."""
73
73
  if retry_state:
74
74
  logger.debug(
75
- f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
75
+ f"Classifying product with url={url} using prompt={prompt.name} (Attempt {retry_state.attempt_number})."
76
76
  )
77
77
  else:
78
78
  logger.debug(f"retry_state is {retry_state}; not logging before.")
@@ -84,7 +84,7 @@ class Processor:
84
84
  """Context aware logging before sleeping after a failed request."""
85
85
  if retry_state and retry_state.outcome:
86
86
  logger.warning(
87
- f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
87
+ f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt.name} "
88
88
  f"failed with error: {retry_state.outcome.exception()}. "
89
89
  f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
90
90
  )
@@ -160,7 +160,7 @@ class Processor:
160
160
  # Call the OpenAI API
161
161
  try:
162
162
  logger.debug(
163
- f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
163
+ f"Classifying product with url={url}, using prompt={prompt.name}."
164
164
  )
165
165
  # Perform the request and retry if necessary. There is some context aware logging
166
166
  # - `before`: before the request is made (or before retrying)
@@ -6,12 +6,18 @@ from typing import Dict, List
6
6
  from urllib.parse import quote_plus
7
7
 
8
8
  from bs4 import BeautifulSoup
9
+ from bs4.element import Tag
9
10
  import httpx
10
- from tenacity import RetryCallState
11
+ from tenacity import RetryCallState, AsyncRetrying
11
12
 
12
- from fraudcrawler.settings import SEARCH_DEFAULT_COUNTRY_CODES
13
+ from fraudcrawler.settings import (
14
+ SEARCH_DEFAULT_COUNTRY_CODES,
15
+ TOPPREISE_SEARCH_PATHS,
16
+ TOPPREISE_COMPARISON_PATHS,
17
+ )
13
18
  from fraudcrawler.base.base import Host, Language, Location, DomainUtils
14
19
  from fraudcrawler.base.retry import get_async_retry
20
+ from fraudcrawler.scraping.zyte import ZyteAPI
15
21
 
16
22
  logger = logging.getLogger(__name__)
17
23
 
@@ -131,6 +137,17 @@ class SerpAPI(SearchEngine):
131
137
  search_string += " site:" + " OR site:".join(s for s in sites)
132
138
  return search_string
133
139
 
140
+ @staticmethod
141
+ def _get_google_domain(location: Location) -> str:
142
+ """Gets the Google domain for the given location if they do not use the default pattern google.tld"""
143
+ if location.name == "Brazil":
144
+ return "google.com.br"
145
+ elif location.name == "United Kingdom":
146
+ return "google.co.uk"
147
+ elif location.name == "Argentina":
148
+ return "google.com.ar"
149
+ return f"google.{location.code}"
150
+
134
151
  async def _search(
135
152
  self,
136
153
  search_string: str,
@@ -169,16 +186,19 @@ class SerpAPI(SearchEngine):
169
186
  f"num_results={num_results}."
170
187
  )
171
188
 
172
- # Setup the parameters
189
+ # Get Google domain and country code
190
+ google_domain = self._get_google_domain(location)
191
+ country_code = location.code
192
+
173
193
  params: Dict[str, str | int] = {
174
194
  "engine": engine,
175
195
  "q": search_string,
176
- "google_domain": f"google.{location.code}",
196
+ "google_domain": google_domain,
177
197
  "location_requested": location.name,
178
198
  "location_used": location.name,
179
- "tbs": f"ctr:{location.code.upper()}",
180
- "cr": f"country{location.code.upper()}",
181
- "gl": location.code,
199
+ "tbs": f"ctr:{country_code.upper()}",
200
+ "cr": f"country{country_code.upper()}",
201
+ "gl": country_code,
182
202
  "hl": language.code,
183
203
  "num": num_results,
184
204
  "api_key": self._api_key,
@@ -366,7 +386,7 @@ class SerpAPIGoogleShopping(SerpAPI):
366
386
  class Toppreise(SearchEngine):
367
387
  """Search engine for toppreise.ch."""
368
388
 
369
- _endpoint = "https://www.toppreise.ch/produktsuche"
389
+ _endpoint = "https://www.toppreise.ch/"
370
390
  _headers = {
371
391
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
372
392
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -376,28 +396,42 @@ class Toppreise(SearchEngine):
376
396
  "Upgrade-Insecure-Requests": "1",
377
397
  }
378
398
 
379
- def __init__(self, http_client: httpx.AsyncClient, zyte_api=None):
399
+ def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
380
400
  """Initializes the Toppreise client.
381
401
 
382
402
  Args:
383
403
  http_client: An httpx.AsyncClient to use for the async requests.
384
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
404
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
385
405
  """
386
406
  self._http_client = http_client
387
- self._zyte_api = zyte_api
407
+ self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
388
408
 
389
- @property
390
- def _search_engine_name(self) -> str:
391
- """The name of the search engine."""
392
- return SearchEngineName.TOPPREISE.value
409
+ @classmethod
410
+ def _get_search_endpoint(cls, language: Language) -> str:
411
+ """Get the search endpoint based on the language."""
412
+ search_path = TOPPREISE_SEARCH_PATHS.get(
413
+ language.code, TOPPREISE_SEARCH_PATHS["default"]
414
+ )
415
+ return f"{cls._endpoint}{search_path}"
393
416
 
394
417
  @staticmethod
395
- def _get_external_product_urls(content: bytes) -> List[str]:
396
- """Extracts external product URLs from the Toppreise search results page."""
418
+ def _extract_links(
419
+ element: Tag, ext_products: bool = True, comp_products: bool = True
420
+ ) -> List[str]:
421
+ """Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
397
422
 
398
- # Parse the HTML
399
- soup = BeautifulSoup(content, "html.parser")
400
- links = soup.find_all("a", href=True)
423
+ Note:
424
+ Depending on the arguments, it extracts:
425
+ - product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
426
+ - external product URLs (i.e. https://www.example.com/ext_...).
427
+
428
+ Args:
429
+ tag: BeautifulSoup Tag object containing the HTML to parse.
430
+ ext_products: Whether to extract external product URLs.
431
+ comp_products: Whether to extract product comparison URLs.
432
+ """
433
+ # Find all links in the page
434
+ links = element.find_all("a", href=True)
401
435
 
402
436
  # Filter links to only include external product links
403
437
  hrefs = [
@@ -408,7 +442,15 @@ class Toppreise(SearchEngine):
408
442
  and (href := link.get("href")) # Ensure href is not None
409
443
  and not href.startswith("javascript:") # Skip javascript links
410
444
  and isinstance(href, str) # Ensure href is a string
411
- and "ext_" in href # Skip links that are not external product link
445
+ # Make sure the link is either an external product link (href contains 'ext_')
446
+ # or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
447
+ and (
448
+ ("ext_" in href and ext_products)
449
+ or (
450
+ any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
451
+ and comp_products
452
+ )
453
+ )
412
454
  )
413
455
  ]
414
456
 
@@ -423,21 +465,100 @@ class Toppreise(SearchEngine):
423
465
 
424
466
  # Return deduplicated urls
425
467
  urls = list(set(urls))
468
+ return urls
469
+
470
+ def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
471
+ """Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
472
+
473
+ # Parse the HTML
474
+ soup = BeautifulSoup(content, "html.parser")
475
+ main = soup.find("div", id="Page_Browsing")
476
+ if not isinstance(main, Tag):
477
+ logger.warning("No main content found in Toppreise search page.")
478
+ return []
479
+
480
+ # Extract links (external product links and comparison links)
481
+ urls = self._extract_links(element=main)
482
+
483
+ logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
484
+ return urls
485
+
486
+ def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
487
+ """Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
488
+
489
+ # Parse the HTML
490
+ soup = BeautifulSoup(content, "html.parser")
491
+
492
+ # Extract links (external product links only)
493
+ urls = self._extract_links(element=soup, comp_products=False)
494
+
426
495
  logger.debug(
427
- f"Found {len(urls)} external product URLs from Toppreise search results."
496
+ f"Found {len(urls)} external product URLs from Toppreise comparison page."
428
497
  )
429
498
  return urls
430
499
 
431
- async def _search(self, search_string: str, num_results: int) -> List[str]:
500
+ @property
501
+ def _search_engine_name(self) -> str:
502
+ """The name of the search engine."""
503
+ return SearchEngineName.TOPPREISE.value
504
+
505
+ async def http_client_get_with_fallback(
506
+ self, url: str, retry: AsyncRetrying
507
+ ) -> bytes:
508
+ """Performs a GET request with retries.
509
+
510
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
511
+ content using Zyte proxy mode.
512
+
513
+ Args:
514
+ url: The URL to request.
515
+ retry: The retry strategy to use.
516
+ """
517
+ # Try to access the URL directly
518
+ try:
519
+ async for attempt in retry:
520
+ with attempt:
521
+ response = await self._http_client.get(
522
+ url=url,
523
+ headers=self._headers,
524
+ )
525
+ response.raise_for_status()
526
+ content = response.content
527
+
528
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
529
+ # we try to unblock the URL using Zyte proxy mode
530
+ except httpx.HTTPStatusError as err_direct:
531
+ if err_direct.response.status_code == 403:
532
+ logger.warning(
533
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
534
+ )
535
+ try:
536
+ content = await self._zyteapi.unblock_url_content(url)
537
+ except Exception as err_resolve:
538
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
539
+ logger.error(msg)
540
+ raise httpx.HTTPError(msg) from err_resolve
541
+ else:
542
+ raise err_direct
543
+ return content
544
+
545
+ async def _search(
546
+ self, search_string: str, language: Language, num_results: int
547
+ ) -> List[str]:
432
548
  """Performs a search on Toppreise and returns the URLs of the results.
433
549
 
550
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
551
+ content using Zyte proxy mode.
552
+
434
553
  Args:
435
554
  search_string: The search string to use for the query.
555
+ language: The language to use for the query.
436
556
  num_results: Max number of results to return.
437
557
  """
438
558
  # Build the search URL for Toppreise
559
+ endpoint = self._get_search_endpoint(language=language)
439
560
  encoded_search = quote_plus(search_string)
440
- url = f"{self._endpoint}?q={encoded_search}"
561
+ url = f"{endpoint}?q={encoded_search}"
441
562
  logger.debug(f"Toppreise search URL: {url}")
442
563
 
443
564
  # Perform the request and retry if necessary. There is some context aware logging:
@@ -450,33 +571,10 @@ class Toppreise(SearchEngine):
450
571
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
451
572
  search_string=search_string, retry_state=retry_state
452
573
  )
453
-
454
- content = None
455
- try:
456
- async for attempt in retry:
457
- with attempt:
458
- response = await self._http_client.get(
459
- url=url,
460
- headers=self._headers,
461
- )
462
- response.raise_for_status()
463
- content = response.content
464
- except httpx.HTTPStatusError as e:
465
- if e.response.status_code == 403 and self._zyte_api:
466
- logger.warning(
467
- f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
468
- )
469
- content = await self._unblock_url(url, self._zyte_api)
470
- if content is None:
471
- raise e # Re-raise if zyte fallback also failed
472
- else:
473
- raise e
474
-
475
- if content is None:
476
- raise httpx.HTTPError("Failed to fetch content")
574
+ content = await self.http_client_get_with_fallback(url=url, retry=retry)
477
575
 
478
576
  # Get external product urls from the content
479
- urls = self._get_external_product_urls(content=content)
577
+ urls = self._extract_product_urls_from_search_page(content=content)
480
578
  urls = urls[:num_results] # Limit to num_results if needed
481
579
 
482
580
  return urls
@@ -484,17 +582,20 @@ class Toppreise(SearchEngine):
484
582
  async def search(
485
583
  self,
486
584
  search_term: str,
585
+ language: Language,
487
586
  num_results: int,
488
587
  ) -> List[SearchResult]:
489
588
  """Performs a Toppreise search and returns SearchResults.
490
589
 
491
590
  Args:
492
591
  search_term: The search term to use for the query.
592
+ language: The language to use for the search.
493
593
  num_results: Max number of results to return.
494
594
  """
495
595
  # Perform the search
496
596
  urls = await self._search(
497
597
  search_string=search_term,
598
+ language=language,
498
599
  num_results=num_results,
499
600
  )
500
601
 
@@ -506,22 +607,124 @@ class Toppreise(SearchEngine):
506
607
  return results
507
608
 
508
609
 
509
- class Search(DomainUtils):
610
+ class Searcher(DomainUtils):
510
611
  """Class to perform searches using different search engines."""
511
612
 
512
- def __init__(self, http_client: httpx.AsyncClient, serpapi_key: str, zyte_api=None):
613
+ _post_search_retry_stop_after = 3
614
+
615
+ def __init__(
616
+ self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
617
+ ):
513
618
  """Initializes the Search class with the given SerpAPI key.
514
619
 
515
620
  Args:
516
621
  http_client: An httpx.AsyncClient to use for the async requests.
517
622
  serpapi_key: The API key for SERP API.
518
- zyte_api: Optional ZyteAPI instance for fallback when direct access fails.
623
+ zyteapi_key: ZyteAPI key for fallback when direct access fails.
519
624
  """
625
+ self._http_client = http_client
520
626
  self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
521
627
  self._google_shopping = SerpAPIGoogleShopping(
522
- http_client=http_client, api_key=serpapi_key
628
+ http_client=http_client,
629
+ api_key=serpapi_key,
630
+ )
631
+ self._toppreise = Toppreise(
632
+ http_client=http_client,
633
+ zyteapi_key=zyteapi_key,
634
+ )
635
+
636
+ @staticmethod
637
+ def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
638
+ """Context aware logging before the request is made."""
639
+ if retry_state:
640
+ logger.debug(
641
+ f'Performing post search for url="{url}" '
642
+ f"(attempt {retry_state.attempt_number})."
643
+ )
644
+ else:
645
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
646
+
647
+ @staticmethod
648
+ def _post_search_log_before_sleep(
649
+ url: str, retry_state: RetryCallState | None
650
+ ) -> None:
651
+ """Context aware logging before sleeping after a failed request."""
652
+ if retry_state and retry_state.outcome:
653
+ logger.warning(
654
+ f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
655
+ f"failed with error: {retry_state.outcome.exception()}. "
656
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
657
+ )
658
+ else:
659
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
660
+
661
+ async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Toppreise product comparison page.
663
+
664
+ Note:
665
+ In comparison to the function Toppreise._search, here we extract the urls from
666
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
667
+ also be found in the results of a google search.
668
+
669
+ Args:
670
+ url: The URL of the Toppreise product listing page.
671
+ """
672
+ # Perform the request and retry if necessary. There is some context aware logging:
673
+ # - `before`: before the request is made (and before retrying)
674
+ # - `before_sleep`: if the request fails before sleeping
675
+ retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
676
+ retry.before = lambda retry_state: self._post_search_log_before(
677
+ url=url, retry_state=retry_state
523
678
  )
524
- self._toppreise = Toppreise(http_client=http_client, zyte_api=zyte_api)
679
+ retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
680
+ url=url, retry_state=retry_state
681
+ )
682
+ content = await self._toppreise.http_client_get_with_fallback(
683
+ url=url, retry=retry
684
+ )
685
+
686
+ # Get external product urls from the content
687
+ urls = self._toppreise._extract_product_urls_from_comparison_page(
688
+ content=content
689
+ )
690
+
691
+ return urls
692
+
693
+ async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
694
+ """Post-search for additional embedded product URLs from the obtained results.
695
+
696
+ Note:
697
+ This function is used to extract embedded product URLs from
698
+ product listing pages (e.g. Toppreise, Google Shopping) if needed.
699
+
700
+ Args:
701
+ results: The list of SearchResult objects obtained from the search.
702
+ """
703
+ post_search_results: List[SearchResult] = []
704
+ for res in results:
705
+ url = res.url
706
+
707
+ # Extract embedded product URLs from the Toppreise product listing page
708
+ if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
709
+ logger.debug(
710
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
711
+ )
712
+ post_search_urls = await self._post_search_toppreise_comparison(url=url)
713
+ logger.debug(
714
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
715
+ )
716
+
717
+ psr = [
718
+ SearchResult(
719
+ url=psu,
720
+ domain=self._get_domain(url=psu),
721
+ search_engine_name=res.search_engine_name,
722
+ )
723
+ for psu in post_search_urls
724
+ ]
725
+ post_search_results.extend(psr)
726
+
727
+ return post_search_results
525
728
 
526
729
  @staticmethod
527
730
  def _domain_in_host(domain: str, host: Host) -> bool:
@@ -611,63 +814,77 @@ class Search(DomainUtils):
611
814
  async def apply(
612
815
  self,
613
816
  search_term: str,
817
+ search_engine: SearchEngineName | str,
614
818
  language: Language,
615
819
  location: Location,
616
820
  num_results: int,
617
821
  marketplaces: List[Host] | None = None,
618
822
  excluded_urls: List[Host] | None = None,
619
- search_engines: List[SearchEngineName | str] | None = None,
620
823
  ) -> List[SearchResult]:
621
824
  """Performs a search and returns SearchResults.
622
825
 
623
826
  Args:
624
827
  search_term: The search term to use for the query.
828
+ search_engine: The search engine to use for the search.
625
829
  language: The language to use for the query ('hl' parameter).
626
830
  location: The location to use for the query ('gl' parameter).
627
831
  num_results: Max number of results per search engine.
628
832
  marketplaces: The marketplaces to include in the search.
629
833
  excluded_urls: The URLs to exclude from the search.
630
- search_engines: The list of search engines to use for the search.
631
834
  """
632
- if search_engines is None:
633
- search_engines = list(SearchEngineName)
634
- else:
635
- search_engines = [
636
- SearchEngineName(sen) if isinstance(sen, str) else sen
637
- for sen in search_engines
638
- ]
639
- results: List[SearchResult] = []
835
+ logger.info(
836
+ f'Performing search for term="{search_term}" using engine="{search_engine}".'
837
+ )
838
+
839
+ # -------------------------------
840
+ # SEARCH
841
+ # -------------------------------
842
+ # Map string to SearchEngineName if needed
843
+ if isinstance(search_engine, str):
844
+ search_engine = SearchEngineName(search_engine)
640
845
 
641
846
  # Make SerpAPI google search
642
- if SearchEngineName.GOOGLE in search_engines:
643
- res = await self._google.search(
847
+ if search_engine == SearchEngineName.GOOGLE:
848
+ results = await self._google.search(
644
849
  search_term=search_term,
645
850
  language=language,
646
851
  location=location,
647
852
  num_results=num_results,
648
853
  marketplaces=marketplaces,
649
854
  )
650
- results.extend(res)
651
855
 
652
856
  # Make SerpAPI google shopping search
653
- if SearchEngineName.GOOGLE_SHOPPING in search_engines:
654
- res = await self._google_shopping.search(
857
+ elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
858
+ results = await self._google_shopping.search(
655
859
  search_term=search_term,
656
860
  language=language,
657
861
  location=location,
658
862
  num_results=num_results,
659
863
  marketplaces=marketplaces,
660
864
  )
661
- results.extend(res)
662
865
 
663
866
  # Make Toppreise search
664
- if SearchEngineName.TOPPREISE in search_engines:
665
- res = await self._toppreise.search(
867
+ elif search_engine == SearchEngineName.TOPPREISE:
868
+ results = await self._toppreise.search(
666
869
  search_term=search_term,
870
+ language=language,
667
871
  num_results=num_results,
668
872
  )
669
- results.extend(res)
670
873
 
874
+ # Other search engines can be added here (raise unknown engine error otherwise)
875
+ else:
876
+ raise ValueError(f"Unknown search engine: {search_engine}")
877
+
878
+ # -------------------------------
879
+ # POST-SEARCH URL EXTRACTION
880
+ # -------------------------------
881
+ post_search_results = await self._post_search(results=results)
882
+ post_search_results = post_search_results[:num_results]
883
+ results.extend(post_search_results)
884
+
885
+ # -------------------------------
886
+ # FILTERS
887
+ # -------------------------------
671
888
  # Apply filters
672
889
  results = [
673
890
  self._apply_filters(
@@ -679,5 +896,7 @@ class Search(DomainUtils):
679
896
  for res in results
680
897
  ]
681
898
 
682
- logger.debug(f"Search produced a total of {len(results)} results.")
899
+ logger.info(
900
+ f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
901
+ )
683
902
  return results
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
3
3
  from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
4
 
5
5
  from fraudcrawler.settings import KNOWN_TRACKERS
6
+ from fraudcrawler.base.base import ProductItem
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -11,11 +12,19 @@ class URLCollector:
11
12
  """A class to collect and de-duplicate URLs."""
12
13
 
13
14
  def __init__(self):
14
- self.collected_currently: Set[str] = set()
15
- self.collected_previously: Set[str] = set()
15
+ self._collected_currently: Set[str] = set()
16
+ self._collected_previously: Set[str] = set()
17
+
18
+ def add_previously_collected_urls(self, urls: List[str]) -> None:
19
+ """Add a set of previously collected URLs to the internal state.
20
+
21
+ Args:
22
+ urls: A set of URLs that have been collected in previous runs.
23
+ """
24
+ self._collected_previously.update(urls)
16
25
 
17
26
  @staticmethod
18
- def remove_tracking_parameters(url: str) -> str:
27
+ def _remove_tracking_parameters(url: str) -> str:
19
28
  """Remove tracking parameters from URLs.
20
29
 
21
30
  Args:
@@ -55,3 +64,33 @@ class URLCollector:
55
64
  fragment=parsed_url.fragment,
56
65
  )
57
66
  return urlunparse(clean_url)
67
+
68
+ async def apply(self, product: ProductItem) -> ProductItem:
69
+ """Manages the collection and deduplication of ProductItems.
70
+
71
+ Args:
72
+ product: The product item to process.
73
+ """
74
+ logger.debug(f'Processing product with url="{product.url}"')
75
+
76
+ # Remove tracking parameters from the URL
77
+ url = self._remove_tracking_parameters(product.url)
78
+ product.url = url
79
+
80
+ # deduplicate on current run
81
+ if url in self._collected_currently:
82
+ product.filtered = True
83
+ product.filtered_at_stage = "URL collection (current run deduplication)"
84
+ logger.debug(f"URL {url} already collected in current run")
85
+
86
+ # deduplicate on previous runs coming from a db
87
+ elif url in self._collected_previously:
88
+ product.filtered = True
89
+ product.filtered_at_stage = "URL collection (previous run deduplication)"
90
+ logger.debug(f"URL {url} as already collected in previous run")
91
+
92
+ # Add to currently collected URLs
93
+ else:
94
+ self._collected_currently.add(url)
95
+
96
+ return product
@@ -1,6 +1,6 @@
1
+ from base64 import b64decode
1
2
  import logging
2
3
  from typing import List
3
- from base64 import b64decode
4
4
 
5
5
  import httpx
6
6
  from tenacity import RetryCallState
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
242
242
  decoded_string = decoded_bytes.decode("utf-8")
243
243
  return decoded_string
244
244
  return None
245
+
246
+ async def unblock_url_content(self, url: str) -> bytes:
247
+ """Unblock the content of an URL using Zyte proxy mode.
248
+
249
+ Args:
250
+ url: The URL to fetch using Zyte proxy mode.
251
+ """
252
+ logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
253
+ details = await self.details(url)
254
+
255
+ if not details or "httpResponseBody" not in details:
256
+ raise httpx.HTTPError("No httpResponseBody in Zyte response")
257
+
258
+ return b64decode(details["httpResponseBody"])