fraudcrawler 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -140,6 +140,8 @@ class ProductItem(BaseModel):
140
140
  url_resolved: str
141
141
  search_engine_name: str
142
142
  domain: str
143
+ exact_search: bool = False
144
+ exact_search_match: bool = False
143
145
 
144
146
  # Context parameters
145
147
  product_name: str | None = None
@@ -217,6 +219,14 @@ class DomainUtils:
217
219
  """
218
220
 
219
221
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
222
+ _headers = {
223
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
224
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
225
+ "Accept-Language": "en-US,en;q=0.5",
226
+ "Accept-Encoding": "gzip, deflate",
227
+ "Connection": "keep-alive",
228
+ "Upgrade-Insecure-Requests": "1",
229
+ }
220
230
 
221
231
  def _get_domain(self, url: str) -> str:
222
232
  """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
@@ -3,10 +3,12 @@ import asyncio
3
3
  import logging
4
4
  from typing import cast, Dict, List, Self
5
5
 
6
- from bs4 import BeautifulSoup
7
6
  import httpx
7
+ import re
8
8
 
9
9
  from fraudcrawler.settings import (
10
+ EXACT_MATCH_PRODUCT_FIELDS,
11
+ EXACT_MATCH_FIELD_SEPARATOR,
10
12
  PROCESSOR_DEFAULT_MODEL,
11
13
  )
12
14
  from fraudcrawler.settings import (
@@ -27,8 +29,8 @@ from fraudcrawler import (
27
29
  Searcher,
28
30
  SearchEngineName,
29
31
  Enricher,
30
- URLCollector,
31
32
  ZyteAPI,
33
+ URLCollector,
32
34
  Processor,
33
35
  )
34
36
 
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
227
229
 
228
230
  if not product.filtered:
229
231
  try:
230
- # Fetch the product context from Zyte API
232
+ # Fetch and enrich the product context from Zyte API
231
233
  details = await self._zyteapi.details(url=product.url)
232
- url_resolved = self._zyteapi.extract_url_resolved(details=details)
233
- if url_resolved:
234
- product.url_resolved = url_resolved
235
- product.product_name = self._zyteapi.extract_product_name(
236
- details=details
234
+ product = self._zyteapi.enrich_context(
235
+ product=product, details=details
237
236
  )
238
237
 
239
- # If the resolved URL is different from the original URL, we also need to update the domain as
240
- # otherwise the unresolved domain will be shown.
241
- # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
242
- if url_resolved and url_resolved != product.url:
243
- logger.debug(
244
- f"URL resolved for {product.url} is {url_resolved}"
245
- )
246
- product.domain = self._searcher._get_domain(url_resolved)
247
-
248
- product.product_price = self._zyteapi.extract_product_price(
249
- details=details
250
- )
251
- product.product_description = (
252
- self._zyteapi.extract_product_description(details=details)
253
- )
254
- product.product_images = self._zyteapi.extract_image_urls(
255
- details=details
256
- )
257
- product.probability = self._zyteapi.extract_probability(
258
- details=details
259
- )
260
- product.html = self._zyteapi.extract_html(details=details)
261
- if product.html:
262
- soup = BeautifulSoup(product.html, "html.parser")
263
- product.html_clean = soup.get_text(separator=" ", strip=True)
264
238
  # Filter the product based on the probability threshold
265
239
  if not self._zyteapi.keep_product(details=details):
266
240
  product.filtered = True
267
- product.filtered_at_stage = "Zyte probability threshold"
241
+ product.filtered_at_stage = (
242
+ "Context (Zyte probability threshold)"
243
+ )
244
+
245
+ # Check for exact match inside the full product context
246
+ product = self._check_exact_search(product=product)
247
+ if (
248
+ not product.filtered
249
+ and product.exact_search
250
+ and not product.exact_search_match
251
+ ):
252
+ product.filtered = True
253
+ product.filtered_at_stage = "Context (exact search)"
254
+
268
255
  except Exception as e:
269
256
  logger.warning(f"Error executing Zyte API search: {e}.")
270
257
  await queue_out.put(product)
@@ -502,6 +489,80 @@ class Orchestrator(ABC):
502
489
  **common_kwargs, # type: ignore[arg-type]
503
490
  )
504
491
 
492
+ @staticmethod
493
+ def _is_exact_search(search_term: str) -> bool:
494
+ """Check if the search term is an exact search (contains double quotation marks).
495
+
496
+ Args:
497
+ search_term: The search term to check.
498
+
499
+ Returns:
500
+ True if the search term contains double quotation marks, False otherwise.
501
+ """
502
+ return '"' in search_term
503
+
504
+ @staticmethod
505
+ def _extract_exact_search_terms(search_term: str) -> list[str]:
506
+ """Extract all exact search terms from within double quotation marks.
507
+
508
+ Args:
509
+ search_term: The search term that may contain double quotation marks.
510
+
511
+ Returns:
512
+ A list of extracted search terms without quotes, or empty list if no quotes found.
513
+ """
514
+ # Find all double-quoted strings
515
+ double_quote_matches = re.findall(r'"([^"]*)"', search_term)
516
+ return double_quote_matches
517
+
518
+ @staticmethod
519
+ def _check_exact_search_terms_match(
520
+ product: ProductItem,
521
+ exact_search_terms: list[str],
522
+ ) -> bool:
523
+ """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
524
+
525
+ Args:
526
+ product: The product item.
527
+ exact_search_terms: List of exact search terms to match against.
528
+ """
529
+ field_values = [
530
+ str(val)
531
+ for fld in EXACT_MATCH_PRODUCT_FIELDS
532
+ if (val := getattr(product, fld, None)) is not None
533
+ ]
534
+ product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
535
+
536
+ return all(
537
+ re.search(re.escape(est.lower()), product_str_lower)
538
+ for est in exact_search_terms
539
+ )
540
+
541
+ def _check_exact_search(self, product: ProductItem) -> ProductItem:
542
+ """Checks if the search term requests an exact search and if yes, checks for conformity."""
543
+ # Check for exact search and apply regex matching
544
+ exact_search = self._is_exact_search(product.search_term)
545
+ product.exact_search = exact_search
546
+
547
+ # Only set exact_search_match if this was an exact search (contains quotes)
548
+ if exact_search:
549
+ exact_search_terms = self._extract_exact_search_terms(product.search_term)
550
+ if exact_search_terms:
551
+ product.exact_search_match = self._check_exact_search_terms_match(
552
+ product=product, exact_search_terms=exact_search_terms
553
+ )
554
+ logger.debug(
555
+ f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
556
+ f"for offer with url={product.url}"
557
+ )
558
+ else:
559
+ logger.warning(
560
+ f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
561
+ f"for offer with url={product.url}"
562
+ )
563
+ # If exact_search is False, product.exact_search_match remains False (default value)
564
+ return product
565
+
505
566
  async def run(
506
567
  self,
507
568
  search_term: str,
@@ -97,4 +97,4 @@ def search(search_term: str):
97
97
 
98
98
 
99
99
  if __name__ == "__main__":
100
- search(search_term="electric cigarettes")
100
+ search(search_term="Kaffeebohnen")
@@ -8,7 +8,7 @@ from urllib.parse import quote_plus
8
8
  from bs4 import BeautifulSoup
9
9
  from bs4.element import Tag
10
10
  import httpx
11
- from tenacity import RetryCallState, AsyncRetrying
11
+ from tenacity import RetryCallState
12
12
 
13
13
  from fraudcrawler.settings import (
14
14
  SEARCH_DEFAULT_COUNTRY_CODES,
@@ -45,6 +45,14 @@ class SearchEngine(ABC, DomainUtils):
45
45
 
46
46
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
47
47
 
48
+ def __init__(self, http_client: httpx.AsyncClient):
49
+ """Initializes the SearchEngine with the given HTTP client.
50
+
51
+ Args:
52
+ http_client: An httpx.AsyncClient to use for the async requests.
53
+ """
54
+ self._http_client = http_client
55
+
48
56
  @property
49
57
  @abstractmethod
50
58
  def _search_engine_name(self) -> str:
@@ -56,45 +64,81 @@ class SearchEngine(ABC, DomainUtils):
56
64
  """Apply the search with the given parameters and return results."""
57
65
  pass
58
66
 
67
+ def _create_search_result(self, url: str) -> SearchResult:
68
+ """From a given url it creates the class:`SearchResult` instance."""
69
+ # Get marketplace name
70
+ domain = self._get_domain(url=url)
71
+
72
+ # Create and return the SearchResult object
73
+ result = SearchResult(
74
+ url=url,
75
+ domain=domain,
76
+ search_engine_name=self._search_engine_name,
77
+ )
78
+ return result
79
+
59
80
  @classmethod
60
81
  def _log_before(
61
- cls, search_string: str, retry_state: RetryCallState | None
82
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
62
83
  ) -> None:
63
- """Context aware logging before the request is made."""
84
+ """Context aware logging before HTTP request is made."""
64
85
  if retry_state:
65
86
  logger.debug(
66
- f'Performing search in {cls.__name__} with q="{search_string}" '
67
- f"(attempt {retry_state.attempt_number})."
87
+ f'Performing HTTP request in {cls.__name__} to url="{url}" '
88
+ f"with params={params} (attempt {retry_state.attempt_number})."
68
89
  )
69
90
  else:
70
91
  logger.debug(f"retry_state is {retry_state}; not logging before.")
71
92
 
72
93
  @classmethod
73
94
  def _log_before_sleep(
74
- cls, search_string: str, retry_state: RetryCallState | None
95
+ cls, url: str, params: dict | None, retry_state: RetryCallState | None
75
96
  ) -> None:
76
- """Context aware logging before sleeping after a failed request."""
97
+ """Context aware logging before sleeping after a failed HTTP request."""
77
98
  if retry_state and retry_state.outcome:
78
99
  logger.warning(
79
- f'Attempt {retry_state.attempt_number} of {cls.__name__} search with q="{search_string}" '
100
+ f"Attempt {retry_state.attempt_number} of {cls.__name__} HTTP request "
101
+ f'to url="{url}" with params="{params}" '
80
102
  f"failed with error: {retry_state.outcome.exception()}. "
81
103
  f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
82
104
  )
83
105
  else:
84
106
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
85
107
 
86
- def _create_search_result(self, url: str) -> SearchResult:
87
- """From a given url it creates the class:`SearchResult` instance."""
88
- # Get marketplace name
89
- domain = self._get_domain(url=url)
108
+ async def http_client_get(
109
+ self, url: str, params: dict | None = None, headers: dict | None = None
110
+ ) -> httpx.Response:
111
+ """Performs a GET request with retries.
90
112
 
91
- # Create and return the SearchResult object
92
- result = SearchResult(
93
- url=url,
94
- domain=domain,
95
- search_engine_name=self._search_engine_name,
113
+ Args:
114
+ retry: The retry strategy to use.
115
+ url: The URL to request.
116
+ params: Query parameters for the request.
117
+ headers: HTTP headers to use for the request.
118
+ """
119
+ # Perform the request and retry if necessary. There is some context aware logging:
120
+ # - `before`: before the request is made (and before retrying)
121
+ # - `before_sleep`: if the request fails before sleeping
122
+ retry = get_async_retry()
123
+ retry.before = lambda retry_state: self._log_before(
124
+ url=url, params=params, retry_state=retry_state
96
125
  )
97
- return result
126
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
127
+ url=url, params=params, retry_state=retry_state
128
+ )
129
+
130
+ async for attempt in retry:
131
+ with attempt:
132
+ response = await self._http_client.get(
133
+ url=url,
134
+ params=params,
135
+ headers=headers,
136
+ )
137
+ response.raise_for_status()
138
+ return response
139
+
140
+ # In case of not entering the for loop (for some strange reason)
141
+ raise RuntimeError("Retry exhausted without success")
98
142
 
99
143
 
100
144
  class SerpAPI(SearchEngine):
@@ -109,7 +153,7 @@ class SerpAPI(SearchEngine):
109
153
  http_client: An httpx.AsyncClient to use for the async requests.
110
154
  api_key: The API key for SerpAPI.
111
155
  """
112
- self._http_client = http_client
156
+ super().__init__(http_client=http_client)
113
157
  self._api_key = api_key
114
158
 
115
159
  @property
@@ -205,22 +249,10 @@ class SerpAPI(SearchEngine):
205
249
  }
206
250
  logger.debug(f"SerpAPI search with params: {params}")
207
251
 
208
- # Perform the request and retry if necessary. There is some context aware logging:
209
- # - `before`: before the request is made (and before retrying)
210
- # - `before_sleep`: if the request fails before sleeping
211
- retry = get_async_retry()
212
- retry.before = lambda retry_state: self._log_before(
213
- search_string=search_string, retry_state=retry_state
214
- )
215
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
216
- search_string=search_string, retry_state=retry_state
252
+ # Perform the search request
253
+ response: httpx.Response = await self.http_client_get(
254
+ url=self._endpoint, params=params
217
255
  )
218
- async for attempt in retry:
219
- with attempt:
220
- response = await self._http_client.get(
221
- url=self._endpoint, params=params
222
- )
223
- response.raise_for_status()
224
256
 
225
257
  # Extract the URLs from the response
226
258
  data = response.json()
@@ -336,7 +368,21 @@ class SerpAPIGoogleShopping(SerpAPI):
336
368
  """
337
369
  results = data.get("shopping_results")
338
370
  if results is not None:
339
- return [url for res in results if (url := res.get("product_link"))]
371
+ # return [url for res in results if (url := res.get("product_link"))] # c.f. https://github.com/serpapi/public-roadmap/issues/3045
372
+ return [
373
+ url
374
+ for res in results
375
+ if (url := res.get("serpapi_immersive_product_api"))
376
+ ]
377
+ return []
378
+
379
+ @staticmethod
380
+ def _extract_product_urls_from_immersive_product_api(data: dict) -> List[str]:
381
+ """Extracts product urls from the serpapi immersive product API data."""
382
+ if results := data.get("product_results"):
383
+ stores = results.get("stores", [])
384
+ urls = [url for sre in stores if (url := sre.get("link"))]
385
+ return list(set(urls))
340
386
  return []
341
387
 
342
388
  async def search(
@@ -349,6 +395,9 @@ class SerpAPIGoogleShopping(SerpAPI):
349
395
  ) -> List[SearchResult]:
350
396
  """Performs a google shopping search using SerpApi and returns SearchResults.
351
397
 
398
+ Similar to Toppreise, this method extracts merchant URLs from Google Shopping product pages
399
+ and creates multiple SearchResult objects for each merchant URL found.
400
+
352
401
  Args:
353
402
  search_term: The search term to use for the query.
354
403
  language: The language to use for the query ('hl' parameter).
@@ -362,7 +411,7 @@ class SerpAPIGoogleShopping(SerpAPI):
362
411
  marketplaces=marketplaces,
363
412
  )
364
413
 
365
- # Perform the search
414
+ # Perform the search to get Google Shopping URLs
366
415
  urls = await self._search(
367
416
  search_string=search_string,
368
417
  language=language,
@@ -375,10 +424,10 @@ class SerpAPIGoogleShopping(SerpAPI):
375
424
  # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
376
425
  urls = urls[:num_results]
377
426
 
378
- # Create and return SearchResult objects from the URLs
427
+ # Create SearchResult objects from merchant URLs (similar to Toppreise pattern)
379
428
  results = [self._create_search_result(url=url) for url in urls]
380
429
  logger.debug(
381
- f'Produced {len(results)} results from SerpAPI with engine="{self._engine}" and q="{search_string}".'
430
+ f'Produced {len(results)} results from Google Shopping search with q="{search_string}".'
382
431
  )
383
432
  return results
384
433
 
@@ -387,14 +436,6 @@ class Toppreise(SearchEngine):
387
436
  """Search engine for toppreise.ch."""
388
437
 
389
438
  _endpoint = "https://www.toppreise.ch/"
390
- _headers = {
391
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
392
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
393
- "Accept-Language": "en-US,en;q=0.5",
394
- "Accept-Encoding": "gzip, deflate",
395
- "Connection": "keep-alive",
396
- "Upgrade-Insecure-Requests": "1",
397
- }
398
439
 
399
440
  def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
400
441
  """Initializes the Toppreise client.
@@ -403,9 +444,42 @@ class Toppreise(SearchEngine):
403
444
  http_client: An httpx.AsyncClient to use for the async requests.
404
445
  zyteapi_key: ZyteAPI key for fallback when direct access fails.
405
446
  """
406
- self._http_client = http_client
447
+ super().__init__(http_client=http_client)
407
448
  self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
408
449
 
450
+ async def http_client_get_with_fallback(self, url: str) -> bytes:
451
+ """Performs a GET request with retries.
452
+
453
+ If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
454
+ content using Zyte proxy mode.
455
+
456
+ Args:
457
+ url: The URL to request.
458
+ """
459
+ # Try to access the URL directly
460
+ try:
461
+ response: httpx.Response = await self.http_client_get(
462
+ url=url, headers=self._headers
463
+ )
464
+ content = response.content
465
+
466
+ # If we get a 403 Error (can happen depending on IP/location of deployment),
467
+ # we try to unblock the URL using Zyte proxy mode
468
+ except httpx.HTTPStatusError as err_direct:
469
+ if err_direct.response.status_code == 403:
470
+ logger.warning(
471
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
472
+ )
473
+ try:
474
+ content = await self._zyteapi.unblock_url_content(url)
475
+ except Exception as err_resolve:
476
+ msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
477
+ logger.error(msg)
478
+ raise httpx.HTTPError(msg) from err_resolve
479
+ else:
480
+ raise err_direct
481
+ return content
482
+
409
483
  @classmethod
410
484
  def _get_search_endpoint(cls, language: Language) -> str:
411
485
  """Get the search endpoint based on the language."""
@@ -502,46 +576,6 @@ class Toppreise(SearchEngine):
502
576
  """The name of the search engine."""
503
577
  return SearchEngineName.TOPPREISE.value
504
578
 
505
- async def http_client_get_with_fallback(
506
- self, url: str, retry: AsyncRetrying
507
- ) -> bytes:
508
- """Performs a GET request with retries.
509
-
510
- If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
511
- content using Zyte proxy mode.
512
-
513
- Args:
514
- url: The URL to request.
515
- retry: The retry strategy to use.
516
- """
517
- # Try to access the URL directly
518
- try:
519
- async for attempt in retry:
520
- with attempt:
521
- response = await self._http_client.get(
522
- url=url,
523
- headers=self._headers,
524
- )
525
- response.raise_for_status()
526
- content = response.content
527
-
528
- # If we get a 403 Error (can happen depending on IP/location of deployment),
529
- # we try to unblock the URL using Zyte proxy mode
530
- except httpx.HTTPStatusError as err_direct:
531
- if err_direct.response.status_code == 403:
532
- logger.warning(
533
- f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
534
- )
535
- try:
536
- content = await self._zyteapi.unblock_url_content(url)
537
- except Exception as err_resolve:
538
- msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
539
- logger.error(msg)
540
- raise httpx.HTTPError(msg) from err_resolve
541
- else:
542
- raise err_direct
543
- return content
544
-
545
579
  async def _search(
546
580
  self, search_string: str, language: Language, num_results: int
547
581
  ) -> List[str]:
@@ -561,17 +595,8 @@ class Toppreise(SearchEngine):
561
595
  url = f"{endpoint}?q={encoded_search}"
562
596
  logger.debug(f"Toppreise search URL: {url}")
563
597
 
564
- # Perform the request and retry if necessary. There is some context aware logging:
565
- # - `before`: before the request is made (and before retrying)
566
- # - `before_sleep`: if the request fails before sleeping
567
- retry = get_async_retry()
568
- retry.before = lambda retry_state: self._log_before(
569
- search_string=search_string, retry_state=retry_state
570
- )
571
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
572
- search_string=search_string, retry_state=retry_state
573
- )
574
- content = await self.http_client_get_with_fallback(url=url, retry=retry)
598
+ # Perform the request with fallback if necessary
599
+ content = await self.http_client_get_with_fallback(url=url)
575
600
 
576
601
  # Get external product urls from the content
577
602
  urls = self._extract_product_urls_from_search_page(content=content)
@@ -633,61 +658,44 @@ class Searcher(DomainUtils):
633
658
  zyteapi_key=zyteapi_key,
634
659
  )
635
660
 
636
- @staticmethod
637
- def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
638
- """Context aware logging before the request is made."""
639
- if retry_state:
640
- logger.debug(
641
- f'Performing post search for url="{url}" '
642
- f"(attempt {retry_state.attempt_number})."
643
- )
644
- else:
645
- logger.debug(f"retry_state is {retry_state}; not logging before.")
661
+ async def _post_search_google_shopping_immersive(self, url: str) -> List[str]:
662
+ """Post-search for product URLs from a Google Shopping immersive product page.
646
663
 
647
- @staticmethod
648
- def _post_search_log_before_sleep(
649
- url: str, retry_state: RetryCallState | None
650
- ) -> None:
651
- """Context aware logging before sleeping after a failed request."""
652
- if retry_state and retry_state.outcome:
653
- logger.warning(
654
- f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
655
- f"failed with error: {retry_state.outcome.exception()}. "
656
- f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
657
- )
658
- else:
659
- logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
664
+ Args:
665
+ url: The URL of the Google Shopping product page.
666
+ """
667
+ # Add SerpAPI key to the url
668
+ sep = "&" if "?" in url else "?"
669
+ url = f"{url}{sep}api_key={self._google_shopping._api_key}"
670
+
671
+ # Fetch the content of the Google Shopping product page
672
+ response = await self._google_shopping.http_client_get(url=url)
673
+
674
+ # Get external product urls from the data
675
+ data = response.json()
676
+ urls = self._google_shopping._extract_product_urls_from_immersive_product_api(
677
+ data=data
678
+ )
679
+ return urls
660
680
 
661
681
  async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
662
682
  """Post-search for product URLs from a Toppreise product comparison page.
663
683
 
664
684
  Note:
665
685
  In comparison to the function Toppreise._search, here we extract the urls from
666
- product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
667
- also be found in the results of a google search.
686
+ product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). These
687
+ pages can also be found in the results of a google search.
668
688
 
669
689
  Args:
670
690
  url: The URL of the Toppreise product listing page.
671
691
  """
672
- # Perform the request and retry if necessary. There is some context aware logging:
673
- # - `before`: before the request is made (and before retrying)
674
- # - `before_sleep`: if the request fails before sleeping
675
- retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
676
- retry.before = lambda retry_state: self._post_search_log_before(
677
- url=url, retry_state=retry_state
678
- )
679
- retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
680
- url=url, retry_state=retry_state
681
- )
682
- content = await self._toppreise.http_client_get_with_fallback(
683
- url=url, retry=retry
684
- )
692
+ # Perform the request with fallback if necessary
693
+ content = await self._toppreise.http_client_get_with_fallback(url=url)
685
694
 
686
695
  # Get external product urls from the content
687
696
  urls = self._toppreise._extract_product_urls_from_comparison_page(
688
697
  content=content
689
698
  )
690
-
691
699
  return urls
692
700
 
693
701
  async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
@@ -703,9 +711,22 @@ class Searcher(DomainUtils):
703
711
  post_search_results: List[SearchResult] = []
704
712
  for res in results:
705
713
  url = res.url
714
+ post_search_urls: List[str] = []
715
+
716
+ # Extract embedded product URLs from the Google Shopping immersive product page
717
+ if "engine=google_immersive_product" in url:
718
+ logger.debug(
719
+ f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
720
+ )
721
+ post_search_urls = await self._post_search_google_shopping_immersive(
722
+ url=url
723
+ )
724
+ logger.debug(
725
+ f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
726
+ )
706
727
 
707
728
  # Extract embedded product URLs from the Toppreise product listing page
708
- if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
729
+ elif any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
709
730
  logger.debug(
710
731
  f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
711
732
  )
@@ -714,15 +735,16 @@ class Searcher(DomainUtils):
714
735
  f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
715
736
  )
716
737
 
717
- psr = [
718
- SearchResult(
719
- url=psu,
720
- domain=self._get_domain(url=psu),
721
- search_engine_name=res.search_engine_name,
722
- )
723
- for psu in post_search_urls
724
- ]
725
- post_search_results.extend(psr)
738
+ # Add the extracted product URLs as SearchResult objects
739
+ psr = [
740
+ SearchResult(
741
+ url=psu,
742
+ domain=self._get_domain(url=psu),
743
+ search_engine_name=res.search_engine_name,
744
+ )
745
+ for psu in post_search_urls
746
+ ]
747
+ post_search_results.extend(psr)
726
748
 
727
749
  return post_search_results
728
750
 
@@ -2,11 +2,12 @@ from base64 import b64decode
2
2
  import logging
3
3
  from typing import List
4
4
 
5
+ from bs4 import BeautifulSoup
5
6
  import httpx
6
7
  from tenacity import RetryCallState
7
8
 
8
9
  from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
9
- from fraudcrawler.base.base import DomainUtils
10
+ from fraudcrawler.base.base import DomainUtils, ProductItem
10
11
  from fraudcrawler.base.retry import get_async_retry
11
12
 
12
13
  logger = logging.getLogger(__name__)
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
61
62
  else:
62
63
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
63
64
 
64
- async def details(self, url: str) -> dict:
65
- """Fetches product details for a single URL.
66
-
67
- Args:
68
- url: The URL to fetch product details from.
69
-
70
- Returns:
71
- A dictionary containing the product details, fields include:
72
- (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
73
- {
74
- "url": str,
75
- "statusCode": str,
76
- "product": {
77
- "name": str,
78
- "price": str,
79
- "mainImage": {"url": str},
80
- "images": [{"url": str}],
81
- "description": str,
82
- "metadata": {
83
- "probability": float,
84
- },
85
- },
86
- "httpResponseBody": base64
87
- }
88
- """
89
- logger.info(f"Fetching product details by Zyte for URL {url}.")
90
-
91
- # Perform the request and retry if necessary. There is some context aware logging:
92
- # - `before`: before the request is made (and before retrying)
93
- # - `before_sleep`: if the request fails before sleeping
94
- retry = get_async_retry()
95
- retry.before = lambda retry_state: self._log_before(
96
- url=url, retry_state=retry_state
97
- )
98
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
99
- url=url, retry_state=retry_state
100
- )
101
- async for attempt in retry:
102
- with attempt:
103
- response = await self._http_client.post(
104
- url=self._endpoint,
105
- json={"url": url, **self._config},
106
- auth=(self._api_key, ""), # API key as username, empty password
107
- )
108
- response.raise_for_status()
109
-
110
- details = response.json()
111
- return details
112
-
113
- @staticmethod
114
- def keep_product(
115
- details: dict,
116
- threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
117
- ) -> bool:
118
- """Determines whether to keep the product based on the probability threshold.
119
-
120
- Args:
121
- details: A product details data dictionary.
122
- threshold: The probability threshold used to filter the products.
123
- """
124
- try:
125
- prob = float(details["product"]["metadata"]["probability"])
126
- except KeyError:
127
- logger.warning(
128
- f"Product with url={details.get('url')} has no probability value - product is ignored"
129
- )
130
- return False
131
- return prob > threshold
132
-
133
65
  @staticmethod
134
- def extract_product_name(details: dict) -> str | None:
66
+ def _extract_product_name(details: dict) -> str | None:
135
67
  """Extracts the product name from the product data.
136
68
 
137
69
  The input argument is a dictionary of the following structure:
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
144
76
  return details.get("product", {}).get("name")
145
77
 
146
78
  @staticmethod
147
- def extract_url_resolved(details: dict) -> str | None:
79
+ def _extract_url_resolved(details: dict) -> str | None:
148
80
  """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
149
81
 
150
82
  The input argument is a dictionary of the following structure:
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
157
89
  return details.get("product", {}).get("url")
158
90
 
159
91
  @staticmethod
160
- def extract_product_price(details: dict) -> str | None:
92
+ def _extract_product_price(details: dict) -> str | None:
161
93
  """Extracts the product price from the product data.
162
94
 
163
95
  The input argument is a dictionary of the following structure:
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
170
102
  return details.get("product", {}).get("price")
171
103
 
172
104
  @staticmethod
173
- def extract_product_description(details: dict) -> str | None:
105
+ def _extract_product_description(details: dict) -> str | None:
174
106
  """Extracts the product description from the product data.
175
107
 
176
108
  The input argument is a dictionary of the following structure:
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
183
115
  return details.get("product", {}).get("description")
184
116
 
185
117
  @staticmethod
186
- def extract_image_urls(details: dict) -> List[str]:
118
+ def _extract_image_urls(details: dict) -> List[str]:
187
119
  """Extracts the images from the product data.
188
120
 
189
121
  The input argument is a dictionary of the following structure:
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
206
138
  return images
207
139
 
208
140
  @staticmethod
209
- def extract_probability(details: dict) -> float:
141
+ def _extract_probability(details: dict) -> float:
210
142
  """Extracts the probability from the product data.
211
143
 
212
144
  The input argument is a dictionary of the following structure:
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
223
155
  )
224
156
 
225
157
  @staticmethod
226
- def extract_html(details: dict) -> str | None:
158
+ def _extract_html(details: dict) -> str | None:
227
159
  """Extracts the HTML from the Zyte API response.
228
160
 
229
161
  The input argument is a dictionary of the following structure:
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
243
175
  return decoded_string
244
176
  return None
245
177
 
178
+ def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
179
+ product.product_name = self._extract_product_name(details=details)
180
+
181
+ url_resolved = self._extract_url_resolved(details=details)
182
+ if url_resolved:
183
+ product.url_resolved = url_resolved
184
+
185
+ # If the resolved URL is different from the original URL, we also need to update the domain as
186
+ # otherwise the unresolved domain will be shown.
187
+ # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
188
+ if url_resolved and url_resolved != product.url:
189
+ logger.debug(f"URL resolved for {product.url} is {url_resolved}")
190
+ product.domain = self._get_domain(url=url_resolved)
191
+
192
+ product.product_price = self._extract_product_price(details=details)
193
+ product.product_description = self._extract_product_description(details=details)
194
+ product.product_images = self._extract_image_urls(details=details)
195
+ product.probability = self._extract_probability(details=details)
196
+ product.html = self._extract_html(details=details)
197
+ if product.html:
198
+ soup = BeautifulSoup(product.html, "html.parser")
199
+ product.html_clean = soup.get_text(separator=" ", strip=True)
200
+
201
+ return product
202
+
203
+ @staticmethod
204
+ def keep_product(
205
+ details: dict,
206
+ threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
207
+ ) -> bool:
208
+ """Determines whether to keep the product based on the probability threshold.
209
+
210
+ Args:
211
+ details: A product details data dictionary.
212
+ threshold: The probability threshold used to filter the products.
213
+ """
214
+ try:
215
+ prob = float(details["product"]["metadata"]["probability"])
216
+ except KeyError:
217
+ logger.warning(
218
+ f"Product with url={details.get('url')} has no probability value - product is ignored"
219
+ )
220
+ return False
221
+ return prob > threshold
222
+
246
223
  async def unblock_url_content(self, url: str) -> bytes:
247
224
  """Unblock the content of an URL using Zyte proxy mode.
248
225
 
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
256
233
  raise httpx.HTTPError("No httpResponseBody in Zyte response")
257
234
 
258
235
  return b64decode(details["httpResponseBody"])
236
+
237
+ async def details(self, url: str) -> dict:
238
+ """Fetches product details for a single URL.
239
+
240
+ Args:
241
+ url: The URL to fetch product details from.
242
+
243
+ Returns:
244
+ A dictionary containing the product details, fields include:
245
+ (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
246
+ {
247
+ "url": str,
248
+ "statusCode": str,
249
+ "product": {
250
+ "name": str,
251
+ "price": str,
252
+ "mainImage": {"url": str},
253
+ "images": [{"url": str}],
254
+ "description": str,
255
+ "metadata": {
256
+ "probability": float,
257
+ },
258
+ },
259
+ "httpResponseBody": base64
260
+ }
261
+ """
262
+ logger.info(f"Fetching product details by Zyte for URL {url}.")
263
+
264
+ # Perform the request and retry if necessary. There is some context aware logging:
265
+ # - `before`: before the request is made (and before retrying)
266
+ # - `before_sleep`: if the request fails before sleeping
267
+ retry = get_async_retry()
268
+ retry.before = lambda retry_state: self._log_before(
269
+ url=url, retry_state=retry_state
270
+ )
271
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
272
+ url=url, retry_state=retry_state
273
+ )
274
+ async for attempt in retry:
275
+ with attempt:
276
+ response = await self._http_client.post(
277
+ url=self._endpoint,
278
+ json={"url": url, **self._config},
279
+ auth=(self._api_key, ""), # API key as username, empty password
280
+ )
281
+ response.raise_for_status()
282
+
283
+ details = response.json()
284
+ return details
fraudcrawler/settings.py CHANGED
@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
78
78
  # Zyte settings
79
79
  ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
80
80
 
81
+ # Exact match settings
82
+ EXACT_MATCH_PRODUCT_FIELDS = {
83
+ "url_resolvedproduct_name",
84
+ "product_description",
85
+ "html",
86
+ }
87
+ EXACT_MATCH_FIELD_SEPARATOR = "\n"
88
+
81
89
  # Processor settings
82
90
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
83
91
  PROCESSOR_DEFAULT_IF_MISSING = -1
@@ -1,9 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: fraudcrawler
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Intelligent Market Monitoring
5
- Home-page: https://github.com/open-veanu/fraudcrawler
6
5
  License: MIT
6
+ License-File: LICENSE
7
7
  Author: Domingo Bertus
8
8
  Author-email: hello@veanu.ch
9
9
  Requires-Python: >=3.11,<4.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
15
16
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
16
17
  Requires-Dist: httpx (>=0.28.1,<0.29.0)
17
18
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -1,22 +1,22 @@
1
1
  fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
2
2
  fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=IbkPookAAkqDCztzAvVRnhh8rCsYGlY69eI6cw-Kiw0,7294
3
+ fraudcrawler/base/base.py,sha256=mTmojNyVrPEB69-aI-43dl0Jct174G4ziBiOudDFfTY,7795
4
4
  fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
5
5
  fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
6
  fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
7
+ fraudcrawler/base/orchestrator.py,sha256=TiLKAJTBIPf0dxJuyZnCGIMWReC9gNvmEXqWwE0Ykbs,29002
8
8
  fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
9
- fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=_aDqaPdxE_DMwQY5_vpqF2YjwLkWIZq5Z9Tz3sqLKdg,4629
10
10
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
12
12
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=pMjTQEewa-jP6l2ndhHy8CNIcO4svhZOm6N_LNuv3gs,33925
14
+ fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
15
15
  fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
16
- fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
17
- fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
18
- fraudcrawler-0.6.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.6.0.dist-info/METADATA,sha256=adpYLe_ToSth-YOZE3eh-KNUsNmcwcM_SE7pqKikNmU,6704
20
- fraudcrawler-0.6.0.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
21
- fraudcrawler-0.6.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.6.0.dist-info/RECORD,,
16
+ fraudcrawler/scraping/zyte.py,sha256=sYpfwMuGE9MYpKvma_8x5Th2VBFn25Mqb4Wd7UChL_g,10215
17
+ fraudcrawler/settings.py,sha256=9ukAkxEzDtvy3xA-jSF3asr9uLIAATNQ-FqrsgCEDUk,4038
18
+ fraudcrawler-0.6.2.dist-info/METADATA,sha256=5hzWjCm1eQJ19Pm3vxUsS_EciUmbuppEpECi8ye2Wyw,6723
19
+ fraudcrawler-0.6.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
20
+ fraudcrawler-0.6.2.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
21
+ fraudcrawler-0.6.2.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
22
+ fraudcrawler-0.6.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.0
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any