fraudcrawler 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/__init__.py CHANGED
@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.url import URLCollector
4
4
  from fraudcrawler.scraping.zyte import ZyteAPI
5
+ from fraudcrawler.scraping.config import ScrapingConfig
5
6
  from fraudcrawler.processing.processor import Processor
7
+ from fraudcrawler.processing.config import ProcessingConfig
6
8
  from fraudcrawler.base.orchestrator import Orchestrator
7
9
  from fraudcrawler.base.client import FraudCrawlerClient
8
10
  from fraudcrawler.base.base import (
@@ -22,7 +24,9 @@ __all__ = [
22
24
  "Enricher",
23
25
  "URLCollector",
24
26
  "ZyteAPI",
27
+ "ScrapingConfig",
25
28
  "Processor",
29
+ "ProcessingConfig",
26
30
  "Orchestrator",
27
31
  "ProductItem",
28
32
  "FraudCrawlerClient",
fraudcrawler/base/base.py CHANGED
@@ -45,6 +45,7 @@ class Setup(BaseSettings):
45
45
  dataforseo_pwd: str
46
46
  zyteapi_key: str
47
47
  openaiapi_key: str
48
+ pypy_token: str
48
49
 
49
50
  class Config:
50
51
  env_file = ".env"
@@ -140,6 +141,8 @@ class ProductItem(BaseModel):
140
141
  url_resolved: str
141
142
  search_engine_name: str
142
143
  domain: str
144
+ exact_search: bool = False
145
+ exact_search_match: bool = False
143
146
 
144
147
  # Context parameters
145
148
  product_name: str | None = None
@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
19
19
  ProductItem,
20
20
  )
21
21
  from fraudcrawler.base.orchestrator import Orchestrator
22
+ from fraudcrawler.scraping.config import ScrapingConfig
22
23
  from fraudcrawler.scraping.search import SearchEngineName
24
+ from fraudcrawler.processing.config import ProcessingConfig
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
141
143
 
142
144
  asyncio.run(
143
145
  _run(
144
- search_term=search_term,
145
- search_engines=nrm_search_engines,
146
- language=language,
147
- location=location,
148
- deepness=deepness,
149
- prompts=prompts,
150
- marketplaces=marketplaces,
151
- excluded_urls=excluded_urls,
152
- previously_collected_urls=previously_collected_urls,
146
+ scraping_config=ScrapingConfig(
147
+ search_term=search_term,
148
+ search_engines=nrm_search_engines,
149
+ language=language,
150
+ location=location,
151
+ deepness=deepness,
152
+ marketplaces=marketplaces,
153
+ excluded_urls=excluded_urls,
154
+ previously_collected_urls=previously_collected_urls,
155
+ ),
156
+ processing_config=ProcessingConfig(
157
+ prompts=prompts,
158
+ ),
153
159
  )
154
160
  )
155
161
 
@@ -3,10 +3,12 @@ import asyncio
3
3
  import logging
4
4
  from typing import cast, Dict, List, Self
5
5
 
6
- from bs4 import BeautifulSoup
7
6
  import httpx
7
+ import re
8
8
 
9
9
  from fraudcrawler.settings import (
10
+ EXACT_MATCH_PRODUCT_FIELDS,
11
+ EXACT_MATCH_FIELD_SEPARATOR,
10
12
  PROCESSOR_DEFAULT_MODEL,
11
13
  )
12
14
  from fraudcrawler.settings import (
@@ -15,11 +17,9 @@ from fraudcrawler.settings import (
15
17
  DEFAULT_N_PROC_WKRS,
16
18
  )
17
19
  from fraudcrawler.base.base import (
18
- Deepness,
19
20
  Host,
20
21
  Language,
21
22
  Location,
22
- Prompt,
23
23
  ProductItem,
24
24
  HttpxAsyncClient,
25
25
  )
@@ -27,9 +27,11 @@ from fraudcrawler import (
27
27
  Searcher,
28
28
  SearchEngineName,
29
29
  Enricher,
30
- URLCollector,
31
30
  ZyteAPI,
31
+ URLCollector,
32
+ ScrapingConfig,
32
33
  Processor,
34
+ ProcessingConfig,
33
35
  )
34
36
 
35
37
  logger = logging.getLogger(__name__)
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
227
229
 
228
230
  if not product.filtered:
229
231
  try:
230
- # Fetch the product context from Zyte API
232
+ # Fetch and enrich the product context from Zyte API
231
233
  details = await self._zyteapi.details(url=product.url)
232
- url_resolved = self._zyteapi.extract_url_resolved(details=details)
233
- if url_resolved:
234
- product.url_resolved = url_resolved
235
- product.product_name = self._zyteapi.extract_product_name(
236
- details=details
234
+ product = self._zyteapi.enrich_context(
235
+ product=product, details=details
237
236
  )
238
237
 
239
- # If the resolved URL is different from the original URL, we also need to update the domain as
240
- # otherwise the unresolved domain will be shown.
241
- # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
242
- if url_resolved and url_resolved != product.url:
243
- logger.debug(
244
- f"URL resolved for {product.url} is {url_resolved}"
245
- )
246
- product.domain = self._searcher._get_domain(url_resolved)
247
-
248
- product.product_price = self._zyteapi.extract_product_price(
249
- details=details
250
- )
251
- product.product_description = (
252
- self._zyteapi.extract_product_description(details=details)
253
- )
254
- product.product_images = self._zyteapi.extract_image_urls(
255
- details=details
256
- )
257
- product.probability = self._zyteapi.extract_probability(
258
- details=details
259
- )
260
- product.html = self._zyteapi.extract_html(details=details)
261
- if product.html:
262
- soup = BeautifulSoup(product.html, "html.parser")
263
- product.html_clean = soup.get_text(separator=" ", strip=True)
264
238
  # Filter the product based on the probability threshold
265
239
  if not self._zyteapi.keep_product(details=details):
266
240
  product.filtered = True
267
- product.filtered_at_stage = "Zyte probability threshold"
241
+ product.filtered_at_stage = (
242
+ "Context (Zyte probability threshold)"
243
+ )
244
+
245
+ # Check for exact match inside the full product context
246
+ product = self._check_exact_search(product=product)
247
+ if (
248
+ not product.filtered
249
+ and product.exact_search
250
+ and not product.exact_search_match
251
+ ):
252
+ product.filtered = True
253
+ product.filtered_at_stage = "Context (exact search)"
254
+
268
255
  except Exception as e:
269
256
  logger.warning(f"Error executing Zyte API search: {e}.")
270
257
  await queue_out.put(product)
@@ -274,14 +261,14 @@ class Orchestrator(ABC):
274
261
  self,
275
262
  queue_in: asyncio.Queue[ProductItem | None],
276
263
  queue_out: asyncio.Queue[ProductItem | None],
277
- prompts: List[Prompt],
264
+ processing_config: ProcessingConfig,
278
265
  ) -> None:
279
266
  """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
280
267
 
281
268
  Args:
282
269
  queue_in: The input queue containing the product details.
283
270
  queue_out: The output queue to put the processed product details.
284
- prompts: The list of prompts to use for classification.
271
+ processing_config: Sets up the processing pipeline step.
285
272
  """
286
273
 
287
274
  # Process the products
@@ -295,7 +282,7 @@ class Orchestrator(ABC):
295
282
  if not product.filtered:
296
283
  try:
297
284
  # Run all the configured prompts
298
- for prompt in prompts:
285
+ for prompt in processing_config.prompts:
299
286
  classification = await self._processor.classify(
300
287
  product=product,
301
288
  prompt=prompt,
@@ -331,7 +318,7 @@ class Orchestrator(ABC):
331
318
  n_srch_wkrs: int,
332
319
  n_cntx_wkrs: int,
333
320
  n_proc_wkrs: int,
334
- prompts: List[Prompt],
321
+ processing_config: ProcessingConfig,
335
322
  ) -> None:
336
323
  """Sets up the necessary queues and workers for the async framework.
337
324
 
@@ -339,7 +326,7 @@ class Orchestrator(ABC):
339
326
  n_srch_wkrs: Number of async workers for search.
340
327
  n_cntx_wkrs: Number of async workers for context extraction.
341
328
  n_proc_wkrs: Number of async workers for processing.
342
- prompts: The list of prompts used for the classification by func:`Processor.classify`.
329
+ processing_config: Sets up the processing pipeline step.
343
330
  """
344
331
 
345
332
  # Setup the input/output queues for the workers
@@ -382,7 +369,7 @@ class Orchestrator(ABC):
382
369
  self._proc_execute(
383
370
  queue_in=proc_queue,
384
371
  queue_out=res_queue,
385
- prompts=prompts,
372
+ processing_config=processing_config,
386
373
  )
387
374
  )
388
375
  for _ in range(n_proc_wkrs)
@@ -436,13 +423,7 @@ class Orchestrator(ABC):
436
423
  async def _add_srch_items(
437
424
  self,
438
425
  queue: asyncio.Queue[dict | None],
439
- search_term: str,
440
- search_engines: List[SearchEngineName],
441
- language: Language,
442
- location: Location,
443
- deepness: Deepness,
444
- marketplaces: List[Host] | None,
445
- excluded_urls: List[Host] | None,
426
+ scraping_config: ScrapingConfig,
446
427
  ) -> None:
447
428
  """Adds all the (enriched) search_term (as srch items) to the queue.
448
429
 
@@ -461,12 +442,17 @@ class Orchestrator(ABC):
461
442
  for each search_engine
462
443
  add item to queue
463
444
  """
445
+ search_term = scraping_config.search_term
446
+ search_engines = scraping_config.search_engines
447
+ language = scraping_config.language
448
+ location = scraping_config.location
449
+ deepness = scraping_config.deepness
464
450
  common_kwargs = {
465
451
  "queue": queue,
466
452
  "language": language,
467
453
  "location": location,
468
- "marketplaces": marketplaces,
469
- "excluded_urls": excluded_urls,
454
+ "marketplaces": scraping_config.marketplaces,
455
+ "excluded_urls": scraping_config.excluded_urls,
470
456
  }
471
457
 
472
458
  # Add initial items to the queue
@@ -502,48 +488,101 @@ class Orchestrator(ABC):
502
488
  **common_kwargs, # type: ignore[arg-type]
503
489
  )
504
490
 
491
+ @staticmethod
492
+ def _is_exact_search(search_term: str) -> bool:
493
+ """Check if the search term is an exact search (contains double quotation marks).
494
+
495
+ Args:
496
+ search_term: The search term to check.
497
+ """
498
+ return '"' in search_term
499
+
500
+ @staticmethod
501
+ def _extract_exact_search_terms(search_term: str) -> list[str]:
502
+ """Extract all exact search terms from within double quotation marks (empty if no quotes found).
503
+
504
+ Args:
505
+ search_term: The search term that may contain double quotation marks.
506
+ """
507
+ # Find all double-quoted strings
508
+ double_quote_matches = re.findall(r'"([^"]*)"', search_term)
509
+ return double_quote_matches
510
+
511
+ @staticmethod
512
+ def _check_exact_search_terms_match(
513
+ product: ProductItem,
514
+ exact_search_terms: list[str],
515
+ ) -> bool:
516
+ """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
517
+
518
+ Args:
519
+ product: The product item.
520
+ exact_search_terms: List of exact search terms to match against.
521
+ """
522
+ field_values = [
523
+ str(val)
524
+ for fld in EXACT_MATCH_PRODUCT_FIELDS
525
+ if (val := getattr(product, fld, None)) is not None
526
+ ]
527
+ product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
528
+
529
+ return all(
530
+ re.search(re.escape(est.lower()), product_str_lower)
531
+ for est in exact_search_terms
532
+ )
533
+
534
+ def _check_exact_search(self, product: ProductItem) -> ProductItem:
535
+ """Checks if the search term requests an exact search and if yes, checks for conformity."""
536
+ # Check for exact search and apply regex matching
537
+ exact_search = self._is_exact_search(product.search_term)
538
+ product.exact_search = exact_search
539
+
540
+ # Only set exact_search_match if this was an exact search (contains quotes)
541
+ if exact_search:
542
+ exact_search_terms = self._extract_exact_search_terms(product.search_term)
543
+ if exact_search_terms:
544
+ product.exact_search_match = self._check_exact_search_terms_match(
545
+ product=product, exact_search_terms=exact_search_terms
546
+ )
547
+ logger.debug(
548
+ f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
549
+ f"for offer with url={product.url}"
550
+ )
551
+ else:
552
+ logger.warning(
553
+ f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
554
+ f"for offer with url={product.url}"
555
+ )
556
+ # If exact_search is False, product.exact_search_match remains False (default value)
557
+ return product
558
+
505
559
  async def run(
506
560
  self,
507
- search_term: str,
508
- search_engines: List[SearchEngineName],
509
- language: Language,
510
- location: Location,
511
- deepness: Deepness,
512
- prompts: List[Prompt],
513
- marketplaces: List[Host] | None = None,
514
- excluded_urls: List[Host] | None = None,
515
- previously_collected_urls: List[str] | None = None,
561
+ scraping_config: ScrapingConfig,
562
+ processing_config: ProcessingConfig,
516
563
  ) -> None:
517
564
  """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
518
565
 
519
566
  Args:
520
- search_term: The search term for the query.
521
- search_engines: The list of search engines to use for the search query.
522
- language: The language to use for the query.
523
- location: The location to use for the query.
524
- deepness: The search depth and enrichment details.
525
- prompts: The list of prompt to use for classification.
526
- marketplaces: The marketplaces to include in the search.
527
- excluded_urls: The URLs to exclude from the search.
528
- previously_collected_urls: The urls that have been collected previously and are ignored.
567
+ scraping_config: Sets up the scraping pipeline step.
568
+ processing_config: Sets up the processing pipeline step.
529
569
  """
530
570
  # ---------------------------
531
571
  # INITIAL SETUP
532
572
  # ---------------------------
533
- # Ensure we have at least one search engine
534
- if not search_engines:
573
+ # Ensure we have at least one search engine (the list might be empty)
574
+ if not scraping_config.search_engines:
535
575
  logger.warning(
536
576
  "No search engines specified, using all available search engines"
537
577
  )
538
- search_engines = list(SearchEngineName)
578
+ scraping_config.search_engines = list(SearchEngineName)
539
579
 
540
580
  # Handle previously collected URLs
541
- if previously_collected_urls:
542
- self._url_collector.add_previously_collected_urls(
543
- urls=previously_collected_urls
544
- )
581
+ if pcurls := scraping_config.previously_collected_urls:
582
+ self._url_collector.add_previously_collected_urls(urls=pcurls)
545
583
 
546
584
  # Setup the async framework
585
+ deepness = scraping_config.deepness
547
586
  n_terms_max = 1 + (
548
587
  deepness.enrichment.additional_terms if deepness.enrichment else 0
549
588
  )
@@ -558,7 +597,7 @@ class Orchestrator(ABC):
558
597
  n_srch_wkrs=n_srch_wkrs,
559
598
  n_cntx_wkrs=n_cntx_wkrs,
560
599
  n_proc_wkrs=n_proc_wkrs,
561
- prompts=prompts,
600
+ processing_config=processing_config,
562
601
  )
563
602
 
564
603
  # Check setup of async framework
@@ -581,13 +620,7 @@ class Orchestrator(ABC):
581
620
  srch_queue = self._queues["srch"]
582
621
  await self._add_srch_items(
583
622
  queue=srch_queue,
584
- search_term=search_term,
585
- search_engines=search_engines,
586
- language=language,
587
- location=location,
588
- deepness=deepness,
589
- marketplaces=marketplaces,
590
- excluded_urls=excluded_urls,
623
+ scraping_config=scraping_config,
591
624
  )
592
625
 
593
626
  # -----------------------------
@@ -97,4 +97,4 @@ def search(search_term: str):
97
97
 
98
98
 
99
99
  if __name__ == "__main__":
100
- search(search_term='Gorenje "R619FEW5"')
100
+ search(search_term="Kaffeebohnen")
@@ -0,0 +1,12 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+
4
+ from fraudcrawler.base.base import Prompt
5
+
6
+
7
+ class ProcessingConfig(BaseModel):
8
+ """Sets up the processing pipeline step."""
9
+
10
+ prompts: List[Prompt] = Field(
11
+ description="The list of prompts to use for classification."
12
+ )
@@ -0,0 +1,32 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+
4
+ from fraudcrawler.scraping.search import SearchEngineName
5
+ from fraudcrawler.base.base import (
6
+ Language,
7
+ Location,
8
+ Deepness,
9
+ Host,
10
+ )
11
+
12
+
13
+ class ScrapingConfig(BaseModel):
14
+ """Sets up the scraping pipeline step."""
15
+
16
+ search_term: str = Field(description="The search term for the query.")
17
+ search_engines: List[SearchEngineName] = Field(
18
+ description="The list of search engines to use for the search query."
19
+ )
20
+ language: Language = Field(description="The language to use for the query.")
21
+ location: Location = Field(description="The location to use for the query.")
22
+ deepness: Deepness = Field(description="The search depth and enrichment details.")
23
+ marketplaces: List[Host] | None = Field(
24
+ default=None, description="The marketplaces to include in the search."
25
+ )
26
+ excluded_urls: List[Host] | None = Field(
27
+ default=None, description="The URLs to exclude from the search."
28
+ )
29
+ previously_collected_urls: List[str] | None = Field(
30
+ default=None,
31
+ description="The URLs that have been collected previously and are ignored.",
32
+ )
@@ -2,11 +2,12 @@ from base64 import b64decode
2
2
  import logging
3
3
  from typing import List
4
4
 
5
+ from bs4 import BeautifulSoup
5
6
  import httpx
6
7
  from tenacity import RetryCallState
7
8
 
8
9
  from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
9
- from fraudcrawler.base.base import DomainUtils
10
+ from fraudcrawler.base.base import DomainUtils, ProductItem
10
11
  from fraudcrawler.base.retry import get_async_retry
11
12
 
12
13
  logger = logging.getLogger(__name__)
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
61
62
  else:
62
63
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
63
64
 
64
- async def details(self, url: str) -> dict:
65
- """Fetches product details for a single URL.
66
-
67
- Args:
68
- url: The URL to fetch product details from.
69
-
70
- Returns:
71
- A dictionary containing the product details, fields include:
72
- (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
73
- {
74
- "url": str,
75
- "statusCode": str,
76
- "product": {
77
- "name": str,
78
- "price": str,
79
- "mainImage": {"url": str},
80
- "images": [{"url": str}],
81
- "description": str,
82
- "metadata": {
83
- "probability": float,
84
- },
85
- },
86
- "httpResponseBody": base64
87
- }
88
- """
89
- logger.info(f"Fetching product details by Zyte for URL {url}.")
90
-
91
- # Perform the request and retry if necessary. There is some context aware logging:
92
- # - `before`: before the request is made (and before retrying)
93
- # - `before_sleep`: if the request fails before sleeping
94
- retry = get_async_retry()
95
- retry.before = lambda retry_state: self._log_before(
96
- url=url, retry_state=retry_state
97
- )
98
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
99
- url=url, retry_state=retry_state
100
- )
101
- async for attempt in retry:
102
- with attempt:
103
- response = await self._http_client.post(
104
- url=self._endpoint,
105
- json={"url": url, **self._config},
106
- auth=(self._api_key, ""), # API key as username, empty password
107
- )
108
- response.raise_for_status()
109
-
110
- details = response.json()
111
- return details
112
-
113
- @staticmethod
114
- def keep_product(
115
- details: dict,
116
- threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
117
- ) -> bool:
118
- """Determines whether to keep the product based on the probability threshold.
119
-
120
- Args:
121
- details: A product details data dictionary.
122
- threshold: The probability threshold used to filter the products.
123
- """
124
- try:
125
- prob = float(details["product"]["metadata"]["probability"])
126
- except KeyError:
127
- logger.warning(
128
- f"Product with url={details.get('url')} has no probability value - product is ignored"
129
- )
130
- return False
131
- return prob > threshold
132
-
133
65
  @staticmethod
134
- def extract_product_name(details: dict) -> str | None:
66
+ def _extract_product_name(details: dict) -> str | None:
135
67
  """Extracts the product name from the product data.
136
68
 
137
69
  The input argument is a dictionary of the following structure:
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
144
76
  return details.get("product", {}).get("name")
145
77
 
146
78
  @staticmethod
147
- def extract_url_resolved(details: dict) -> str | None:
79
+ def _extract_url_resolved(details: dict) -> str | None:
148
80
  """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
149
81
 
150
82
  The input argument is a dictionary of the following structure:
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
157
89
  return details.get("product", {}).get("url")
158
90
 
159
91
  @staticmethod
160
- def extract_product_price(details: dict) -> str | None:
92
+ def _extract_product_price(details: dict) -> str | None:
161
93
  """Extracts the product price from the product data.
162
94
 
163
95
  The input argument is a dictionary of the following structure:
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
170
102
  return details.get("product", {}).get("price")
171
103
 
172
104
  @staticmethod
173
- def extract_product_description(details: dict) -> str | None:
105
+ def _extract_product_description(details: dict) -> str | None:
174
106
  """Extracts the product description from the product data.
175
107
 
176
108
  The input argument is a dictionary of the following structure:
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
183
115
  return details.get("product", {}).get("description")
184
116
 
185
117
  @staticmethod
186
- def extract_image_urls(details: dict) -> List[str]:
118
+ def _extract_image_urls(details: dict) -> List[str]:
187
119
  """Extracts the images from the product data.
188
120
 
189
121
  The input argument is a dictionary of the following structure:
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
206
138
  return images
207
139
 
208
140
  @staticmethod
209
- def extract_probability(details: dict) -> float:
141
+ def _extract_probability(details: dict) -> float:
210
142
  """Extracts the probability from the product data.
211
143
 
212
144
  The input argument is a dictionary of the following structure:
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
223
155
  )
224
156
 
225
157
  @staticmethod
226
- def extract_html(details: dict) -> str | None:
158
+ def _extract_html(details: dict) -> str | None:
227
159
  """Extracts the HTML from the Zyte API response.
228
160
 
229
161
  The input argument is a dictionary of the following structure:
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
243
175
  return decoded_string
244
176
  return None
245
177
 
178
+ def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
179
+ product.product_name = self._extract_product_name(details=details)
180
+
181
+ url_resolved = self._extract_url_resolved(details=details)
182
+ if url_resolved:
183
+ product.url_resolved = url_resolved
184
+
185
+ # If the resolved URL is different from the original URL, we also need to update the domain as
186
+ # otherwise the unresolved domain will be shown.
187
+ # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
188
+ if url_resolved and url_resolved != product.url:
189
+ logger.debug(f"URL resolved for {product.url} is {url_resolved}")
190
+ product.domain = self._get_domain(url=url_resolved)
191
+
192
+ product.product_price = self._extract_product_price(details=details)
193
+ product.product_description = self._extract_product_description(details=details)
194
+ product.product_images = self._extract_image_urls(details=details)
195
+ product.probability = self._extract_probability(details=details)
196
+ product.html = self._extract_html(details=details)
197
+ if product.html:
198
+ soup = BeautifulSoup(product.html, "html.parser")
199
+ product.html_clean = soup.get_text(separator=" ", strip=True)
200
+
201
+ return product
202
+
203
+ @staticmethod
204
+ def keep_product(
205
+ details: dict,
206
+ threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
207
+ ) -> bool:
208
+ """Determines whether to keep the product based on the probability threshold.
209
+
210
+ Args:
211
+ details: A product details data dictionary.
212
+ threshold: The probability threshold used to filter the products.
213
+ """
214
+ try:
215
+ prob = float(details["product"]["metadata"]["probability"])
216
+ except KeyError:
217
+ logger.warning(
218
+ f"Product with url={details.get('url')} has no probability value - product is ignored"
219
+ )
220
+ return False
221
+ return prob > threshold
222
+
246
223
  async def unblock_url_content(self, url: str) -> bytes:
247
224
  """Unblock the content of an URL using Zyte proxy mode.
248
225
 
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
256
233
  raise httpx.HTTPError("No httpResponseBody in Zyte response")
257
234
 
258
235
  return b64decode(details["httpResponseBody"])
236
+
237
+ async def details(self, url: str) -> dict:
238
+ """Fetches product details for a single URL.
239
+
240
+ Args:
241
+ url: The URL to fetch product details from.
242
+
243
+ Returns:
244
+ A dictionary containing the product details, fields include:
245
+ (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
246
+ {
247
+ "url": str,
248
+ "statusCode": str,
249
+ "product": {
250
+ "name": str,
251
+ "price": str,
252
+ "mainImage": {"url": str},
253
+ "images": [{"url": str}],
254
+ "description": str,
255
+ "metadata": {
256
+ "probability": float,
257
+ },
258
+ },
259
+ "httpResponseBody": base64
260
+ }
261
+ """
262
+ logger.info(f"Fetching product details by Zyte for URL {url}.")
263
+
264
+ # Perform the request and retry if necessary. There is some context aware logging:
265
+ # - `before`: before the request is made (and before retrying)
266
+ # - `before_sleep`: if the request fails before sleeping
267
+ retry = get_async_retry()
268
+ retry.before = lambda retry_state: self._log_before(
269
+ url=url, retry_state=retry_state
270
+ )
271
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
272
+ url=url, retry_state=retry_state
273
+ )
274
+ async for attempt in retry:
275
+ with attempt:
276
+ response = await self._http_client.post(
277
+ url=self._endpoint,
278
+ json={"url": url, **self._config},
279
+ auth=(self._api_key, ""), # API key as username, empty password
280
+ )
281
+ response.raise_for_status()
282
+
283
+ details = response.json()
284
+ return details
fraudcrawler/settings.py CHANGED
@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
78
78
  # Zyte settings
79
79
  ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
80
80
 
81
+ # Exact match settings
82
+ EXACT_MATCH_PRODUCT_FIELDS = {
83
+ "url_resolvedproduct_name",
84
+ "product_description",
85
+ "html",
86
+ }
87
+ EXACT_MATCH_FIELD_SEPARATOR = "\n"
88
+
81
89
  # Processor settings
82
90
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
83
91
  PROCESSOR_DEFAULT_IF_MISSING = -1
@@ -1,9 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: fraudcrawler
3
- Version: 0.6.1
3
+ Version: 0.6.3
4
4
  Summary: Intelligent Market Monitoring
5
- Home-page: https://github.com/open-veanu/fraudcrawler
6
5
  License: MIT
6
+ License-File: LICENSE
7
7
  Author: Domingo Bertus
8
8
  Author-email: hello@veanu.ch
9
9
  Requires-Python: >=3.11,<4.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
15
16
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
16
17
  Requires-Dist: httpx (>=0.28.1,<0.29.0)
17
18
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -0,0 +1,24 @@
1
+ fraudcrawler/__init__.py,sha256=YEbaofjs8pKkwqz4T-kGk7vHIQ_3XtDlF6D63wfuXjE,1008
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=l3rsXKYKmN_I0GkAXDjpjh_s07cE4siFAwMq0byroQM,7815
4
+ fraudcrawler/base/client.py,sha256=6xAhQ7hdWYa7CQ84Ps1XUompCTXiSk0e3PywmTMUGng,6146
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=UxjJMLm2kpxG76m2TXShuLkqhUxSuYvZyPP2yy708JA,28082
8
+ fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=_aDqaPdxE_DMwQY5_vpqF2YjwLkWIZq5Z9Tz3sqLKdg,4629
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/config.py,sha256=xqqTXK7zFA-7zwk76eZwrF97NtMzMOipUY6imeBIjQ8,301
12
+ fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
13
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ fraudcrawler/scraping/config.py,sha256=8gyfB0VLi_FZr4J7a-HCTSYt8bRgSKXHo-Y9tlsD2MQ,1179
15
+ fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
16
+ fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
17
+ fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
18
+ fraudcrawler/scraping/zyte.py,sha256=sYpfwMuGE9MYpKvma_8x5Th2VBFn25Mqb4Wd7UChL_g,10215
19
+ fraudcrawler/settings.py,sha256=9ukAkxEzDtvy3xA-jSF3asr9uLIAATNQ-FqrsgCEDUk,4038
20
+ fraudcrawler-0.6.3.dist-info/METADATA,sha256=pP2S9-MFvCkNwWO7YB9Q9oYmZhsHlIxyljFXvTIfgus,6723
21
+ fraudcrawler-0.6.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
22
+ fraudcrawler-0.6.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
23
+ fraudcrawler-0.6.3.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
24
+ fraudcrawler-0.6.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.0
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,22 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=74qwevU8sZBvXAladam0rmjcdn3AiT39MScpxZtD95I,7727
4
- fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
8
- fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
9
- fraudcrawler/launch_demo_pipeline.py,sha256=TqlQrs8raT9jIJ3TJK3BOQMLm2qNn2dKaMGL-MyhC70,4635
10
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
12
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
15
- fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
16
- fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
17
- fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
18
- fraudcrawler-0.6.1.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.6.1.dist-info/METADATA,sha256=_LcfOKayMQjAXoCxlJfqYtiSfitegUuQgFUD5XEGFog,6704
20
- fraudcrawler-0.6.1.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
21
- fraudcrawler-0.6.1.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.6.1.dist-info/RECORD,,