fraudcrawler 0.6.1__tar.gz → 0.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/PKG-INFO +4 -3
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/__init__.py +4 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/base.py +3 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/client.py +15 -9
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/orchestrator.py +117 -84
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler-0.6.3/fraudcrawler/processing/config.py +12 -0
- fraudcrawler-0.6.3/fraudcrawler/scraping/config.py +32 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/scraping/zyte.py +103 -77
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/settings.py +8 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/pyproject.toml +1 -1
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/LICENSE +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/README.md +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/base/retry.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/scraping/search.py +0 -0
- {fraudcrawler-0.6.1 → fraudcrawler-0.6.3}/fraudcrawler/scraping/url.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.3
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
|
-
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
7
|
Author: Domingo Bertus
|
|
8
8
|
Author-email: hello@veanu.ch
|
|
9
9
|
Requires-Python: >=3.11,<4.0
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
16
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
16
17
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
17
18
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.url import URLCollector
|
|
4
4
|
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
5
|
+
from fraudcrawler.scraping.config import ScrapingConfig
|
|
5
6
|
from fraudcrawler.processing.processor import Processor
|
|
7
|
+
from fraudcrawler.processing.config import ProcessingConfig
|
|
6
8
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
7
9
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
8
10
|
from fraudcrawler.base.base import (
|
|
@@ -22,7 +24,9 @@ __all__ = [
|
|
|
22
24
|
"Enricher",
|
|
23
25
|
"URLCollector",
|
|
24
26
|
"ZyteAPI",
|
|
27
|
+
"ScrapingConfig",
|
|
25
28
|
"Processor",
|
|
29
|
+
"ProcessingConfig",
|
|
26
30
|
"Orchestrator",
|
|
27
31
|
"ProductItem",
|
|
28
32
|
"FraudCrawlerClient",
|
|
@@ -45,6 +45,7 @@ class Setup(BaseSettings):
|
|
|
45
45
|
dataforseo_pwd: str
|
|
46
46
|
zyteapi_key: str
|
|
47
47
|
openaiapi_key: str
|
|
48
|
+
pypy_token: str
|
|
48
49
|
|
|
49
50
|
class Config:
|
|
50
51
|
env_file = ".env"
|
|
@@ -140,6 +141,8 @@ class ProductItem(BaseModel):
|
|
|
140
141
|
url_resolved: str
|
|
141
142
|
search_engine_name: str
|
|
142
143
|
domain: str
|
|
144
|
+
exact_search: bool = False
|
|
145
|
+
exact_search_match: bool = False
|
|
143
146
|
|
|
144
147
|
# Context parameters
|
|
145
148
|
product_name: str | None = None
|
|
@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
|
|
|
19
19
|
ProductItem,
|
|
20
20
|
)
|
|
21
21
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
22
|
+
from fraudcrawler.scraping.config import ScrapingConfig
|
|
22
23
|
from fraudcrawler.scraping.search import SearchEngineName
|
|
24
|
+
from fraudcrawler.processing.config import ProcessingConfig
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
25
27
|
|
|
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
141
143
|
|
|
142
144
|
asyncio.run(
|
|
143
145
|
_run(
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
146
|
+
scraping_config=ScrapingConfig(
|
|
147
|
+
search_term=search_term,
|
|
148
|
+
search_engines=nrm_search_engines,
|
|
149
|
+
language=language,
|
|
150
|
+
location=location,
|
|
151
|
+
deepness=deepness,
|
|
152
|
+
marketplaces=marketplaces,
|
|
153
|
+
excluded_urls=excluded_urls,
|
|
154
|
+
previously_collected_urls=previously_collected_urls,
|
|
155
|
+
),
|
|
156
|
+
processing_config=ProcessingConfig(
|
|
157
|
+
prompts=prompts,
|
|
158
|
+
),
|
|
153
159
|
)
|
|
154
160
|
)
|
|
155
161
|
|
|
@@ -3,10 +3,12 @@ import asyncio
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import cast, Dict, List, Self
|
|
5
5
|
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
6
|
import httpx
|
|
7
|
+
import re
|
|
8
8
|
|
|
9
9
|
from fraudcrawler.settings import (
|
|
10
|
+
EXACT_MATCH_PRODUCT_FIELDS,
|
|
11
|
+
EXACT_MATCH_FIELD_SEPARATOR,
|
|
10
12
|
PROCESSOR_DEFAULT_MODEL,
|
|
11
13
|
)
|
|
12
14
|
from fraudcrawler.settings import (
|
|
@@ -15,11 +17,9 @@ from fraudcrawler.settings import (
|
|
|
15
17
|
DEFAULT_N_PROC_WKRS,
|
|
16
18
|
)
|
|
17
19
|
from fraudcrawler.base.base import (
|
|
18
|
-
Deepness,
|
|
19
20
|
Host,
|
|
20
21
|
Language,
|
|
21
22
|
Location,
|
|
22
|
-
Prompt,
|
|
23
23
|
ProductItem,
|
|
24
24
|
HttpxAsyncClient,
|
|
25
25
|
)
|
|
@@ -27,9 +27,11 @@ from fraudcrawler import (
|
|
|
27
27
|
Searcher,
|
|
28
28
|
SearchEngineName,
|
|
29
29
|
Enricher,
|
|
30
|
-
URLCollector,
|
|
31
30
|
ZyteAPI,
|
|
31
|
+
URLCollector,
|
|
32
|
+
ScrapingConfig,
|
|
32
33
|
Processor,
|
|
34
|
+
ProcessingConfig,
|
|
33
35
|
)
|
|
34
36
|
|
|
35
37
|
logger = logging.getLogger(__name__)
|
|
@@ -227,44 +229,29 @@ class Orchestrator(ABC):
|
|
|
227
229
|
|
|
228
230
|
if not product.filtered:
|
|
229
231
|
try:
|
|
230
|
-
# Fetch the product context from Zyte API
|
|
232
|
+
# Fetch and enrich the product context from Zyte API
|
|
231
233
|
details = await self._zyteapi.details(url=product.url)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
product.url_resolved = url_resolved
|
|
235
|
-
product.product_name = self._zyteapi.extract_product_name(
|
|
236
|
-
details=details
|
|
234
|
+
product = self._zyteapi.enrich_context(
|
|
235
|
+
product=product, details=details
|
|
237
236
|
)
|
|
238
237
|
|
|
239
|
-
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
240
|
-
# otherwise the unresolved domain will be shown.
|
|
241
|
-
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
242
|
-
if url_resolved and url_resolved != product.url:
|
|
243
|
-
logger.debug(
|
|
244
|
-
f"URL resolved for {product.url} is {url_resolved}"
|
|
245
|
-
)
|
|
246
|
-
product.domain = self._searcher._get_domain(url_resolved)
|
|
247
|
-
|
|
248
|
-
product.product_price = self._zyteapi.extract_product_price(
|
|
249
|
-
details=details
|
|
250
|
-
)
|
|
251
|
-
product.product_description = (
|
|
252
|
-
self._zyteapi.extract_product_description(details=details)
|
|
253
|
-
)
|
|
254
|
-
product.product_images = self._zyteapi.extract_image_urls(
|
|
255
|
-
details=details
|
|
256
|
-
)
|
|
257
|
-
product.probability = self._zyteapi.extract_probability(
|
|
258
|
-
details=details
|
|
259
|
-
)
|
|
260
|
-
product.html = self._zyteapi.extract_html(details=details)
|
|
261
|
-
if product.html:
|
|
262
|
-
soup = BeautifulSoup(product.html, "html.parser")
|
|
263
|
-
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
264
238
|
# Filter the product based on the probability threshold
|
|
265
239
|
if not self._zyteapi.keep_product(details=details):
|
|
266
240
|
product.filtered = True
|
|
267
|
-
product.filtered_at_stage =
|
|
241
|
+
product.filtered_at_stage = (
|
|
242
|
+
"Context (Zyte probability threshold)"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Check for exact match inside the full product context
|
|
246
|
+
product = self._check_exact_search(product=product)
|
|
247
|
+
if (
|
|
248
|
+
not product.filtered
|
|
249
|
+
and product.exact_search
|
|
250
|
+
and not product.exact_search_match
|
|
251
|
+
):
|
|
252
|
+
product.filtered = True
|
|
253
|
+
product.filtered_at_stage = "Context (exact search)"
|
|
254
|
+
|
|
268
255
|
except Exception as e:
|
|
269
256
|
logger.warning(f"Error executing Zyte API search: {e}.")
|
|
270
257
|
await queue_out.put(product)
|
|
@@ -274,14 +261,14 @@ class Orchestrator(ABC):
|
|
|
274
261
|
self,
|
|
275
262
|
queue_in: asyncio.Queue[ProductItem | None],
|
|
276
263
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
277
|
-
|
|
264
|
+
processing_config: ProcessingConfig,
|
|
278
265
|
) -> None:
|
|
279
266
|
"""Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
|
|
280
267
|
|
|
281
268
|
Args:
|
|
282
269
|
queue_in: The input queue containing the product details.
|
|
283
270
|
queue_out: The output queue to put the processed product details.
|
|
284
|
-
|
|
271
|
+
processing_config: Sets up the processing pipeline step.
|
|
285
272
|
"""
|
|
286
273
|
|
|
287
274
|
# Process the products
|
|
@@ -295,7 +282,7 @@ class Orchestrator(ABC):
|
|
|
295
282
|
if not product.filtered:
|
|
296
283
|
try:
|
|
297
284
|
# Run all the configured prompts
|
|
298
|
-
for prompt in prompts:
|
|
285
|
+
for prompt in processing_config.prompts:
|
|
299
286
|
classification = await self._processor.classify(
|
|
300
287
|
product=product,
|
|
301
288
|
prompt=prompt,
|
|
@@ -331,7 +318,7 @@ class Orchestrator(ABC):
|
|
|
331
318
|
n_srch_wkrs: int,
|
|
332
319
|
n_cntx_wkrs: int,
|
|
333
320
|
n_proc_wkrs: int,
|
|
334
|
-
|
|
321
|
+
processing_config: ProcessingConfig,
|
|
335
322
|
) -> None:
|
|
336
323
|
"""Sets up the necessary queues and workers for the async framework.
|
|
337
324
|
|
|
@@ -339,7 +326,7 @@ class Orchestrator(ABC):
|
|
|
339
326
|
n_srch_wkrs: Number of async workers for search.
|
|
340
327
|
n_cntx_wkrs: Number of async workers for context extraction.
|
|
341
328
|
n_proc_wkrs: Number of async workers for processing.
|
|
342
|
-
|
|
329
|
+
processing_config: Sets up the processing pipeline step.
|
|
343
330
|
"""
|
|
344
331
|
|
|
345
332
|
# Setup the input/output queues for the workers
|
|
@@ -382,7 +369,7 @@ class Orchestrator(ABC):
|
|
|
382
369
|
self._proc_execute(
|
|
383
370
|
queue_in=proc_queue,
|
|
384
371
|
queue_out=res_queue,
|
|
385
|
-
|
|
372
|
+
processing_config=processing_config,
|
|
386
373
|
)
|
|
387
374
|
)
|
|
388
375
|
for _ in range(n_proc_wkrs)
|
|
@@ -436,13 +423,7 @@ class Orchestrator(ABC):
|
|
|
436
423
|
async def _add_srch_items(
|
|
437
424
|
self,
|
|
438
425
|
queue: asyncio.Queue[dict | None],
|
|
439
|
-
|
|
440
|
-
search_engines: List[SearchEngineName],
|
|
441
|
-
language: Language,
|
|
442
|
-
location: Location,
|
|
443
|
-
deepness: Deepness,
|
|
444
|
-
marketplaces: List[Host] | None,
|
|
445
|
-
excluded_urls: List[Host] | None,
|
|
426
|
+
scraping_config: ScrapingConfig,
|
|
446
427
|
) -> None:
|
|
447
428
|
"""Adds all the (enriched) search_term (as srch items) to the queue.
|
|
448
429
|
|
|
@@ -461,12 +442,17 @@ class Orchestrator(ABC):
|
|
|
461
442
|
for each search_engine
|
|
462
443
|
add item to queue
|
|
463
444
|
"""
|
|
445
|
+
search_term = scraping_config.search_term
|
|
446
|
+
search_engines = scraping_config.search_engines
|
|
447
|
+
language = scraping_config.language
|
|
448
|
+
location = scraping_config.location
|
|
449
|
+
deepness = scraping_config.deepness
|
|
464
450
|
common_kwargs = {
|
|
465
451
|
"queue": queue,
|
|
466
452
|
"language": language,
|
|
467
453
|
"location": location,
|
|
468
|
-
"marketplaces": marketplaces,
|
|
469
|
-
"excluded_urls": excluded_urls,
|
|
454
|
+
"marketplaces": scraping_config.marketplaces,
|
|
455
|
+
"excluded_urls": scraping_config.excluded_urls,
|
|
470
456
|
}
|
|
471
457
|
|
|
472
458
|
# Add initial items to the queue
|
|
@@ -502,48 +488,101 @@ class Orchestrator(ABC):
|
|
|
502
488
|
**common_kwargs, # type: ignore[arg-type]
|
|
503
489
|
)
|
|
504
490
|
|
|
491
|
+
@staticmethod
|
|
492
|
+
def _is_exact_search(search_term: str) -> bool:
|
|
493
|
+
"""Check if the search term is an exact search (contains double quotation marks).
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
search_term: The search term to check.
|
|
497
|
+
"""
|
|
498
|
+
return '"' in search_term
|
|
499
|
+
|
|
500
|
+
@staticmethod
|
|
501
|
+
def _extract_exact_search_terms(search_term: str) -> list[str]:
|
|
502
|
+
"""Extract all exact search terms from within double quotation marks (empty if no quotes found).
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
search_term: The search term that may contain double quotation marks.
|
|
506
|
+
"""
|
|
507
|
+
# Find all double-quoted strings
|
|
508
|
+
double_quote_matches = re.findall(r'"([^"]*)"', search_term)
|
|
509
|
+
return double_quote_matches
|
|
510
|
+
|
|
511
|
+
@staticmethod
|
|
512
|
+
def _check_exact_search_terms_match(
|
|
513
|
+
product: ProductItem,
|
|
514
|
+
exact_search_terms: list[str],
|
|
515
|
+
) -> bool:
|
|
516
|
+
"""Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
product: The product item.
|
|
520
|
+
exact_search_terms: List of exact search terms to match against.
|
|
521
|
+
"""
|
|
522
|
+
field_values = [
|
|
523
|
+
str(val)
|
|
524
|
+
for fld in EXACT_MATCH_PRODUCT_FIELDS
|
|
525
|
+
if (val := getattr(product, fld, None)) is not None
|
|
526
|
+
]
|
|
527
|
+
product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
|
|
528
|
+
|
|
529
|
+
return all(
|
|
530
|
+
re.search(re.escape(est.lower()), product_str_lower)
|
|
531
|
+
for est in exact_search_terms
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def _check_exact_search(self, product: ProductItem) -> ProductItem:
|
|
535
|
+
"""Checks if the search term requests an exact search and if yes, checks for conformity."""
|
|
536
|
+
# Check for exact search and apply regex matching
|
|
537
|
+
exact_search = self._is_exact_search(product.search_term)
|
|
538
|
+
product.exact_search = exact_search
|
|
539
|
+
|
|
540
|
+
# Only set exact_search_match if this was an exact search (contains quotes)
|
|
541
|
+
if exact_search:
|
|
542
|
+
exact_search_terms = self._extract_exact_search_terms(product.search_term)
|
|
543
|
+
if exact_search_terms:
|
|
544
|
+
product.exact_search_match = self._check_exact_search_terms_match(
|
|
545
|
+
product=product, exact_search_terms=exact_search_terms
|
|
546
|
+
)
|
|
547
|
+
logger.debug(
|
|
548
|
+
f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
|
|
549
|
+
f"for offer with url={product.url}"
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
logger.warning(
|
|
553
|
+
f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
|
|
554
|
+
f"for offer with url={product.url}"
|
|
555
|
+
)
|
|
556
|
+
# If exact_search is False, product.exact_search_match remains False (default value)
|
|
557
|
+
return product
|
|
558
|
+
|
|
505
559
|
async def run(
|
|
506
560
|
self,
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
language: Language,
|
|
510
|
-
location: Location,
|
|
511
|
-
deepness: Deepness,
|
|
512
|
-
prompts: List[Prompt],
|
|
513
|
-
marketplaces: List[Host] | None = None,
|
|
514
|
-
excluded_urls: List[Host] | None = None,
|
|
515
|
-
previously_collected_urls: List[str] | None = None,
|
|
561
|
+
scraping_config: ScrapingConfig,
|
|
562
|
+
processing_config: ProcessingConfig,
|
|
516
563
|
) -> None:
|
|
517
564
|
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
518
565
|
|
|
519
566
|
Args:
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
language: The language to use for the query.
|
|
523
|
-
location: The location to use for the query.
|
|
524
|
-
deepness: The search depth and enrichment details.
|
|
525
|
-
prompts: The list of prompt to use for classification.
|
|
526
|
-
marketplaces: The marketplaces to include in the search.
|
|
527
|
-
excluded_urls: The URLs to exclude from the search.
|
|
528
|
-
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
567
|
+
scraping_config: Sets up the scraping pipeline step.
|
|
568
|
+
processing_config: Sets up the processing pipeline step.
|
|
529
569
|
"""
|
|
530
570
|
# ---------------------------
|
|
531
571
|
# INITIAL SETUP
|
|
532
572
|
# ---------------------------
|
|
533
|
-
# Ensure we have at least one search engine
|
|
534
|
-
if not search_engines:
|
|
573
|
+
# Ensure we have at least one search engine (the list might be empty)
|
|
574
|
+
if not scraping_config.search_engines:
|
|
535
575
|
logger.warning(
|
|
536
576
|
"No search engines specified, using all available search engines"
|
|
537
577
|
)
|
|
538
|
-
search_engines = list(SearchEngineName)
|
|
578
|
+
scraping_config.search_engines = list(SearchEngineName)
|
|
539
579
|
|
|
540
580
|
# Handle previously collected URLs
|
|
541
|
-
if previously_collected_urls:
|
|
542
|
-
self._url_collector.add_previously_collected_urls(
|
|
543
|
-
urls=previously_collected_urls
|
|
544
|
-
)
|
|
581
|
+
if pcurls := scraping_config.previously_collected_urls:
|
|
582
|
+
self._url_collector.add_previously_collected_urls(urls=pcurls)
|
|
545
583
|
|
|
546
584
|
# Setup the async framework
|
|
585
|
+
deepness = scraping_config.deepness
|
|
547
586
|
n_terms_max = 1 + (
|
|
548
587
|
deepness.enrichment.additional_terms if deepness.enrichment else 0
|
|
549
588
|
)
|
|
@@ -558,7 +597,7 @@ class Orchestrator(ABC):
|
|
|
558
597
|
n_srch_wkrs=n_srch_wkrs,
|
|
559
598
|
n_cntx_wkrs=n_cntx_wkrs,
|
|
560
599
|
n_proc_wkrs=n_proc_wkrs,
|
|
561
|
-
|
|
600
|
+
processing_config=processing_config,
|
|
562
601
|
)
|
|
563
602
|
|
|
564
603
|
# Check setup of async framework
|
|
@@ -581,13 +620,7 @@ class Orchestrator(ABC):
|
|
|
581
620
|
srch_queue = self._queues["srch"]
|
|
582
621
|
await self._add_srch_items(
|
|
583
622
|
queue=srch_queue,
|
|
584
|
-
|
|
585
|
-
search_engines=search_engines,
|
|
586
|
-
language=language,
|
|
587
|
-
location=location,
|
|
588
|
-
deepness=deepness,
|
|
589
|
-
marketplaces=marketplaces,
|
|
590
|
-
excluded_urls=excluded_urls,
|
|
623
|
+
scraping_config=scraping_config,
|
|
591
624
|
)
|
|
592
625
|
|
|
593
626
|
# -----------------------------
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from fraudcrawler.base.base import Prompt
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ProcessingConfig(BaseModel):
|
|
8
|
+
"""Sets up the processing pipeline step."""
|
|
9
|
+
|
|
10
|
+
prompts: List[Prompt] = Field(
|
|
11
|
+
description="The list of prompts to use for classification."
|
|
12
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from fraudcrawler.scraping.search import SearchEngineName
|
|
5
|
+
from fraudcrawler.base.base import (
|
|
6
|
+
Language,
|
|
7
|
+
Location,
|
|
8
|
+
Deepness,
|
|
9
|
+
Host,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScrapingConfig(BaseModel):
|
|
14
|
+
"""Sets up the scraping pipeline step."""
|
|
15
|
+
|
|
16
|
+
search_term: str = Field(description="The search term for the query.")
|
|
17
|
+
search_engines: List[SearchEngineName] = Field(
|
|
18
|
+
description="The list of search engines to use for the search query."
|
|
19
|
+
)
|
|
20
|
+
language: Language = Field(description="The language to use for the query.")
|
|
21
|
+
location: Location = Field(description="The location to use for the query.")
|
|
22
|
+
deepness: Deepness = Field(description="The search depth and enrichment details.")
|
|
23
|
+
marketplaces: List[Host] | None = Field(
|
|
24
|
+
default=None, description="The marketplaces to include in the search."
|
|
25
|
+
)
|
|
26
|
+
excluded_urls: List[Host] | None = Field(
|
|
27
|
+
default=None, description="The URLs to exclude from the search."
|
|
28
|
+
)
|
|
29
|
+
previously_collected_urls: List[str] | None = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="The URLs that have been collected previously and are ignored.",
|
|
32
|
+
)
|
|
@@ -2,11 +2,12 @@ from base64 import b64decode
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
5
6
|
import httpx
|
|
6
7
|
from tenacity import RetryCallState
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
9
|
-
from fraudcrawler.base.base import DomainUtils
|
|
10
|
+
from fraudcrawler.base.base import DomainUtils, ProductItem
|
|
10
11
|
from fraudcrawler.base.retry import get_async_retry
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
|
|
|
61
62
|
else:
|
|
62
63
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
63
64
|
|
|
64
|
-
async def details(self, url: str) -> dict:
|
|
65
|
-
"""Fetches product details for a single URL.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
url: The URL to fetch product details from.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
A dictionary containing the product details, fields include:
|
|
72
|
-
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
73
|
-
{
|
|
74
|
-
"url": str,
|
|
75
|
-
"statusCode": str,
|
|
76
|
-
"product": {
|
|
77
|
-
"name": str,
|
|
78
|
-
"price": str,
|
|
79
|
-
"mainImage": {"url": str},
|
|
80
|
-
"images": [{"url": str}],
|
|
81
|
-
"description": str,
|
|
82
|
-
"metadata": {
|
|
83
|
-
"probability": float,
|
|
84
|
-
},
|
|
85
|
-
},
|
|
86
|
-
"httpResponseBody": base64
|
|
87
|
-
}
|
|
88
|
-
"""
|
|
89
|
-
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
90
|
-
|
|
91
|
-
# Perform the request and retry if necessary. There is some context aware logging:
|
|
92
|
-
# - `before`: before the request is made (and before retrying)
|
|
93
|
-
# - `before_sleep`: if the request fails before sleeping
|
|
94
|
-
retry = get_async_retry()
|
|
95
|
-
retry.before = lambda retry_state: self._log_before(
|
|
96
|
-
url=url, retry_state=retry_state
|
|
97
|
-
)
|
|
98
|
-
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
99
|
-
url=url, retry_state=retry_state
|
|
100
|
-
)
|
|
101
|
-
async for attempt in retry:
|
|
102
|
-
with attempt:
|
|
103
|
-
response = await self._http_client.post(
|
|
104
|
-
url=self._endpoint,
|
|
105
|
-
json={"url": url, **self._config},
|
|
106
|
-
auth=(self._api_key, ""), # API key as username, empty password
|
|
107
|
-
)
|
|
108
|
-
response.raise_for_status()
|
|
109
|
-
|
|
110
|
-
details = response.json()
|
|
111
|
-
return details
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def keep_product(
|
|
115
|
-
details: dict,
|
|
116
|
-
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
117
|
-
) -> bool:
|
|
118
|
-
"""Determines whether to keep the product based on the probability threshold.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
details: A product details data dictionary.
|
|
122
|
-
threshold: The probability threshold used to filter the products.
|
|
123
|
-
"""
|
|
124
|
-
try:
|
|
125
|
-
prob = float(details["product"]["metadata"]["probability"])
|
|
126
|
-
except KeyError:
|
|
127
|
-
logger.warning(
|
|
128
|
-
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
129
|
-
)
|
|
130
|
-
return False
|
|
131
|
-
return prob > threshold
|
|
132
|
-
|
|
133
65
|
@staticmethod
|
|
134
|
-
def
|
|
66
|
+
def _extract_product_name(details: dict) -> str | None:
|
|
135
67
|
"""Extracts the product name from the product data.
|
|
136
68
|
|
|
137
69
|
The input argument is a dictionary of the following structure:
|
|
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
|
|
|
144
76
|
return details.get("product", {}).get("name")
|
|
145
77
|
|
|
146
78
|
@staticmethod
|
|
147
|
-
def
|
|
79
|
+
def _extract_url_resolved(details: dict) -> str | None:
|
|
148
80
|
"""Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
|
|
149
81
|
|
|
150
82
|
The input argument is a dictionary of the following structure:
|
|
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
|
|
|
157
89
|
return details.get("product", {}).get("url")
|
|
158
90
|
|
|
159
91
|
@staticmethod
|
|
160
|
-
def
|
|
92
|
+
def _extract_product_price(details: dict) -> str | None:
|
|
161
93
|
"""Extracts the product price from the product data.
|
|
162
94
|
|
|
163
95
|
The input argument is a dictionary of the following structure:
|
|
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
|
|
|
170
102
|
return details.get("product", {}).get("price")
|
|
171
103
|
|
|
172
104
|
@staticmethod
|
|
173
|
-
def
|
|
105
|
+
def _extract_product_description(details: dict) -> str | None:
|
|
174
106
|
"""Extracts the product description from the product data.
|
|
175
107
|
|
|
176
108
|
The input argument is a dictionary of the following structure:
|
|
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
|
|
|
183
115
|
return details.get("product", {}).get("description")
|
|
184
116
|
|
|
185
117
|
@staticmethod
|
|
186
|
-
def
|
|
118
|
+
def _extract_image_urls(details: dict) -> List[str]:
|
|
187
119
|
"""Extracts the images from the product data.
|
|
188
120
|
|
|
189
121
|
The input argument is a dictionary of the following structure:
|
|
@@ -206,7 +138,7 @@ class ZyteAPI(DomainUtils):
|
|
|
206
138
|
return images
|
|
207
139
|
|
|
208
140
|
@staticmethod
|
|
209
|
-
def
|
|
141
|
+
def _extract_probability(details: dict) -> float:
|
|
210
142
|
"""Extracts the probability from the product data.
|
|
211
143
|
|
|
212
144
|
The input argument is a dictionary of the following structure:
|
|
@@ -223,7 +155,7 @@ class ZyteAPI(DomainUtils):
|
|
|
223
155
|
)
|
|
224
156
|
|
|
225
157
|
@staticmethod
|
|
226
|
-
def
|
|
158
|
+
def _extract_html(details: dict) -> str | None:
|
|
227
159
|
"""Extracts the HTML from the Zyte API response.
|
|
228
160
|
|
|
229
161
|
The input argument is a dictionary of the following structure:
|
|
@@ -243,6 +175,51 @@ class ZyteAPI(DomainUtils):
|
|
|
243
175
|
return decoded_string
|
|
244
176
|
return None
|
|
245
177
|
|
|
178
|
+
def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
|
|
179
|
+
product.product_name = self._extract_product_name(details=details)
|
|
180
|
+
|
|
181
|
+
url_resolved = self._extract_url_resolved(details=details)
|
|
182
|
+
if url_resolved:
|
|
183
|
+
product.url_resolved = url_resolved
|
|
184
|
+
|
|
185
|
+
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
186
|
+
# otherwise the unresolved domain will be shown.
|
|
187
|
+
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
188
|
+
if url_resolved and url_resolved != product.url:
|
|
189
|
+
logger.debug(f"URL resolved for {product.url} is {url_resolved}")
|
|
190
|
+
product.domain = self._get_domain(url=url_resolved)
|
|
191
|
+
|
|
192
|
+
product.product_price = self._extract_product_price(details=details)
|
|
193
|
+
product.product_description = self._extract_product_description(details=details)
|
|
194
|
+
product.product_images = self._extract_image_urls(details=details)
|
|
195
|
+
product.probability = self._extract_probability(details=details)
|
|
196
|
+
product.html = self._extract_html(details=details)
|
|
197
|
+
if product.html:
|
|
198
|
+
soup = BeautifulSoup(product.html, "html.parser")
|
|
199
|
+
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
200
|
+
|
|
201
|
+
return product
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def keep_product(
|
|
205
|
+
details: dict,
|
|
206
|
+
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
207
|
+
) -> bool:
|
|
208
|
+
"""Determines whether to keep the product based on the probability threshold.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
details: A product details data dictionary.
|
|
212
|
+
threshold: The probability threshold used to filter the products.
|
|
213
|
+
"""
|
|
214
|
+
try:
|
|
215
|
+
prob = float(details["product"]["metadata"]["probability"])
|
|
216
|
+
except KeyError:
|
|
217
|
+
logger.warning(
|
|
218
|
+
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
219
|
+
)
|
|
220
|
+
return False
|
|
221
|
+
return prob > threshold
|
|
222
|
+
|
|
246
223
|
async def unblock_url_content(self, url: str) -> bytes:
|
|
247
224
|
"""Unblock the content of an URL using Zyte proxy mode.
|
|
248
225
|
|
|
@@ -256,3 +233,52 @@ class ZyteAPI(DomainUtils):
|
|
|
256
233
|
raise httpx.HTTPError("No httpResponseBody in Zyte response")
|
|
257
234
|
|
|
258
235
|
return b64decode(details["httpResponseBody"])
|
|
236
|
+
|
|
237
|
+
async def details(self, url: str) -> dict:
|
|
238
|
+
"""Fetches product details for a single URL.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
url: The URL to fetch product details from.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
A dictionary containing the product details, fields include:
|
|
245
|
+
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
246
|
+
{
|
|
247
|
+
"url": str,
|
|
248
|
+
"statusCode": str,
|
|
249
|
+
"product": {
|
|
250
|
+
"name": str,
|
|
251
|
+
"price": str,
|
|
252
|
+
"mainImage": {"url": str},
|
|
253
|
+
"images": [{"url": str}],
|
|
254
|
+
"description": str,
|
|
255
|
+
"metadata": {
|
|
256
|
+
"probability": float,
|
|
257
|
+
},
|
|
258
|
+
},
|
|
259
|
+
"httpResponseBody": base64
|
|
260
|
+
}
|
|
261
|
+
"""
|
|
262
|
+
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
263
|
+
|
|
264
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
265
|
+
# - `before`: before the request is made (and before retrying)
|
|
266
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
267
|
+
retry = get_async_retry()
|
|
268
|
+
retry.before = lambda retry_state: self._log_before(
|
|
269
|
+
url=url, retry_state=retry_state
|
|
270
|
+
)
|
|
271
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
272
|
+
url=url, retry_state=retry_state
|
|
273
|
+
)
|
|
274
|
+
async for attempt in retry:
|
|
275
|
+
with attempt:
|
|
276
|
+
response = await self._http_client.post(
|
|
277
|
+
url=self._endpoint,
|
|
278
|
+
json={"url": url, **self._config},
|
|
279
|
+
auth=(self._api_key, ""), # API key as username, empty password
|
|
280
|
+
)
|
|
281
|
+
response.raise_for_status()
|
|
282
|
+
|
|
283
|
+
details = response.json()
|
|
284
|
+
return details
|
|
@@ -78,6 +78,14 @@ ENRICHMENT_DEFAULT_LIMIT = 10
|
|
|
78
78
|
# Zyte settings
|
|
79
79
|
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
80
80
|
|
|
81
|
+
# Exact match settings
|
|
82
|
+
EXACT_MATCH_PRODUCT_FIELDS = {
|
|
83
|
+
"url_resolvedproduct_name",
|
|
84
|
+
"product_description",
|
|
85
|
+
"html",
|
|
86
|
+
}
|
|
87
|
+
EXACT_MATCH_FIELD_SEPARATOR = "\n"
|
|
88
|
+
|
|
81
89
|
# Processor settings
|
|
82
90
|
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
83
91
|
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|