fraudcrawler 0.6.2__tar.gz → 0.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/PKG-INFO +1 -1
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/__init__.py +4 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/base.py +1 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/client.py +15 -9
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/orchestrator.py +29 -57
- fraudcrawler-0.6.3/fraudcrawler/processing/config.py +12 -0
- fraudcrawler-0.6.3/fraudcrawler/scraping/config.py +32 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/pyproject.toml +1 -1
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/LICENSE +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/README.md +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/retry.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/processing/processor.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/search.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/url.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/zyte.py +0 -0
- {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/settings.py +0 -0
|
@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.url import URLCollector
|
|
4
4
|
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
5
|
+
from fraudcrawler.scraping.config import ScrapingConfig
|
|
5
6
|
from fraudcrawler.processing.processor import Processor
|
|
7
|
+
from fraudcrawler.processing.config import ProcessingConfig
|
|
6
8
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
7
9
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
8
10
|
from fraudcrawler.base.base import (
|
|
@@ -22,7 +24,9 @@ __all__ = [
|
|
|
22
24
|
"Enricher",
|
|
23
25
|
"URLCollector",
|
|
24
26
|
"ZyteAPI",
|
|
27
|
+
"ScrapingConfig",
|
|
25
28
|
"Processor",
|
|
29
|
+
"ProcessingConfig",
|
|
26
30
|
"Orchestrator",
|
|
27
31
|
"ProductItem",
|
|
28
32
|
"FraudCrawlerClient",
|
|
@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
|
|
|
19
19
|
ProductItem,
|
|
20
20
|
)
|
|
21
21
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
22
|
+
from fraudcrawler.scraping.config import ScrapingConfig
|
|
22
23
|
from fraudcrawler.scraping.search import SearchEngineName
|
|
24
|
+
from fraudcrawler.processing.config import ProcessingConfig
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
25
27
|
|
|
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
141
143
|
|
|
142
144
|
asyncio.run(
|
|
143
145
|
_run(
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
146
|
+
scraping_config=ScrapingConfig(
|
|
147
|
+
search_term=search_term,
|
|
148
|
+
search_engines=nrm_search_engines,
|
|
149
|
+
language=language,
|
|
150
|
+
location=location,
|
|
151
|
+
deepness=deepness,
|
|
152
|
+
marketplaces=marketplaces,
|
|
153
|
+
excluded_urls=excluded_urls,
|
|
154
|
+
previously_collected_urls=previously_collected_urls,
|
|
155
|
+
),
|
|
156
|
+
processing_config=ProcessingConfig(
|
|
157
|
+
prompts=prompts,
|
|
158
|
+
),
|
|
153
159
|
)
|
|
154
160
|
)
|
|
155
161
|
|
|
@@ -17,11 +17,9 @@ from fraudcrawler.settings import (
|
|
|
17
17
|
DEFAULT_N_PROC_WKRS,
|
|
18
18
|
)
|
|
19
19
|
from fraudcrawler.base.base import (
|
|
20
|
-
Deepness,
|
|
21
20
|
Host,
|
|
22
21
|
Language,
|
|
23
22
|
Location,
|
|
24
|
-
Prompt,
|
|
25
23
|
ProductItem,
|
|
26
24
|
HttpxAsyncClient,
|
|
27
25
|
)
|
|
@@ -31,7 +29,9 @@ from fraudcrawler import (
|
|
|
31
29
|
Enricher,
|
|
32
30
|
ZyteAPI,
|
|
33
31
|
URLCollector,
|
|
32
|
+
ScrapingConfig,
|
|
34
33
|
Processor,
|
|
34
|
+
ProcessingConfig,
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
logger = logging.getLogger(__name__)
|
|
@@ -261,14 +261,14 @@ class Orchestrator(ABC):
|
|
|
261
261
|
self,
|
|
262
262
|
queue_in: asyncio.Queue[ProductItem | None],
|
|
263
263
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
264
|
-
|
|
264
|
+
processing_config: ProcessingConfig,
|
|
265
265
|
) -> None:
|
|
266
266
|
"""Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
|
|
267
267
|
|
|
268
268
|
Args:
|
|
269
269
|
queue_in: The input queue containing the product details.
|
|
270
270
|
queue_out: The output queue to put the processed product details.
|
|
271
|
-
|
|
271
|
+
processing_config: Sets up the processing pipeline step.
|
|
272
272
|
"""
|
|
273
273
|
|
|
274
274
|
# Process the products
|
|
@@ -282,7 +282,7 @@ class Orchestrator(ABC):
|
|
|
282
282
|
if not product.filtered:
|
|
283
283
|
try:
|
|
284
284
|
# Run all the configured prompts
|
|
285
|
-
for prompt in prompts:
|
|
285
|
+
for prompt in processing_config.prompts:
|
|
286
286
|
classification = await self._processor.classify(
|
|
287
287
|
product=product,
|
|
288
288
|
prompt=prompt,
|
|
@@ -318,7 +318,7 @@ class Orchestrator(ABC):
|
|
|
318
318
|
n_srch_wkrs: int,
|
|
319
319
|
n_cntx_wkrs: int,
|
|
320
320
|
n_proc_wkrs: int,
|
|
321
|
-
|
|
321
|
+
processing_config: ProcessingConfig,
|
|
322
322
|
) -> None:
|
|
323
323
|
"""Sets up the necessary queues and workers for the async framework.
|
|
324
324
|
|
|
@@ -326,7 +326,7 @@ class Orchestrator(ABC):
|
|
|
326
326
|
n_srch_wkrs: Number of async workers for search.
|
|
327
327
|
n_cntx_wkrs: Number of async workers for context extraction.
|
|
328
328
|
n_proc_wkrs: Number of async workers for processing.
|
|
329
|
-
|
|
329
|
+
processing_config: Sets up the processing pipeline step.
|
|
330
330
|
"""
|
|
331
331
|
|
|
332
332
|
# Setup the input/output queues for the workers
|
|
@@ -369,7 +369,7 @@ class Orchestrator(ABC):
|
|
|
369
369
|
self._proc_execute(
|
|
370
370
|
queue_in=proc_queue,
|
|
371
371
|
queue_out=res_queue,
|
|
372
|
-
|
|
372
|
+
processing_config=processing_config,
|
|
373
373
|
)
|
|
374
374
|
)
|
|
375
375
|
for _ in range(n_proc_wkrs)
|
|
@@ -423,13 +423,7 @@ class Orchestrator(ABC):
|
|
|
423
423
|
async def _add_srch_items(
|
|
424
424
|
self,
|
|
425
425
|
queue: asyncio.Queue[dict | None],
|
|
426
|
-
|
|
427
|
-
search_engines: List[SearchEngineName],
|
|
428
|
-
language: Language,
|
|
429
|
-
location: Location,
|
|
430
|
-
deepness: Deepness,
|
|
431
|
-
marketplaces: List[Host] | None,
|
|
432
|
-
excluded_urls: List[Host] | None,
|
|
426
|
+
scraping_config: ScrapingConfig,
|
|
433
427
|
) -> None:
|
|
434
428
|
"""Adds all the (enriched) search_term (as srch items) to the queue.
|
|
435
429
|
|
|
@@ -448,12 +442,17 @@ class Orchestrator(ABC):
|
|
|
448
442
|
for each search_engine
|
|
449
443
|
add item to queue
|
|
450
444
|
"""
|
|
445
|
+
search_term = scraping_config.search_term
|
|
446
|
+
search_engines = scraping_config.search_engines
|
|
447
|
+
language = scraping_config.language
|
|
448
|
+
location = scraping_config.location
|
|
449
|
+
deepness = scraping_config.deepness
|
|
451
450
|
common_kwargs = {
|
|
452
451
|
"queue": queue,
|
|
453
452
|
"language": language,
|
|
454
453
|
"location": location,
|
|
455
|
-
"marketplaces": marketplaces,
|
|
456
|
-
"excluded_urls": excluded_urls,
|
|
454
|
+
"marketplaces": scraping_config.marketplaces,
|
|
455
|
+
"excluded_urls": scraping_config.excluded_urls,
|
|
457
456
|
}
|
|
458
457
|
|
|
459
458
|
# Add initial items to the queue
|
|
@@ -495,21 +494,15 @@ class Orchestrator(ABC):
|
|
|
495
494
|
|
|
496
495
|
Args:
|
|
497
496
|
search_term: The search term to check.
|
|
498
|
-
|
|
499
|
-
Returns:
|
|
500
|
-
True if the search term contains double quotation marks, False otherwise.
|
|
501
497
|
"""
|
|
502
498
|
return '"' in search_term
|
|
503
499
|
|
|
504
500
|
@staticmethod
|
|
505
501
|
def _extract_exact_search_terms(search_term: str) -> list[str]:
|
|
506
|
-
"""Extract all exact search terms from within double quotation marks.
|
|
502
|
+
"""Extract all exact search terms from within double quotation marks (empty if no quotes found).
|
|
507
503
|
|
|
508
504
|
Args:
|
|
509
505
|
search_term: The search term that may contain double quotation marks.
|
|
510
|
-
|
|
511
|
-
Returns:
|
|
512
|
-
A list of extracted search terms without quotes, or empty list if no quotes found.
|
|
513
506
|
"""
|
|
514
507
|
# Find all double-quoted strings
|
|
515
508
|
double_quote_matches = re.findall(r'"([^"]*)"', search_term)
|
|
@@ -565,46 +558,31 @@ class Orchestrator(ABC):
|
|
|
565
558
|
|
|
566
559
|
async def run(
|
|
567
560
|
self,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
language: Language,
|
|
571
|
-
location: Location,
|
|
572
|
-
deepness: Deepness,
|
|
573
|
-
prompts: List[Prompt],
|
|
574
|
-
marketplaces: List[Host] | None = None,
|
|
575
|
-
excluded_urls: List[Host] | None = None,
|
|
576
|
-
previously_collected_urls: List[str] | None = None,
|
|
561
|
+
scraping_config: ScrapingConfig,
|
|
562
|
+
processing_config: ProcessingConfig,
|
|
577
563
|
) -> None:
|
|
578
564
|
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
579
565
|
|
|
580
566
|
Args:
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
language: The language to use for the query.
|
|
584
|
-
location: The location to use for the query.
|
|
585
|
-
deepness: The search depth and enrichment details.
|
|
586
|
-
prompts: The list of prompt to use for classification.
|
|
587
|
-
marketplaces: The marketplaces to include in the search.
|
|
588
|
-
excluded_urls: The URLs to exclude from the search.
|
|
589
|
-
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
567
|
+
scraping_config: Sets up the scraping pipeline step.
|
|
568
|
+
processing_config: Sets up the processing pipeline step.
|
|
590
569
|
"""
|
|
591
570
|
# ---------------------------
|
|
592
571
|
# INITIAL SETUP
|
|
593
572
|
# ---------------------------
|
|
594
|
-
# Ensure we have at least one search engine
|
|
595
|
-
if not search_engines:
|
|
573
|
+
# Ensure we have at least one search engine (the list might be empty)
|
|
574
|
+
if not scraping_config.search_engines:
|
|
596
575
|
logger.warning(
|
|
597
576
|
"No search engines specified, using all available search engines"
|
|
598
577
|
)
|
|
599
|
-
search_engines = list(SearchEngineName)
|
|
578
|
+
scraping_config.search_engines = list(SearchEngineName)
|
|
600
579
|
|
|
601
580
|
# Handle previously collected URLs
|
|
602
|
-
if previously_collected_urls:
|
|
603
|
-
self._url_collector.add_previously_collected_urls(
|
|
604
|
-
urls=previously_collected_urls
|
|
605
|
-
)
|
|
581
|
+
if pcurls := scraping_config.previously_collected_urls:
|
|
582
|
+
self._url_collector.add_previously_collected_urls(urls=pcurls)
|
|
606
583
|
|
|
607
584
|
# Setup the async framework
|
|
585
|
+
deepness = scraping_config.deepness
|
|
608
586
|
n_terms_max = 1 + (
|
|
609
587
|
deepness.enrichment.additional_terms if deepness.enrichment else 0
|
|
610
588
|
)
|
|
@@ -619,7 +597,7 @@ class Orchestrator(ABC):
|
|
|
619
597
|
n_srch_wkrs=n_srch_wkrs,
|
|
620
598
|
n_cntx_wkrs=n_cntx_wkrs,
|
|
621
599
|
n_proc_wkrs=n_proc_wkrs,
|
|
622
|
-
|
|
600
|
+
processing_config=processing_config,
|
|
623
601
|
)
|
|
624
602
|
|
|
625
603
|
# Check setup of async framework
|
|
@@ -642,13 +620,7 @@ class Orchestrator(ABC):
|
|
|
642
620
|
srch_queue = self._queues["srch"]
|
|
643
621
|
await self._add_srch_items(
|
|
644
622
|
queue=srch_queue,
|
|
645
|
-
|
|
646
|
-
search_engines=search_engines,
|
|
647
|
-
language=language,
|
|
648
|
-
location=location,
|
|
649
|
-
deepness=deepness,
|
|
650
|
-
marketplaces=marketplaces,
|
|
651
|
-
excluded_urls=excluded_urls,
|
|
623
|
+
scraping_config=scraping_config,
|
|
652
624
|
)
|
|
653
625
|
|
|
654
626
|
# -----------------------------
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from fraudcrawler.base.base import Prompt
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ProcessingConfig(BaseModel):
|
|
8
|
+
"""Sets up the processing pipeline step."""
|
|
9
|
+
|
|
10
|
+
prompts: List[Prompt] = Field(
|
|
11
|
+
description="The list of prompts to use for classification."
|
|
12
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from fraudcrawler.scraping.search import SearchEngineName
|
|
5
|
+
from fraudcrawler.base.base import (
|
|
6
|
+
Language,
|
|
7
|
+
Location,
|
|
8
|
+
Deepness,
|
|
9
|
+
Host,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScrapingConfig(BaseModel):
|
|
14
|
+
"""Sets up the scraping pipeline step."""
|
|
15
|
+
|
|
16
|
+
search_term: str = Field(description="The search term for the query.")
|
|
17
|
+
search_engines: List[SearchEngineName] = Field(
|
|
18
|
+
description="The list of search engines to use for the search query."
|
|
19
|
+
)
|
|
20
|
+
language: Language = Field(description="The language to use for the query.")
|
|
21
|
+
location: Location = Field(description="The location to use for the query.")
|
|
22
|
+
deepness: Deepness = Field(description="The search depth and enrichment details.")
|
|
23
|
+
marketplaces: List[Host] | None = Field(
|
|
24
|
+
default=None, description="The marketplaces to include in the search."
|
|
25
|
+
)
|
|
26
|
+
excluded_urls: List[Host] | None = Field(
|
|
27
|
+
default=None, description="The URLs to exclude from the search."
|
|
28
|
+
)
|
|
29
|
+
previously_collected_urls: List[str] | None = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="The URLs that have been collected previously and are ignored.",
|
|
32
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|