fraudcrawler 0.6.2__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (23) hide show
  1. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/PKG-INFO +1 -1
  2. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/__init__.py +4 -0
  3. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/base.py +1 -0
  4. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/client.py +15 -9
  5. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/orchestrator.py +29 -57
  6. fraudcrawler-0.6.3/fraudcrawler/processing/config.py +12 -0
  7. fraudcrawler-0.6.3/fraudcrawler/scraping/config.py +32 -0
  8. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/pyproject.toml +1 -1
  9. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/LICENSE +0 -0
  10. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/README.md +0 -0
  11. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/__init__.py +0 -0
  12. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/google-languages.json +0 -0
  13. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/google-locations.json +0 -0
  14. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/base/retry.py +0 -0
  15. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/launch_demo_pipeline.py +0 -0
  16. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/processing/__init__.py +0 -0
  17. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/processing/processor.py +0 -0
  18. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/__init__.py +0 -0
  19. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/enrich.py +0 -0
  20. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/search.py +0 -0
  21. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/url.py +0 -0
  22. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/scraping/zyte.py +0 -0
  23. {fraudcrawler-0.6.2 → fraudcrawler-0.6.3}/fraudcrawler/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fraudcrawler
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Intelligent Market Monitoring
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -2,7 +2,9 @@ from fraudcrawler.scraping.search import Searcher, SearchEngineName
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.url import URLCollector
4
4
  from fraudcrawler.scraping.zyte import ZyteAPI
5
+ from fraudcrawler.scraping.config import ScrapingConfig
5
6
  from fraudcrawler.processing.processor import Processor
7
+ from fraudcrawler.processing.config import ProcessingConfig
6
8
  from fraudcrawler.base.orchestrator import Orchestrator
7
9
  from fraudcrawler.base.client import FraudCrawlerClient
8
10
  from fraudcrawler.base.base import (
@@ -22,7 +24,9 @@ __all__ = [
22
24
  "Enricher",
23
25
  "URLCollector",
24
26
  "ZyteAPI",
27
+ "ScrapingConfig",
25
28
  "Processor",
29
+ "ProcessingConfig",
26
30
  "Orchestrator",
27
31
  "ProductItem",
28
32
  "FraudCrawlerClient",
@@ -45,6 +45,7 @@ class Setup(BaseSettings):
45
45
  dataforseo_pwd: str
46
46
  zyteapi_key: str
47
47
  openaiapi_key: str
48
+ pypy_token: str
48
49
 
49
50
  class Config:
50
51
  env_file = ".env"
@@ -19,7 +19,9 @@ from fraudcrawler.base.base import (
19
19
  ProductItem,
20
20
  )
21
21
  from fraudcrawler.base.orchestrator import Orchestrator
22
+ from fraudcrawler.scraping.config import ScrapingConfig
22
23
  from fraudcrawler.scraping.search import SearchEngineName
24
+ from fraudcrawler.processing.config import ProcessingConfig
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
@@ -141,15 +143,19 @@ class FraudCrawlerClient(Orchestrator):
141
143
 
142
144
  asyncio.run(
143
145
  _run(
144
- search_term=search_term,
145
- search_engines=nrm_search_engines,
146
- language=language,
147
- location=location,
148
- deepness=deepness,
149
- prompts=prompts,
150
- marketplaces=marketplaces,
151
- excluded_urls=excluded_urls,
152
- previously_collected_urls=previously_collected_urls,
146
+ scraping_config=ScrapingConfig(
147
+ search_term=search_term,
148
+ search_engines=nrm_search_engines,
149
+ language=language,
150
+ location=location,
151
+ deepness=deepness,
152
+ marketplaces=marketplaces,
153
+ excluded_urls=excluded_urls,
154
+ previously_collected_urls=previously_collected_urls,
155
+ ),
156
+ processing_config=ProcessingConfig(
157
+ prompts=prompts,
158
+ ),
153
159
  )
154
160
  )
155
161
 
@@ -17,11 +17,9 @@ from fraudcrawler.settings import (
17
17
  DEFAULT_N_PROC_WKRS,
18
18
  )
19
19
  from fraudcrawler.base.base import (
20
- Deepness,
21
20
  Host,
22
21
  Language,
23
22
  Location,
24
- Prompt,
25
23
  ProductItem,
26
24
  HttpxAsyncClient,
27
25
  )
@@ -31,7 +29,9 @@ from fraudcrawler import (
31
29
  Enricher,
32
30
  ZyteAPI,
33
31
  URLCollector,
32
+ ScrapingConfig,
34
33
  Processor,
34
+ ProcessingConfig,
35
35
  )
36
36
 
37
37
  logger = logging.getLogger(__name__)
@@ -261,14 +261,14 @@ class Orchestrator(ABC):
261
261
  self,
262
262
  queue_in: asyncio.Queue[ProductItem | None],
263
263
  queue_out: asyncio.Queue[ProductItem | None],
264
- prompts: List[Prompt],
264
+ processing_config: ProcessingConfig,
265
265
  ) -> None:
266
266
  """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
267
267
 
268
268
  Args:
269
269
  queue_in: The input queue containing the product details.
270
270
  queue_out: The output queue to put the processed product details.
271
- prompts: The list of prompts to use for classification.
271
+ processing_config: Sets up the processing pipeline step.
272
272
  """
273
273
 
274
274
  # Process the products
@@ -282,7 +282,7 @@ class Orchestrator(ABC):
282
282
  if not product.filtered:
283
283
  try:
284
284
  # Run all the configured prompts
285
- for prompt in prompts:
285
+ for prompt in processing_config.prompts:
286
286
  classification = await self._processor.classify(
287
287
  product=product,
288
288
  prompt=prompt,
@@ -318,7 +318,7 @@ class Orchestrator(ABC):
318
318
  n_srch_wkrs: int,
319
319
  n_cntx_wkrs: int,
320
320
  n_proc_wkrs: int,
321
- prompts: List[Prompt],
321
+ processing_config: ProcessingConfig,
322
322
  ) -> None:
323
323
  """Sets up the necessary queues and workers for the async framework.
324
324
 
@@ -326,7 +326,7 @@ class Orchestrator(ABC):
326
326
  n_srch_wkrs: Number of async workers for search.
327
327
  n_cntx_wkrs: Number of async workers for context extraction.
328
328
  n_proc_wkrs: Number of async workers for processing.
329
- prompts: The list of prompts used for the classification by func:`Processor.classify`.
329
+ processing_config: Sets up the processing pipeline step.
330
330
  """
331
331
 
332
332
  # Setup the input/output queues for the workers
@@ -369,7 +369,7 @@ class Orchestrator(ABC):
369
369
  self._proc_execute(
370
370
  queue_in=proc_queue,
371
371
  queue_out=res_queue,
372
- prompts=prompts,
372
+ processing_config=processing_config,
373
373
  )
374
374
  )
375
375
  for _ in range(n_proc_wkrs)
@@ -423,13 +423,7 @@ class Orchestrator(ABC):
423
423
  async def _add_srch_items(
424
424
  self,
425
425
  queue: asyncio.Queue[dict | None],
426
- search_term: str,
427
- search_engines: List[SearchEngineName],
428
- language: Language,
429
- location: Location,
430
- deepness: Deepness,
431
- marketplaces: List[Host] | None,
432
- excluded_urls: List[Host] | None,
426
+ scraping_config: ScrapingConfig,
433
427
  ) -> None:
434
428
  """Adds all the (enriched) search_term (as srch items) to the queue.
435
429
 
@@ -448,12 +442,17 @@ class Orchestrator(ABC):
448
442
  for each search_engine
449
443
  add item to queue
450
444
  """
445
+ search_term = scraping_config.search_term
446
+ search_engines = scraping_config.search_engines
447
+ language = scraping_config.language
448
+ location = scraping_config.location
449
+ deepness = scraping_config.deepness
451
450
  common_kwargs = {
452
451
  "queue": queue,
453
452
  "language": language,
454
453
  "location": location,
455
- "marketplaces": marketplaces,
456
- "excluded_urls": excluded_urls,
454
+ "marketplaces": scraping_config.marketplaces,
455
+ "excluded_urls": scraping_config.excluded_urls,
457
456
  }
458
457
 
459
458
  # Add initial items to the queue
@@ -495,21 +494,15 @@ class Orchestrator(ABC):
495
494
 
496
495
  Args:
497
496
  search_term: The search term to check.
498
-
499
- Returns:
500
- True if the search term contains double quotation marks, False otherwise.
501
497
  """
502
498
  return '"' in search_term
503
499
 
504
500
  @staticmethod
505
501
  def _extract_exact_search_terms(search_term: str) -> list[str]:
506
- """Extract all exact search terms from within double quotation marks.
502
+ """Extract all exact search terms from within double quotation marks (empty if no quotes found).
507
503
 
508
504
  Args:
509
505
  search_term: The search term that may contain double quotation marks.
510
-
511
- Returns:
512
- A list of extracted search terms without quotes, or empty list if no quotes found.
513
506
  """
514
507
  # Find all double-quoted strings
515
508
  double_quote_matches = re.findall(r'"([^"]*)"', search_term)
@@ -565,46 +558,31 @@ class Orchestrator(ABC):
565
558
 
566
559
  async def run(
567
560
  self,
568
- search_term: str,
569
- search_engines: List[SearchEngineName],
570
- language: Language,
571
- location: Location,
572
- deepness: Deepness,
573
- prompts: List[Prompt],
574
- marketplaces: List[Host] | None = None,
575
- excluded_urls: List[Host] | None = None,
576
- previously_collected_urls: List[str] | None = None,
561
+ scraping_config: ScrapingConfig,
562
+ processing_config: ProcessingConfig,
577
563
  ) -> None:
578
564
  """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
579
565
 
580
566
  Args:
581
- search_term: The search term for the query.
582
- search_engines: The list of search engines to use for the search query.
583
- language: The language to use for the query.
584
- location: The location to use for the query.
585
- deepness: The search depth and enrichment details.
586
- prompts: The list of prompt to use for classification.
587
- marketplaces: The marketplaces to include in the search.
588
- excluded_urls: The URLs to exclude from the search.
589
- previously_collected_urls: The urls that have been collected previously and are ignored.
567
+ scraping_config: Sets up the scraping pipeline step.
568
+ processing_config: Sets up the processing pipeline step.
590
569
  """
591
570
  # ---------------------------
592
571
  # INITIAL SETUP
593
572
  # ---------------------------
594
- # Ensure we have at least one search engine
595
- if not search_engines:
573
+ # Ensure we have at least one search engine (the list might be empty)
574
+ if not scraping_config.search_engines:
596
575
  logger.warning(
597
576
  "No search engines specified, using all available search engines"
598
577
  )
599
- search_engines = list(SearchEngineName)
578
+ scraping_config.search_engines = list(SearchEngineName)
600
579
 
601
580
  # Handle previously collected URLs
602
- if previously_collected_urls:
603
- self._url_collector.add_previously_collected_urls(
604
- urls=previously_collected_urls
605
- )
581
+ if pcurls := scraping_config.previously_collected_urls:
582
+ self._url_collector.add_previously_collected_urls(urls=pcurls)
606
583
 
607
584
  # Setup the async framework
585
+ deepness = scraping_config.deepness
608
586
  n_terms_max = 1 + (
609
587
  deepness.enrichment.additional_terms if deepness.enrichment else 0
610
588
  )
@@ -619,7 +597,7 @@ class Orchestrator(ABC):
619
597
  n_srch_wkrs=n_srch_wkrs,
620
598
  n_cntx_wkrs=n_cntx_wkrs,
621
599
  n_proc_wkrs=n_proc_wkrs,
622
- prompts=prompts,
600
+ processing_config=processing_config,
623
601
  )
624
602
 
625
603
  # Check setup of async framework
@@ -642,13 +620,7 @@ class Orchestrator(ABC):
642
620
  srch_queue = self._queues["srch"]
643
621
  await self._add_srch_items(
644
622
  queue=srch_queue,
645
- search_term=search_term,
646
- search_engines=search_engines,
647
- language=language,
648
- location=location,
649
- deepness=deepness,
650
- marketplaces=marketplaces,
651
- excluded_urls=excluded_urls,
623
+ scraping_config=scraping_config,
652
624
  )
653
625
 
654
626
  # -----------------------------
@@ -0,0 +1,12 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+
4
+ from fraudcrawler.base.base import Prompt
5
+
6
+
7
+ class ProcessingConfig(BaseModel):
8
+ """Sets up the processing pipeline step."""
9
+
10
+ prompts: List[Prompt] = Field(
11
+ description="The list of prompts to use for classification."
12
+ )
@@ -0,0 +1,32 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+
4
+ from fraudcrawler.scraping.search import SearchEngineName
5
+ from fraudcrawler.base.base import (
6
+ Language,
7
+ Location,
8
+ Deepness,
9
+ Host,
10
+ )
11
+
12
+
13
+ class ScrapingConfig(BaseModel):
14
+ """Sets up the scraping pipeline step."""
15
+
16
+ search_term: str = Field(description="The search term for the query.")
17
+ search_engines: List[SearchEngineName] = Field(
18
+ description="The list of search engines to use for the search query."
19
+ )
20
+ language: Language = Field(description="The language to use for the query.")
21
+ location: Location = Field(description="The location to use for the query.")
22
+ deepness: Deepness = Field(description="The search depth and enrichment details.")
23
+ marketplaces: List[Host] | None = Field(
24
+ default=None, description="The marketplaces to include in the search."
25
+ )
26
+ excluded_urls: List[Host] | None = Field(
27
+ default=None, description="The URLs to exclude from the search."
28
+ )
29
+ previously_collected_urls: List[str] | None = Field(
30
+ default=None,
31
+ description="The URLs that have been collected previously and are ignored.",
32
+ )
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.6.2"
7
+ version = "0.6.3"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes