fraudcrawler 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,626 @@
1
+ from abc import ABC, abstractmethod
2
+ import asyncio
3
+ import logging
4
+ from pydantic import BaseModel, Field
5
+ from typing import Dict, List, Set, cast
6
+
7
+ from fraudcrawler.settings import PROCESSOR_DEFAULT_MODEL, MAX_RETRIES, RETRY_DELAY
8
+ from fraudcrawler.settings import (
9
+ DEFAULT_N_SERP_WKRS,
10
+ DEFAULT_N_ZYTE_WKRS,
11
+ DEFAULT_N_PROC_WKRS,
12
+ )
13
+ from fraudcrawler.settings import PRODUCT_ITEM_DEFAULT_IS_RELEVANT
14
+ from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
15
+ from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ProductItem(BaseModel):
21
+ """Model representing a product item."""
22
+
23
+ # Serp/Enrich parameters
24
+ search_term: str
25
+ search_term_type: str
26
+ url: str
27
+ marketplace_name: str
28
+ domain: str
29
+
30
+ # Zyte parameters
31
+ product_name: str | None = None
32
+ product_price: str | None = None
33
+ product_description: str | None = None
34
+ product_images: List[str] | None = None
35
+ probability: float | None = None
36
+
37
+ # Processor parameters are set dynamic so we must allow extra fields
38
+ classifications: Dict[str, int] = Field(default_factory=dict)
39
+
40
+ # Filtering parameters
41
+ filtered: bool = False
42
+ filtered_at_stage: str | None = None
43
+ is_relevant: int = PRODUCT_ITEM_DEFAULT_IS_RELEVANT
44
+
45
+
46
+ class Orchestrator(ABC):
47
+ """Abstract base class for orchestrating the different actors (crawling, processing).
48
+
49
+ Abstract methods:
50
+ _collect_results: Collects the results from the given queue_in.
51
+
52
+ Each subclass of class:`Orchestrator` must implement the abstract method func:`_collect_results`.
53
+ This function is responsible for collecting and handling the results from the given queue_in. It might
54
+ save the results to a file, a database, or any other storage.
55
+
56
+ For each pipeline step class:`Orchestrator` will deploy a number of async workers to handle the tasks.
57
+ In addition it makes sure to orchestrate the canceling of the workers only after the relevant workload is done.
58
+
59
+ For more information on the orchestrating pattern see README.md.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ serpapi_key: str,
65
+ dataforseo_user: str,
66
+ dataforseo_pwd: str,
67
+ zyteapi_key: str,
68
+ openaiapi_key: str,
69
+ openai_model: str = PROCESSOR_DEFAULT_MODEL,
70
+ max_retries: int = MAX_RETRIES,
71
+ retry_delay: int = RETRY_DELAY,
72
+ n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
73
+ n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
74
+ n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
75
+ ):
76
+ """Initializes the orchestrator with the given settings.
77
+
78
+ Args:
79
+ serpapi_key: The API key for SERP API.
80
+ dataforseo_user: The user for DataForSEO.
81
+ dataforseo_pwd: The password for DataForSEO.
82
+ zyteapi_key: The API key for Zyte API.
83
+ openaiapi_key: The API key for OpenAI.
84
+ openai_model: The model to use for the processing (optional).
85
+ max_retries: Maximum number of retries for API calls (optional).
86
+ retry_delay: Delay between retries in seconds (optional).
87
+ n_serp_wkrs: Number of async workers for serp (optional).
88
+ n_zyte_wkrs: Number of async workers for zyte (optional).
89
+ n_proc_wkrs: Number of async workers for the processor (optional).
90
+ """
91
+ # Setup the variables
92
+ self._collected_urls_current_run: Set[str] = set()
93
+ self._collected_urls_previous_runs: Set[str] = set()
94
+
95
+ # Setup the clients
96
+ self._serpapi = SerpApi(
97
+ api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
98
+ )
99
+ self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
100
+ self._zyteapi = ZyteApi(
101
+ api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
102
+ )
103
+ self._processor = Processor(api_key=openaiapi_key, model=openai_model)
104
+
105
+ # Setup the async framework
106
+ self._n_serp_wkrs = n_serp_wkrs
107
+ self._n_zyte_wkrs = n_zyte_wkrs
108
+ self._n_proc_wkrs = n_proc_wkrs
109
+ self._queues: Dict[str, asyncio.Queue] | None = None
110
+ self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
111
+
112
+ async def _serp_execute(
113
+ self,
114
+ queue_in: asyncio.Queue[dict | None],
115
+ queue_out: asyncio.Queue[ProductItem | None],
116
+ ) -> None:
117
+ """Collects the SerpApi search setups from the queue_in, executes the search, filters the results (country_code) and puts them into queue_out.
118
+
119
+ Args:
120
+ queue_in: The input queue containing the search parameters.
121
+ queue_out: The output queue to put the found urls.
122
+ """
123
+ while True:
124
+ item = await queue_in.get()
125
+ if item is None:
126
+ queue_in.task_done()
127
+ break
128
+
129
+ try:
130
+ search_term_type = item.pop("search_term_type")
131
+ results = await self._serpapi.apply(**item)
132
+ logger.debug(
133
+ f"SERP API search for {item['search_term']} returned {len(results)} results"
134
+ )
135
+ for res in results:
136
+ product = ProductItem(
137
+ search_term=item["search_term"],
138
+ search_term_type=search_term_type,
139
+ url=res.url,
140
+ marketplace_name=res.marketplace_name,
141
+ domain=res.domain,
142
+ filtered=res.filtered,
143
+ filtered_at_stage=res.filtered_at_stage,
144
+ )
145
+ await queue_out.put(product)
146
+ except Exception as e:
147
+ logger.error(f"Error executing SERP API search: {e}")
148
+ queue_in.task_done()
149
+
150
+ async def _collect_url(
151
+ self,
152
+ queue_in: asyncio.Queue[ProductItem | None],
153
+ queue_out: asyncio.Queue[ProductItem | None],
154
+ ) -> None:
155
+ """Collects the URLs from the given queue_in, checks for duplicates, and puts them into the queue_out.
156
+
157
+ Args:
158
+ queue_in: The input queue containing the URLs.
159
+ queue_out: The output queue to put the URLs.
160
+ """
161
+ while True:
162
+ product = await queue_in.get()
163
+ if product is None:
164
+ queue_in.task_done()
165
+ break
166
+
167
+ if not product.filtered:
168
+ url = product.url
169
+
170
+ if url in self._collected_urls_current_run:
171
+ # deduplicate on current run
172
+ product.filtered = True
173
+ product.filtered_at_stage = (
174
+ "URL collection (current run deduplication)"
175
+ )
176
+ logger.debug(f"URL {url} already collected in current run")
177
+ elif url in self._collected_urls_previous_runs:
178
+ # deduplicate on previous runs coming from a db
179
+ product.filtered = True
180
+ product.filtered_at_stage = (
181
+ "URL collection (previous run deduplication)"
182
+ )
183
+ logger.debug(f"URL {url} as already collected in previous run")
184
+ else:
185
+ self._collected_urls_current_run.add(url)
186
+
187
+ await queue_out.put(product)
188
+ queue_in.task_done()
189
+
190
+ async def _zyte_execute(
191
+ self,
192
+ queue_in: asyncio.Queue[ProductItem | None],
193
+ queue_out: asyncio.Queue[ProductItem | None],
194
+ ) -> None:
195
+ """Collects the URLs from the queue_in, enriches it with product details metadata, filters them (probability), and puts them into queue_out.
196
+
197
+ Args:
198
+ queue_in: The input queue containing URLs to fetch product details from.
199
+ queue_out: The output queue to put the product details as dictionaries.
200
+ """
201
+ while True:
202
+ product = await queue_in.get()
203
+ if product is None:
204
+ queue_in.task_done()
205
+ break
206
+
207
+ if not product.filtered:
208
+ try:
209
+ # Fetch the product details from Zyte API
210
+ details = await self._zyteapi.get_details(url=product.url)
211
+ product.product_name = self._zyteapi.extract_product_name(
212
+ details=details
213
+ )
214
+ product.product_price = self._zyteapi.extract_product_price(
215
+ details=details
216
+ )
217
+ product.product_description = (
218
+ self._zyteapi.extract_product_description(details=details)
219
+ )
220
+ product.product_images = self._zyteapi.extract_image_urls(
221
+ details=details
222
+ )
223
+ product.probability = self._zyteapi.extract_probability(
224
+ details=details
225
+ )
226
+
227
+ # Filter the product based on the probability threshold
228
+ if not self._zyteapi.keep_product(details=details):
229
+ product.filtered = True
230
+ product.filtered_at_stage = "Zyte probability threshold"
231
+
232
+ except Exception as e:
233
+ logger.warning(f"Error executing Zyte API search: {e}.")
234
+
235
+ await queue_out.put(product)
236
+ queue_in.task_done()
237
+
238
+ async def _proc_execute(
239
+ self,
240
+ queue_in: asyncio.Queue[ProductItem | None],
241
+ queue_out: asyncio.Queue[ProductItem | None],
242
+ prompts: List[Prompt],
243
+ ) -> None:
244
+ """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
245
+
246
+ Args:
247
+ queue_in: The input queue containing the product details.
248
+ queue_out: The output queue to put the processed product details.
249
+ prompts: The list of prompts to use for classification.
250
+ """
251
+
252
+ # Process the products
253
+
254
+ while True:
255
+ product = await queue_in.get()
256
+ if product is None:
257
+ # End of queue signal
258
+ queue_in.task_done()
259
+ break
260
+
261
+ if not product.filtered:
262
+ try:
263
+ url = product.url
264
+ name = product.product_name
265
+ description = product.product_description
266
+
267
+ # Run all the configured prompts
268
+ for prompt in prompts:
269
+ logger.debug(
270
+ f"Classify product {name} with prompt {prompt.name}"
271
+ )
272
+ classification = await self._processor.classify(
273
+ prompt=prompt,
274
+ url=url,
275
+ name=name,
276
+ description=description,
277
+ )
278
+ product.classifications[prompt.name] = classification
279
+ except Exception as e:
280
+ logger.warning(f"Error processing product: {e}.")
281
+
282
+ await queue_out.put(product)
283
+ queue_in.task_done()
284
+
285
+ @abstractmethod
286
+ async def _collect_results(
287
+ self, queue_in: asyncio.Queue[ProductItem | None]
288
+ ) -> None:
289
+ """Collects the results from the given queue_in.
290
+
291
+ Args:
292
+ queue_in: The input queue containing the results.
293
+ """
294
+ pass
295
+
296
+ def _setup_async_framework(
297
+ self,
298
+ n_serp_wkrs: int,
299
+ n_zyte_wkrs: int,
300
+ n_proc_wkrs: int,
301
+ prompts: List[Prompt],
302
+ ) -> None:
303
+ """Sets up the necessary queues and workers for the async framework.
304
+
305
+ Args:
306
+ n_serp_wkrs: Number of async workers for serp.
307
+ n_zyte_wkrs: Number of async workers for zyte.
308
+ n_proc_wkrs: Number of async workers for processor.
309
+ prompts: The list of prompts used for the classification by func:`Processor.classify`.
310
+ """
311
+
312
+ # Setup the input/output queues for the workers
313
+ serp_queue: asyncio.Queue[dict | None] = asyncio.Queue()
314
+ url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
315
+ zyte_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
316
+ proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
317
+ res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
318
+
319
+ # Setup the Serp workers
320
+ serp_wkrs = [
321
+ asyncio.create_task(
322
+ self._serp_execute(
323
+ queue_in=serp_queue,
324
+ queue_out=url_queue,
325
+ )
326
+ )
327
+ for _ in range(n_serp_wkrs)
328
+ ]
329
+
330
+ # Setup the URL collector
331
+ url_col = asyncio.create_task(
332
+ self._collect_url(queue_in=url_queue, queue_out=zyte_queue)
333
+ )
334
+
335
+ # Setup the Zyte workers
336
+ zyte_wkrs = [
337
+ asyncio.create_task(
338
+ self._zyte_execute(
339
+ queue_in=zyte_queue,
340
+ queue_out=proc_queue,
341
+ )
342
+ )
343
+ for _ in range(n_zyte_wkrs)
344
+ ]
345
+
346
+ # Setup the processing workers
347
+ proc_wkrs = [
348
+ asyncio.create_task(
349
+ self._proc_execute(
350
+ queue_in=proc_queue,
351
+ queue_out=res_queue,
352
+ prompts=prompts,
353
+ )
354
+ )
355
+ for _ in range(n_proc_wkrs)
356
+ ]
357
+
358
+ # Setup the result collector
359
+ res_col = asyncio.create_task(self._collect_results(queue_in=res_queue))
360
+
361
+ # Add the setup to the instance variables
362
+ self._queues = {
363
+ "serp": serp_queue,
364
+ "url": url_queue,
365
+ "zyte": zyte_queue,
366
+ "proc": proc_queue,
367
+ "res": res_queue,
368
+ }
369
+ self._workers = {
370
+ "serp": serp_wkrs,
371
+ "url": url_col,
372
+ "zyte": zyte_wkrs,
373
+ "proc": proc_wkrs,
374
+ "res": res_col,
375
+ }
376
+
377
+ @staticmethod
378
+ async def _add_serp_items_for_search_term(
379
+ queue: asyncio.Queue[dict | None],
380
+ search_term: str,
381
+ search_term_type: str,
382
+ language: Language,
383
+ location: Location,
384
+ num_results: int,
385
+ marketplaces: List[Host] | None,
386
+ excluded_urls: List[Host] | None,
387
+ ) -> None:
388
+ """Adds a search-item to the queue."""
389
+ item = {
390
+ "search_term": search_term,
391
+ "search_term_type": search_term_type,
392
+ "language": language,
393
+ "location": location,
394
+ "num_results": num_results,
395
+ "marketplaces": marketplaces,
396
+ "excluded_urls": excluded_urls,
397
+ }
398
+ logger.debug(f'Adding item="{item}" to serp_queue')
399
+ await queue.put(item)
400
+
401
+ async def _add_serp_items(
402
+ self,
403
+ queue: asyncio.Queue[dict | None],
404
+ search_term: str,
405
+ language: Language,
406
+ location: Location,
407
+ deepness: Deepness,
408
+ marketplaces: List[Host] | None,
409
+ excluded_urls: List[Host] | None,
410
+ ) -> None:
411
+ """Adds all the (enriched) search_term (as serp items) to the queue."""
412
+ common_kwargs = {
413
+ "queue": queue,
414
+ "language": language,
415
+ "location": location,
416
+ "marketplaces": marketplaces,
417
+ "excluded_urls": excluded_urls,
418
+ }
419
+
420
+ # Add initial items to the serp_queue
421
+ await self._add_serp_items_for_search_term(
422
+ search_term=search_term,
423
+ search_term_type="initial",
424
+ num_results=deepness.num_results,
425
+ **common_kwargs, # type: ignore[arg-type]
426
+ )
427
+
428
+ # Enrich the search_terms
429
+ enrichment = deepness.enrichment
430
+ if enrichment:
431
+ # Call DataForSEO to get additional terms
432
+ n_terms = enrichment.additional_terms
433
+ terms = await self._enricher.apply(
434
+ search_term=search_term,
435
+ language=language,
436
+ location=location,
437
+ n_terms=n_terms,
438
+ )
439
+
440
+ # Add the enriched search terms to the serp_queue
441
+ for trm in terms:
442
+ await self._add_serp_items_for_search_term(
443
+ search_term=trm,
444
+ search_term_type="enriched",
445
+ num_results=enrichment.additional_urls_per_term,
446
+ **common_kwargs, # type: ignore[arg-type]
447
+ )
448
+
449
+ async def run(
450
+ self,
451
+ search_term: str,
452
+ language: Language,
453
+ location: Location,
454
+ deepness: Deepness,
455
+ prompts: List[Prompt],
456
+ marketplaces: List[Host] | None = None,
457
+ excluded_urls: List[Host] | None = None,
458
+ previously_collected_urls: List[str] | None = None,
459
+ ) -> None:
460
+ """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
461
+
462
+ Args:
463
+ search_term: The search term for the query.
464
+ language: The language to use for the query.
465
+ location: The location to use for the query.
466
+ deepness: The search depth and enrichment details.
467
+ prompts: The list of prompt to use for classification.
468
+ marketplaces: The marketplaces to include in the search.
469
+ excluded_urls: The URLs to exclude from the search.
470
+ previously_collected_urls: The urls that have been collected previously and are ignored.
471
+ """
472
+
473
+ # ---------------------------
474
+ # INITIAL SETUP
475
+ # ---------------------------
476
+ if previously_collected_urls:
477
+ self._collected_urls_previous_runs = set(self._collected_urls_current_run)
478
+
479
+ # Setup the async framework
480
+ n_terms_max = 1 + (
481
+ deepness.enrichment.additional_terms if deepness.enrichment else 0
482
+ )
483
+ n_serp_wkrs = min(self._n_serp_wkrs, n_terms_max)
484
+ n_zyte_wkrs = min(self._n_zyte_wkrs, deepness.num_results)
485
+ n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
486
+
487
+ logger.debug(
488
+ f"setting up async framework (#workers: serp={n_serp_wkrs}, zyte={n_zyte_wkrs}, proc={n_proc_wkrs})"
489
+ )
490
+ self._setup_async_framework(
491
+ n_serp_wkrs=n_serp_wkrs,
492
+ n_zyte_wkrs=n_zyte_wkrs,
493
+ n_proc_wkrs=n_proc_wkrs,
494
+ prompts=prompts,
495
+ )
496
+
497
+ # Check setup of async framework
498
+ if self._queues is None or self._workers is None:
499
+ raise ValueError(
500
+ "Async framework is not setup. Please call _setup_async_framework() first."
501
+ )
502
+ if not all([k in self._queues for k in ["serp", "url", "zyte", "proc", "res"]]):
503
+ raise ValueError(
504
+ "The queues of the async framework are not setup correctly."
505
+ )
506
+ if not all(
507
+ [k in self._workers for k in ["serp", "url", "zyte", "proc", "res"]]
508
+ ):
509
+ raise ValueError(
510
+ "The workers of the async framework are not setup correctly."
511
+ )
512
+
513
+ # Add the search items to the serp_queue
514
+ serp_queue = self._queues["serp"]
515
+ await self._add_serp_items(
516
+ queue=serp_queue,
517
+ search_term=search_term,
518
+ language=language,
519
+ location=location,
520
+ deepness=deepness,
521
+ marketplaces=marketplaces,
522
+ excluded_urls=excluded_urls,
523
+ )
524
+
525
+ # ---------------------------
526
+ # ORCHESTRATE SERP WORKERS
527
+ # ---------------------------
528
+ # Add the sentinels to the serp_queue
529
+ for _ in range(n_serp_wkrs):
530
+ await serp_queue.put(None)
531
+
532
+ # Wait for the serp workers to be concluded before adding the sentinels to the url_queue
533
+ serp_workers = self._workers["serp"]
534
+ try:
535
+ logger.debug("Waiting for serp_workers to conclude their tasks...")
536
+ serp_res = await asyncio.gather(*serp_workers, return_exceptions=True)
537
+ for i, res in enumerate(serp_res):
538
+ if isinstance(res, Exception):
539
+ logger.error(f"Error in serp_worker {i}: {res}")
540
+ logger.debug("...serp_workers concluded their tasks")
541
+ except Exception as e:
542
+ logger.error(f"Gathering serp_workers failed: {e}")
543
+ finally:
544
+ await serp_queue.join()
545
+
546
+ # ---------------------------
547
+ # ORCHESTRATE URL COLLECTOR
548
+ # ---------------------------
549
+ # Add the sentinels to the url_queue
550
+ url_queue = self._queues["url"]
551
+ await url_queue.put(None)
552
+
553
+ # Wait for the url_collector to be concluded before adding the sentinels to the zyte_queue
554
+ url_collector = cast(asyncio.Task, self._workers["url"])
555
+ try:
556
+ logger.debug("Waiting for url_collector to conclude its tasks...")
557
+ await url_collector
558
+ logger.debug("...url_collector concluded its tasks")
559
+ except Exception as e:
560
+ logger.error(f"Gathering url_collector failed: {e}")
561
+ finally:
562
+ await url_queue.join()
563
+
564
+ # ---------------------------
565
+ # ORCHESTRATE ZYTE WORKERS
566
+ # ---------------------------
567
+ # Add the sentinels to the zyte_queue
568
+ zyte_queue = self._queues["zyte"]
569
+ for _ in range(n_zyte_wkrs):
570
+ await zyte_queue.put(None)
571
+
572
+ # Wait for the zyte_workers to be concluded before adding the sentinels to the proc_queue
573
+ zyte_workers = self._workers["zyte"]
574
+ try:
575
+ logger.debug("Waiting for zyte_workers to conclude their tasks...")
576
+ zyte_res = await asyncio.gather(*zyte_workers, return_exceptions=True)
577
+ for i, res in enumerate(zyte_res):
578
+ if isinstance(res, Exception):
579
+ logger.error(f"Error in zyte_worker {i}: {res}")
580
+ logger.debug("...zyte_workers concluded their tasks")
581
+ except Exception as e:
582
+ logger.error(f"Gathering zyte_workers failed: {e}")
583
+ finally:
584
+ await zyte_queue.join()
585
+
586
+ # ---------------------------
587
+ # ORCHESTRATE PROC WORKERS
588
+ # ---------------------------
589
+ # Add the sentinels to the proc_queue
590
+ proc_queue = self._queues["proc"]
591
+ for _ in range(n_proc_wkrs):
592
+ await proc_queue.put(None)
593
+
594
+ # Wait for the proc_workers to be concluded before adding the sentinels to the res_queue
595
+ proc_workers = self._workers["proc"]
596
+ try:
597
+ logger.debug("Waiting for proc_workers to conclude their tasks...")
598
+ proc_res = await asyncio.gather(*proc_workers, return_exceptions=True)
599
+ for i, res in enumerate(proc_res):
600
+ if isinstance(res, Exception):
601
+ logger.error(f"Error in proc_worker {i}: {res}")
602
+ logger.debug("...proc_workers concluded their tasks")
603
+ except Exception as e:
604
+ logger.error(f"Gathering proc_workers failed: {e}")
605
+ finally:
606
+ await proc_queue.join()
607
+
608
+ # ---------------------------
609
+ # ORCHESTRATE RES COLLECTOR
610
+ # ---------------------------
611
+ # Add the sentinels to the res_queue
612
+ res_queue = self._queues["res"]
613
+ await res_queue.put(None)
614
+
615
+ # Wait for the res_collector to be concluded
616
+ res_collector = cast(asyncio.Task, self._workers["res"])
617
+ try:
618
+ logger.debug("Waiting for res_collector to conclude its tasks...")
619
+ await res_collector
620
+ logger.debug("...res_collector concluded its tasks")
621
+ except Exception as e:
622
+ logger.error(f"Gathering res_collector failed: {e}")
623
+ finally:
624
+ await res_queue.join()
625
+
626
+ logger.info("Pipeline concluded; async framework is closed")