fraudcrawler 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from fraudcrawler.scraping.search import Search, SearchEngineName
1
+ from fraudcrawler.scraping.search import Searcher, SearchEngineName
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.url import URLCollector
4
4
  from fraudcrawler.scraping.zyte import ZyteAPI
@@ -17,7 +17,7 @@ from fraudcrawler.base.base import (
17
17
  )
18
18
 
19
19
  __all__ = [
20
- "Search",
20
+ "Searcher",
21
21
  "SearchEngineName",
22
22
  "Enricher",
23
23
  "URLCollector",
fraudcrawler/base/base.py CHANGED
@@ -9,7 +9,8 @@ from pydantic import (
9
9
  from pydantic_settings import BaseSettings
10
10
  from urllib.parse import urlparse
11
11
  import re
12
- from typing import Any, Dict, List, TYPE_CHECKING
12
+ from typing import Any, Dict, List
13
+
13
14
 
14
15
  import httpx
15
16
 
@@ -23,9 +24,6 @@ from fraudcrawler.settings import (
23
24
  DEFAULT_HTTPX_REDIRECTS,
24
25
  )
25
26
 
26
- if TYPE_CHECKING:
27
- from fraudcrawler.scraping.zyte import ZyteAPI
28
-
29
27
  logger = logging.getLogger(__name__)
30
28
 
31
29
  # Load google locations and languages
@@ -135,7 +133,7 @@ class Deepness(BaseModel):
135
133
  class ProductItem(BaseModel):
136
134
  """Model representing a product item."""
137
135
 
138
- # Serp/Enrich parameters
136
+ # Search parameters
139
137
  search_term: str
140
138
  search_term_type: str
141
139
  url: str
@@ -143,7 +141,7 @@ class ProductItem(BaseModel):
143
141
  search_engine_name: str
144
142
  domain: str
145
143
 
146
- # Zyte parameters
144
+ # Context parameters
147
145
  product_name: str | None = None
148
146
  product_price: str | None = None
149
147
  product_description: str | None = None
@@ -244,35 +242,3 @@ class DomainUtils:
244
242
  if hostname and hostname.startswith("www."):
245
243
  hostname = hostname[4:]
246
244
  return hostname.lower()
247
-
248
- async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
249
- """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
250
-
251
- This method is specifically designed to handle 403 Forbidden errors for domains
252
- that may be blocking requests from certain IP ranges (like cloud providers).
253
-
254
- Args:
255
- url: The URL to fetch using Zyte proxy mode.
256
- zyte_api: An instance of ZyteAPI to use for the request.
257
-
258
- Returns:
259
- The HTML content as bytes if successful, None if failed.
260
- """
261
- try:
262
- logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
263
- details = await zyte_api.details(url)
264
-
265
- if details and "httpResponseBody" in details:
266
- # Decode the base64 content
267
- import base64
268
-
269
- html_content = base64.b64decode(details["httpResponseBody"])
270
- logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
271
- return html_content
272
- else:
273
- logger.warning(f"Zyte proxy request failed for URL: {url}")
274
- return None
275
-
276
- except Exception as e:
277
- logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
278
- return None
@@ -103,7 +103,7 @@ class FraudCrawlerClient(Orchestrator):
103
103
  search_engines: List[SearchEngineName | str] | None = None,
104
104
  previously_collected_urls: List[str] | None = None,
105
105
  ) -> None:
106
- """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
106
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
107
107
 
108
108
  Args:
109
109
  search_term: The search term for the query.
@@ -10,8 +10,8 @@ from fraudcrawler.settings import (
10
10
  PROCESSOR_DEFAULT_MODEL,
11
11
  )
12
12
  from fraudcrawler.settings import (
13
- DEFAULT_N_SERP_WKRS,
14
- DEFAULT_N_ZYTE_WKRS,
13
+ DEFAULT_N_SRCH_WKRS,
14
+ DEFAULT_N_CNTX_WKRS,
15
15
  DEFAULT_N_PROC_WKRS,
16
16
  )
17
17
  from fraudcrawler.base.base import (
@@ -24,7 +24,7 @@ from fraudcrawler.base.base import (
24
24
  HttpxAsyncClient,
25
25
  )
26
26
  from fraudcrawler import (
27
- Search,
27
+ Searcher,
28
28
  SearchEngineName,
29
29
  Enricher,
30
30
  URLCollector,
@@ -59,8 +59,8 @@ class Orchestrator(ABC):
59
59
  zyteapi_key: str,
60
60
  openaiapi_key: str,
61
61
  openai_model: str = PROCESSOR_DEFAULT_MODEL,
62
- n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
63
- n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
62
+ n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
63
+ n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
64
64
  n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
65
65
  # Configure a custom httpx client.
66
66
  # We provide a `HttpxAsyncClient` class that you can pass
@@ -81,8 +81,8 @@ class Orchestrator(ABC):
81
81
  zyteapi_key: The API key for Zyte API.
82
82
  openaiapi_key: The API key for OpenAI.
83
83
  openai_model: The model to use for the processing (optional).
84
- n_serp_wkrs: Number of async workers for serp (optional).
85
- n_zyte_wkrs: Number of async workers for zyte (optional).
84
+ n_srch_wkrs: Number of async workers for the search (optional).
85
+ n_cntx_wkrs: Number of async workers for context extraction (optional).
86
86
  n_proc_wkrs: Number of async workers for the processor (optional).
87
87
  http_client: An httpx.AsyncClient to use for the async requests (optional).
88
88
  """
@@ -96,8 +96,8 @@ class Orchestrator(ABC):
96
96
  self._openai_model = openai_model
97
97
 
98
98
  # Setup the async framework
99
- self._n_serp_wkrs = n_serp_wkrs
100
- self._n_zyte_wkrs = n_zyte_wkrs
99
+ self._n_srch_wkrs = n_srch_wkrs
100
+ self._n_cntx_wkrs = n_cntx_wkrs
101
101
  self._n_proc_wkrs = n_proc_wkrs
102
102
  self._queues: Dict[str, asyncio.Queue] | None = None
103
103
  self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
@@ -114,13 +114,10 @@ class Orchestrator(ABC):
114
114
  self._owns_http_client = True
115
115
 
116
116
  # Setup the clients
117
- self._zyteapi = ZyteAPI(
118
- http_client=self._http_client, api_key=self._zyteapi_key
119
- )
120
- self._search = Search(
117
+ self._searcher = Searcher(
121
118
  http_client=self._http_client,
122
119
  serpapi_key=self._serpapi_key,
123
- zyte_api=self._zyteapi,
120
+ zyteapi_key=self._zyteapi_key,
124
121
  )
125
122
  self._enricher = Enricher(
126
123
  http_client=self._http_client,
@@ -128,6 +125,10 @@ class Orchestrator(ABC):
128
125
  pwd=self._dataforseo_pwd,
129
126
  )
130
127
  self._url_collector = URLCollector()
128
+ self._zyteapi = ZyteAPI(
129
+ http_client=self._http_client,
130
+ api_key=self._zyteapi_key,
131
+ )
131
132
  self._processor = Processor(
132
133
  http_client=self._http_client,
133
134
  api_key=self._openaiapi_key,
@@ -142,7 +143,7 @@ class Orchestrator(ABC):
142
143
  await self._http_client.aclose()
143
144
  self._http_client = None
144
145
 
145
- async def _serp_execute(
146
+ async def _srch_execute(
146
147
  self,
147
148
  queue_in: asyncio.Queue[dict | None],
148
149
  queue_out: asyncio.Queue[ProductItem | None],
@@ -160,17 +161,14 @@ class Orchestrator(ABC):
160
161
  break
161
162
 
162
163
  try:
164
+ # Execute the search
163
165
  search_term_type = item.pop("search_term_type")
164
- # The search_engines are already SearchEngineName enum values
165
- search_engines = item.pop("search_engines")
166
-
167
- results = await self._search.apply(
168
- **item, search_engines=search_engines
169
- )
170
-
166
+ results = await self._searcher.apply(**item)
171
167
  logger.debug(
172
168
  f"Search for {item['search_term']} returned {len(results)} results"
173
169
  )
170
+
171
+ # Create ProductItems for each result
174
172
  for res in results:
175
173
  product = ProductItem(
176
174
  search_term=item["search_term"],
@@ -205,31 +203,12 @@ class Orchestrator(ABC):
205
203
  break
206
204
 
207
205
  if not product.filtered:
208
- # Clean the URL by removing tracking parameters
209
- url = self._url_collector.remove_tracking_parameters(product.url)
210
- product.url = url
211
-
212
- if url in self._url_collector.collected_currently:
213
- # deduplicate on current run
214
- product.filtered = True
215
- product.filtered_at_stage = (
216
- "URL collection (current run deduplication)"
217
- )
218
- logger.debug(f"URL {url} already collected in current run")
219
- elif url in self._url_collector.collected_previously:
220
- # deduplicate on previous runs coming from a db
221
- product.filtered = True
222
- product.filtered_at_stage = (
223
- "URL collection (previous run deduplication)"
224
- )
225
- logger.debug(f"URL {url} as already collected in previous run")
226
- else:
227
- self._url_collector.collected_currently.add(url)
206
+ product = await self._url_collector.apply(product=product)
228
207
 
229
208
  await queue_out.put(product)
230
209
  queue_in.task_done()
231
210
 
232
- async def _zyte_execute(
211
+ async def _cntx_execute(
233
212
  self,
234
213
  queue_in: asyncio.Queue[ProductItem | None],
235
214
  queue_out: asyncio.Queue[ProductItem | None],
@@ -248,7 +227,7 @@ class Orchestrator(ABC):
248
227
 
249
228
  if not product.filtered:
250
229
  try:
251
- # Fetch the product details from Zyte API
230
+ # Fetch the product context from Zyte API
252
231
  details = await self._zyteapi.details(url=product.url)
253
232
  url_resolved = self._zyteapi.extract_url_resolved(details=details)
254
233
  if url_resolved:
@@ -258,12 +237,13 @@ class Orchestrator(ABC):
258
237
  )
259
238
 
260
239
  # If the resolved URL is different from the original URL, we also need to update the domain as
261
- # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
240
+ # otherwise the unresolved domain will be shown.
241
+ # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
262
242
  if url_resolved and url_resolved != product.url:
263
243
  logger.debug(
264
244
  f"URL resolved for {product.url} is {url_resolved}"
265
245
  )
266
- product.domain = self._search._get_domain(url_resolved)
246
+ product.domain = self._searcher._get_domain(url_resolved)
267
247
 
268
248
  product.product_price = self._zyteapi.extract_product_price(
269
249
  details=details
@@ -348,52 +328,52 @@ class Orchestrator(ABC):
348
328
 
349
329
  def _setup_async_framework(
350
330
  self,
351
- n_serp_wkrs: int,
352
- n_zyte_wkrs: int,
331
+ n_srch_wkrs: int,
332
+ n_cntx_wkrs: int,
353
333
  n_proc_wkrs: int,
354
334
  prompts: List[Prompt],
355
335
  ) -> None:
356
336
  """Sets up the necessary queues and workers for the async framework.
357
337
 
358
338
  Args:
359
- n_serp_wkrs: Number of async workers for serp.
360
- n_zyte_wkrs: Number of async workers for zyte.
361
- n_proc_wkrs: Number of async workers for processor.
339
+ n_srch_wkrs: Number of async workers for search.
340
+ n_cntx_wkrs: Number of async workers for context extraction.
341
+ n_proc_wkrs: Number of async workers for processing.
362
342
  prompts: The list of prompts used for the classification by func:`Processor.classify`.
363
343
  """
364
344
 
365
345
  # Setup the input/output queues for the workers
366
- serp_queue: asyncio.Queue[dict | None] = asyncio.Queue()
346
+ srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
367
347
  url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
368
- zyte_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
348
+ cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
369
349
  proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
370
350
  res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
371
351
 
372
- # Setup the Serp workers
373
- serp_wkrs = [
352
+ # Setup the Search workers
353
+ srch_wkrs = [
374
354
  asyncio.create_task(
375
- self._serp_execute(
376
- queue_in=serp_queue,
355
+ self._srch_execute(
356
+ queue_in=srch_queue,
377
357
  queue_out=url_queue,
378
358
  )
379
359
  )
380
- for _ in range(n_serp_wkrs)
360
+ for _ in range(n_srch_wkrs)
381
361
  ]
382
362
 
383
363
  # Setup the URL collector
384
364
  url_col = asyncio.create_task(
385
- self._collect_url(queue_in=url_queue, queue_out=zyte_queue)
365
+ self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
386
366
  )
387
367
 
388
- # Setup the Zyte workers
389
- zyte_wkrs = [
368
+ # Setup the context extraction workers
369
+ cntx_wkrs = [
390
370
  asyncio.create_task(
391
- self._zyte_execute(
392
- queue_in=zyte_queue,
371
+ self._cntx_execute(
372
+ queue_in=cntx_queue,
393
373
  queue_out=proc_queue,
394
374
  )
395
375
  )
396
- for _ in range(n_zyte_wkrs)
376
+ for _ in range(n_cntx_wkrs)
397
377
  ]
398
378
 
399
379
  # Setup the processing workers
@@ -413,26 +393,26 @@ class Orchestrator(ABC):
413
393
 
414
394
  # Add the setup to the instance variables
415
395
  self._queues = {
416
- "serp": serp_queue,
396
+ "srch": srch_queue,
417
397
  "url": url_queue,
418
- "zyte": zyte_queue,
398
+ "cntx": cntx_queue,
419
399
  "proc": proc_queue,
420
400
  "res": res_queue,
421
401
  }
422
402
  self._workers = {
423
- "serp": serp_wkrs,
403
+ "srch": srch_wkrs,
424
404
  "url": url_col,
425
- "zyte": zyte_wkrs,
405
+ "cntx": cntx_wkrs,
426
406
  "proc": proc_wkrs,
427
407
  "res": res_col,
428
408
  }
429
409
 
430
410
  @staticmethod
431
- async def _add_serp_items_for_search_term(
411
+ async def _add_search_items_for_search_term(
432
412
  queue: asyncio.Queue[dict | None],
433
413
  search_term: str,
434
414
  search_term_type: str,
435
- search_engines: List[SearchEngineName],
415
+ search_engine: SearchEngineName,
436
416
  language: Language,
437
417
  location: Location,
438
418
  num_results: int,
@@ -443,17 +423,17 @@ class Orchestrator(ABC):
443
423
  item = {
444
424
  "search_term": search_term,
445
425
  "search_term_type": search_term_type,
446
- "search_engines": search_engines,
426
+ "search_engine": search_engine,
447
427
  "language": language,
448
428
  "location": location,
449
429
  "num_results": num_results,
450
430
  "marketplaces": marketplaces,
451
431
  "excluded_urls": excluded_urls,
452
432
  }
453
- logger.debug(f'Adding item="{item}" to serp_queue')
433
+ logger.debug(f'Adding item="{item}" to srch_queue')
454
434
  await queue.put(item)
455
435
 
456
- async def _add_serp_items(
436
+ async def _add_srch_items(
457
437
  self,
458
438
  queue: asyncio.Queue[dict | None],
459
439
  search_term: str,
@@ -464,7 +444,23 @@ class Orchestrator(ABC):
464
444
  marketplaces: List[Host] | None,
465
445
  excluded_urls: List[Host] | None,
466
446
  ) -> None:
467
- """Adds all the (enriched) search_term (as serp items) to the queue."""
447
+ """Adds all the (enriched) search_term (as srch items) to the queue.
448
+
449
+ One item consists of the following parameters:
450
+ - search_term: The search term for the query.
451
+ - search_term_type: The type of the search term (initial or enriched).
452
+ - search_engines: The search engines to use for the query.
453
+ - language: The language to use for the query.
454
+ - location: The location to use for the query.
455
+ - num_results: The number of results to return.
456
+ - marketplaces: The marketplaces to include in the search.
457
+ - excluded_urls: The URLs to exclude from the search.
458
+
459
+ For constructing such items we essentially have two loops:
460
+ for each search_term (initial + enriched)
461
+ for each search_engine
462
+ add item to queue
463
+ """
468
464
  common_kwargs = {
469
465
  "queue": queue,
470
466
  "language": language,
@@ -473,14 +469,15 @@ class Orchestrator(ABC):
473
469
  "excluded_urls": excluded_urls,
474
470
  }
475
471
 
476
- # Add initial items to the serp_queue
477
- await self._add_serp_items_for_search_term(
478
- search_term=search_term,
479
- search_term_type="initial",
480
- search_engines=search_engines,
481
- num_results=deepness.num_results,
482
- **common_kwargs, # type: ignore[arg-type]
483
- )
472
+ # Add initial items to the queue
473
+ for se in search_engines:
474
+ await self._add_search_items_for_search_term(
475
+ search_term=search_term,
476
+ search_term_type="initial",
477
+ search_engine=se,
478
+ num_results=deepness.num_results,
479
+ **common_kwargs, # type: ignore[arg-type]
480
+ )
484
481
 
485
482
  # Enrich the search_terms
486
483
  enrichment = deepness.enrichment
@@ -494,15 +491,16 @@ class Orchestrator(ABC):
494
491
  n_terms=n_terms,
495
492
  )
496
493
 
497
- # Add the enriched search terms to the serp_queue
494
+ # Add the enriched search terms to the queue
498
495
  for trm in terms:
499
- await self._add_serp_items_for_search_term(
500
- search_term=trm,
501
- search_term_type="enriched",
502
- search_engines=search_engines,
503
- num_results=enrichment.additional_urls_per_term,
504
- **common_kwargs, # type: ignore[arg-type]
505
- )
496
+ for se in search_engines:
497
+ await self._add_search_items_for_search_term(
498
+ search_term=trm,
499
+ search_term_type="enriched",
500
+ search_engine=se,
501
+ num_results=enrichment.additional_urls_per_term,
502
+ **common_kwargs, # type: ignore[arg-type]
503
+ )
506
504
 
507
505
  async def run(
508
506
  self,
@@ -516,7 +514,7 @@ class Orchestrator(ABC):
516
514
  excluded_urls: List[Host] | None = None,
517
515
  previously_collected_urls: List[str] | None = None,
518
516
  ) -> None:
519
- """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
517
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
520
518
 
521
519
  Args:
522
520
  search_term: The search term for the query.
@@ -541,22 +539,24 @@ class Orchestrator(ABC):
541
539
 
542
540
  # Handle previously collected URLs
543
541
  if previously_collected_urls:
544
- self._url_collector.collected_previously = set(previously_collected_urls)
542
+ self._url_collector.add_previously_collected_urls(
543
+ urls=previously_collected_urls
544
+ )
545
545
 
546
546
  # Setup the async framework
547
547
  n_terms_max = 1 + (
548
548
  deepness.enrichment.additional_terms if deepness.enrichment else 0
549
549
  )
550
- n_serp_wkrs = min(self._n_serp_wkrs, n_terms_max)
551
- n_zyte_wkrs = min(self._n_zyte_wkrs, deepness.num_results)
550
+ n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
551
+ n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
552
552
  n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
553
553
 
554
554
  logger.debug(
555
- f"setting up async framework (#workers: serp={n_serp_wkrs}, zyte={n_zyte_wkrs}, proc={n_proc_wkrs})"
555
+ f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
556
556
  )
557
557
  self._setup_async_framework(
558
- n_serp_wkrs=n_serp_wkrs,
559
- n_zyte_wkrs=n_zyte_wkrs,
558
+ n_srch_wkrs=n_srch_wkrs,
559
+ n_cntx_wkrs=n_cntx_wkrs,
560
560
  n_proc_wkrs=n_proc_wkrs,
561
561
  prompts=prompts,
562
562
  )
@@ -566,21 +566,21 @@ class Orchestrator(ABC):
566
566
  raise ValueError(
567
567
  "Async framework is not setup. Please call _setup_async_framework() first."
568
568
  )
569
- if not all([k in self._queues for k in ["serp", "url", "zyte", "proc", "res"]]):
569
+ if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
570
570
  raise ValueError(
571
571
  "The queues of the async framework are not setup correctly."
572
572
  )
573
573
  if not all(
574
- [k in self._workers for k in ["serp", "url", "zyte", "proc", "res"]]
574
+ [k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
575
575
  ):
576
576
  raise ValueError(
577
577
  "The workers of the async framework are not setup correctly."
578
578
  )
579
579
 
580
- # Add the search items to the serp_queue
581
- serp_queue = self._queues["serp"]
582
- await self._add_serp_items(
583
- queue=serp_queue,
580
+ # Add the search items to the srch_queue
581
+ srch_queue = self._queues["srch"]
582
+ await self._add_srch_items(
583
+ queue=srch_queue,
584
584
  search_term=search_term,
585
585
  search_engines=search_engines,
586
586
  language=language,
@@ -590,26 +590,26 @@ class Orchestrator(ABC):
590
590
  excluded_urls=excluded_urls,
591
591
  )
592
592
 
593
- # ---------------------------
594
- # ORCHESTRATE SERP WORKERS
595
- # ---------------------------
596
- # Add the sentinels to the serp_queue
597
- for _ in range(n_serp_wkrs):
598
- await serp_queue.put(None)
593
+ # -----------------------------
594
+ # ORCHESTRATE SEARCH WORKERS
595
+ # -----------------------------
596
+ # Add the sentinels to the srch_queue
597
+ for _ in range(n_srch_wkrs):
598
+ await srch_queue.put(None)
599
599
 
600
- # Wait for the serp workers to be concluded before adding the sentinels to the url_queue
601
- serp_workers = self._workers["serp"]
600
+ # Wait for the srch workers to be concluded before adding the sentinels to the url_queue
601
+ srch_workers = self._workers["srch"]
602
602
  try:
603
- logger.debug("Waiting for serp_workers to conclude their tasks...")
604
- serp_res = await asyncio.gather(*serp_workers, return_exceptions=True)
605
- for i, res in enumerate(serp_res):
603
+ logger.debug("Waiting for srch_workers to conclude their tasks...")
604
+ srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
605
+ for i, res in enumerate(srch_res):
606
606
  if isinstance(res, Exception):
607
- logger.error(f"Error in serp_worker {i}: {res}")
608
- logger.debug("...serp_workers concluded their tasks")
607
+ logger.error(f"Error in srch_worker {i}: {res}")
608
+ logger.debug("...srch_workers concluded their tasks")
609
609
  except Exception as e:
610
- logger.error(f"Gathering serp_workers failed: {e}")
610
+ logger.error(f"Gathering srch_workers failed: {e}")
611
611
  finally:
612
- await serp_queue.join()
612
+ await srch_queue.join()
613
613
 
614
614
  # ---------------------------
615
615
  # ORCHESTRATE URL COLLECTOR
@@ -618,7 +618,7 @@ class Orchestrator(ABC):
618
618
  url_queue = self._queues["url"]
619
619
  await url_queue.put(None)
620
620
 
621
- # Wait for the url_collector to be concluded before adding the sentinels to the zyte_queue
621
+ # Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
622
622
  url_collector = cast(asyncio.Task, self._workers["url"])
623
623
  try:
624
624
  logger.debug("Waiting for url_collector to conclude its tasks...")
@@ -629,27 +629,27 @@ class Orchestrator(ABC):
629
629
  finally:
630
630
  await url_queue.join()
631
631
 
632
- # ---------------------------
633
- # ORCHESTRATE ZYTE WORKERS
634
- # ---------------------------
635
- # Add the sentinels to the zyte_queue
636
- zyte_queue = self._queues["zyte"]
637
- for _ in range(n_zyte_wkrs):
638
- await zyte_queue.put(None)
632
+ # -----------------------------
633
+ # ORCHESTRATE CONTEXT WORKERS
634
+ # -----------------------------
635
+ # Add the sentinels to the cntx_queue
636
+ cntx_queue = self._queues["cntx"]
637
+ for _ in range(n_cntx_wkrs):
638
+ await cntx_queue.put(None)
639
639
 
640
- # Wait for the zyte_workers to be concluded before adding the sentinels to the proc_queue
641
- zyte_workers = self._workers["zyte"]
640
+ # Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
641
+ cntx_workers = self._workers["cntx"]
642
642
  try:
643
- logger.debug("Waiting for zyte_workers to conclude their tasks...")
644
- zyte_res = await asyncio.gather(*zyte_workers, return_exceptions=True)
645
- for i, res in enumerate(zyte_res):
643
+ logger.debug("Waiting for cntx_workers to conclude their tasks...")
644
+ cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
645
+ for i, res in enumerate(cntx_res):
646
646
  if isinstance(res, Exception):
647
- logger.error(f"Error in zyte_worker {i}: {res}")
648
- logger.debug("...zyte_workers concluded their tasks")
647
+ logger.error(f"Error in cntx_worker {i}: {res}")
648
+ logger.debug("...cntx_workers concluded their tasks")
649
649
  except Exception as e:
650
- logger.error(f"Gathering zyte_workers failed: {e}")
650
+ logger.error(f"Gathering cntx_workers failed: {e}")
651
651
  finally:
652
- await zyte_queue.join()
652
+ await cntx_queue.join()
653
653
 
654
654
  # ---------------------------
655
655
  # ORCHESTRATE PROC WORKERS
@@ -25,16 +25,22 @@ def _is_retryable_exception(err: BaseException) -> bool:
25
25
  return True
26
26
 
27
27
 
28
- def get_async_retry() -> AsyncRetrying:
28
+ def get_async_retry(
29
+ stop_after: int = RETRY_STOP_AFTER_ATTEMPT,
30
+ initial_delay: int = RETRY_INITIAL_DELAY,
31
+ max_delay: int = RETRY_MAX_DELAY,
32
+ exp_base: int = RETRY_EXP_BASE,
33
+ jitter: int = RETRY_JITTER,
34
+ ) -> AsyncRetrying:
29
35
  """returns the retry configuration for async operations."""
30
36
  return AsyncRetrying(
31
37
  retry=retry_if_exception(_is_retryable_exception),
32
- stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
38
+ stop=stop_after_attempt(stop_after),
33
39
  wait=wait_exponential_jitter(
34
- initial=RETRY_INITIAL_DELAY,
35
- max=RETRY_MAX_DELAY,
36
- exp_base=RETRY_EXP_BASE,
37
- jitter=RETRY_JITTER,
40
+ initial=initial_delay,
41
+ max=max_delay,
42
+ exp_base=exp_base,
43
+ jitter=jitter,
38
44
  ),
39
45
  reraise=True,
40
46
  )
@@ -97,4 +97,4 @@ def search(search_term: str):
97
97
 
98
98
 
99
99
  if __name__ == "__main__":
100
- search(search_term='Liebherr "TP1410"')
100
+ search(search_term="electric cigarettes")