fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,696 @@
1
+ from abc import ABC, abstractmethod
2
+ import asyncio
3
+ import logging
4
+ from typing import cast, Dict, List
5
+
6
+ import re
7
+
8
+ from fraudcrawler.settings import (
9
+ EXACT_MATCH_PRODUCT_FIELDS,
10
+ EXACT_MATCH_FIELD_SEPARATOR,
11
+ )
12
+ from fraudcrawler.settings import (
13
+ DEFAULT_N_SRCH_WKRS,
14
+ DEFAULT_N_CNTX_WKRS,
15
+ DEFAULT_N_PROC_WKRS,
16
+ )
17
+ from fraudcrawler.base.base import (
18
+ Host,
19
+ Language,
20
+ Location,
21
+ Deepness,
22
+ ProductItem,
23
+ )
24
+ from fraudcrawler import (
25
+ Searcher,
26
+ SearchEngineName,
27
+ Enricher,
28
+ ZyteAPI,
29
+ URLCollector,
30
+ Processor,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class Orchestrator(ABC):
37
+ """Abstract base class for orchestrating the different actors (scraping, processing).
38
+
39
+ Any subclass of :class:`Orchestrator` orchestrates the complete pipeline: search,
40
+ deduplication, context extraction, processing (classification), and result collection.
41
+
42
+ Abstract methods:
43
+ _collect_results: Collects the results from the given queue_in.
44
+ This function is responsible for collecting and handling the results from the given queue_in. It might
45
+ save the results to a file, a database, or any other storage.
46
+
47
+ For each pipeline step :class:`Orchestrator` will deploy a number of async workers to handle the tasks.
48
+ In addition it makes sure to orchestrate the canceling of the workers only after the relevant workload is done.
49
+
50
+ For more information on the orchestrating pattern see README.md.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ searcher: Searcher,
56
+ enricher: Enricher,
57
+ url_collector: URLCollector,
58
+ zyteapi: ZyteAPI,
59
+ processor: Processor,
60
+ n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
61
+ n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
62
+ n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
63
+ ):
64
+ """Initializes the orchestrator with the given settings.
65
+
66
+ Args:
67
+ searcher: Client for searching step.
68
+ enricher: Client for enrichment step.
69
+ url_collector: Client for deduplication.
70
+ zyteapi: Client for metadata extraction.
71
+ processor: Client for product classification.
72
+ n_srch_wkrs: Number of async workers for the search (optional).
73
+ n_cntx_wkrs: Number of async workers for context extraction (optional).
74
+ n_proc_wkrs: Number of async workers for the processor (optional).
75
+ """
76
+
77
+ # Pipeline clients
78
+ self._searcher = searcher
79
+ self._enricher = enricher
80
+ self._url_collector = url_collector
81
+ self._zyteapi = zyteapi
82
+ self._processor = processor
83
+
84
+ # Setup the async framework
85
+ self._n_srch_wkrs = n_srch_wkrs
86
+ self._n_cntx_wkrs = n_cntx_wkrs
87
+ self._n_proc_wkrs = n_proc_wkrs
88
+ self._queues: Dict[str, asyncio.Queue] | None = None
89
+ self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
90
+
91
+ async def _srch_execute(
92
+ self,
93
+ queue_in: asyncio.Queue[dict | None],
94
+ queue_out: asyncio.Queue[ProductItem | None],
95
+ ) -> None:
96
+ """Collects the search setups from the queue_in, executes the search, filters the results and puts them into queue_out.
97
+
98
+ Args:
99
+ queue_in: The input queue containing the search parameters.
100
+ queue_out: The output queue to put the found urls.
101
+ """
102
+ while True:
103
+ item = await queue_in.get()
104
+ if item is None:
105
+ queue_in.task_done()
106
+ break
107
+
108
+ try:
109
+ # Execute the search
110
+ search_term_type = item.pop("search_term_type")
111
+ results = await self._searcher.apply(**item)
112
+ logger.debug(
113
+ f"Search for {item['search_term']} returned {len(results)} results"
114
+ )
115
+
116
+ # Create ProductItems for each result
117
+ for res in results:
118
+ product = ProductItem(
119
+ search_term=item["search_term"],
120
+ search_term_type=search_term_type,
121
+ url=res.url,
122
+ url_resolved=res.url, # Set initial value, will be updated by Zyte
123
+ search_engine_name=res.search_engine_name,
124
+ domain=res.domain,
125
+ filtered=res.filtered,
126
+ filtered_at_stage=res.filtered_at_stage,
127
+ )
128
+ await queue_out.put(product)
129
+ except Exception:
130
+ logger.error(
131
+ f"Running search failed with item={item}",
132
+ exc_info=True,
133
+ )
134
+ queue_in.task_done()
135
+
136
+ async def _collect_url(
137
+ self,
138
+ queue_in: asyncio.Queue[ProductItem | None],
139
+ queue_out: asyncio.Queue[ProductItem | None],
140
+ ) -> None:
141
+ """Collects the URLs from the given queue_in, checks for duplicates, and puts them into the queue_out.
142
+
143
+ Args:
144
+ queue_in: The input queue containing the URLs.
145
+ queue_out: The output queue to put the URLs.
146
+ """
147
+ while True:
148
+ product = await queue_in.get()
149
+ if product is None:
150
+ queue_in.task_done()
151
+ break
152
+
153
+ if not product.filtered:
154
+ product = await self._url_collector.apply(product=product)
155
+
156
+ await queue_out.put(product)
157
+ queue_in.task_done()
158
+
159
+ async def _cntx_execute(
160
+ self,
161
+ queue_in: asyncio.Queue[ProductItem | None],
162
+ queue_out: asyncio.Queue[ProductItem | None],
163
+ ) -> None:
164
+ """Collects the URLs from the queue_in, enriches it with product details metadata, filters them (probability), and puts them into queue_out.
165
+
166
+ Args:
167
+ queue_in: The input queue containing URLs to fetch product details from.
168
+ queue_out: The output queue to put the product details as dictionaries.
169
+ """
170
+ while True:
171
+ product = await queue_in.get()
172
+ if product is None:
173
+ queue_in.task_done()
174
+ break
175
+
176
+ if not product.filtered:
177
+ try:
178
+ # Fetch and enrich the product context from Zyte API
179
+ details = await self._zyteapi.details(url=product.url)
180
+ product = self._zyteapi.enrich_context(
181
+ product=product, details=details
182
+ )
183
+
184
+ # Filter the product based on the probability threshold
185
+ if not self._zyteapi.keep_product(details=details):
186
+ product.filtered = True
187
+ product.filtered_at_stage = (
188
+ "Context (Zyte probability threshold)"
189
+ )
190
+
191
+ # Check for exact match inside the full product context
192
+ product = self._check_exact_search(product=product)
193
+ if (
194
+ not product.filtered
195
+ and product.exact_search
196
+ and not product.exact_search_match
197
+ ):
198
+ product.filtered = True
199
+ product.filtered_at_stage = "Context (exact search)"
200
+
201
+ except Exception:
202
+ logger.error(
203
+ f"Running Zyte API search failed for product with url={product.url_resolved}",
204
+ exc_info=True,
205
+ )
206
+ await queue_out.put(product)
207
+ queue_in.task_done()
208
+
209
+ async def _proc_execute(
210
+ self,
211
+ queue_in: asyncio.Queue[ProductItem | None],
212
+ queue_out: asyncio.Queue[ProductItem | None],
213
+ ) -> None:
214
+ """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
215
+
216
+ Args:
217
+ queue_in: The input queue containing the product details.
218
+ queue_out: The output queue to put the processed product details.
219
+ """
220
+
221
+ # Process the products
222
+ while True:
223
+ product = await queue_in.get()
224
+ if product is None:
225
+ # End of queue signal
226
+ queue_in.task_done()
227
+ break
228
+
229
+ if not product.filtered:
230
+ try:
231
+ # Run the configured workflows
232
+ product = await self._processor.run(product=product)
233
+ except Exception:
234
+ logger.error(
235
+ f"Processing product with url={product.url_resolved} failed",
236
+ exc_info=True,
237
+ )
238
+
239
+ await queue_out.put(product)
240
+ queue_in.task_done()
241
+
242
+ @abstractmethod
243
+ async def _collect_results(
244
+ self, queue_in: asyncio.Queue[ProductItem | None]
245
+ ) -> None:
246
+ """Collects the results from the given queue_in.
247
+
248
+ Args:
249
+ queue_in: The input queue containing the results.
250
+ """
251
+ pass
252
+
253
+ def _setup_async_framework(
254
+ self,
255
+ n_srch_wkrs: int,
256
+ n_cntx_wkrs: int,
257
+ n_proc_wkrs: int,
258
+ ) -> None:
259
+ """Sets up the necessary queues and workers for the async framework.
260
+
261
+ Args:
262
+ n_srch_wkrs: Number of async workers for search.
263
+ n_cntx_wkrs: Number of async workers for context extraction.
264
+ n_proc_wkrs: Number of async workers for processing.
265
+ """
266
+
267
+ # Setup the input/output queues for the workers
268
+ srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
269
+ url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
270
+ cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
271
+ proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
272
+ res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
273
+
274
+ # Setup the Search workers
275
+ srch_wkrs = [
276
+ asyncio.create_task(
277
+ self._srch_execute(
278
+ queue_in=srch_queue,
279
+ queue_out=url_queue,
280
+ )
281
+ )
282
+ for _ in range(n_srch_wkrs)
283
+ ]
284
+
285
+ # Setup the URL collector
286
+ url_col = asyncio.create_task(
287
+ self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
288
+ )
289
+
290
+ # Setup the context extraction workers
291
+ cntx_wkrs = [
292
+ asyncio.create_task(
293
+ self._cntx_execute(
294
+ queue_in=cntx_queue,
295
+ queue_out=proc_queue,
296
+ )
297
+ )
298
+ for _ in range(n_cntx_wkrs)
299
+ ]
300
+
301
+ # Setup the processing workers
302
+ proc_wkrs = [
303
+ asyncio.create_task(
304
+ self._proc_execute(
305
+ queue_in=proc_queue,
306
+ queue_out=res_queue,
307
+ )
308
+ )
309
+ for _ in range(n_proc_wkrs)
310
+ ]
311
+
312
+ # Setup the result collector
313
+ res_col = asyncio.create_task(self._collect_results(queue_in=res_queue))
314
+
315
+ # Add the setup to the instance variables
316
+ self._queues = {
317
+ "srch": srch_queue,
318
+ "url": url_queue,
319
+ "cntx": cntx_queue,
320
+ "proc": proc_queue,
321
+ "res": res_queue,
322
+ }
323
+ self._workers = {
324
+ "srch": srch_wkrs,
325
+ "url": url_col,
326
+ "cntx": cntx_wkrs,
327
+ "proc": proc_wkrs,
328
+ "res": res_col,
329
+ }
330
+
331
+ @staticmethod
332
+ async def _add_search_items_for_search_term(
333
+ queue: asyncio.Queue[dict | None],
334
+ search_term: str,
335
+ search_term_type: str,
336
+ search_engine: SearchEngineName,
337
+ language: Language,
338
+ location: Location,
339
+ num_results: int,
340
+ marketplaces: List[Host] | None,
341
+ excluded_urls: List[Host] | None,
342
+ ) -> None:
343
+ """Adds a search-item to the queue."""
344
+ item = {
345
+ "search_term": search_term,
346
+ "search_term_type": search_term_type,
347
+ "search_engine": search_engine,
348
+ "language": language,
349
+ "location": location,
350
+ "num_results": num_results,
351
+ "marketplaces": marketplaces,
352
+ "excluded_urls": excluded_urls,
353
+ }
354
+ logger.debug(f'Adding item="{item}" to srch_queue')
355
+ await queue.put(item)
356
+
357
+ async def _add_srch_items(
358
+ self,
359
+ queue: asyncio.Queue[dict | None],
360
+ search_term: str,
361
+ search_engines: List[SearchEngineName],
362
+ language: Language,
363
+ location: Location,
364
+ deepness: Deepness,
365
+ marketplaces: List[Host] | None,
366
+ excluded_urls: List[Host] | None,
367
+ ) -> None:
368
+ """Adds all the (enriched) search_term (as srch items) to the queue.
369
+
370
+ One item consists of the following parameters:
371
+ - search_term: The search term for the query.
372
+ - search_term_type: The type of the search term (initial or enriched).
373
+ - search_engines: The search engines to use for the query.
374
+ - language: The language to use for the query.
375
+ - location: The location to use for the query.
376
+ - num_results: The number of results to return.
377
+ - marketplaces: The marketplaces to include in the search.
378
+ - excluded_urls: The URLs to exclude from the search.
379
+
380
+ For constructing such items we essentially have two loops:
381
+ for each search_term (initial + enriched)
382
+ for each search_engine
383
+ add item to queue
384
+ """
385
+ common_kwargs = {
386
+ "queue": queue,
387
+ "language": language,
388
+ "location": location,
389
+ "marketplaces": marketplaces,
390
+ "excluded_urls": excluded_urls,
391
+ }
392
+
393
+ # Add initial items to the queue
394
+ for se in search_engines:
395
+ await self._add_search_items_for_search_term(
396
+ search_term=search_term,
397
+ search_term_type="initial",
398
+ search_engine=se,
399
+ num_results=deepness.num_results,
400
+ **common_kwargs, # type: ignore[arg-type]
401
+ )
402
+
403
+ # Enrich the search_terms
404
+ enrichment = deepness.enrichment
405
+ if enrichment:
406
+ # Call DataForSEO to get additional terms
407
+ n_terms = enrichment.additional_terms
408
+ terms = await self._enricher.enrich(
409
+ search_term=search_term,
410
+ language=language,
411
+ location=location,
412
+ n_terms=n_terms,
413
+ )
414
+
415
+ # Add the enriched search terms to the queue
416
+ for trm in terms:
417
+ for se in search_engines:
418
+ await self._add_search_items_for_search_term(
419
+ search_term=trm,
420
+ search_term_type="enriched",
421
+ search_engine=se,
422
+ num_results=enrichment.additional_urls_per_term,
423
+ **common_kwargs, # type: ignore[arg-type]
424
+ )
425
+
426
+ @staticmethod
427
+ def _is_exact_search(search_term: str) -> bool:
428
+ """Check if the search term is an exact search (contains double quotation marks).
429
+
430
+ Args:
431
+ search_term: The search term to check.
432
+ """
433
+ return '"' in search_term
434
+
435
+ @staticmethod
436
+ def _extract_exact_search_terms(search_term: str) -> list[str]:
437
+ """Extract all exact search terms from within double quotation marks (empty if no quotes found).
438
+
439
+ Args:
440
+ search_term: The search term that may contain double quotation marks.
441
+ """
442
+ # Find all double-quoted strings
443
+ double_quote_matches = re.findall(r'"([^"]*)"', search_term)
444
+ return double_quote_matches
445
+
446
+ @staticmethod
447
+ def _check_exact_search_terms_match(
448
+ product: ProductItem,
449
+ exact_search_terms: list[str],
450
+ ) -> bool:
451
+ """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
452
+
453
+ Args:
454
+ product: The product item.
455
+ exact_search_terms: List of exact search terms to match against.
456
+ """
457
+ field_values = [
458
+ str(val)
459
+ for fld in EXACT_MATCH_PRODUCT_FIELDS
460
+ if (val := getattr(product, fld, None)) is not None
461
+ ]
462
+ product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
463
+
464
+ return all(
465
+ re.search(re.escape(est.lower()), product_str_lower)
466
+ for est in exact_search_terms
467
+ )
468
+
469
+ def _check_exact_search(self, product: ProductItem) -> ProductItem:
470
+ """Checks if the search term requests an exact search and if yes, checks for conformity."""
471
+ # Check for exact search and apply regex matching
472
+ exact_search = self._is_exact_search(product.search_term)
473
+ product.exact_search = exact_search
474
+
475
+ # Only set exact_search_match if this was an exact search (contains quotes)
476
+ if exact_search:
477
+ exact_search_terms = self._extract_exact_search_terms(product.search_term)
478
+ if exact_search_terms:
479
+ product.exact_search_match = self._check_exact_search_terms_match(
480
+ product=product, exact_search_terms=exact_search_terms
481
+ )
482
+ logger.debug(
483
+ f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
484
+ f"for offer with url={product.url}"
485
+ )
486
+ else:
487
+ logger.warning(
488
+ f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
489
+ f"for offer with url={product.url}"
490
+ )
491
+ # If exact_search is False, product.exact_search_match remains False (default value)
492
+ return product
493
+
494
+ async def run(
495
+ self,
496
+ search_term: str,
497
+ search_engines: List[SearchEngineName],
498
+ language: Language,
499
+ location: Location,
500
+ deepness: Deepness,
501
+ marketplaces: List[Host] | None = None,
502
+ excluded_urls: List[Host] | None = None,
503
+ previously_collected_urls: List[str] | None = None,
504
+ ) -> None:
505
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
506
+
507
+ Args:
508
+ search_term: The search term for the query.
509
+ search_engines: The list of search engines to use for the search query.
510
+ language: The language to use for the query.
511
+ location: The location to use for the query.
512
+ deepness: The search depth and enrichment details.
513
+ marketplaces: The marketplaces to include in the search.
514
+ excluded_urls: The URLs to exclude from the search.
515
+ previously_collected_urls: The urls that have been collected previously and are ignored.
516
+ """
517
+ # ---------------------------
518
+ # INITIAL SETUP
519
+ # ---------------------------
520
+ # Ensure we have at least one search engine (the list might be empty)
521
+ if not search_engines:
522
+ logger.warning(
523
+ "No search engines specified, using all available search engines"
524
+ )
525
+ search_engines = list(SearchEngineName)
526
+
527
+ # Handle previously collected URLs
528
+ if pcurls := previously_collected_urls:
529
+ self._url_collector.add_previously_collected_urls(urls=pcurls)
530
+
531
+ # Setup the async framework
532
+ n_terms_max = 1 + (
533
+ deepness.enrichment.additional_terms if deepness.enrichment else 0
534
+ )
535
+ n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
536
+ n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
537
+ n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
538
+
539
+ logger.debug(
540
+ f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
541
+ )
542
+ self._setup_async_framework(
543
+ n_srch_wkrs=n_srch_wkrs,
544
+ n_cntx_wkrs=n_cntx_wkrs,
545
+ n_proc_wkrs=n_proc_wkrs,
546
+ )
547
+
548
+ # Check setup of async framework
549
+ if self._queues is None or self._workers is None:
550
+ raise ValueError(
551
+ "Async framework is not setup. Please call _setup_async_framework() first."
552
+ )
553
+ if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
554
+ raise ValueError(
555
+ "The queues of the async framework are not setup correctly."
556
+ )
557
+ if not all(
558
+ [k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
559
+ ):
560
+ raise ValueError(
561
+ "The workers of the async framework are not setup correctly."
562
+ )
563
+
564
+ # Add the search items to the srch_queue
565
+ srch_queue = self._queues["srch"]
566
+ await self._add_srch_items(
567
+ queue=srch_queue,
568
+ search_term=search_term,
569
+ search_engines=search_engines,
570
+ language=language,
571
+ location=location,
572
+ deepness=deepness,
573
+ marketplaces=marketplaces,
574
+ excluded_urls=excluded_urls,
575
+ )
576
+
577
+ # -----------------------------
578
+ # ORCHESTRATE SEARCH WORKERS
579
+ # -----------------------------
580
+ # Add the sentinels to the srch_queue
581
+ for _ in range(n_srch_wkrs):
582
+ await srch_queue.put(None)
583
+
584
+ # Wait for the srch workers to be concluded before adding the sentinels to the url_queue
585
+ srch_workers = self._workers["srch"]
586
+ try:
587
+ logger.debug("Waiting for srch_workers to conclude their tasks...")
588
+ srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
589
+ for i, res in enumerate(srch_res):
590
+ if isinstance(res, Exception):
591
+ logger.error(f"Error in srch_worker {i}: {res}")
592
+ logger.debug("...srch_workers concluded their tasks")
593
+ except Exception:
594
+ logger.error(
595
+ "Gathering srch_workers failed",
596
+ exc_info=True,
597
+ )
598
+ finally:
599
+ await srch_queue.join()
600
+
601
+ # ---------------------------
602
+ # ORCHESTRATE URL COLLECTOR
603
+ # ---------------------------
604
+ # Add the sentinels to the url_queue
605
+ url_queue = self._queues["url"]
606
+ await url_queue.put(None)
607
+
608
+ # Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
609
+ url_collector = cast(asyncio.Task, self._workers["url"])
610
+ try:
611
+ logger.debug("Waiting for url_collector to conclude its tasks...")
612
+ await url_collector
613
+ logger.debug("...url_collector concluded its tasks")
614
+ except Exception:
615
+ logger.error(
616
+ "Gathering url_collector failed",
617
+ exc_info=True,
618
+ )
619
+ finally:
620
+ await url_queue.join()
621
+
622
+ # -----------------------------
623
+ # ORCHESTRATE CONTEXT WORKERS
624
+ # -----------------------------
625
+ # Add the sentinels to the cntx_queue
626
+ cntx_queue = self._queues["cntx"]
627
+ for _ in range(n_cntx_wkrs):
628
+ await cntx_queue.put(None)
629
+
630
+ # Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
631
+ cntx_workers = self._workers["cntx"]
632
+ try:
633
+ logger.debug("Waiting for cntx_workers to conclude their tasks...")
634
+ cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
635
+ for i, res in enumerate(cntx_res):
636
+ if isinstance(res, Exception):
637
+ logger.error(f"Error in cntx_worker {i}: {res}")
638
+ logger.debug("...cntx_workers concluded their tasks")
639
+ except Exception:
640
+ logger.error(
641
+ "Gathering cntx_workers failed",
642
+ exc_info=True,
643
+ )
644
+ finally:
645
+ await cntx_queue.join()
646
+
647
+ # ---------------------------
648
+ # ORCHESTRATE PROC WORKERS
649
+ # ---------------------------
650
+ # Add the sentinels to the proc_queue
651
+ proc_queue = self._queues["proc"]
652
+ for _ in range(n_proc_wkrs):
653
+ await proc_queue.put(None)
654
+
655
+ # Wait for the proc_workers to be concluded before adding the sentinels to the res_queue
656
+ proc_workers = self._workers["proc"]
657
+ try:
658
+ logger.debug("Waiting for proc_workers to conclude their tasks...")
659
+ proc_res = await asyncio.gather(*proc_workers, return_exceptions=True)
660
+ for i, res in enumerate(proc_res):
661
+ if isinstance(res, Exception):
662
+ logger.error(f"Error in proc_worker {i}: {res}")
663
+ logger.debug("...proc_workers concluded their tasks")
664
+ except Exception:
665
+ logger.error(
666
+ "Gathering proc_workers failed",
667
+ exc_info=True,
668
+ )
669
+ finally:
670
+ await proc_queue.join()
671
+
672
+ # ---------------------------
673
+ # ORCHESTRATE RES COLLECTOR
674
+ # ---------------------------
675
+ # Add the sentinels to the res_queue
676
+ res_queue = self._queues["res"]
677
+ await res_queue.put(None)
678
+
679
+ # Wait for the res_collector to be concluded
680
+ res_collector = cast(asyncio.Task, self._workers["res"])
681
+ try:
682
+ logger.debug("Waiting for res_collector to conclude its tasks...")
683
+ await res_collector
684
+ logger.debug("...res_collector concluded its tasks")
685
+ except Exception:
686
+ logger.error(
687
+ "Gathering res_collector failed",
688
+ exc_info=True,
689
+ )
690
+ finally:
691
+ await res_queue.join()
692
+
693
+ # ---------------------------
694
+ # CLOSING PIPELINE
695
+ # ---------------------------
696
+ logger.info("Pipeline concluded; async framework is closed")