fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,34 +1,32 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from typing import cast, Dict, List, Self
4
+ from typing import cast, Dict, List
5
5
 
6
- from bs4 import BeautifulSoup
7
- import httpx
6
+ import re
8
7
 
9
8
  from fraudcrawler.settings import (
10
- PROCESSOR_DEFAULT_MODEL,
9
+ EXACT_MATCH_PRODUCT_FIELDS,
10
+ EXACT_MATCH_FIELD_SEPARATOR,
11
11
  )
12
12
  from fraudcrawler.settings import (
13
- DEFAULT_N_SERP_WKRS,
14
- DEFAULT_N_ZYTE_WKRS,
13
+ DEFAULT_N_SRCH_WKRS,
14
+ DEFAULT_N_CNTX_WKRS,
15
15
  DEFAULT_N_PROC_WKRS,
16
16
  )
17
17
  from fraudcrawler.base.base import (
18
- Deepness,
19
18
  Host,
20
19
  Language,
21
20
  Location,
22
- Prompt,
21
+ Deepness,
23
22
  ProductItem,
24
- HttpxAsyncClient,
25
23
  )
26
24
  from fraudcrawler import (
27
- Search,
25
+ Searcher,
28
26
  SearchEngineName,
29
27
  Enricher,
30
- URLCollector,
31
28
  ZyteAPI,
29
+ URLCollector,
32
30
  Processor,
33
31
  )
34
32
 
@@ -36,16 +34,17 @@ logger = logging.getLogger(__name__)
36
34
 
37
35
 
38
36
  class Orchestrator(ABC):
39
- """Abstract base class for orchestrating the different actors (crawling, processing).
37
+ """Abstract base class for orchestrating the different actors (scraping, processing).
38
+
39
+ Any subclass of :class:`Orchestrator` orchestrates the complete pipeline: search,
40
+ deduplication, context extraction, processing (classification), and result collection.
40
41
 
41
42
  Abstract methods:
42
43
  _collect_results: Collects the results from the given queue_in.
44
+ This function is responsible for collecting and handling the results from the given queue_in. It might
45
+ save the results to a file, a database, or any other storage.
43
46
 
44
- Each subclass of class:`Orchestrator` must implement the abstract method func:`_collect_results`.
45
- This function is responsible for collecting and handling the results from the given queue_in. It might
46
- save the results to a file, a database, or any other storage.
47
-
48
- For each pipeline step class:`Orchestrator` will deploy a number of async workers to handle the tasks.
47
+ For each pipeline step :class:`Orchestrator` will deploy a number of async workers to handle the tasks.
49
48
  In addition it makes sure to orchestrate the canceling of the workers only after the relevant workload is done.
50
49
 
51
50
  For more information on the orchestrating pattern see README.md.
@@ -53,94 +52,43 @@ class Orchestrator(ABC):
53
52
 
54
53
  def __init__(
55
54
  self,
56
- serpapi_key: str,
57
- dataforseo_user: str,
58
- dataforseo_pwd: str,
59
- zyteapi_key: str,
60
- openaiapi_key: str,
61
- openai_model: str = PROCESSOR_DEFAULT_MODEL,
62
- n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
63
- n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
55
+ searcher: Searcher,
56
+ enricher: Enricher,
57
+ url_collector: URLCollector,
58
+ zyteapi: ZyteAPI,
59
+ processor: Processor,
60
+ n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
61
+ n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
64
62
  n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
65
- # Configure a custom httpx client.
66
- # We provide a `HttpxAsyncClient` class that you can pass
67
- # to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
68
- http_client: httpx.AsyncClient | None = None,
69
63
  ):
70
64
  """Initializes the orchestrator with the given settings.
71
65
 
72
- NOTE:
73
- The class:`Orchestrator` must be used as context manager as follows:
74
- async with Orchestrator(...) as orchestrator:
75
- await orchestrator.run()
76
-
77
66
  Args:
78
- serpapi_key: The API key for SERP API.
79
- dataforseo_user: The user for DataForSEO.
80
- dataforseo_pwd: The password for DataForSEO.
81
- zyteapi_key: The API key for Zyte API.
82
- openaiapi_key: The API key for OpenAI.
83
- openai_model: The model to use for the processing (optional).
84
- n_serp_wkrs: Number of async workers for serp (optional).
85
- n_zyte_wkrs: Number of async workers for zyte (optional).
67
+ searcher: Client for searching step.
68
+ enricher: Client for enrichment step.
69
+ url_collector: Client for deduplication.
70
+ zyteapi: Client for metadata extraction.
71
+ processor: Client for product classification.
72
+ n_srch_wkrs: Number of async workers for the search (optional).
73
+ n_cntx_wkrs: Number of async workers for context extraction (optional).
86
74
  n_proc_wkrs: Number of async workers for the processor (optional).
87
- http_client: An httpx.AsyncClient to use for the async requests (optional).
88
75
  """
89
76
 
90
- # Store the variables for setting up the clients
91
- self._serpapi_key = serpapi_key
92
- self._dataforseo_user = dataforseo_user
93
- self._dataforseo_pwd = dataforseo_pwd
94
- self._zyteapi_key = zyteapi_key
95
- self._openaiapi_key = openaiapi_key
96
- self._openai_model = openai_model
77
+ # Pipeline clients
78
+ self._searcher = searcher
79
+ self._enricher = enricher
80
+ self._url_collector = url_collector
81
+ self._zyteapi = zyteapi
82
+ self._processor = processor
97
83
 
98
84
  # Setup the async framework
99
- self._n_serp_wkrs = n_serp_wkrs
100
- self._n_zyte_wkrs = n_zyte_wkrs
85
+ self._n_srch_wkrs = n_srch_wkrs
86
+ self._n_cntx_wkrs = n_cntx_wkrs
101
87
  self._n_proc_wkrs = n_proc_wkrs
102
88
  self._queues: Dict[str, asyncio.Queue] | None = None
103
89
  self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
104
90
 
105
- # Setup the httpx client
106
- self._http_client = http_client
107
- self._owns_http_client = http_client is None
108
-
109
- async def __aenter__(self) -> Self:
110
- """Creates and starts an httpx.AsyncClient if not provided."""
111
- if self._http_client is None:
112
- logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
113
- self._http_client = HttpxAsyncClient()
114
- self._owns_http_client = True
115
-
116
- # Setup the clients
117
- self._search = Search(
118
- http_client=self._http_client, serpapi_key=self._serpapi_key
119
- )
120
- self._enricher = Enricher(
121
- http_client=self._http_client,
122
- user=self._dataforseo_user,
123
- pwd=self._dataforseo_pwd,
124
- )
125
- self._url_collector = URLCollector()
126
- self._zyteapi = ZyteAPI(
127
- http_client=self._http_client, api_key=self._zyteapi_key
128
- )
129
- self._processor = Processor(
130
- http_client=self._http_client,
131
- api_key=self._openaiapi_key,
132
- model=self._openai_model,
133
- )
134
- return self
135
-
136
- async def __aexit__(self, *args, **kwargs) -> None:
137
- """Closes the httpx.AsyncClient if it was created by this orchestrator."""
138
- if self._owns_http_client and self._http_client is not None:
139
- logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
140
- await self._http_client.aclose()
141
- self._http_client = None
142
-
143
- async def _serp_execute(
91
+ async def _srch_execute(
144
92
  self,
145
93
  queue_in: asyncio.Queue[dict | None],
146
94
  queue_out: asyncio.Queue[ProductItem | None],
@@ -158,17 +106,14 @@ class Orchestrator(ABC):
158
106
  break
159
107
 
160
108
  try:
109
+ # Execute the search
161
110
  search_term_type = item.pop("search_term_type")
162
- # The search_engines are already SearchEngineName enum values
163
- search_engines = item.pop("search_engines")
164
-
165
- results = await self._search.apply(
166
- **item, search_engines=search_engines
167
- )
168
-
111
+ results = await self._searcher.apply(**item)
169
112
  logger.debug(
170
113
  f"Search for {item['search_term']} returned {len(results)} results"
171
114
  )
115
+
116
+ # Create ProductItems for each result
172
117
  for res in results:
173
118
  product = ProductItem(
174
119
  search_term=item["search_term"],
@@ -181,8 +126,11 @@ class Orchestrator(ABC):
181
126
  filtered_at_stage=res.filtered_at_stage,
182
127
  )
183
128
  await queue_out.put(product)
184
- except Exception as e:
185
- logger.error(f"Error executing search: {e}")
129
+ except Exception:
130
+ logger.error(
131
+ f"Running search failed with item={item}",
132
+ exc_info=True,
133
+ )
186
134
  queue_in.task_done()
187
135
 
188
136
  async def _collect_url(
@@ -203,31 +151,12 @@ class Orchestrator(ABC):
203
151
  break
204
152
 
205
153
  if not product.filtered:
206
- # Clean the URL by removing tracking parameters
207
- url = self._url_collector.remove_tracking_parameters(product.url)
208
- product.url = url
209
-
210
- if url in self._url_collector.collected_currently:
211
- # deduplicate on current run
212
- product.filtered = True
213
- product.filtered_at_stage = (
214
- "URL collection (current run deduplication)"
215
- )
216
- logger.debug(f"URL {url} already collected in current run")
217
- elif url in self._url_collector.collected_previously:
218
- # deduplicate on previous runs coming from a db
219
- product.filtered = True
220
- product.filtered_at_stage = (
221
- "URL collection (previous run deduplication)"
222
- )
223
- logger.debug(f"URL {url} as already collected in previous run")
224
- else:
225
- self._url_collector.collected_currently.add(url)
154
+ product = await self._url_collector.apply(product=product)
226
155
 
227
156
  await queue_out.put(product)
228
157
  queue_in.task_done()
229
158
 
230
- async def _zyte_execute(
159
+ async def _cntx_execute(
231
160
  self,
232
161
  queue_in: asyncio.Queue[ProductItem | None],
233
162
  queue_out: asyncio.Queue[ProductItem | None],
@@ -246,45 +175,34 @@ class Orchestrator(ABC):
246
175
 
247
176
  if not product.filtered:
248
177
  try:
249
- # Fetch the product details from Zyte API
178
+ # Fetch and enrich the product context from Zyte API
250
179
  details = await self._zyteapi.details(url=product.url)
251
- url_resolved = self._zyteapi.extract_url_resolved(details=details)
252
- if url_resolved:
253
- product.url_resolved = url_resolved
254
- product.product_name = self._zyteapi.extract_product_name(
255
- details=details
180
+ product = self._zyteapi.enrich_context(
181
+ product=product, details=details
256
182
  )
257
183
 
258
- # If the resolved URL is different from the original URL, we also need to update the domain as
259
- # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
260
- if url_resolved and url_resolved != product.url:
261
- logger.debug(
262
- f"URL resolved for {product.url} is {url_resolved}"
263
- )
264
- product.domain = self._search._get_domain(url_resolved)
265
-
266
- product.product_price = self._zyteapi.extract_product_price(
267
- details=details
268
- )
269
- product.product_description = (
270
- self._zyteapi.extract_product_description(details=details)
271
- )
272
- product.product_images = self._zyteapi.extract_image_urls(
273
- details=details
274
- )
275
- product.probability = self._zyteapi.extract_probability(
276
- details=details
277
- )
278
- product.html = self._zyteapi.extract_html(details=details)
279
- if product.html:
280
- soup = BeautifulSoup(product.html, "html.parser")
281
- product.html_clean = soup.get_text(separator=" ", strip=True)
282
184
  # Filter the product based on the probability threshold
283
185
  if not self._zyteapi.keep_product(details=details):
284
186
  product.filtered = True
285
- product.filtered_at_stage = "Zyte probability threshold"
286
- except Exception as e:
287
- logger.warning(f"Error executing Zyte API search: {e}.")
187
+ product.filtered_at_stage = (
188
+ "Context (Zyte probability threshold)"
189
+ )
190
+
191
+ # Check for exact match inside the full product context
192
+ product = self._check_exact_search(product=product)
193
+ if (
194
+ not product.filtered
195
+ and product.exact_search
196
+ and not product.exact_search_match
197
+ ):
198
+ product.filtered = True
199
+ product.filtered_at_stage = "Context (exact search)"
200
+
201
+ except Exception:
202
+ logger.error(
203
+ f"Running Zyte API search failed for product with url={product.url_resolved}",
204
+ exc_info=True,
205
+ )
288
206
  await queue_out.put(product)
289
207
  queue_in.task_done()
290
208
 
@@ -292,14 +210,12 @@ class Orchestrator(ABC):
292
210
  self,
293
211
  queue_in: asyncio.Queue[ProductItem | None],
294
212
  queue_out: asyncio.Queue[ProductItem | None],
295
- prompts: List[Prompt],
296
213
  ) -> None:
297
214
  """Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
298
215
 
299
216
  Args:
300
217
  queue_in: The input queue containing the product details.
301
218
  queue_out: The output queue to put the processed product details.
302
- prompts: The list of prompts to use for classification.
303
219
  """
304
220
 
305
221
  # Process the products
@@ -312,22 +228,12 @@ class Orchestrator(ABC):
312
228
 
313
229
  if not product.filtered:
314
230
  try:
315
- # Run all the configured prompts
316
- for prompt in prompts:
317
- classification = await self._processor.classify(
318
- product=product,
319
- prompt=prompt,
320
- )
321
- product.classifications[prompt.name] = int(
322
- classification.result
323
- )
324
- product.usage[prompt.name] = {
325
- "input_tokens": classification.input_tokens,
326
- "output_tokens": classification.output_tokens,
327
- }
328
- except Exception as e:
329
- logger.warning(
330
- f"Error processing product with url={product.url}: {e}."
231
+ # Run the configured workflows
232
+ product = await self._processor.run(product=product)
233
+ except Exception:
234
+ logger.error(
235
+ f"Processing product with url={product.url_resolved} failed",
236
+ exc_info=True,
331
237
  )
332
238
 
333
239
  await queue_out.put(product)
@@ -346,52 +252,50 @@ class Orchestrator(ABC):
346
252
 
347
253
  def _setup_async_framework(
348
254
  self,
349
- n_serp_wkrs: int,
350
- n_zyte_wkrs: int,
255
+ n_srch_wkrs: int,
256
+ n_cntx_wkrs: int,
351
257
  n_proc_wkrs: int,
352
- prompts: List[Prompt],
353
258
  ) -> None:
354
259
  """Sets up the necessary queues and workers for the async framework.
355
260
 
356
261
  Args:
357
- n_serp_wkrs: Number of async workers for serp.
358
- n_zyte_wkrs: Number of async workers for zyte.
359
- n_proc_wkrs: Number of async workers for processor.
360
- prompts: The list of prompts used for the classification by func:`Processor.classify`.
262
+ n_srch_wkrs: Number of async workers for search.
263
+ n_cntx_wkrs: Number of async workers for context extraction.
264
+ n_proc_wkrs: Number of async workers for processing.
361
265
  """
362
266
 
363
267
  # Setup the input/output queues for the workers
364
- serp_queue: asyncio.Queue[dict | None] = asyncio.Queue()
268
+ srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
365
269
  url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
366
- zyte_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
270
+ cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
367
271
  proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
368
272
  res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
369
273
 
370
- # Setup the Serp workers
371
- serp_wkrs = [
274
+ # Setup the Search workers
275
+ srch_wkrs = [
372
276
  asyncio.create_task(
373
- self._serp_execute(
374
- queue_in=serp_queue,
277
+ self._srch_execute(
278
+ queue_in=srch_queue,
375
279
  queue_out=url_queue,
376
280
  )
377
281
  )
378
- for _ in range(n_serp_wkrs)
282
+ for _ in range(n_srch_wkrs)
379
283
  ]
380
284
 
381
285
  # Setup the URL collector
382
286
  url_col = asyncio.create_task(
383
- self._collect_url(queue_in=url_queue, queue_out=zyte_queue)
287
+ self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
384
288
  )
385
289
 
386
- # Setup the Zyte workers
387
- zyte_wkrs = [
290
+ # Setup the context extraction workers
291
+ cntx_wkrs = [
388
292
  asyncio.create_task(
389
- self._zyte_execute(
390
- queue_in=zyte_queue,
293
+ self._cntx_execute(
294
+ queue_in=cntx_queue,
391
295
  queue_out=proc_queue,
392
296
  )
393
297
  )
394
- for _ in range(n_zyte_wkrs)
298
+ for _ in range(n_cntx_wkrs)
395
299
  ]
396
300
 
397
301
  # Setup the processing workers
@@ -400,7 +304,6 @@ class Orchestrator(ABC):
400
304
  self._proc_execute(
401
305
  queue_in=proc_queue,
402
306
  queue_out=res_queue,
403
- prompts=prompts,
404
307
  )
405
308
  )
406
309
  for _ in range(n_proc_wkrs)
@@ -411,26 +314,26 @@ class Orchestrator(ABC):
411
314
 
412
315
  # Add the setup to the instance variables
413
316
  self._queues = {
414
- "serp": serp_queue,
317
+ "srch": srch_queue,
415
318
  "url": url_queue,
416
- "zyte": zyte_queue,
319
+ "cntx": cntx_queue,
417
320
  "proc": proc_queue,
418
321
  "res": res_queue,
419
322
  }
420
323
  self._workers = {
421
- "serp": serp_wkrs,
324
+ "srch": srch_wkrs,
422
325
  "url": url_col,
423
- "zyte": zyte_wkrs,
326
+ "cntx": cntx_wkrs,
424
327
  "proc": proc_wkrs,
425
328
  "res": res_col,
426
329
  }
427
330
 
428
331
  @staticmethod
429
- async def _add_serp_items_for_search_term(
332
+ async def _add_search_items_for_search_term(
430
333
  queue: asyncio.Queue[dict | None],
431
334
  search_term: str,
432
335
  search_term_type: str,
433
- search_engines: List[SearchEngineName],
336
+ search_engine: SearchEngineName,
434
337
  language: Language,
435
338
  location: Location,
436
339
  num_results: int,
@@ -441,17 +344,17 @@ class Orchestrator(ABC):
441
344
  item = {
442
345
  "search_term": search_term,
443
346
  "search_term_type": search_term_type,
444
- "search_engines": search_engines,
347
+ "search_engine": search_engine,
445
348
  "language": language,
446
349
  "location": location,
447
350
  "num_results": num_results,
448
351
  "marketplaces": marketplaces,
449
352
  "excluded_urls": excluded_urls,
450
353
  }
451
- logger.debug(f'Adding item="{item}" to serp_queue')
354
+ logger.debug(f'Adding item="{item}" to srch_queue')
452
355
  await queue.put(item)
453
356
 
454
- async def _add_serp_items(
357
+ async def _add_srch_items(
455
358
  self,
456
359
  queue: asyncio.Queue[dict | None],
457
360
  search_term: str,
@@ -462,7 +365,23 @@ class Orchestrator(ABC):
462
365
  marketplaces: List[Host] | None,
463
366
  excluded_urls: List[Host] | None,
464
367
  ) -> None:
465
- """Adds all the (enriched) search_term (as serp items) to the queue."""
368
+ """Adds all the (enriched) search_term (as srch items) to the queue.
369
+
370
+ One item consists of the following parameters:
371
+ - search_term: The search term for the query.
372
+ - search_term_type: The type of the search term (initial or enriched).
373
+ - search_engines: The search engines to use for the query.
374
+ - language: The language to use for the query.
375
+ - location: The location to use for the query.
376
+ - num_results: The number of results to return.
377
+ - marketplaces: The marketplaces to include in the search.
378
+ - excluded_urls: The URLs to exclude from the search.
379
+
380
+ For constructing such items we essentially have two loops:
381
+ for each search_term (initial + enriched)
382
+ for each search_engine
383
+ add item to queue
384
+ """
466
385
  common_kwargs = {
467
386
  "queue": queue,
468
387
  "language": language,
@@ -471,14 +390,15 @@ class Orchestrator(ABC):
471
390
  "excluded_urls": excluded_urls,
472
391
  }
473
392
 
474
- # Add initial items to the serp_queue
475
- await self._add_serp_items_for_search_term(
476
- search_term=search_term,
477
- search_term_type="initial",
478
- search_engines=search_engines,
479
- num_results=deepness.num_results,
480
- **common_kwargs, # type: ignore[arg-type]
481
- )
393
+ # Add initial items to the queue
394
+ for se in search_engines:
395
+ await self._add_search_items_for_search_term(
396
+ search_term=search_term,
397
+ search_term_type="initial",
398
+ search_engine=se,
399
+ num_results=deepness.num_results,
400
+ **common_kwargs, # type: ignore[arg-type]
401
+ )
482
402
 
483
403
  # Enrich the search_terms
484
404
  enrichment = deepness.enrichment
@@ -492,15 +412,84 @@ class Orchestrator(ABC):
492
412
  n_terms=n_terms,
493
413
  )
494
414
 
495
- # Add the enriched search terms to the serp_queue
415
+ # Add the enriched search terms to the queue
496
416
  for trm in terms:
497
- await self._add_serp_items_for_search_term(
498
- search_term=trm,
499
- search_term_type="enriched",
500
- search_engines=search_engines,
501
- num_results=enrichment.additional_urls_per_term,
502
- **common_kwargs, # type: ignore[arg-type]
417
+ for se in search_engines:
418
+ await self._add_search_items_for_search_term(
419
+ search_term=trm,
420
+ search_term_type="enriched",
421
+ search_engine=se,
422
+ num_results=enrichment.additional_urls_per_term,
423
+ **common_kwargs, # type: ignore[arg-type]
424
+ )
425
+
426
+ @staticmethod
427
+ def _is_exact_search(search_term: str) -> bool:
428
+ """Check if the search term is an exact search (contains double quotation marks).
429
+
430
+ Args:
431
+ search_term: The search term to check.
432
+ """
433
+ return '"' in search_term
434
+
435
+ @staticmethod
436
+ def _extract_exact_search_terms(search_term: str) -> list[str]:
437
+ """Extract all exact search terms from within double quotation marks (empty if no quotes found).
438
+
439
+ Args:
440
+ search_term: The search term that may contain double quotation marks.
441
+ """
442
+ # Find all double-quoted strings
443
+ double_quote_matches = re.findall(r'"([^"]*)"', search_term)
444
+ return double_quote_matches
445
+
446
+ @staticmethod
447
+ def _check_exact_search_terms_match(
448
+ product: ProductItem,
449
+ exact_search_terms: list[str],
450
+ ) -> bool:
451
+ """Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
452
+
453
+ Args:
454
+ product: The product item.
455
+ exact_search_terms: List of exact search terms to match against.
456
+ """
457
+ field_values = [
458
+ str(val)
459
+ for fld in EXACT_MATCH_PRODUCT_FIELDS
460
+ if (val := getattr(product, fld, None)) is not None
461
+ ]
462
+ product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
463
+
464
+ return all(
465
+ re.search(re.escape(est.lower()), product_str_lower)
466
+ for est in exact_search_terms
467
+ )
468
+
469
+ def _check_exact_search(self, product: ProductItem) -> ProductItem:
470
+ """Checks if the search term requests an exact search and if yes, checks for conformity."""
471
+ # Check for exact search and apply regex matching
472
+ exact_search = self._is_exact_search(product.search_term)
473
+ product.exact_search = exact_search
474
+
475
+ # Only set exact_search_match if this was an exact search (contains quotes)
476
+ if exact_search:
477
+ exact_search_terms = self._extract_exact_search_terms(product.search_term)
478
+ if exact_search_terms:
479
+ product.exact_search_match = self._check_exact_search_terms_match(
480
+ product=product, exact_search_terms=exact_search_terms
481
+ )
482
+ logger.debug(
483
+ f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
484
+ f"for offer with url={product.url}"
485
+ )
486
+ else:
487
+ logger.warning(
488
+ f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
489
+ f"for offer with url={product.url}"
503
490
  )
491
+ # If exact_search is False, product.exact_search_match remains False (default value)
492
+ return product
504
493
 
505
494
  async def run(
506
495
  self,
@@ -509,12 +498,11 @@ class Orchestrator(ABC):
509
498
  language: Language,
510
499
  location: Location,
511
500
  deepness: Deepness,
512
- prompts: List[Prompt],
513
501
  marketplaces: List[Host] | None = None,
514
502
  excluded_urls: List[Host] | None = None,
515
503
  previously_collected_urls: List[str] | None = None,
516
504
  ) -> None:
517
- """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
505
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
518
506
 
519
507
  Args:
520
508
  search_term: The search term for the query.
@@ -522,7 +510,6 @@ class Orchestrator(ABC):
522
510
  language: The language to use for the query.
523
511
  location: The location to use for the query.
524
512
  deepness: The search depth and enrichment details.
525
- prompts: The list of prompt to use for classification.
526
513
  marketplaces: The marketplaces to include in the search.
527
514
  excluded_urls: The URLs to exclude from the search.
528
515
  previously_collected_urls: The urls that have been collected previously and are ignored.
@@ -530,7 +517,7 @@ class Orchestrator(ABC):
530
517
  # ---------------------------
531
518
  # INITIAL SETUP
532
519
  # ---------------------------
533
- # Ensure we have at least one search engine
520
+ # Ensure we have at least one search engine (the list might be empty)
534
521
  if not search_engines:
535
522
  logger.warning(
536
523
  "No search engines specified, using all available search engines"
@@ -538,25 +525,24 @@ class Orchestrator(ABC):
538
525
  search_engines = list(SearchEngineName)
539
526
 
540
527
  # Handle previously collected URLs
541
- if previously_collected_urls:
542
- self._url_collector.collected_previously = set(previously_collected_urls)
528
+ if pcurls := previously_collected_urls:
529
+ self._url_collector.add_previously_collected_urls(urls=pcurls)
543
530
 
544
531
  # Setup the async framework
545
532
  n_terms_max = 1 + (
546
533
  deepness.enrichment.additional_terms if deepness.enrichment else 0
547
534
  )
548
- n_serp_wkrs = min(self._n_serp_wkrs, n_terms_max)
549
- n_zyte_wkrs = min(self._n_zyte_wkrs, deepness.num_results)
535
+ n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
536
+ n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
550
537
  n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
551
538
 
552
539
  logger.debug(
553
- f"setting up async framework (#workers: serp={n_serp_wkrs}, zyte={n_zyte_wkrs}, proc={n_proc_wkrs})"
540
+ f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
554
541
  )
555
542
  self._setup_async_framework(
556
- n_serp_wkrs=n_serp_wkrs,
557
- n_zyte_wkrs=n_zyte_wkrs,
543
+ n_srch_wkrs=n_srch_wkrs,
544
+ n_cntx_wkrs=n_cntx_wkrs,
558
545
  n_proc_wkrs=n_proc_wkrs,
559
- prompts=prompts,
560
546
  )
561
547
 
562
548
  # Check setup of async framework
@@ -564,21 +550,21 @@ class Orchestrator(ABC):
564
550
  raise ValueError(
565
551
  "Async framework is not setup. Please call _setup_async_framework() first."
566
552
  )
567
- if not all([k in self._queues for k in ["serp", "url", "zyte", "proc", "res"]]):
553
+ if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
568
554
  raise ValueError(
569
555
  "The queues of the async framework are not setup correctly."
570
556
  )
571
557
  if not all(
572
- [k in self._workers for k in ["serp", "url", "zyte", "proc", "res"]]
558
+ [k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
573
559
  ):
574
560
  raise ValueError(
575
561
  "The workers of the async framework are not setup correctly."
576
562
  )
577
563
 
578
- # Add the search items to the serp_queue
579
- serp_queue = self._queues["serp"]
580
- await self._add_serp_items(
581
- queue=serp_queue,
564
+ # Add the search items to the srch_queue
565
+ srch_queue = self._queues["srch"]
566
+ await self._add_srch_items(
567
+ queue=srch_queue,
582
568
  search_term=search_term,
583
569
  search_engines=search_engines,
584
570
  language=language,
@@ -588,26 +574,29 @@ class Orchestrator(ABC):
588
574
  excluded_urls=excluded_urls,
589
575
  )
590
576
 
591
- # ---------------------------
592
- # ORCHESTRATE SERP WORKERS
593
- # ---------------------------
594
- # Add the sentinels to the serp_queue
595
- for _ in range(n_serp_wkrs):
596
- await serp_queue.put(None)
577
+ # -----------------------------
578
+ # ORCHESTRATE SEARCH WORKERS
579
+ # -----------------------------
580
+ # Add the sentinels to the srch_queue
581
+ for _ in range(n_srch_wkrs):
582
+ await srch_queue.put(None)
597
583
 
598
- # Wait for the serp workers to be concluded before adding the sentinels to the url_queue
599
- serp_workers = self._workers["serp"]
584
+ # Wait for the srch workers to be concluded before adding the sentinels to the url_queue
585
+ srch_workers = self._workers["srch"]
600
586
  try:
601
- logger.debug("Waiting for serp_workers to conclude their tasks...")
602
- serp_res = await asyncio.gather(*serp_workers, return_exceptions=True)
603
- for i, res in enumerate(serp_res):
587
+ logger.debug("Waiting for srch_workers to conclude their tasks...")
588
+ srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
589
+ for i, res in enumerate(srch_res):
604
590
  if isinstance(res, Exception):
605
- logger.error(f"Error in serp_worker {i}: {res}")
606
- logger.debug("...serp_workers concluded their tasks")
607
- except Exception as e:
608
- logger.error(f"Gathering serp_workers failed: {e}")
591
+ logger.error(f"Error in srch_worker {i}: {res}")
592
+ logger.debug("...srch_workers concluded their tasks")
593
+ except Exception:
594
+ logger.error(
595
+ "Gathering srch_workers failed",
596
+ exc_info=True,
597
+ )
609
598
  finally:
610
- await serp_queue.join()
599
+ await srch_queue.join()
611
600
 
612
601
  # ---------------------------
613
602
  # ORCHESTRATE URL COLLECTOR
@@ -616,38 +605,44 @@ class Orchestrator(ABC):
616
605
  url_queue = self._queues["url"]
617
606
  await url_queue.put(None)
618
607
 
619
- # Wait for the url_collector to be concluded before adding the sentinels to the zyte_queue
608
+ # Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
620
609
  url_collector = cast(asyncio.Task, self._workers["url"])
621
610
  try:
622
611
  logger.debug("Waiting for url_collector to conclude its tasks...")
623
612
  await url_collector
624
613
  logger.debug("...url_collector concluded its tasks")
625
- except Exception as e:
626
- logger.error(f"Gathering url_collector failed: {e}")
614
+ except Exception:
615
+ logger.error(
616
+ "Gathering url_collector failed",
617
+ exc_info=True,
618
+ )
627
619
  finally:
628
620
  await url_queue.join()
629
621
 
630
- # ---------------------------
631
- # ORCHESTRATE ZYTE WORKERS
632
- # ---------------------------
633
- # Add the sentinels to the zyte_queue
634
- zyte_queue = self._queues["zyte"]
635
- for _ in range(n_zyte_wkrs):
636
- await zyte_queue.put(None)
622
+ # -----------------------------
623
+ # ORCHESTRATE CONTEXT WORKERS
624
+ # -----------------------------
625
+ # Add the sentinels to the cntx_queue
626
+ cntx_queue = self._queues["cntx"]
627
+ for _ in range(n_cntx_wkrs):
628
+ await cntx_queue.put(None)
637
629
 
638
- # Wait for the zyte_workers to be concluded before adding the sentinels to the proc_queue
639
- zyte_workers = self._workers["zyte"]
630
+ # Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
631
+ cntx_workers = self._workers["cntx"]
640
632
  try:
641
- logger.debug("Waiting for zyte_workers to conclude their tasks...")
642
- zyte_res = await asyncio.gather(*zyte_workers, return_exceptions=True)
643
- for i, res in enumerate(zyte_res):
633
+ logger.debug("Waiting for cntx_workers to conclude their tasks...")
634
+ cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
635
+ for i, res in enumerate(cntx_res):
644
636
  if isinstance(res, Exception):
645
- logger.error(f"Error in zyte_worker {i}: {res}")
646
- logger.debug("...zyte_workers concluded their tasks")
647
- except Exception as e:
648
- logger.error(f"Gathering zyte_workers failed: {e}")
637
+ logger.error(f"Error in cntx_worker {i}: {res}")
638
+ logger.debug("...cntx_workers concluded their tasks")
639
+ except Exception:
640
+ logger.error(
641
+ "Gathering cntx_workers failed",
642
+ exc_info=True,
643
+ )
649
644
  finally:
650
- await zyte_queue.join()
645
+ await cntx_queue.join()
651
646
 
652
647
  # ---------------------------
653
648
  # ORCHESTRATE PROC WORKERS
@@ -666,8 +661,11 @@ class Orchestrator(ABC):
666
661
  if isinstance(res, Exception):
667
662
  logger.error(f"Error in proc_worker {i}: {res}")
668
663
  logger.debug("...proc_workers concluded their tasks")
669
- except Exception as e:
670
- logger.error(f"Gathering proc_workers failed: {e}")
664
+ except Exception:
665
+ logger.error(
666
+ "Gathering proc_workers failed",
667
+ exc_info=True,
668
+ )
671
669
  finally:
672
670
  await proc_queue.join()
673
671
 
@@ -684,8 +682,11 @@ class Orchestrator(ABC):
684
682
  logger.debug("Waiting for res_collector to conclude its tasks...")
685
683
  await res_collector
686
684
  logger.debug("...res_collector concluded its tasks")
687
- except Exception as e:
688
- logger.error(f"Gathering res_collector failed: {e}")
685
+ except Exception:
686
+ logger.error(
687
+ "Gathering res_collector failed",
688
+ exc_info=True,
689
+ )
689
690
  finally:
690
691
  await res_queue.join()
691
692