fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +21 -5
- fraudcrawler/base/base.py +18 -38
- fraudcrawler/base/client.py +57 -60
- fraudcrawler/base/orchestrator.py +277 -276
- fraudcrawler/base/retry.py +25 -11
- fraudcrawler/launch_demo_pipeline.py +103 -41
- fraudcrawler/processing/base.py +129 -0
- fraudcrawler/processing/openai.py +520 -0
- fraudcrawler/scraping/enrich.py +6 -4
- fraudcrawler/scraping/search.py +370 -110
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +146 -80
- fraudcrawler/settings.py +22 -10
- fraudcrawler-0.7.22.dist-info/METADATA +173 -0
- fraudcrawler-0.7.22.dist-info/RECORD +23 -0
- fraudcrawler/processing/processor.py +0 -199
- fraudcrawler-0.5.0.dist-info/METADATA +0 -167
- fraudcrawler-0.5.0.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/entry_points.txt +0 -0
|
@@ -1,34 +1,32 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from typing import cast, Dict, List
|
|
4
|
+
from typing import cast, Dict, List
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
import httpx
|
|
6
|
+
import re
|
|
8
7
|
|
|
9
8
|
from fraudcrawler.settings import (
|
|
10
|
-
|
|
9
|
+
EXACT_MATCH_PRODUCT_FIELDS,
|
|
10
|
+
EXACT_MATCH_FIELD_SEPARATOR,
|
|
11
11
|
)
|
|
12
12
|
from fraudcrawler.settings import (
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
DEFAULT_N_SRCH_WKRS,
|
|
14
|
+
DEFAULT_N_CNTX_WKRS,
|
|
15
15
|
DEFAULT_N_PROC_WKRS,
|
|
16
16
|
)
|
|
17
17
|
from fraudcrawler.base.base import (
|
|
18
|
-
Deepness,
|
|
19
18
|
Host,
|
|
20
19
|
Language,
|
|
21
20
|
Location,
|
|
22
|
-
|
|
21
|
+
Deepness,
|
|
23
22
|
ProductItem,
|
|
24
|
-
HttpxAsyncClient,
|
|
25
23
|
)
|
|
26
24
|
from fraudcrawler import (
|
|
27
|
-
|
|
25
|
+
Searcher,
|
|
28
26
|
SearchEngineName,
|
|
29
27
|
Enricher,
|
|
30
|
-
URLCollector,
|
|
31
28
|
ZyteAPI,
|
|
29
|
+
URLCollector,
|
|
32
30
|
Processor,
|
|
33
31
|
)
|
|
34
32
|
|
|
@@ -36,16 +34,17 @@ logger = logging.getLogger(__name__)
|
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
class Orchestrator(ABC):
|
|
39
|
-
"""Abstract base class for orchestrating the different actors (
|
|
37
|
+
"""Abstract base class for orchestrating the different actors (scraping, processing).
|
|
38
|
+
|
|
39
|
+
Any subclass of :class:`Orchestrator` orchestrates the complete pipeline: search,
|
|
40
|
+
deduplication, context extraction, processing (classification), and result collection.
|
|
40
41
|
|
|
41
42
|
Abstract methods:
|
|
42
43
|
_collect_results: Collects the results from the given queue_in.
|
|
44
|
+
This function is responsible for collecting and handling the results from the given queue_in. It might
|
|
45
|
+
save the results to a file, a database, or any other storage.
|
|
43
46
|
|
|
44
|
-
|
|
45
|
-
This function is responsible for collecting and handling the results from the given queue_in. It might
|
|
46
|
-
save the results to a file, a database, or any other storage.
|
|
47
|
-
|
|
48
|
-
For each pipeline step class:`Orchestrator` will deploy a number of async workers to handle the tasks.
|
|
47
|
+
For each pipeline step :class:`Orchestrator` will deploy a number of async workers to handle the tasks.
|
|
49
48
|
In addition it makes sure to orchestrate the canceling of the workers only after the relevant workload is done.
|
|
50
49
|
|
|
51
50
|
For more information on the orchestrating pattern see README.md.
|
|
@@ -53,94 +52,43 @@ class Orchestrator(ABC):
|
|
|
53
52
|
|
|
54
53
|
def __init__(
|
|
55
54
|
self,
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
|
|
55
|
+
searcher: Searcher,
|
|
56
|
+
enricher: Enricher,
|
|
57
|
+
url_collector: URLCollector,
|
|
58
|
+
zyteapi: ZyteAPI,
|
|
59
|
+
processor: Processor,
|
|
60
|
+
n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
|
|
61
|
+
n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
|
|
64
62
|
n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
|
|
65
|
-
# Configure a custom httpx client.
|
|
66
|
-
# We provide a `HttpxAsyncClient` class that you can pass
|
|
67
|
-
# to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
|
|
68
|
-
http_client: httpx.AsyncClient | None = None,
|
|
69
63
|
):
|
|
70
64
|
"""Initializes the orchestrator with the given settings.
|
|
71
65
|
|
|
72
|
-
NOTE:
|
|
73
|
-
The class:`Orchestrator` must be used as context manager as follows:
|
|
74
|
-
async with Orchestrator(...) as orchestrator:
|
|
75
|
-
await orchestrator.run()
|
|
76
|
-
|
|
77
66
|
Args:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
n_zyte_wkrs: Number of async workers for zyte (optional).
|
|
67
|
+
searcher: Client for searching step.
|
|
68
|
+
enricher: Client for enrichment step.
|
|
69
|
+
url_collector: Client for deduplication.
|
|
70
|
+
zyteapi: Client for metadata extraction.
|
|
71
|
+
processor: Client for product classification.
|
|
72
|
+
n_srch_wkrs: Number of async workers for the search (optional).
|
|
73
|
+
n_cntx_wkrs: Number of async workers for context extraction (optional).
|
|
86
74
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
87
|
-
http_client: An httpx.AsyncClient to use for the async requests (optional).
|
|
88
75
|
"""
|
|
89
76
|
|
|
90
|
-
#
|
|
91
|
-
self.
|
|
92
|
-
self.
|
|
93
|
-
self.
|
|
94
|
-
self.
|
|
95
|
-
self.
|
|
96
|
-
self._openai_model = openai_model
|
|
77
|
+
# Pipeline clients
|
|
78
|
+
self._searcher = searcher
|
|
79
|
+
self._enricher = enricher
|
|
80
|
+
self._url_collector = url_collector
|
|
81
|
+
self._zyteapi = zyteapi
|
|
82
|
+
self._processor = processor
|
|
97
83
|
|
|
98
84
|
# Setup the async framework
|
|
99
|
-
self.
|
|
100
|
-
self.
|
|
85
|
+
self._n_srch_wkrs = n_srch_wkrs
|
|
86
|
+
self._n_cntx_wkrs = n_cntx_wkrs
|
|
101
87
|
self._n_proc_wkrs = n_proc_wkrs
|
|
102
88
|
self._queues: Dict[str, asyncio.Queue] | None = None
|
|
103
89
|
self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
|
|
104
90
|
|
|
105
|
-
|
|
106
|
-
self._http_client = http_client
|
|
107
|
-
self._owns_http_client = http_client is None
|
|
108
|
-
|
|
109
|
-
async def __aenter__(self) -> Self:
|
|
110
|
-
"""Creates and starts an httpx.AsyncClient if not provided."""
|
|
111
|
-
if self._http_client is None:
|
|
112
|
-
logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
|
|
113
|
-
self._http_client = HttpxAsyncClient()
|
|
114
|
-
self._owns_http_client = True
|
|
115
|
-
|
|
116
|
-
# Setup the clients
|
|
117
|
-
self._search = Search(
|
|
118
|
-
http_client=self._http_client, serpapi_key=self._serpapi_key
|
|
119
|
-
)
|
|
120
|
-
self._enricher = Enricher(
|
|
121
|
-
http_client=self._http_client,
|
|
122
|
-
user=self._dataforseo_user,
|
|
123
|
-
pwd=self._dataforseo_pwd,
|
|
124
|
-
)
|
|
125
|
-
self._url_collector = URLCollector()
|
|
126
|
-
self._zyteapi = ZyteAPI(
|
|
127
|
-
http_client=self._http_client, api_key=self._zyteapi_key
|
|
128
|
-
)
|
|
129
|
-
self._processor = Processor(
|
|
130
|
-
http_client=self._http_client,
|
|
131
|
-
api_key=self._openaiapi_key,
|
|
132
|
-
model=self._openai_model,
|
|
133
|
-
)
|
|
134
|
-
return self
|
|
135
|
-
|
|
136
|
-
async def __aexit__(self, *args, **kwargs) -> None:
|
|
137
|
-
"""Closes the httpx.AsyncClient if it was created by this orchestrator."""
|
|
138
|
-
if self._owns_http_client and self._http_client is not None:
|
|
139
|
-
logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
|
|
140
|
-
await self._http_client.aclose()
|
|
141
|
-
self._http_client = None
|
|
142
|
-
|
|
143
|
-
async def _serp_execute(
|
|
91
|
+
async def _srch_execute(
|
|
144
92
|
self,
|
|
145
93
|
queue_in: asyncio.Queue[dict | None],
|
|
146
94
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
@@ -158,17 +106,14 @@ class Orchestrator(ABC):
|
|
|
158
106
|
break
|
|
159
107
|
|
|
160
108
|
try:
|
|
109
|
+
# Execute the search
|
|
161
110
|
search_term_type = item.pop("search_term_type")
|
|
162
|
-
|
|
163
|
-
search_engines = item.pop("search_engines")
|
|
164
|
-
|
|
165
|
-
results = await self._search.apply(
|
|
166
|
-
**item, search_engines=search_engines
|
|
167
|
-
)
|
|
168
|
-
|
|
111
|
+
results = await self._searcher.apply(**item)
|
|
169
112
|
logger.debug(
|
|
170
113
|
f"Search for {item['search_term']} returned {len(results)} results"
|
|
171
114
|
)
|
|
115
|
+
|
|
116
|
+
# Create ProductItems for each result
|
|
172
117
|
for res in results:
|
|
173
118
|
product = ProductItem(
|
|
174
119
|
search_term=item["search_term"],
|
|
@@ -181,8 +126,11 @@ class Orchestrator(ABC):
|
|
|
181
126
|
filtered_at_stage=res.filtered_at_stage,
|
|
182
127
|
)
|
|
183
128
|
await queue_out.put(product)
|
|
184
|
-
except Exception
|
|
185
|
-
logger.error(
|
|
129
|
+
except Exception:
|
|
130
|
+
logger.error(
|
|
131
|
+
f"Running search failed with item={item}",
|
|
132
|
+
exc_info=True,
|
|
133
|
+
)
|
|
186
134
|
queue_in.task_done()
|
|
187
135
|
|
|
188
136
|
async def _collect_url(
|
|
@@ -203,31 +151,12 @@ class Orchestrator(ABC):
|
|
|
203
151
|
break
|
|
204
152
|
|
|
205
153
|
if not product.filtered:
|
|
206
|
-
|
|
207
|
-
url = self._url_collector.remove_tracking_parameters(product.url)
|
|
208
|
-
product.url = url
|
|
209
|
-
|
|
210
|
-
if url in self._url_collector.collected_currently:
|
|
211
|
-
# deduplicate on current run
|
|
212
|
-
product.filtered = True
|
|
213
|
-
product.filtered_at_stage = (
|
|
214
|
-
"URL collection (current run deduplication)"
|
|
215
|
-
)
|
|
216
|
-
logger.debug(f"URL {url} already collected in current run")
|
|
217
|
-
elif url in self._url_collector.collected_previously:
|
|
218
|
-
# deduplicate on previous runs coming from a db
|
|
219
|
-
product.filtered = True
|
|
220
|
-
product.filtered_at_stage = (
|
|
221
|
-
"URL collection (previous run deduplication)"
|
|
222
|
-
)
|
|
223
|
-
logger.debug(f"URL {url} as already collected in previous run")
|
|
224
|
-
else:
|
|
225
|
-
self._url_collector.collected_currently.add(url)
|
|
154
|
+
product = await self._url_collector.apply(product=product)
|
|
226
155
|
|
|
227
156
|
await queue_out.put(product)
|
|
228
157
|
queue_in.task_done()
|
|
229
158
|
|
|
230
|
-
async def
|
|
159
|
+
async def _cntx_execute(
|
|
231
160
|
self,
|
|
232
161
|
queue_in: asyncio.Queue[ProductItem | None],
|
|
233
162
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
@@ -246,45 +175,34 @@ class Orchestrator(ABC):
|
|
|
246
175
|
|
|
247
176
|
if not product.filtered:
|
|
248
177
|
try:
|
|
249
|
-
# Fetch the product
|
|
178
|
+
# Fetch and enrich the product context from Zyte API
|
|
250
179
|
details = await self._zyteapi.details(url=product.url)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
product.url_resolved = url_resolved
|
|
254
|
-
product.product_name = self._zyteapi.extract_product_name(
|
|
255
|
-
details=details
|
|
180
|
+
product = self._zyteapi.enrich_context(
|
|
181
|
+
product=product, details=details
|
|
256
182
|
)
|
|
257
183
|
|
|
258
|
-
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
259
|
-
# otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
|
|
260
|
-
if url_resolved and url_resolved != product.url:
|
|
261
|
-
logger.debug(
|
|
262
|
-
f"URL resolved for {product.url} is {url_resolved}"
|
|
263
|
-
)
|
|
264
|
-
product.domain = self._search._get_domain(url_resolved)
|
|
265
|
-
|
|
266
|
-
product.product_price = self._zyteapi.extract_product_price(
|
|
267
|
-
details=details
|
|
268
|
-
)
|
|
269
|
-
product.product_description = (
|
|
270
|
-
self._zyteapi.extract_product_description(details=details)
|
|
271
|
-
)
|
|
272
|
-
product.product_images = self._zyteapi.extract_image_urls(
|
|
273
|
-
details=details
|
|
274
|
-
)
|
|
275
|
-
product.probability = self._zyteapi.extract_probability(
|
|
276
|
-
details=details
|
|
277
|
-
)
|
|
278
|
-
product.html = self._zyteapi.extract_html(details=details)
|
|
279
|
-
if product.html:
|
|
280
|
-
soup = BeautifulSoup(product.html, "html.parser")
|
|
281
|
-
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
282
184
|
# Filter the product based on the probability threshold
|
|
283
185
|
if not self._zyteapi.keep_product(details=details):
|
|
284
186
|
product.filtered = True
|
|
285
|
-
product.filtered_at_stage =
|
|
286
|
-
|
|
287
|
-
|
|
187
|
+
product.filtered_at_stage = (
|
|
188
|
+
"Context (Zyte probability threshold)"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Check for exact match inside the full product context
|
|
192
|
+
product = self._check_exact_search(product=product)
|
|
193
|
+
if (
|
|
194
|
+
not product.filtered
|
|
195
|
+
and product.exact_search
|
|
196
|
+
and not product.exact_search_match
|
|
197
|
+
):
|
|
198
|
+
product.filtered = True
|
|
199
|
+
product.filtered_at_stage = "Context (exact search)"
|
|
200
|
+
|
|
201
|
+
except Exception:
|
|
202
|
+
logger.error(
|
|
203
|
+
f"Running Zyte API search failed for product with url={product.url_resolved}",
|
|
204
|
+
exc_info=True,
|
|
205
|
+
)
|
|
288
206
|
await queue_out.put(product)
|
|
289
207
|
queue_in.task_done()
|
|
290
208
|
|
|
@@ -292,14 +210,12 @@ class Orchestrator(ABC):
|
|
|
292
210
|
self,
|
|
293
211
|
queue_in: asyncio.Queue[ProductItem | None],
|
|
294
212
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
295
|
-
prompts: List[Prompt],
|
|
296
213
|
) -> None:
|
|
297
214
|
"""Collects the product details from the queue_in, processes them (filtering, relevance, etc.) and puts the results into queue_out.
|
|
298
215
|
|
|
299
216
|
Args:
|
|
300
217
|
queue_in: The input queue containing the product details.
|
|
301
218
|
queue_out: The output queue to put the processed product details.
|
|
302
|
-
prompts: The list of prompts to use for classification.
|
|
303
219
|
"""
|
|
304
220
|
|
|
305
221
|
# Process the products
|
|
@@ -312,22 +228,12 @@ class Orchestrator(ABC):
|
|
|
312
228
|
|
|
313
229
|
if not product.filtered:
|
|
314
230
|
try:
|
|
315
|
-
# Run
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
product.classifications[prompt.name] = int(
|
|
322
|
-
classification.result
|
|
323
|
-
)
|
|
324
|
-
product.usage[prompt.name] = {
|
|
325
|
-
"input_tokens": classification.input_tokens,
|
|
326
|
-
"output_tokens": classification.output_tokens,
|
|
327
|
-
}
|
|
328
|
-
except Exception as e:
|
|
329
|
-
logger.warning(
|
|
330
|
-
f"Error processing product with url={product.url}: {e}."
|
|
231
|
+
# Run the configured workflows
|
|
232
|
+
product = await self._processor.run(product=product)
|
|
233
|
+
except Exception:
|
|
234
|
+
logger.error(
|
|
235
|
+
f"Processing product with url={product.url_resolved} failed",
|
|
236
|
+
exc_info=True,
|
|
331
237
|
)
|
|
332
238
|
|
|
333
239
|
await queue_out.put(product)
|
|
@@ -346,52 +252,50 @@ class Orchestrator(ABC):
|
|
|
346
252
|
|
|
347
253
|
def _setup_async_framework(
|
|
348
254
|
self,
|
|
349
|
-
|
|
350
|
-
|
|
255
|
+
n_srch_wkrs: int,
|
|
256
|
+
n_cntx_wkrs: int,
|
|
351
257
|
n_proc_wkrs: int,
|
|
352
|
-
prompts: List[Prompt],
|
|
353
258
|
) -> None:
|
|
354
259
|
"""Sets up the necessary queues and workers for the async framework.
|
|
355
260
|
|
|
356
261
|
Args:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
n_proc_wkrs: Number of async workers for
|
|
360
|
-
prompts: The list of prompts used for the classification by func:`Processor.classify`.
|
|
262
|
+
n_srch_wkrs: Number of async workers for search.
|
|
263
|
+
n_cntx_wkrs: Number of async workers for context extraction.
|
|
264
|
+
n_proc_wkrs: Number of async workers for processing.
|
|
361
265
|
"""
|
|
362
266
|
|
|
363
267
|
# Setup the input/output queues for the workers
|
|
364
|
-
|
|
268
|
+
srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
|
|
365
269
|
url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
366
|
-
|
|
270
|
+
cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
367
271
|
proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
368
272
|
res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
369
273
|
|
|
370
|
-
# Setup the
|
|
371
|
-
|
|
274
|
+
# Setup the Search workers
|
|
275
|
+
srch_wkrs = [
|
|
372
276
|
asyncio.create_task(
|
|
373
|
-
self.
|
|
374
|
-
queue_in=
|
|
277
|
+
self._srch_execute(
|
|
278
|
+
queue_in=srch_queue,
|
|
375
279
|
queue_out=url_queue,
|
|
376
280
|
)
|
|
377
281
|
)
|
|
378
|
-
for _ in range(
|
|
282
|
+
for _ in range(n_srch_wkrs)
|
|
379
283
|
]
|
|
380
284
|
|
|
381
285
|
# Setup the URL collector
|
|
382
286
|
url_col = asyncio.create_task(
|
|
383
|
-
self._collect_url(queue_in=url_queue, queue_out=
|
|
287
|
+
self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
|
|
384
288
|
)
|
|
385
289
|
|
|
386
|
-
# Setup the
|
|
387
|
-
|
|
290
|
+
# Setup the context extraction workers
|
|
291
|
+
cntx_wkrs = [
|
|
388
292
|
asyncio.create_task(
|
|
389
|
-
self.
|
|
390
|
-
queue_in=
|
|
293
|
+
self._cntx_execute(
|
|
294
|
+
queue_in=cntx_queue,
|
|
391
295
|
queue_out=proc_queue,
|
|
392
296
|
)
|
|
393
297
|
)
|
|
394
|
-
for _ in range(
|
|
298
|
+
for _ in range(n_cntx_wkrs)
|
|
395
299
|
]
|
|
396
300
|
|
|
397
301
|
# Setup the processing workers
|
|
@@ -400,7 +304,6 @@ class Orchestrator(ABC):
|
|
|
400
304
|
self._proc_execute(
|
|
401
305
|
queue_in=proc_queue,
|
|
402
306
|
queue_out=res_queue,
|
|
403
|
-
prompts=prompts,
|
|
404
307
|
)
|
|
405
308
|
)
|
|
406
309
|
for _ in range(n_proc_wkrs)
|
|
@@ -411,26 +314,26 @@ class Orchestrator(ABC):
|
|
|
411
314
|
|
|
412
315
|
# Add the setup to the instance variables
|
|
413
316
|
self._queues = {
|
|
414
|
-
"
|
|
317
|
+
"srch": srch_queue,
|
|
415
318
|
"url": url_queue,
|
|
416
|
-
"
|
|
319
|
+
"cntx": cntx_queue,
|
|
417
320
|
"proc": proc_queue,
|
|
418
321
|
"res": res_queue,
|
|
419
322
|
}
|
|
420
323
|
self._workers = {
|
|
421
|
-
"
|
|
324
|
+
"srch": srch_wkrs,
|
|
422
325
|
"url": url_col,
|
|
423
|
-
"
|
|
326
|
+
"cntx": cntx_wkrs,
|
|
424
327
|
"proc": proc_wkrs,
|
|
425
328
|
"res": res_col,
|
|
426
329
|
}
|
|
427
330
|
|
|
428
331
|
@staticmethod
|
|
429
|
-
async def
|
|
332
|
+
async def _add_search_items_for_search_term(
|
|
430
333
|
queue: asyncio.Queue[dict | None],
|
|
431
334
|
search_term: str,
|
|
432
335
|
search_term_type: str,
|
|
433
|
-
|
|
336
|
+
search_engine: SearchEngineName,
|
|
434
337
|
language: Language,
|
|
435
338
|
location: Location,
|
|
436
339
|
num_results: int,
|
|
@@ -441,17 +344,17 @@ class Orchestrator(ABC):
|
|
|
441
344
|
item = {
|
|
442
345
|
"search_term": search_term,
|
|
443
346
|
"search_term_type": search_term_type,
|
|
444
|
-
"
|
|
347
|
+
"search_engine": search_engine,
|
|
445
348
|
"language": language,
|
|
446
349
|
"location": location,
|
|
447
350
|
"num_results": num_results,
|
|
448
351
|
"marketplaces": marketplaces,
|
|
449
352
|
"excluded_urls": excluded_urls,
|
|
450
353
|
}
|
|
451
|
-
logger.debug(f'Adding item="{item}" to
|
|
354
|
+
logger.debug(f'Adding item="{item}" to srch_queue')
|
|
452
355
|
await queue.put(item)
|
|
453
356
|
|
|
454
|
-
async def
|
|
357
|
+
async def _add_srch_items(
|
|
455
358
|
self,
|
|
456
359
|
queue: asyncio.Queue[dict | None],
|
|
457
360
|
search_term: str,
|
|
@@ -462,7 +365,23 @@ class Orchestrator(ABC):
|
|
|
462
365
|
marketplaces: List[Host] | None,
|
|
463
366
|
excluded_urls: List[Host] | None,
|
|
464
367
|
) -> None:
|
|
465
|
-
"""Adds all the (enriched) search_term (as
|
|
368
|
+
"""Adds all the (enriched) search_term (as srch items) to the queue.
|
|
369
|
+
|
|
370
|
+
One item consists of the following parameters:
|
|
371
|
+
- search_term: The search term for the query.
|
|
372
|
+
- search_term_type: The type of the search term (initial or enriched).
|
|
373
|
+
- search_engines: The search engines to use for the query.
|
|
374
|
+
- language: The language to use for the query.
|
|
375
|
+
- location: The location to use for the query.
|
|
376
|
+
- num_results: The number of results to return.
|
|
377
|
+
- marketplaces: The marketplaces to include in the search.
|
|
378
|
+
- excluded_urls: The URLs to exclude from the search.
|
|
379
|
+
|
|
380
|
+
For constructing such items we essentially have two loops:
|
|
381
|
+
for each search_term (initial + enriched)
|
|
382
|
+
for each search_engine
|
|
383
|
+
add item to queue
|
|
384
|
+
"""
|
|
466
385
|
common_kwargs = {
|
|
467
386
|
"queue": queue,
|
|
468
387
|
"language": language,
|
|
@@ -471,14 +390,15 @@ class Orchestrator(ABC):
|
|
|
471
390
|
"excluded_urls": excluded_urls,
|
|
472
391
|
}
|
|
473
392
|
|
|
474
|
-
# Add initial items to the
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
393
|
+
# Add initial items to the queue
|
|
394
|
+
for se in search_engines:
|
|
395
|
+
await self._add_search_items_for_search_term(
|
|
396
|
+
search_term=search_term,
|
|
397
|
+
search_term_type="initial",
|
|
398
|
+
search_engine=se,
|
|
399
|
+
num_results=deepness.num_results,
|
|
400
|
+
**common_kwargs, # type: ignore[arg-type]
|
|
401
|
+
)
|
|
482
402
|
|
|
483
403
|
# Enrich the search_terms
|
|
484
404
|
enrichment = deepness.enrichment
|
|
@@ -492,15 +412,84 @@ class Orchestrator(ABC):
|
|
|
492
412
|
n_terms=n_terms,
|
|
493
413
|
)
|
|
494
414
|
|
|
495
|
-
# Add the enriched search terms to the
|
|
415
|
+
# Add the enriched search terms to the queue
|
|
496
416
|
for trm in terms:
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
417
|
+
for se in search_engines:
|
|
418
|
+
await self._add_search_items_for_search_term(
|
|
419
|
+
search_term=trm,
|
|
420
|
+
search_term_type="enriched",
|
|
421
|
+
search_engine=se,
|
|
422
|
+
num_results=enrichment.additional_urls_per_term,
|
|
423
|
+
**common_kwargs, # type: ignore[arg-type]
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
@staticmethod
|
|
427
|
+
def _is_exact_search(search_term: str) -> bool:
|
|
428
|
+
"""Check if the search term is an exact search (contains double quotation marks).
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
search_term: The search term to check.
|
|
432
|
+
"""
|
|
433
|
+
return '"' in search_term
|
|
434
|
+
|
|
435
|
+
@staticmethod
|
|
436
|
+
def _extract_exact_search_terms(search_term: str) -> list[str]:
|
|
437
|
+
"""Extract all exact search terms from within double quotation marks (empty if no quotes found).
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
search_term: The search term that may contain double quotation marks.
|
|
441
|
+
"""
|
|
442
|
+
# Find all double-quoted strings
|
|
443
|
+
double_quote_matches = re.findall(r'"([^"]*)"', search_term)
|
|
444
|
+
return double_quote_matches
|
|
445
|
+
|
|
446
|
+
@staticmethod
|
|
447
|
+
def _check_exact_search_terms_match(
|
|
448
|
+
product: ProductItem,
|
|
449
|
+
exact_search_terms: list[str],
|
|
450
|
+
) -> bool:
|
|
451
|
+
"""Check if the product, represented by a string of selected attributes, matches ALL of the exact search terms.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
product: The product item.
|
|
455
|
+
exact_search_terms: List of exact search terms to match against.
|
|
456
|
+
"""
|
|
457
|
+
field_values = [
|
|
458
|
+
str(val)
|
|
459
|
+
for fld in EXACT_MATCH_PRODUCT_FIELDS
|
|
460
|
+
if (val := getattr(product, fld, None)) is not None
|
|
461
|
+
]
|
|
462
|
+
product_str_lower = EXACT_MATCH_FIELD_SEPARATOR.join(field_values).lower()
|
|
463
|
+
|
|
464
|
+
return all(
|
|
465
|
+
re.search(re.escape(est.lower()), product_str_lower)
|
|
466
|
+
for est in exact_search_terms
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def _check_exact_search(self, product: ProductItem) -> ProductItem:
|
|
470
|
+
"""Checks if the search term requests an exact search and if yes, checks for conformity."""
|
|
471
|
+
# Check for exact search and apply regex matching
|
|
472
|
+
exact_search = self._is_exact_search(product.search_term)
|
|
473
|
+
product.exact_search = exact_search
|
|
474
|
+
|
|
475
|
+
# Only set exact_search_match if this was an exact search (contains quotes)
|
|
476
|
+
if exact_search:
|
|
477
|
+
exact_search_terms = self._extract_exact_search_terms(product.search_term)
|
|
478
|
+
if exact_search_terms:
|
|
479
|
+
product.exact_search_match = self._check_exact_search_terms_match(
|
|
480
|
+
product=product, exact_search_terms=exact_search_terms
|
|
481
|
+
)
|
|
482
|
+
logger.debug(
|
|
483
|
+
f"Exact search terms {exact_search_terms} matched: {product.exact_search_match} "
|
|
484
|
+
f"for offer with url={product.url}"
|
|
485
|
+
)
|
|
486
|
+
else:
|
|
487
|
+
logger.warning(
|
|
488
|
+
f"is_exact_search=True but no exact search terms found in search_term='{product.search_term}' "
|
|
489
|
+
f"for offer with url={product.url}"
|
|
503
490
|
)
|
|
491
|
+
# If exact_search is False, product.exact_search_match remains False (default value)
|
|
492
|
+
return product
|
|
504
493
|
|
|
505
494
|
async def run(
|
|
506
495
|
self,
|
|
@@ -509,12 +498,11 @@ class Orchestrator(ABC):
|
|
|
509
498
|
language: Language,
|
|
510
499
|
location: Location,
|
|
511
500
|
deepness: Deepness,
|
|
512
|
-
prompts: List[Prompt],
|
|
513
501
|
marketplaces: List[Host] | None = None,
|
|
514
502
|
excluded_urls: List[Host] | None = None,
|
|
515
503
|
previously_collected_urls: List[str] | None = None,
|
|
516
504
|
) -> None:
|
|
517
|
-
"""Runs the pipeline steps:
|
|
505
|
+
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
518
506
|
|
|
519
507
|
Args:
|
|
520
508
|
search_term: The search term for the query.
|
|
@@ -522,7 +510,6 @@ class Orchestrator(ABC):
|
|
|
522
510
|
language: The language to use for the query.
|
|
523
511
|
location: The location to use for the query.
|
|
524
512
|
deepness: The search depth and enrichment details.
|
|
525
|
-
prompts: The list of prompt to use for classification.
|
|
526
513
|
marketplaces: The marketplaces to include in the search.
|
|
527
514
|
excluded_urls: The URLs to exclude from the search.
|
|
528
515
|
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
@@ -530,7 +517,7 @@ class Orchestrator(ABC):
|
|
|
530
517
|
# ---------------------------
|
|
531
518
|
# INITIAL SETUP
|
|
532
519
|
# ---------------------------
|
|
533
|
-
# Ensure we have at least one search engine
|
|
520
|
+
# Ensure we have at least one search engine (the list might be empty)
|
|
534
521
|
if not search_engines:
|
|
535
522
|
logger.warning(
|
|
536
523
|
"No search engines specified, using all available search engines"
|
|
@@ -538,25 +525,24 @@ class Orchestrator(ABC):
|
|
|
538
525
|
search_engines = list(SearchEngineName)
|
|
539
526
|
|
|
540
527
|
# Handle previously collected URLs
|
|
541
|
-
if previously_collected_urls:
|
|
542
|
-
self._url_collector.
|
|
528
|
+
if pcurls := previously_collected_urls:
|
|
529
|
+
self._url_collector.add_previously_collected_urls(urls=pcurls)
|
|
543
530
|
|
|
544
531
|
# Setup the async framework
|
|
545
532
|
n_terms_max = 1 + (
|
|
546
533
|
deepness.enrichment.additional_terms if deepness.enrichment else 0
|
|
547
534
|
)
|
|
548
|
-
|
|
549
|
-
|
|
535
|
+
n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
|
|
536
|
+
n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
|
|
550
537
|
n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
|
|
551
538
|
|
|
552
539
|
logger.debug(
|
|
553
|
-
f"setting up async framework (#workers:
|
|
540
|
+
f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
|
|
554
541
|
)
|
|
555
542
|
self._setup_async_framework(
|
|
556
|
-
|
|
557
|
-
|
|
543
|
+
n_srch_wkrs=n_srch_wkrs,
|
|
544
|
+
n_cntx_wkrs=n_cntx_wkrs,
|
|
558
545
|
n_proc_wkrs=n_proc_wkrs,
|
|
559
|
-
prompts=prompts,
|
|
560
546
|
)
|
|
561
547
|
|
|
562
548
|
# Check setup of async framework
|
|
@@ -564,21 +550,21 @@ class Orchestrator(ABC):
|
|
|
564
550
|
raise ValueError(
|
|
565
551
|
"Async framework is not setup. Please call _setup_async_framework() first."
|
|
566
552
|
)
|
|
567
|
-
if not all([k in self._queues for k in ["
|
|
553
|
+
if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
|
|
568
554
|
raise ValueError(
|
|
569
555
|
"The queues of the async framework are not setup correctly."
|
|
570
556
|
)
|
|
571
557
|
if not all(
|
|
572
|
-
[k in self._workers for k in ["
|
|
558
|
+
[k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
|
|
573
559
|
):
|
|
574
560
|
raise ValueError(
|
|
575
561
|
"The workers of the async framework are not setup correctly."
|
|
576
562
|
)
|
|
577
563
|
|
|
578
|
-
# Add the search items to the
|
|
579
|
-
|
|
580
|
-
await self.
|
|
581
|
-
queue=
|
|
564
|
+
# Add the search items to the srch_queue
|
|
565
|
+
srch_queue = self._queues["srch"]
|
|
566
|
+
await self._add_srch_items(
|
|
567
|
+
queue=srch_queue,
|
|
582
568
|
search_term=search_term,
|
|
583
569
|
search_engines=search_engines,
|
|
584
570
|
language=language,
|
|
@@ -588,26 +574,29 @@ class Orchestrator(ABC):
|
|
|
588
574
|
excluded_urls=excluded_urls,
|
|
589
575
|
)
|
|
590
576
|
|
|
591
|
-
#
|
|
592
|
-
#
|
|
593
|
-
#
|
|
594
|
-
# Add the sentinels to the
|
|
595
|
-
for _ in range(
|
|
596
|
-
await
|
|
577
|
+
# -----------------------------
|
|
578
|
+
# ORCHESTRATE SEARCH WORKERS
|
|
579
|
+
# -----------------------------
|
|
580
|
+
# Add the sentinels to the srch_queue
|
|
581
|
+
for _ in range(n_srch_wkrs):
|
|
582
|
+
await srch_queue.put(None)
|
|
597
583
|
|
|
598
|
-
# Wait for the
|
|
599
|
-
|
|
584
|
+
# Wait for the srch workers to be concluded before adding the sentinels to the url_queue
|
|
585
|
+
srch_workers = self._workers["srch"]
|
|
600
586
|
try:
|
|
601
|
-
logger.debug("Waiting for
|
|
602
|
-
|
|
603
|
-
for i, res in enumerate(
|
|
587
|
+
logger.debug("Waiting for srch_workers to conclude their tasks...")
|
|
588
|
+
srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
|
|
589
|
+
for i, res in enumerate(srch_res):
|
|
604
590
|
if isinstance(res, Exception):
|
|
605
|
-
logger.error(f"Error in
|
|
606
|
-
logger.debug("...
|
|
607
|
-
except Exception
|
|
608
|
-
logger.error(
|
|
591
|
+
logger.error(f"Error in srch_worker {i}: {res}")
|
|
592
|
+
logger.debug("...srch_workers concluded their tasks")
|
|
593
|
+
except Exception:
|
|
594
|
+
logger.error(
|
|
595
|
+
"Gathering srch_workers failed",
|
|
596
|
+
exc_info=True,
|
|
597
|
+
)
|
|
609
598
|
finally:
|
|
610
|
-
await
|
|
599
|
+
await srch_queue.join()
|
|
611
600
|
|
|
612
601
|
# ---------------------------
|
|
613
602
|
# ORCHESTRATE URL COLLECTOR
|
|
@@ -616,38 +605,44 @@ class Orchestrator(ABC):
|
|
|
616
605
|
url_queue = self._queues["url"]
|
|
617
606
|
await url_queue.put(None)
|
|
618
607
|
|
|
619
|
-
# Wait for the url_collector to be concluded before adding the sentinels to the
|
|
608
|
+
# Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
|
|
620
609
|
url_collector = cast(asyncio.Task, self._workers["url"])
|
|
621
610
|
try:
|
|
622
611
|
logger.debug("Waiting for url_collector to conclude its tasks...")
|
|
623
612
|
await url_collector
|
|
624
613
|
logger.debug("...url_collector concluded its tasks")
|
|
625
|
-
except Exception
|
|
626
|
-
logger.error(
|
|
614
|
+
except Exception:
|
|
615
|
+
logger.error(
|
|
616
|
+
"Gathering url_collector failed",
|
|
617
|
+
exc_info=True,
|
|
618
|
+
)
|
|
627
619
|
finally:
|
|
628
620
|
await url_queue.join()
|
|
629
621
|
|
|
630
|
-
#
|
|
631
|
-
# ORCHESTRATE
|
|
632
|
-
#
|
|
633
|
-
# Add the sentinels to the
|
|
634
|
-
|
|
635
|
-
for _ in range(
|
|
636
|
-
await
|
|
622
|
+
# -----------------------------
|
|
623
|
+
# ORCHESTRATE CONTEXT WORKERS
|
|
624
|
+
# -----------------------------
|
|
625
|
+
# Add the sentinels to the cntx_queue
|
|
626
|
+
cntx_queue = self._queues["cntx"]
|
|
627
|
+
for _ in range(n_cntx_wkrs):
|
|
628
|
+
await cntx_queue.put(None)
|
|
637
629
|
|
|
638
|
-
# Wait for the
|
|
639
|
-
|
|
630
|
+
# Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
|
|
631
|
+
cntx_workers = self._workers["cntx"]
|
|
640
632
|
try:
|
|
641
|
-
logger.debug("Waiting for
|
|
642
|
-
|
|
643
|
-
for i, res in enumerate(
|
|
633
|
+
logger.debug("Waiting for cntx_workers to conclude their tasks...")
|
|
634
|
+
cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
|
|
635
|
+
for i, res in enumerate(cntx_res):
|
|
644
636
|
if isinstance(res, Exception):
|
|
645
|
-
logger.error(f"Error in
|
|
646
|
-
logger.debug("...
|
|
647
|
-
except Exception
|
|
648
|
-
logger.error(
|
|
637
|
+
logger.error(f"Error in cntx_worker {i}: {res}")
|
|
638
|
+
logger.debug("...cntx_workers concluded their tasks")
|
|
639
|
+
except Exception:
|
|
640
|
+
logger.error(
|
|
641
|
+
"Gathering cntx_workers failed",
|
|
642
|
+
exc_info=True,
|
|
643
|
+
)
|
|
649
644
|
finally:
|
|
650
|
-
await
|
|
645
|
+
await cntx_queue.join()
|
|
651
646
|
|
|
652
647
|
# ---------------------------
|
|
653
648
|
# ORCHESTRATE PROC WORKERS
|
|
@@ -666,8 +661,11 @@ class Orchestrator(ABC):
|
|
|
666
661
|
if isinstance(res, Exception):
|
|
667
662
|
logger.error(f"Error in proc_worker {i}: {res}")
|
|
668
663
|
logger.debug("...proc_workers concluded their tasks")
|
|
669
|
-
except Exception
|
|
670
|
-
logger.error(
|
|
664
|
+
except Exception:
|
|
665
|
+
logger.error(
|
|
666
|
+
"Gathering proc_workers failed",
|
|
667
|
+
exc_info=True,
|
|
668
|
+
)
|
|
671
669
|
finally:
|
|
672
670
|
await proc_queue.join()
|
|
673
671
|
|
|
@@ -684,8 +682,11 @@ class Orchestrator(ABC):
|
|
|
684
682
|
logger.debug("Waiting for res_collector to conclude its tasks...")
|
|
685
683
|
await res_collector
|
|
686
684
|
logger.debug("...res_collector concluded its tasks")
|
|
687
|
-
except Exception
|
|
688
|
-
logger.error(
|
|
685
|
+
except Exception:
|
|
686
|
+
logger.error(
|
|
687
|
+
"Gathering res_collector failed",
|
|
688
|
+
exc_info=True,
|
|
689
|
+
)
|
|
689
690
|
finally:
|
|
690
691
|
await res_queue.join()
|
|
691
692
|
|