fraudcrawler 0.5.9__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/PKG-INFO +4 -3
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/README.md +1 -1
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/__init__.py +2 -2
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/base.py +11 -32
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/client.py +1 -1
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/orchestrator.py +135 -135
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/retry.py +12 -6
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/launch_demo_pipeline.py +1 -1
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/processing/processor.py +3 -3
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/scraping/search.py +352 -125
- fraudcrawler-0.6.1/fraudcrawler/scraping/url.py +96 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/scraping/zyte.py +15 -1
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/settings.py +13 -3
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/pyproject.toml +1 -1
- fraudcrawler-0.5.9/fraudcrawler/scraping/url.py +0 -57
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/LICENSE +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.5.9 → fraudcrawler-0.6.1}/fraudcrawler/scraping/enrich.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
16
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
|
|
|
160
161
|
### Async Setup
|
|
161
162
|
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
162
163
|
|
|
163
|
-
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `
|
|
164
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
164
165
|
|
|
165
166
|
The following image provides a schematic representation of the package's async setup.
|
|
166
167
|

|
|
@@ -137,7 +137,7 @@ see `CONTRIBUTING.md`
|
|
|
137
137
|
### Async Setup
|
|
138
138
|
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
139
139
|
|
|
140
|
-
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `
|
|
140
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
141
141
|
|
|
142
142
|
The following image provides a schematic representation of the package's async setup.
|
|
143
143
|

|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from fraudcrawler.scraping.search import
|
|
1
|
+
from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.url import URLCollector
|
|
4
4
|
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
@@ -17,7 +17,7 @@ from fraudcrawler.base.base import (
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
__all__ = [
|
|
20
|
-
"
|
|
20
|
+
"Searcher",
|
|
21
21
|
"SearchEngineName",
|
|
22
22
|
"Enricher",
|
|
23
23
|
"URLCollector",
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import base64
|
|
4
3
|
from pydantic import (
|
|
5
4
|
BaseModel,
|
|
6
5
|
Field,
|
|
@@ -12,6 +11,7 @@ from urllib.parse import urlparse
|
|
|
12
11
|
import re
|
|
13
12
|
from typing import Any, Dict, List
|
|
14
13
|
|
|
14
|
+
|
|
15
15
|
import httpx
|
|
16
16
|
|
|
17
17
|
from fraudcrawler.settings import (
|
|
@@ -133,7 +133,7 @@ class Deepness(BaseModel):
|
|
|
133
133
|
class ProductItem(BaseModel):
|
|
134
134
|
"""Model representing a product item."""
|
|
135
135
|
|
|
136
|
-
#
|
|
136
|
+
# Search parameters
|
|
137
137
|
search_term: str
|
|
138
138
|
search_term_type: str
|
|
139
139
|
url: str
|
|
@@ -141,7 +141,7 @@ class ProductItem(BaseModel):
|
|
|
141
141
|
search_engine_name: str
|
|
142
142
|
domain: str
|
|
143
143
|
|
|
144
|
-
#
|
|
144
|
+
# Context parameters
|
|
145
145
|
product_name: str | None = None
|
|
146
146
|
product_price: str | None = None
|
|
147
147
|
product_description: str | None = None
|
|
@@ -217,6 +217,14 @@ class DomainUtils:
|
|
|
217
217
|
"""
|
|
218
218
|
|
|
219
219
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
220
|
+
_headers = {
|
|
221
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
222
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
223
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
224
|
+
"Accept-Encoding": "gzip, deflate",
|
|
225
|
+
"Connection": "keep-alive",
|
|
226
|
+
"Upgrade-Insecure-Requests": "1",
|
|
227
|
+
}
|
|
220
228
|
|
|
221
229
|
def _get_domain(self, url: str) -> str:
|
|
222
230
|
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
@@ -242,32 +250,3 @@ class DomainUtils:
|
|
|
242
250
|
if hostname and hostname.startswith("www."):
|
|
243
251
|
hostname = hostname[4:]
|
|
244
252
|
return hostname.lower()
|
|
245
|
-
|
|
246
|
-
async def _unblock_url(self, url: str, zyte_api: Any) -> bytes | None:
|
|
247
|
-
"""Attempts to unblock a URL using Zyte proxy mode when direct access fails.
|
|
248
|
-
|
|
249
|
-
This method is specifically designed to handle 403 Forbidden errors for domains
|
|
250
|
-
that may be blocking requests from certain IP ranges (like cloud providers).
|
|
251
|
-
|
|
252
|
-
Args:
|
|
253
|
-
url: The URL to fetch using Zyte proxy mode.
|
|
254
|
-
zyte_api: An instance of ZyteAPI to use for the request.
|
|
255
|
-
|
|
256
|
-
Returns:
|
|
257
|
-
The HTML content as bytes if successful, None if failed.
|
|
258
|
-
"""
|
|
259
|
-
try:
|
|
260
|
-
logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
|
|
261
|
-
details = await zyte_api.details(url)
|
|
262
|
-
|
|
263
|
-
if details and "httpResponseBody" in details:
|
|
264
|
-
html_content = base64.b64decode(details["httpResponseBody"])
|
|
265
|
-
logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
|
|
266
|
-
return html_content
|
|
267
|
-
else:
|
|
268
|
-
logger.warning(f"Zyte proxy request failed for URL: {url}")
|
|
269
|
-
return None
|
|
270
|
-
|
|
271
|
-
except Exception as e:
|
|
272
|
-
logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
|
|
273
|
-
return None
|
|
@@ -103,7 +103,7 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
103
103
|
search_engines: List[SearchEngineName | str] | None = None,
|
|
104
104
|
previously_collected_urls: List[str] | None = None,
|
|
105
105
|
) -> None:
|
|
106
|
-
"""Runs the pipeline steps:
|
|
106
|
+
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
107
107
|
|
|
108
108
|
Args:
|
|
109
109
|
search_term: The search term for the query.
|
|
@@ -10,8 +10,8 @@ from fraudcrawler.settings import (
|
|
|
10
10
|
PROCESSOR_DEFAULT_MODEL,
|
|
11
11
|
)
|
|
12
12
|
from fraudcrawler.settings import (
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
DEFAULT_N_SRCH_WKRS,
|
|
14
|
+
DEFAULT_N_CNTX_WKRS,
|
|
15
15
|
DEFAULT_N_PROC_WKRS,
|
|
16
16
|
)
|
|
17
17
|
from fraudcrawler.base.base import (
|
|
@@ -24,7 +24,7 @@ from fraudcrawler.base.base import (
|
|
|
24
24
|
HttpxAsyncClient,
|
|
25
25
|
)
|
|
26
26
|
from fraudcrawler import (
|
|
27
|
-
|
|
27
|
+
Searcher,
|
|
28
28
|
SearchEngineName,
|
|
29
29
|
Enricher,
|
|
30
30
|
URLCollector,
|
|
@@ -59,8 +59,8 @@ class Orchestrator(ABC):
|
|
|
59
59
|
zyteapi_key: str,
|
|
60
60
|
openaiapi_key: str,
|
|
61
61
|
openai_model: str = PROCESSOR_DEFAULT_MODEL,
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
n_srch_wkrs: int = DEFAULT_N_SRCH_WKRS,
|
|
63
|
+
n_cntx_wkrs: int = DEFAULT_N_CNTX_WKRS,
|
|
64
64
|
n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
|
|
65
65
|
# Configure a custom httpx client.
|
|
66
66
|
# We provide a `HttpxAsyncClient` class that you can pass
|
|
@@ -81,8 +81,8 @@ class Orchestrator(ABC):
|
|
|
81
81
|
zyteapi_key: The API key for Zyte API.
|
|
82
82
|
openaiapi_key: The API key for OpenAI.
|
|
83
83
|
openai_model: The model to use for the processing (optional).
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
n_srch_wkrs: Number of async workers for the search (optional).
|
|
85
|
+
n_cntx_wkrs: Number of async workers for context extraction (optional).
|
|
86
86
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
87
87
|
http_client: An httpx.AsyncClient to use for the async requests (optional).
|
|
88
88
|
"""
|
|
@@ -96,8 +96,8 @@ class Orchestrator(ABC):
|
|
|
96
96
|
self._openai_model = openai_model
|
|
97
97
|
|
|
98
98
|
# Setup the async framework
|
|
99
|
-
self.
|
|
100
|
-
self.
|
|
99
|
+
self._n_srch_wkrs = n_srch_wkrs
|
|
100
|
+
self._n_cntx_wkrs = n_cntx_wkrs
|
|
101
101
|
self._n_proc_wkrs = n_proc_wkrs
|
|
102
102
|
self._queues: Dict[str, asyncio.Queue] | None = None
|
|
103
103
|
self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
|
|
@@ -114,13 +114,10 @@ class Orchestrator(ABC):
|
|
|
114
114
|
self._owns_http_client = True
|
|
115
115
|
|
|
116
116
|
# Setup the clients
|
|
117
|
-
self.
|
|
118
|
-
http_client=self._http_client, api_key=self._zyteapi_key
|
|
119
|
-
)
|
|
120
|
-
self._search = Search(
|
|
117
|
+
self._searcher = Searcher(
|
|
121
118
|
http_client=self._http_client,
|
|
122
119
|
serpapi_key=self._serpapi_key,
|
|
123
|
-
|
|
120
|
+
zyteapi_key=self._zyteapi_key,
|
|
124
121
|
)
|
|
125
122
|
self._enricher = Enricher(
|
|
126
123
|
http_client=self._http_client,
|
|
@@ -128,6 +125,10 @@ class Orchestrator(ABC):
|
|
|
128
125
|
pwd=self._dataforseo_pwd,
|
|
129
126
|
)
|
|
130
127
|
self._url_collector = URLCollector()
|
|
128
|
+
self._zyteapi = ZyteAPI(
|
|
129
|
+
http_client=self._http_client,
|
|
130
|
+
api_key=self._zyteapi_key,
|
|
131
|
+
)
|
|
131
132
|
self._processor = Processor(
|
|
132
133
|
http_client=self._http_client,
|
|
133
134
|
api_key=self._openaiapi_key,
|
|
@@ -142,7 +143,7 @@ class Orchestrator(ABC):
|
|
|
142
143
|
await self._http_client.aclose()
|
|
143
144
|
self._http_client = None
|
|
144
145
|
|
|
145
|
-
async def
|
|
146
|
+
async def _srch_execute(
|
|
146
147
|
self,
|
|
147
148
|
queue_in: asyncio.Queue[dict | None],
|
|
148
149
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
@@ -160,17 +161,14 @@ class Orchestrator(ABC):
|
|
|
160
161
|
break
|
|
161
162
|
|
|
162
163
|
try:
|
|
164
|
+
# Execute the search
|
|
163
165
|
search_term_type = item.pop("search_term_type")
|
|
164
|
-
|
|
165
|
-
search_engines = item.pop("search_engines")
|
|
166
|
-
|
|
167
|
-
results = await self._search.apply(
|
|
168
|
-
**item, search_engines=search_engines
|
|
169
|
-
)
|
|
170
|
-
|
|
166
|
+
results = await self._searcher.apply(**item)
|
|
171
167
|
logger.debug(
|
|
172
168
|
f"Search for {item['search_term']} returned {len(results)} results"
|
|
173
169
|
)
|
|
170
|
+
|
|
171
|
+
# Create ProductItems for each result
|
|
174
172
|
for res in results:
|
|
175
173
|
product = ProductItem(
|
|
176
174
|
search_term=item["search_term"],
|
|
@@ -205,31 +203,12 @@ class Orchestrator(ABC):
|
|
|
205
203
|
break
|
|
206
204
|
|
|
207
205
|
if not product.filtered:
|
|
208
|
-
|
|
209
|
-
url = self._url_collector.remove_tracking_parameters(product.url)
|
|
210
|
-
product.url = url
|
|
211
|
-
|
|
212
|
-
if url in self._url_collector.collected_currently:
|
|
213
|
-
# deduplicate on current run
|
|
214
|
-
product.filtered = True
|
|
215
|
-
product.filtered_at_stage = (
|
|
216
|
-
"URL collection (current run deduplication)"
|
|
217
|
-
)
|
|
218
|
-
logger.debug(f"URL {url} already collected in current run")
|
|
219
|
-
elif url in self._url_collector.collected_previously:
|
|
220
|
-
# deduplicate on previous runs coming from a db
|
|
221
|
-
product.filtered = True
|
|
222
|
-
product.filtered_at_stage = (
|
|
223
|
-
"URL collection (previous run deduplication)"
|
|
224
|
-
)
|
|
225
|
-
logger.debug(f"URL {url} as already collected in previous run")
|
|
226
|
-
else:
|
|
227
|
-
self._url_collector.collected_currently.add(url)
|
|
206
|
+
product = await self._url_collector.apply(product=product)
|
|
228
207
|
|
|
229
208
|
await queue_out.put(product)
|
|
230
209
|
queue_in.task_done()
|
|
231
210
|
|
|
232
|
-
async def
|
|
211
|
+
async def _cntx_execute(
|
|
233
212
|
self,
|
|
234
213
|
queue_in: asyncio.Queue[ProductItem | None],
|
|
235
214
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
@@ -248,7 +227,7 @@ class Orchestrator(ABC):
|
|
|
248
227
|
|
|
249
228
|
if not product.filtered:
|
|
250
229
|
try:
|
|
251
|
-
# Fetch the product
|
|
230
|
+
# Fetch the product context from Zyte API
|
|
252
231
|
details = await self._zyteapi.details(url=product.url)
|
|
253
232
|
url_resolved = self._zyteapi.extract_url_resolved(details=details)
|
|
254
233
|
if url_resolved:
|
|
@@ -258,12 +237,13 @@ class Orchestrator(ABC):
|
|
|
258
237
|
)
|
|
259
238
|
|
|
260
239
|
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
261
|
-
# otherwise the unresolved domain will be shown
|
|
240
|
+
# otherwise the unresolved domain will be shown.
|
|
241
|
+
# For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
|
|
262
242
|
if url_resolved and url_resolved != product.url:
|
|
263
243
|
logger.debug(
|
|
264
244
|
f"URL resolved for {product.url} is {url_resolved}"
|
|
265
245
|
)
|
|
266
|
-
product.domain = self.
|
|
246
|
+
product.domain = self._searcher._get_domain(url_resolved)
|
|
267
247
|
|
|
268
248
|
product.product_price = self._zyteapi.extract_product_price(
|
|
269
249
|
details=details
|
|
@@ -348,52 +328,52 @@ class Orchestrator(ABC):
|
|
|
348
328
|
|
|
349
329
|
def _setup_async_framework(
|
|
350
330
|
self,
|
|
351
|
-
|
|
352
|
-
|
|
331
|
+
n_srch_wkrs: int,
|
|
332
|
+
n_cntx_wkrs: int,
|
|
353
333
|
n_proc_wkrs: int,
|
|
354
334
|
prompts: List[Prompt],
|
|
355
335
|
) -> None:
|
|
356
336
|
"""Sets up the necessary queues and workers for the async framework.
|
|
357
337
|
|
|
358
338
|
Args:
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
n_proc_wkrs: Number of async workers for
|
|
339
|
+
n_srch_wkrs: Number of async workers for search.
|
|
340
|
+
n_cntx_wkrs: Number of async workers for context extraction.
|
|
341
|
+
n_proc_wkrs: Number of async workers for processing.
|
|
362
342
|
prompts: The list of prompts used for the classification by func:`Processor.classify`.
|
|
363
343
|
"""
|
|
364
344
|
|
|
365
345
|
# Setup the input/output queues for the workers
|
|
366
|
-
|
|
346
|
+
srch_queue: asyncio.Queue[dict | None] = asyncio.Queue()
|
|
367
347
|
url_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
368
|
-
|
|
348
|
+
cntx_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
369
349
|
proc_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
370
350
|
res_queue: asyncio.Queue[ProductItem | None] = asyncio.Queue()
|
|
371
351
|
|
|
372
|
-
# Setup the
|
|
373
|
-
|
|
352
|
+
# Setup the Search workers
|
|
353
|
+
srch_wkrs = [
|
|
374
354
|
asyncio.create_task(
|
|
375
|
-
self.
|
|
376
|
-
queue_in=
|
|
355
|
+
self._srch_execute(
|
|
356
|
+
queue_in=srch_queue,
|
|
377
357
|
queue_out=url_queue,
|
|
378
358
|
)
|
|
379
359
|
)
|
|
380
|
-
for _ in range(
|
|
360
|
+
for _ in range(n_srch_wkrs)
|
|
381
361
|
]
|
|
382
362
|
|
|
383
363
|
# Setup the URL collector
|
|
384
364
|
url_col = asyncio.create_task(
|
|
385
|
-
self._collect_url(queue_in=url_queue, queue_out=
|
|
365
|
+
self._collect_url(queue_in=url_queue, queue_out=cntx_queue)
|
|
386
366
|
)
|
|
387
367
|
|
|
388
|
-
# Setup the
|
|
389
|
-
|
|
368
|
+
# Setup the context extraction workers
|
|
369
|
+
cntx_wkrs = [
|
|
390
370
|
asyncio.create_task(
|
|
391
|
-
self.
|
|
392
|
-
queue_in=
|
|
371
|
+
self._cntx_execute(
|
|
372
|
+
queue_in=cntx_queue,
|
|
393
373
|
queue_out=proc_queue,
|
|
394
374
|
)
|
|
395
375
|
)
|
|
396
|
-
for _ in range(
|
|
376
|
+
for _ in range(n_cntx_wkrs)
|
|
397
377
|
]
|
|
398
378
|
|
|
399
379
|
# Setup the processing workers
|
|
@@ -413,26 +393,26 @@ class Orchestrator(ABC):
|
|
|
413
393
|
|
|
414
394
|
# Add the setup to the instance variables
|
|
415
395
|
self._queues = {
|
|
416
|
-
"
|
|
396
|
+
"srch": srch_queue,
|
|
417
397
|
"url": url_queue,
|
|
418
|
-
"
|
|
398
|
+
"cntx": cntx_queue,
|
|
419
399
|
"proc": proc_queue,
|
|
420
400
|
"res": res_queue,
|
|
421
401
|
}
|
|
422
402
|
self._workers = {
|
|
423
|
-
"
|
|
403
|
+
"srch": srch_wkrs,
|
|
424
404
|
"url": url_col,
|
|
425
|
-
"
|
|
405
|
+
"cntx": cntx_wkrs,
|
|
426
406
|
"proc": proc_wkrs,
|
|
427
407
|
"res": res_col,
|
|
428
408
|
}
|
|
429
409
|
|
|
430
410
|
@staticmethod
|
|
431
|
-
async def
|
|
411
|
+
async def _add_search_items_for_search_term(
|
|
432
412
|
queue: asyncio.Queue[dict | None],
|
|
433
413
|
search_term: str,
|
|
434
414
|
search_term_type: str,
|
|
435
|
-
|
|
415
|
+
search_engine: SearchEngineName,
|
|
436
416
|
language: Language,
|
|
437
417
|
location: Location,
|
|
438
418
|
num_results: int,
|
|
@@ -443,17 +423,17 @@ class Orchestrator(ABC):
|
|
|
443
423
|
item = {
|
|
444
424
|
"search_term": search_term,
|
|
445
425
|
"search_term_type": search_term_type,
|
|
446
|
-
"
|
|
426
|
+
"search_engine": search_engine,
|
|
447
427
|
"language": language,
|
|
448
428
|
"location": location,
|
|
449
429
|
"num_results": num_results,
|
|
450
430
|
"marketplaces": marketplaces,
|
|
451
431
|
"excluded_urls": excluded_urls,
|
|
452
432
|
}
|
|
453
|
-
logger.debug(f'Adding item="{item}" to
|
|
433
|
+
logger.debug(f'Adding item="{item}" to srch_queue')
|
|
454
434
|
await queue.put(item)
|
|
455
435
|
|
|
456
|
-
async def
|
|
436
|
+
async def _add_srch_items(
|
|
457
437
|
self,
|
|
458
438
|
queue: asyncio.Queue[dict | None],
|
|
459
439
|
search_term: str,
|
|
@@ -464,7 +444,23 @@ class Orchestrator(ABC):
|
|
|
464
444
|
marketplaces: List[Host] | None,
|
|
465
445
|
excluded_urls: List[Host] | None,
|
|
466
446
|
) -> None:
|
|
467
|
-
"""Adds all the (enriched) search_term (as
|
|
447
|
+
"""Adds all the (enriched) search_term (as srch items) to the queue.
|
|
448
|
+
|
|
449
|
+
One item consists of the following parameters:
|
|
450
|
+
- search_term: The search term for the query.
|
|
451
|
+
- search_term_type: The type of the search term (initial or enriched).
|
|
452
|
+
- search_engines: The search engines to use for the query.
|
|
453
|
+
- language: The language to use for the query.
|
|
454
|
+
- location: The location to use for the query.
|
|
455
|
+
- num_results: The number of results to return.
|
|
456
|
+
- marketplaces: The marketplaces to include in the search.
|
|
457
|
+
- excluded_urls: The URLs to exclude from the search.
|
|
458
|
+
|
|
459
|
+
For constructing such items we essentially have two loops:
|
|
460
|
+
for each search_term (initial + enriched)
|
|
461
|
+
for each search_engine
|
|
462
|
+
add item to queue
|
|
463
|
+
"""
|
|
468
464
|
common_kwargs = {
|
|
469
465
|
"queue": queue,
|
|
470
466
|
"language": language,
|
|
@@ -473,14 +469,15 @@ class Orchestrator(ABC):
|
|
|
473
469
|
"excluded_urls": excluded_urls,
|
|
474
470
|
}
|
|
475
471
|
|
|
476
|
-
# Add initial items to the
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
472
|
+
# Add initial items to the queue
|
|
473
|
+
for se in search_engines:
|
|
474
|
+
await self._add_search_items_for_search_term(
|
|
475
|
+
search_term=search_term,
|
|
476
|
+
search_term_type="initial",
|
|
477
|
+
search_engine=se,
|
|
478
|
+
num_results=deepness.num_results,
|
|
479
|
+
**common_kwargs, # type: ignore[arg-type]
|
|
480
|
+
)
|
|
484
481
|
|
|
485
482
|
# Enrich the search_terms
|
|
486
483
|
enrichment = deepness.enrichment
|
|
@@ -494,15 +491,16 @@ class Orchestrator(ABC):
|
|
|
494
491
|
n_terms=n_terms,
|
|
495
492
|
)
|
|
496
493
|
|
|
497
|
-
# Add the enriched search terms to the
|
|
494
|
+
# Add the enriched search terms to the queue
|
|
498
495
|
for trm in terms:
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
496
|
+
for se in search_engines:
|
|
497
|
+
await self._add_search_items_for_search_term(
|
|
498
|
+
search_term=trm,
|
|
499
|
+
search_term_type="enriched",
|
|
500
|
+
search_engine=se,
|
|
501
|
+
num_results=enrichment.additional_urls_per_term,
|
|
502
|
+
**common_kwargs, # type: ignore[arg-type]
|
|
503
|
+
)
|
|
506
504
|
|
|
507
505
|
async def run(
|
|
508
506
|
self,
|
|
@@ -516,7 +514,7 @@ class Orchestrator(ABC):
|
|
|
516
514
|
excluded_urls: List[Host] | None = None,
|
|
517
515
|
previously_collected_urls: List[str] | None = None,
|
|
518
516
|
) -> None:
|
|
519
|
-
"""Runs the pipeline steps:
|
|
517
|
+
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
520
518
|
|
|
521
519
|
Args:
|
|
522
520
|
search_term: The search term for the query.
|
|
@@ -541,22 +539,24 @@ class Orchestrator(ABC):
|
|
|
541
539
|
|
|
542
540
|
# Handle previously collected URLs
|
|
543
541
|
if previously_collected_urls:
|
|
544
|
-
self._url_collector.
|
|
542
|
+
self._url_collector.add_previously_collected_urls(
|
|
543
|
+
urls=previously_collected_urls
|
|
544
|
+
)
|
|
545
545
|
|
|
546
546
|
# Setup the async framework
|
|
547
547
|
n_terms_max = 1 + (
|
|
548
548
|
deepness.enrichment.additional_terms if deepness.enrichment else 0
|
|
549
549
|
)
|
|
550
|
-
|
|
551
|
-
|
|
550
|
+
n_srch_wkrs = min(self._n_srch_wkrs, n_terms_max)
|
|
551
|
+
n_cntx_wkrs = min(self._n_cntx_wkrs, deepness.num_results)
|
|
552
552
|
n_proc_wkrs = min(self._n_proc_wkrs, deepness.num_results)
|
|
553
553
|
|
|
554
554
|
logger.debug(
|
|
555
|
-
f"setting up async framework (#workers:
|
|
555
|
+
f"setting up async framework (#workers: srch={n_srch_wkrs}, cntx={n_cntx_wkrs}, proc={n_proc_wkrs})"
|
|
556
556
|
)
|
|
557
557
|
self._setup_async_framework(
|
|
558
|
-
|
|
559
|
-
|
|
558
|
+
n_srch_wkrs=n_srch_wkrs,
|
|
559
|
+
n_cntx_wkrs=n_cntx_wkrs,
|
|
560
560
|
n_proc_wkrs=n_proc_wkrs,
|
|
561
561
|
prompts=prompts,
|
|
562
562
|
)
|
|
@@ -566,21 +566,21 @@ class Orchestrator(ABC):
|
|
|
566
566
|
raise ValueError(
|
|
567
567
|
"Async framework is not setup. Please call _setup_async_framework() first."
|
|
568
568
|
)
|
|
569
|
-
if not all([k in self._queues for k in ["
|
|
569
|
+
if not all([k in self._queues for k in ["srch", "url", "cntx", "proc", "res"]]):
|
|
570
570
|
raise ValueError(
|
|
571
571
|
"The queues of the async framework are not setup correctly."
|
|
572
572
|
)
|
|
573
573
|
if not all(
|
|
574
|
-
[k in self._workers for k in ["
|
|
574
|
+
[k in self._workers for k in ["srch", "url", "cntx", "proc", "res"]]
|
|
575
575
|
):
|
|
576
576
|
raise ValueError(
|
|
577
577
|
"The workers of the async framework are not setup correctly."
|
|
578
578
|
)
|
|
579
579
|
|
|
580
|
-
# Add the search items to the
|
|
581
|
-
|
|
582
|
-
await self.
|
|
583
|
-
queue=
|
|
580
|
+
# Add the search items to the srch_queue
|
|
581
|
+
srch_queue = self._queues["srch"]
|
|
582
|
+
await self._add_srch_items(
|
|
583
|
+
queue=srch_queue,
|
|
584
584
|
search_term=search_term,
|
|
585
585
|
search_engines=search_engines,
|
|
586
586
|
language=language,
|
|
@@ -590,26 +590,26 @@ class Orchestrator(ABC):
|
|
|
590
590
|
excluded_urls=excluded_urls,
|
|
591
591
|
)
|
|
592
592
|
|
|
593
|
-
#
|
|
594
|
-
#
|
|
595
|
-
#
|
|
596
|
-
# Add the sentinels to the
|
|
597
|
-
for _ in range(
|
|
598
|
-
await
|
|
593
|
+
# -----------------------------
|
|
594
|
+
# ORCHESTRATE SEARCH WORKERS
|
|
595
|
+
# -----------------------------
|
|
596
|
+
# Add the sentinels to the srch_queue
|
|
597
|
+
for _ in range(n_srch_wkrs):
|
|
598
|
+
await srch_queue.put(None)
|
|
599
599
|
|
|
600
|
-
# Wait for the
|
|
601
|
-
|
|
600
|
+
# Wait for the srch workers to be concluded before adding the sentinels to the url_queue
|
|
601
|
+
srch_workers = self._workers["srch"]
|
|
602
602
|
try:
|
|
603
|
-
logger.debug("Waiting for
|
|
604
|
-
|
|
605
|
-
for i, res in enumerate(
|
|
603
|
+
logger.debug("Waiting for srch_workers to conclude their tasks...")
|
|
604
|
+
srch_res = await asyncio.gather(*srch_workers, return_exceptions=True)
|
|
605
|
+
for i, res in enumerate(srch_res):
|
|
606
606
|
if isinstance(res, Exception):
|
|
607
|
-
logger.error(f"Error in
|
|
608
|
-
logger.debug("...
|
|
607
|
+
logger.error(f"Error in srch_worker {i}: {res}")
|
|
608
|
+
logger.debug("...srch_workers concluded their tasks")
|
|
609
609
|
except Exception as e:
|
|
610
|
-
logger.error(f"Gathering
|
|
610
|
+
logger.error(f"Gathering srch_workers failed: {e}")
|
|
611
611
|
finally:
|
|
612
|
-
await
|
|
612
|
+
await srch_queue.join()
|
|
613
613
|
|
|
614
614
|
# ---------------------------
|
|
615
615
|
# ORCHESTRATE URL COLLECTOR
|
|
@@ -618,7 +618,7 @@ class Orchestrator(ABC):
|
|
|
618
618
|
url_queue = self._queues["url"]
|
|
619
619
|
await url_queue.put(None)
|
|
620
620
|
|
|
621
|
-
# Wait for the url_collector to be concluded before adding the sentinels to the
|
|
621
|
+
# Wait for the url_collector to be concluded before adding the sentinels to the cntx_queue
|
|
622
622
|
url_collector = cast(asyncio.Task, self._workers["url"])
|
|
623
623
|
try:
|
|
624
624
|
logger.debug("Waiting for url_collector to conclude its tasks...")
|
|
@@ -629,27 +629,27 @@ class Orchestrator(ABC):
|
|
|
629
629
|
finally:
|
|
630
630
|
await url_queue.join()
|
|
631
631
|
|
|
632
|
-
#
|
|
633
|
-
# ORCHESTRATE
|
|
634
|
-
#
|
|
635
|
-
# Add the sentinels to the
|
|
636
|
-
|
|
637
|
-
for _ in range(
|
|
638
|
-
await
|
|
632
|
+
# -----------------------------
|
|
633
|
+
# ORCHESTRATE CONTEXT WORKERS
|
|
634
|
+
# -----------------------------
|
|
635
|
+
# Add the sentinels to the cntx_queue
|
|
636
|
+
cntx_queue = self._queues["cntx"]
|
|
637
|
+
for _ in range(n_cntx_wkrs):
|
|
638
|
+
await cntx_queue.put(None)
|
|
639
639
|
|
|
640
|
-
# Wait for the
|
|
641
|
-
|
|
640
|
+
# Wait for the cntx_workers to be concluded before adding the sentinels to the proc_queue
|
|
641
|
+
cntx_workers = self._workers["cntx"]
|
|
642
642
|
try:
|
|
643
|
-
logger.debug("Waiting for
|
|
644
|
-
|
|
645
|
-
for i, res in enumerate(
|
|
643
|
+
logger.debug("Waiting for cntx_workers to conclude their tasks...")
|
|
644
|
+
cntx_res = await asyncio.gather(*cntx_workers, return_exceptions=True)
|
|
645
|
+
for i, res in enumerate(cntx_res):
|
|
646
646
|
if isinstance(res, Exception):
|
|
647
|
-
logger.error(f"Error in
|
|
648
|
-
logger.debug("...
|
|
647
|
+
logger.error(f"Error in cntx_worker {i}: {res}")
|
|
648
|
+
logger.debug("...cntx_workers concluded their tasks")
|
|
649
649
|
except Exception as e:
|
|
650
|
-
logger.error(f"Gathering
|
|
650
|
+
logger.error(f"Gathering cntx_workers failed: {e}")
|
|
651
651
|
finally:
|
|
652
|
-
await
|
|
652
|
+
await cntx_queue.join()
|
|
653
653
|
|
|
654
654
|
# ---------------------------
|
|
655
655
|
# ORCHESTRATE PROC WORKERS
|