fraudcrawler 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +7 -5
- fraudcrawler/base/base.py +64 -32
- fraudcrawler/base/client.py +27 -11
- fraudcrawler/base/orchestrator.py +103 -25
- fraudcrawler/base/retry.py +5 -2
- fraudcrawler/launch_demo_pipeline.py +9 -9
- fraudcrawler/processing/processor.py +9 -5
- fraudcrawler/scraping/enrich.py +38 -21
- fraudcrawler/scraping/search.py +664 -0
- fraudcrawler/scraping/zyte.py +37 -15
- fraudcrawler/settings.py +13 -2
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/METADATA +6 -2
- fraudcrawler-0.5.0.dist-info/RECORD +22 -0
- fraudcrawler/scraping/serp.py +0 -515
- fraudcrawler-0.4.7.dist-info/RECORD +0 -22
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.0.dist-info}/entry_points.txt +0 -0
fraudcrawler/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from fraudcrawler.scraping.
|
|
1
|
+
from fraudcrawler.scraping.search import Search, SearchEngineName
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.url import URLCollector
|
|
4
|
-
from fraudcrawler.scraping.zyte import
|
|
4
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
5
5
|
from fraudcrawler.processing.processor import Processor
|
|
6
6
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
7
7
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
@@ -13,14 +13,15 @@ from fraudcrawler.base.base import (
|
|
|
13
13
|
Location,
|
|
14
14
|
Prompt,
|
|
15
15
|
ProductItem,
|
|
16
|
+
HttpxAsyncClient,
|
|
16
17
|
)
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
19
|
-
"
|
|
20
|
-
"
|
|
20
|
+
"Search",
|
|
21
|
+
"SearchEngineName",
|
|
21
22
|
"Enricher",
|
|
22
23
|
"URLCollector",
|
|
23
|
-
"
|
|
24
|
+
"ZyteAPI",
|
|
24
25
|
"Processor",
|
|
25
26
|
"Orchestrator",
|
|
26
27
|
"ProductItem",
|
|
@@ -31,4 +32,5 @@ __all__ = [
|
|
|
31
32
|
"Deepness",
|
|
32
33
|
"Enrichment",
|
|
33
34
|
"Prompt",
|
|
35
|
+
"HttpxAsyncClient",
|
|
34
36
|
]
|
fraudcrawler/base/base.py
CHANGED
|
@@ -7,15 +7,21 @@ from pydantic import (
|
|
|
7
7
|
model_validator,
|
|
8
8
|
)
|
|
9
9
|
from pydantic_settings import BaseSettings
|
|
10
|
+
from urllib.parse import urlparse
|
|
10
11
|
import re
|
|
11
|
-
from typing import
|
|
12
|
+
from typing import Any, Dict, List
|
|
12
13
|
|
|
13
|
-
import
|
|
14
|
+
import httpx
|
|
14
15
|
|
|
15
16
|
from fraudcrawler.settings import (
|
|
16
17
|
GOOGLE_LANGUAGES_FILENAME,
|
|
17
18
|
GOOGLE_LOCATIONS_FILENAME,
|
|
18
19
|
)
|
|
20
|
+
from fraudcrawler.settings import (
|
|
21
|
+
DEFAULT_HTTPX_TIMEOUT,
|
|
22
|
+
DEFAULT_HTTPX_LIMITS,
|
|
23
|
+
DEFAULT_HTTPX_REDIRECTS,
|
|
24
|
+
)
|
|
19
25
|
|
|
20
26
|
logger = logging.getLogger(__name__)
|
|
21
27
|
|
|
@@ -130,7 +136,8 @@ class ProductItem(BaseModel):
|
|
|
130
136
|
search_term: str
|
|
131
137
|
search_term_type: str
|
|
132
138
|
url: str
|
|
133
|
-
|
|
139
|
+
url_resolved: str
|
|
140
|
+
search_engine_name: str
|
|
134
141
|
domain: str
|
|
135
142
|
|
|
136
143
|
# Zyte parameters
|
|
@@ -180,32 +187,57 @@ class Prompt(BaseModel):
|
|
|
180
187
|
return val
|
|
181
188
|
|
|
182
189
|
|
|
183
|
-
class AsyncClient:
|
|
184
|
-
"""
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
190
|
+
class HttpxAsyncClient(httpx.AsyncClient):
|
|
191
|
+
"""Httpx async client that can be used to retain the default settings."""
|
|
192
|
+
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
|
|
196
|
+
limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
|
|
197
|
+
follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
|
|
198
|
+
**kwargs: Any,
|
|
199
|
+
) -> None:
|
|
200
|
+
if isinstance(timeout, dict):
|
|
201
|
+
timeout = httpx.Timeout(**timeout)
|
|
202
|
+
if isinstance(limits, dict):
|
|
203
|
+
limits = httpx.Limits(**limits)
|
|
204
|
+
|
|
205
|
+
kwargs.setdefault("timeout", timeout)
|
|
206
|
+
kwargs.setdefault("limits", limits)
|
|
207
|
+
kwargs.setdefault("follow_redirects", follow_redirects)
|
|
208
|
+
super().__init__(**kwargs)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class DomainUtils:
|
|
212
|
+
"""Utility class for domain extraction and normalization.
|
|
213
|
+
|
|
214
|
+
Handles domain parsing from URLs, removes common prefixes (www, http/https),
|
|
215
|
+
and provides consistent domain formatting for search and scraping operations.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
219
|
+
|
|
220
|
+
def _get_domain(self, url: str) -> str:
|
|
221
|
+
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
url: The URL to be processed.
|
|
225
|
+
"""
|
|
226
|
+
# Add scheme; urlparse requires it
|
|
227
|
+
if not url.startswith(("http://", "https://")):
|
|
228
|
+
url = "http://" + url
|
|
229
|
+
|
|
230
|
+
# Get the hostname
|
|
231
|
+
hostname = urlparse(url).hostname
|
|
232
|
+
if hostname is None and (match := re.search(self._hostname_pattern, url)):
|
|
233
|
+
hostname = match.group(1)
|
|
234
|
+
if hostname is None:
|
|
235
|
+
logger.warning(
|
|
236
|
+
f'Failed to extract domain from url="{url}"; full url is returned'
|
|
237
|
+
)
|
|
238
|
+
return url.lower()
|
|
239
|
+
|
|
240
|
+
# Remove www. prefix
|
|
241
|
+
if hostname and hostname.startswith("www."):
|
|
242
|
+
hostname = hostname[4:]
|
|
243
|
+
return hostname.lower()
|
fraudcrawler/base/client.py
CHANGED
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
import logging
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from pydantic import BaseModel
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List, Self
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
@@ -19,7 +19,7 @@ from fraudcrawler.base.base import (
|
|
|
19
19
|
ProductItem,
|
|
20
20
|
)
|
|
21
21
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
22
|
-
from fraudcrawler.scraping.
|
|
22
|
+
from fraudcrawler.scraping.search import SearchEngineName
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
@@ -53,6 +53,13 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
53
53
|
self._results_dir.mkdir(parents=True)
|
|
54
54
|
self._results: List[Results] = []
|
|
55
55
|
|
|
56
|
+
async def __aenter__(self) -> Self:
|
|
57
|
+
await super().__aenter__() # let base set itself up
|
|
58
|
+
return self # so `async with FraudCrawlerClient()` gives you this instance
|
|
59
|
+
|
|
60
|
+
async def __aexit__(self, *args, **kwargs) -> None:
|
|
61
|
+
await super().__aexit__(*args, **kwargs)
|
|
62
|
+
|
|
56
63
|
async def _collect_results(
|
|
57
64
|
self, queue_in: asyncio.Queue[ProductItem | None]
|
|
58
65
|
) -> None:
|
|
@@ -93,7 +100,8 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
93
100
|
prompts: List[Prompt],
|
|
94
101
|
marketplaces: List[Host] | None = None,
|
|
95
102
|
excluded_urls: List[Host] | None = None,
|
|
96
|
-
search_engines: List[
|
|
103
|
+
search_engines: List[SearchEngineName | str] | None = None,
|
|
104
|
+
previously_collected_urls: List[str] | None = None,
|
|
97
105
|
) -> None:
|
|
98
106
|
"""Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
|
|
99
107
|
|
|
@@ -103,8 +111,10 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
103
111
|
location: The location to use for the query.
|
|
104
112
|
deepness: The search depth and enrichment details.
|
|
105
113
|
prompts: The list of prompts to use for classification.
|
|
106
|
-
marketplaces: The marketplaces to include in the search.
|
|
107
|
-
excluded_urls: The URLs to exclude from the search.
|
|
114
|
+
marketplaces: The marketplaces to include in the search (optional).
|
|
115
|
+
excluded_urls: The URLs to exclude from the search (optional).
|
|
116
|
+
search_engines: The list of search engines to use for the search (optional).
|
|
117
|
+
previously_collected_urls: The urls that have been collected previously and are ignored (optional).
|
|
108
118
|
"""
|
|
109
119
|
# Handle results files
|
|
110
120
|
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
@@ -116,24 +126,30 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
116
126
|
)
|
|
117
127
|
self._results.append(Results(search_term=search_term, filename=filename))
|
|
118
128
|
|
|
119
|
-
# Normalize inputs
|
|
120
|
-
|
|
129
|
+
# Normalize inputs - convert strings to SearchEngineName enum values
|
|
130
|
+
nrm_search_engines = list(SearchEngineName)
|
|
121
131
|
if search_engines:
|
|
122
|
-
|
|
123
|
-
|
|
132
|
+
nrm_search_engines = [
|
|
133
|
+
SearchEngineName(se) if isinstance(se, str) else se
|
|
134
|
+
for se in search_engines
|
|
124
135
|
]
|
|
125
136
|
|
|
126
137
|
# Run the pipeline by calling the orchestrator's run method
|
|
138
|
+
async def _run(*args, **kwargs):
|
|
139
|
+
async with self:
|
|
140
|
+
return await super(FraudCrawlerClient, self).run(*args, **kwargs)
|
|
141
|
+
|
|
127
142
|
asyncio.run(
|
|
128
|
-
|
|
143
|
+
_run(
|
|
129
144
|
search_term=search_term,
|
|
130
|
-
search_engines=
|
|
145
|
+
search_engines=nrm_search_engines,
|
|
131
146
|
language=language,
|
|
132
147
|
location=location,
|
|
133
148
|
deepness=deepness,
|
|
134
149
|
prompts=prompts,
|
|
135
150
|
marketplaces=marketplaces,
|
|
136
151
|
excluded_urls=excluded_urls,
|
|
152
|
+
previously_collected_urls=previously_collected_urls,
|
|
137
153
|
)
|
|
138
154
|
)
|
|
139
155
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import cast, Dict, List, Self
|
|
5
5
|
|
|
6
6
|
from bs4 import BeautifulSoup
|
|
7
|
+
import httpx
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import (
|
|
9
10
|
PROCESSOR_DEFAULT_MODEL,
|
|
@@ -20,13 +21,14 @@ from fraudcrawler.base.base import (
|
|
|
20
21
|
Location,
|
|
21
22
|
Prompt,
|
|
22
23
|
ProductItem,
|
|
24
|
+
HttpxAsyncClient,
|
|
23
25
|
)
|
|
24
26
|
from fraudcrawler import (
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
Search,
|
|
28
|
+
SearchEngineName,
|
|
27
29
|
Enricher,
|
|
28
30
|
URLCollector,
|
|
29
|
-
|
|
31
|
+
ZyteAPI,
|
|
30
32
|
Processor,
|
|
31
33
|
)
|
|
32
34
|
|
|
@@ -60,9 +62,18 @@ class Orchestrator(ABC):
|
|
|
60
62
|
n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
|
|
61
63
|
n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
|
|
62
64
|
n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
|
|
65
|
+
# Configure a custom httpx client.
|
|
66
|
+
# We provide a `HttpxAsyncClient` class that you can pass
|
|
67
|
+
# to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
|
|
68
|
+
http_client: httpx.AsyncClient | None = None,
|
|
63
69
|
):
|
|
64
70
|
"""Initializes the orchestrator with the given settings.
|
|
65
71
|
|
|
72
|
+
NOTE:
|
|
73
|
+
The class:`Orchestrator` must be used as context manager as follows:
|
|
74
|
+
async with Orchestrator(...) as orchestrator:
|
|
75
|
+
await orchestrator.run()
|
|
76
|
+
|
|
66
77
|
Args:
|
|
67
78
|
serpapi_key: The API key for SERP API.
|
|
68
79
|
dataforseo_user: The user for DataForSEO.
|
|
@@ -73,16 +84,16 @@ class Orchestrator(ABC):
|
|
|
73
84
|
n_serp_wkrs: Number of async workers for serp (optional).
|
|
74
85
|
n_zyte_wkrs: Number of async workers for zyte (optional).
|
|
75
86
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
87
|
+
http_client: An httpx.AsyncClient to use for the async requests (optional).
|
|
76
88
|
"""
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
self.
|
|
80
|
-
self.
|
|
81
|
-
self.
|
|
82
|
-
self.
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
)
|
|
89
|
+
|
|
90
|
+
# Store the variables for setting up the clients
|
|
91
|
+
self._serpapi_key = serpapi_key
|
|
92
|
+
self._dataforseo_user = dataforseo_user
|
|
93
|
+
self._dataforseo_pwd = dataforseo_pwd
|
|
94
|
+
self._zyteapi_key = zyteapi_key
|
|
95
|
+
self._openaiapi_key = openaiapi_key
|
|
96
|
+
self._openai_model = openai_model
|
|
86
97
|
|
|
87
98
|
# Setup the async framework
|
|
88
99
|
self._n_serp_wkrs = n_serp_wkrs
|
|
@@ -91,12 +102,50 @@ class Orchestrator(ABC):
|
|
|
91
102
|
self._queues: Dict[str, asyncio.Queue] | None = None
|
|
92
103
|
self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
|
|
93
104
|
|
|
105
|
+
# Setup the httpx client
|
|
106
|
+
self._http_client = http_client
|
|
107
|
+
self._owns_http_client = http_client is None
|
|
108
|
+
|
|
109
|
+
async def __aenter__(self) -> Self:
|
|
110
|
+
"""Creates and starts an httpx.AsyncClient if not provided."""
|
|
111
|
+
if self._http_client is None:
|
|
112
|
+
logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
|
|
113
|
+
self._http_client = HttpxAsyncClient()
|
|
114
|
+
self._owns_http_client = True
|
|
115
|
+
|
|
116
|
+
# Setup the clients
|
|
117
|
+
self._search = Search(
|
|
118
|
+
http_client=self._http_client, serpapi_key=self._serpapi_key
|
|
119
|
+
)
|
|
120
|
+
self._enricher = Enricher(
|
|
121
|
+
http_client=self._http_client,
|
|
122
|
+
user=self._dataforseo_user,
|
|
123
|
+
pwd=self._dataforseo_pwd,
|
|
124
|
+
)
|
|
125
|
+
self._url_collector = URLCollector()
|
|
126
|
+
self._zyteapi = ZyteAPI(
|
|
127
|
+
http_client=self._http_client, api_key=self._zyteapi_key
|
|
128
|
+
)
|
|
129
|
+
self._processor = Processor(
|
|
130
|
+
http_client=self._http_client,
|
|
131
|
+
api_key=self._openaiapi_key,
|
|
132
|
+
model=self._openai_model,
|
|
133
|
+
)
|
|
134
|
+
return self
|
|
135
|
+
|
|
136
|
+
async def __aexit__(self, *args, **kwargs) -> None:
|
|
137
|
+
"""Closes the httpx.AsyncClient if it was created by this orchestrator."""
|
|
138
|
+
if self._owns_http_client and self._http_client is not None:
|
|
139
|
+
logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
|
|
140
|
+
await self._http_client.aclose()
|
|
141
|
+
self._http_client = None
|
|
142
|
+
|
|
94
143
|
async def _serp_execute(
|
|
95
144
|
self,
|
|
96
145
|
queue_in: asyncio.Queue[dict | None],
|
|
97
146
|
queue_out: asyncio.Queue[ProductItem | None],
|
|
98
147
|
) -> None:
|
|
99
|
-
"""Collects the
|
|
148
|
+
"""Collects the search setups from the queue_in, executes the search, filters the results and puts them into queue_out.
|
|
100
149
|
|
|
101
150
|
Args:
|
|
102
151
|
queue_in: The input queue containing the search parameters.
|
|
@@ -110,23 +159,30 @@ class Orchestrator(ABC):
|
|
|
110
159
|
|
|
111
160
|
try:
|
|
112
161
|
search_term_type = item.pop("search_term_type")
|
|
113
|
-
|
|
162
|
+
# The search_engines are already SearchEngineName enum values
|
|
163
|
+
search_engines = item.pop("search_engines")
|
|
164
|
+
|
|
165
|
+
results = await self._search.apply(
|
|
166
|
+
**item, search_engines=search_engines
|
|
167
|
+
)
|
|
168
|
+
|
|
114
169
|
logger.debug(
|
|
115
|
-
f"
|
|
170
|
+
f"Search for {item['search_term']} returned {len(results)} results"
|
|
116
171
|
)
|
|
117
172
|
for res in results:
|
|
118
173
|
product = ProductItem(
|
|
119
174
|
search_term=item["search_term"],
|
|
120
175
|
search_term_type=search_term_type,
|
|
121
176
|
url=res.url,
|
|
122
|
-
|
|
177
|
+
url_resolved=res.url, # Set initial value, will be updated by Zyte
|
|
178
|
+
search_engine_name=res.search_engine_name,
|
|
123
179
|
domain=res.domain,
|
|
124
180
|
filtered=res.filtered,
|
|
125
181
|
filtered_at_stage=res.filtered_at_stage,
|
|
126
182
|
)
|
|
127
183
|
await queue_out.put(product)
|
|
128
184
|
except Exception as e:
|
|
129
|
-
logger.error(f"Error executing
|
|
185
|
+
logger.error(f"Error executing search: {e}")
|
|
130
186
|
queue_in.task_done()
|
|
131
187
|
|
|
132
188
|
async def _collect_url(
|
|
@@ -191,10 +247,22 @@ class Orchestrator(ABC):
|
|
|
191
247
|
if not product.filtered:
|
|
192
248
|
try:
|
|
193
249
|
# Fetch the product details from Zyte API
|
|
194
|
-
details = await self._zyteapi.
|
|
250
|
+
details = await self._zyteapi.details(url=product.url)
|
|
251
|
+
url_resolved = self._zyteapi.extract_url_resolved(details=details)
|
|
252
|
+
if url_resolved:
|
|
253
|
+
product.url_resolved = url_resolved
|
|
195
254
|
product.product_name = self._zyteapi.extract_product_name(
|
|
196
255
|
details=details
|
|
197
256
|
)
|
|
257
|
+
|
|
258
|
+
# If the resolved URL is different from the original URL, we also need to update the domain as
|
|
259
|
+
# otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
|
|
260
|
+
if url_resolved and url_resolved != product.url:
|
|
261
|
+
logger.debug(
|
|
262
|
+
f"URL resolved for {product.url} is {url_resolved}"
|
|
263
|
+
)
|
|
264
|
+
product.domain = self._search._get_domain(url_resolved)
|
|
265
|
+
|
|
198
266
|
product.product_price = self._zyteapi.extract_product_price(
|
|
199
267
|
details=details
|
|
200
268
|
)
|
|
@@ -362,7 +430,7 @@ class Orchestrator(ABC):
|
|
|
362
430
|
queue: asyncio.Queue[dict | None],
|
|
363
431
|
search_term: str,
|
|
364
432
|
search_term_type: str,
|
|
365
|
-
search_engines: List[
|
|
433
|
+
search_engines: List[SearchEngineName],
|
|
366
434
|
language: Language,
|
|
367
435
|
location: Location,
|
|
368
436
|
num_results: int,
|
|
@@ -387,7 +455,7 @@ class Orchestrator(ABC):
|
|
|
387
455
|
self,
|
|
388
456
|
queue: asyncio.Queue[dict | None],
|
|
389
457
|
search_term: str,
|
|
390
|
-
search_engines: List[
|
|
458
|
+
search_engines: List[SearchEngineName],
|
|
391
459
|
language: Language,
|
|
392
460
|
location: Location,
|
|
393
461
|
deepness: Deepness,
|
|
@@ -417,7 +485,7 @@ class Orchestrator(ABC):
|
|
|
417
485
|
if enrichment:
|
|
418
486
|
# Call DataForSEO to get additional terms
|
|
419
487
|
n_terms = enrichment.additional_terms
|
|
420
|
-
terms = await self._enricher.
|
|
488
|
+
terms = await self._enricher.enrich(
|
|
421
489
|
search_term=search_term,
|
|
422
490
|
language=language,
|
|
423
491
|
location=location,
|
|
@@ -437,7 +505,7 @@ class Orchestrator(ABC):
|
|
|
437
505
|
async def run(
|
|
438
506
|
self,
|
|
439
507
|
search_term: str,
|
|
440
|
-
search_engines: List[
|
|
508
|
+
search_engines: List[SearchEngineName],
|
|
441
509
|
language: Language,
|
|
442
510
|
location: Location,
|
|
443
511
|
deepness: Deepness,
|
|
@@ -450,7 +518,7 @@ class Orchestrator(ABC):
|
|
|
450
518
|
|
|
451
519
|
Args:
|
|
452
520
|
search_term: The search term for the query.
|
|
453
|
-
search_engines: The list of search engines to use for the
|
|
521
|
+
search_engines: The list of search engines to use for the search query.
|
|
454
522
|
language: The language to use for the query.
|
|
455
523
|
location: The location to use for the query.
|
|
456
524
|
deepness: The search depth and enrichment details.
|
|
@@ -459,10 +527,17 @@ class Orchestrator(ABC):
|
|
|
459
527
|
excluded_urls: The URLs to exclude from the search.
|
|
460
528
|
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
461
529
|
"""
|
|
462
|
-
|
|
463
530
|
# ---------------------------
|
|
464
531
|
# INITIAL SETUP
|
|
465
532
|
# ---------------------------
|
|
533
|
+
# Ensure we have at least one search engine
|
|
534
|
+
if not search_engines:
|
|
535
|
+
logger.warning(
|
|
536
|
+
"No search engines specified, using all available search engines"
|
|
537
|
+
)
|
|
538
|
+
search_engines = list(SearchEngineName)
|
|
539
|
+
|
|
540
|
+
# Handle previously collected URLs
|
|
466
541
|
if previously_collected_urls:
|
|
467
542
|
self._url_collector.collected_previously = set(previously_collected_urls)
|
|
468
543
|
|
|
@@ -614,4 +689,7 @@ class Orchestrator(ABC):
|
|
|
614
689
|
finally:
|
|
615
690
|
await res_queue.join()
|
|
616
691
|
|
|
692
|
+
# ---------------------------
|
|
693
|
+
# CLOSING PIPELINE
|
|
694
|
+
# ---------------------------
|
|
617
695
|
logger.info("Pipeline concluded; async framework is closed")
|
fraudcrawler/base/retry.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from httpx import HTTPStatusError
|
|
2
2
|
from tenacity import (
|
|
3
3
|
AsyncRetrying,
|
|
4
4
|
retry_if_exception,
|
|
@@ -17,7 +17,10 @@ from fraudcrawler.settings import (
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def _is_retryable_exception(err: BaseException) -> bool:
|
|
20
|
-
if
|
|
20
|
+
if (
|
|
21
|
+
isinstance(err, HTTPStatusError)
|
|
22
|
+
and err.response.status_code in RETRY_SKIP_IF_CODE
|
|
23
|
+
):
|
|
21
24
|
return False
|
|
22
25
|
return True
|
|
23
26
|
|
|
@@ -54,17 +54,17 @@ def search(search_term: str):
|
|
|
54
54
|
|
|
55
55
|
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
56
56
|
|
|
57
|
-
#
|
|
58
|
-
|
|
57
|
+
# Optional: Add MARKETPLACES and EXCLUDED_URLS
|
|
58
|
+
from fraudcrawler import Host
|
|
59
59
|
|
|
60
60
|
# marketplaces = [
|
|
61
61
|
# Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
62
|
-
# Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
63
|
-
# ]
|
|
64
|
-
# excluded_urls = [
|
|
65
|
-
# Host(name="Digitec", domains="digitec.ch"),
|
|
66
|
-
# Host(name="Brack", domains="brack.ch"),
|
|
62
|
+
# # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
67
63
|
# ]
|
|
64
|
+
excluded_urls = [
|
|
65
|
+
Host(name="Digitec", domains="digitec.ch"),
|
|
66
|
+
Host(name="Brack", domains="brack.ch"),
|
|
67
|
+
]
|
|
68
68
|
|
|
69
69
|
# Execute the pipeline
|
|
70
70
|
client.execute(
|
|
@@ -74,7 +74,7 @@ def search(search_term: str):
|
|
|
74
74
|
deepness=deepness,
|
|
75
75
|
prompts=prompts,
|
|
76
76
|
# marketplaces=marketplaces,
|
|
77
|
-
|
|
77
|
+
excluded_urls=excluded_urls,
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
# Show results
|
|
@@ -97,4 +97,4 @@ def search(search_term: str):
|
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
if __name__ == "__main__":
|
|
100
|
-
search(search_term
|
|
100
|
+
search(search_term='Liebherr "TP1410"')
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
import httpx
|
|
3
4
|
from openai import AsyncOpenAI
|
|
4
5
|
from tenacity import RetryCallState
|
|
5
6
|
|
|
@@ -21,6 +22,7 @@ class Processor:
|
|
|
21
22
|
|
|
22
23
|
def __init__(
|
|
23
24
|
self,
|
|
25
|
+
http_client: httpx.AsyncClient,
|
|
24
26
|
api_key: str,
|
|
25
27
|
model: str,
|
|
26
28
|
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
@@ -29,12 +31,13 @@ class Processor:
|
|
|
29
31
|
"""Initializes the Processor.
|
|
30
32
|
|
|
31
33
|
Args:
|
|
34
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
32
35
|
api_key: The OpenAI API key.
|
|
33
36
|
model: The OpenAI model to use.
|
|
34
37
|
default_if_missing: The default classification to return if error occurs.
|
|
35
38
|
empty_token_count: The default value to return as tokensif the classification is empty.
|
|
36
39
|
"""
|
|
37
|
-
self._client = AsyncOpenAI(api_key=api_key)
|
|
40
|
+
self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
|
|
38
41
|
self._model = model
|
|
39
42
|
self._error_response = ClassificationResult(
|
|
40
43
|
result=default_if_missing,
|
|
@@ -59,7 +62,7 @@ class Processor:
|
|
|
59
62
|
)
|
|
60
63
|
)
|
|
61
64
|
else:
|
|
62
|
-
logger.
|
|
65
|
+
logger.warning(
|
|
63
66
|
f'Field "{field}" is missing in ProductItem with url="{product.url}"'
|
|
64
67
|
)
|
|
65
68
|
return "\n\n".join(details)
|
|
@@ -101,9 +104,10 @@ class Processor:
|
|
|
101
104
|
],
|
|
102
105
|
**kwargs,
|
|
103
106
|
)
|
|
104
|
-
content
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
if not response or not (content := response.choices[0].message.content):
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f'Error calling OpenAI API or empty response="{response}".'
|
|
110
|
+
)
|
|
107
111
|
|
|
108
112
|
# Convert the content to an integer
|
|
109
113
|
try:
|