fraudcrawler 0.7.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +52 -0
- fraudcrawler/base/__init__.py +0 -0
- fraudcrawler/base/base.py +222 -0
- fraudcrawler/base/client.py +167 -0
- fraudcrawler/base/google-languages.json +630 -0
- fraudcrawler/base/google-locations.json +1 -0
- fraudcrawler/base/orchestrator.py +696 -0
- fraudcrawler/base/retry.py +54 -0
- fraudcrawler/launch_demo_pipeline.py +162 -0
- fraudcrawler/processing/__init__.py +0 -0
- fraudcrawler/processing/base.py +129 -0
- fraudcrawler/processing/openai.py +520 -0
- fraudcrawler/scraping/__init__.py +0 -0
- fraudcrawler/scraping/enrich.py +361 -0
- fraudcrawler/scraping/search.py +924 -0
- fraudcrawler/scraping/url.py +96 -0
- fraudcrawler/scraping/zyte.py +287 -0
- fraudcrawler/settings.py +104 -0
- fraudcrawler-0.7.21.dist-info/METADATA +175 -0
- fraudcrawler-0.7.21.dist-info/RECORD +23 -0
- fraudcrawler-0.7.21.dist-info/WHEEL +4 -0
- fraudcrawler-0.7.21.dist-info/entry_points.txt +3 -0
- fraudcrawler-0.7.21.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
from base64 import b64encode
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
import logging
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from typing import Dict, Iterator, List
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from tenacity import RetryCallState
|
|
9
|
+
|
|
10
|
+
from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
|
|
11
|
+
from fraudcrawler.base.base import Location, Language
|
|
12
|
+
from fraudcrawler.base.retry import get_async_retry
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Keyword(BaseModel):
|
|
19
|
+
"""Model for keyword details (e.g. `Keyword(text="sildenafil", volume=100)`)."""
|
|
20
|
+
|
|
21
|
+
text: str
|
|
22
|
+
volume: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Enricher:
|
|
26
|
+
"""A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
|
|
27
|
+
|
|
28
|
+
_auth_encoding = "ascii"
|
|
29
|
+
_base_endpoint = "https://api.dataforseo.com"
|
|
30
|
+
_suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
|
|
31
|
+
_keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
|
|
32
|
+
|
|
33
|
+
def __init__(self, http_client: httpx.AsyncClient, user: str, pwd: str):
|
|
34
|
+
"""Initializes the DataForSeoApiClient with the given username and password.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
38
|
+
user: The username for DataForSEO API.
|
|
39
|
+
pwd: The password for DataForSEO API.
|
|
40
|
+
"""
|
|
41
|
+
self._http_client = http_client
|
|
42
|
+
self._user = user
|
|
43
|
+
self._pwd = pwd
|
|
44
|
+
auth = f"{user}:{pwd}"
|
|
45
|
+
auth = b64encode(auth.encode(self._auth_encoding)).decode(self._auth_encoding)
|
|
46
|
+
self._headers = {
|
|
47
|
+
"Authorization": f"Basic {auth}",
|
|
48
|
+
"Content-Encoding": "gzip",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _log_before(search_term: str, retry_state: RetryCallState | None) -> None:
|
|
53
|
+
"""Context aware logging before the request is made."""
|
|
54
|
+
if retry_state:
|
|
55
|
+
logger.debug(
|
|
56
|
+
f'DataForSEO suggested search with search="{search_term}" (attempt {retry_state.attempt_number}).'
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
logger.debug(f"retry_state is {retry_state}, not logging before.")
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _log_before_sleep(search_term: str, retry_state: RetryCallState | None) -> None:
|
|
63
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
64
|
+
if retry_state and retry_state.outcome:
|
|
65
|
+
logger.warning(
|
|
66
|
+
f'Attempt {retry_state.attempt_number} DataForSEO suggested search with search_term="{search_term}" '
|
|
67
|
+
f"failed with error: {retry_state.outcome.exception()}. "
|
|
68
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
logger.debug(f"retry_state is {retry_state}, not logging before_sleep.")
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _extract_items_from_data(data: dict) -> Iterator[dict]:
|
|
75
|
+
"""Extracts the items from the DataForSEO response.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
data: The response data from DataForSEO.
|
|
79
|
+
"""
|
|
80
|
+
tasks = (
|
|
81
|
+
data.get("tasks") or []
|
|
82
|
+
) # in contrast to data.get("tasks", []) this handles the case where data["tasks"] is set to None
|
|
83
|
+
for task in tasks:
|
|
84
|
+
results = task.get("result") or []
|
|
85
|
+
for result in results:
|
|
86
|
+
items = result.get("items") or []
|
|
87
|
+
yield from items
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _parse_suggested_keyword(item: dict) -> Keyword:
|
|
91
|
+
"""Parses a keyword from an item in the DataForSEO suggested keyword search response.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
item: An item from the DataForSEO response.
|
|
95
|
+
"""
|
|
96
|
+
text = item["keyword"]
|
|
97
|
+
volume = item["keyword_info"]["search_volume"]
|
|
98
|
+
return Keyword(text=text, volume=volume)
|
|
99
|
+
|
|
100
|
+
def _extract_suggested_keywords(self, data: dict) -> List[Keyword]:
|
|
101
|
+
"""Extracts the keywords from the DataForSEO response for suggested keywords.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
data: The response data from DataForSEO.
|
|
105
|
+
|
|
106
|
+
The DataForSEO results are of the form
|
|
107
|
+
(c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/keyword_suggestions/live/?bash):
|
|
108
|
+
{
|
|
109
|
+
"tasks": [
|
|
110
|
+
{
|
|
111
|
+
"result": [
|
|
112
|
+
{
|
|
113
|
+
"items": [
|
|
114
|
+
{
|
|
115
|
+
"keyword": <suggested-keyword>,
|
|
116
|
+
"keyword_info": {
|
|
117
|
+
"search_volume": <volume>
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
}
|
|
122
|
+
]
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
data: The response data from DataForSEO.
|
|
129
|
+
"""
|
|
130
|
+
keywords = []
|
|
131
|
+
for item in self._extract_items_from_data(data=data):
|
|
132
|
+
try:
|
|
133
|
+
keyword = self._parse_suggested_keyword(item)
|
|
134
|
+
keywords.append(keyword)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.warning(f"Ignoring keyword due to error: {e}.")
|
|
137
|
+
return keywords
|
|
138
|
+
|
|
139
|
+
async def _get_suggested_keywords(
|
|
140
|
+
self,
|
|
141
|
+
search_term: str,
|
|
142
|
+
language: Language,
|
|
143
|
+
location: Location,
|
|
144
|
+
limit: int = ENRICHMENT_DEFAULT_LIMIT,
|
|
145
|
+
) -> List[Keyword]:
|
|
146
|
+
"""Get keyword suggestions for a given search_term.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
search_term: The search term to use for the query.
|
|
150
|
+
language: The language to use for the search.
|
|
151
|
+
location: The location to use for the search.
|
|
152
|
+
limit: The upper limit of suggestions to get.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
# Data must be a list of dictionaries, setting a number of search tasks; here we only have one task.
|
|
156
|
+
data = [
|
|
157
|
+
{
|
|
158
|
+
"keyword": search_term,
|
|
159
|
+
"language_name": language.name,
|
|
160
|
+
"location_name": location.name,
|
|
161
|
+
"limit": limit,
|
|
162
|
+
"include_serp_info": True,
|
|
163
|
+
"include_seed_keyword": True,
|
|
164
|
+
}
|
|
165
|
+
]
|
|
166
|
+
url = f"{self._base_endpoint}{self._suggestions_endpoint}"
|
|
167
|
+
logger.debug(
|
|
168
|
+
f'DataForSEO search suggested keywords with url="{url}" and data="{data}".'
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Perform the request and retry if necessary. There is some context aware logging
|
|
172
|
+
# - `before`: before the request is made (or before retrying)
|
|
173
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
174
|
+
retry = get_async_retry()
|
|
175
|
+
retry.before = lambda retry_state: self._log_before(
|
|
176
|
+
search_term=search_term, retry_state=retry_state
|
|
177
|
+
)
|
|
178
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
179
|
+
search_term=search_term, retry_state=retry_state
|
|
180
|
+
)
|
|
181
|
+
async for attempt in retry:
|
|
182
|
+
with attempt:
|
|
183
|
+
response = await self._http_client.post(
|
|
184
|
+
url=url, headers=self._headers, json=data
|
|
185
|
+
)
|
|
186
|
+
response.raise_for_status()
|
|
187
|
+
|
|
188
|
+
# Extract the keywords from the response
|
|
189
|
+
data_suggested_keywords = response.json()
|
|
190
|
+
keywords = self._extract_suggested_keywords(data=data_suggested_keywords)
|
|
191
|
+
|
|
192
|
+
logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
|
|
193
|
+
return keywords
|
|
194
|
+
|
|
195
|
+
@staticmethod
|
|
196
|
+
def _parse_related_keyword(item: dict) -> Keyword:
|
|
197
|
+
"""Parses a keyword from an item in the DataForSEO related keyword search response.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
item: An item from the DataForSEO response.
|
|
201
|
+
"""
|
|
202
|
+
text = item["keyword_data"]["keyword"]
|
|
203
|
+
volume = item["keyword_data"]["keyword_info"]["search_volume"]
|
|
204
|
+
return Keyword(text=text, volume=volume)
|
|
205
|
+
|
|
206
|
+
def _extract_related_keywords(self, data: dict) -> List[Keyword]:
|
|
207
|
+
"""Extracts the keywords from the DataForSEO response for related keywords.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
data: The response data from DataForSEO.
|
|
211
|
+
|
|
212
|
+
The DataForSEO results are of the form
|
|
213
|
+
(c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/related_keywords/live/?bash):
|
|
214
|
+
{
|
|
215
|
+
"tasks": [
|
|
216
|
+
{
|
|
217
|
+
"result": [
|
|
218
|
+
{
|
|
219
|
+
"items": [
|
|
220
|
+
{
|
|
221
|
+
"keyword_data": {
|
|
222
|
+
"keyword": <related-keyword>,
|
|
223
|
+
"keyword_info": {
|
|
224
|
+
"search_volume": <volume>
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
]
|
|
229
|
+
}
|
|
230
|
+
]
|
|
231
|
+
}
|
|
232
|
+
]
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
data: The response data from DataForSEO.
|
|
237
|
+
"""
|
|
238
|
+
keywords = []
|
|
239
|
+
for item in self._extract_items_from_data(data=data):
|
|
240
|
+
try:
|
|
241
|
+
keyword = self._parse_related_keyword(item)
|
|
242
|
+
keywords.append(keyword)
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning(f"Ignoring keyword due to error: {e}.")
|
|
245
|
+
return keywords
|
|
246
|
+
|
|
247
|
+
async def _get_related_keywords(
|
|
248
|
+
self,
|
|
249
|
+
search_term: str,
|
|
250
|
+
language: Language,
|
|
251
|
+
location: Location,
|
|
252
|
+
limit: int = ENRICHMENT_DEFAULT_LIMIT,
|
|
253
|
+
) -> List[Keyword]:
|
|
254
|
+
"""Get related keywords for a given search_term.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
search_term: The search term to use for the query.
|
|
258
|
+
location: The location to use for the search.
|
|
259
|
+
language: The language to use for the search.
|
|
260
|
+
limit: The upper limit of suggestions to get.
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
# Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
|
|
264
|
+
data = [
|
|
265
|
+
{
|
|
266
|
+
"keyword": search_term,
|
|
267
|
+
"language_name": language.name,
|
|
268
|
+
"location_name": location.name,
|
|
269
|
+
"limit": limit,
|
|
270
|
+
}
|
|
271
|
+
]
|
|
272
|
+
url = f"{self._base_endpoint}{self._keywords_endpoint}"
|
|
273
|
+
logger.debug(
|
|
274
|
+
f'DataForSEO search related keywords with url="{url}" and data="{data}".'
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Perform the request and retry if necessary. There is some context aware logging
|
|
278
|
+
# - `before`: before the request is made (or before retrying)
|
|
279
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
280
|
+
retry = get_async_retry()
|
|
281
|
+
retry.before = lambda retry_state: self._log_before(
|
|
282
|
+
search_term=search_term, retry_state=retry_state
|
|
283
|
+
)
|
|
284
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
285
|
+
search_term=search_term, retry_state=retry_state
|
|
286
|
+
)
|
|
287
|
+
async for attempt in retry:
|
|
288
|
+
with attempt:
|
|
289
|
+
response = await self._http_client.post(
|
|
290
|
+
url=url, headers=self._headers, json=data
|
|
291
|
+
)
|
|
292
|
+
response.raise_for_status()
|
|
293
|
+
|
|
294
|
+
# Extract the keywords from the response
|
|
295
|
+
data_related_keywords = response.json()
|
|
296
|
+
keywords = self._extract_related_keywords(data=data_related_keywords)
|
|
297
|
+
|
|
298
|
+
logger.debug(f"Found {len(keywords)} related keywords from DataForSEO search.")
|
|
299
|
+
return keywords
|
|
300
|
+
|
|
301
|
+
async def enrich(
|
|
302
|
+
self,
|
|
303
|
+
search_term: str,
|
|
304
|
+
language: Language,
|
|
305
|
+
location: Location,
|
|
306
|
+
n_terms: int,
|
|
307
|
+
) -> List[str]:
|
|
308
|
+
"""Applies the enrichment to a search_term.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
search_term: The search term to use for the query.
|
|
312
|
+
location: The location to use for the search.
|
|
313
|
+
language: The language to use for the search.
|
|
314
|
+
n_terms: The number of additional terms
|
|
315
|
+
"""
|
|
316
|
+
logger.info(
|
|
317
|
+
f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
|
|
318
|
+
)
|
|
319
|
+
# Get the additional suggested keywords
|
|
320
|
+
try:
|
|
321
|
+
suggested = await self._get_suggested_keywords(
|
|
322
|
+
search_term=search_term,
|
|
323
|
+
location=location,
|
|
324
|
+
language=language,
|
|
325
|
+
limit=n_terms,
|
|
326
|
+
)
|
|
327
|
+
except Exception:
|
|
328
|
+
logger.error(
|
|
329
|
+
f"Fetching suggested keywords for search_term='{search_term}' failed",
|
|
330
|
+
exc_info=True,
|
|
331
|
+
)
|
|
332
|
+
suggested = []
|
|
333
|
+
|
|
334
|
+
# Get the additional related keywords
|
|
335
|
+
try:
|
|
336
|
+
related = await self._get_related_keywords(
|
|
337
|
+
search_term=search_term,
|
|
338
|
+
location=location,
|
|
339
|
+
language=language,
|
|
340
|
+
limit=n_terms,
|
|
341
|
+
)
|
|
342
|
+
except Exception:
|
|
343
|
+
logger.error(
|
|
344
|
+
f"Fetching related keywords for search_term='{search_term}' failed",
|
|
345
|
+
exc_info=True,
|
|
346
|
+
)
|
|
347
|
+
related = []
|
|
348
|
+
|
|
349
|
+
# Remove original keyword and aggregate them by volume
|
|
350
|
+
keywords = [kw for kw in suggested + related if kw.text != search_term]
|
|
351
|
+
kw_vol: Dict[str, int] = defaultdict(int)
|
|
352
|
+
for kw in keywords:
|
|
353
|
+
kw_vol[kw.text] = max(kw.volume, kw_vol[kw.text])
|
|
354
|
+
keywords = [Keyword(text=k, volume=v) for k, v in kw_vol.items()]
|
|
355
|
+
logger.debug(f"Found {len(keywords)} additional unique keywords.")
|
|
356
|
+
|
|
357
|
+
# Sort the keywords by volume and get the top n_terms
|
|
358
|
+
keywords = sorted(keywords, key=lambda kw: kw.volume, reverse=True)
|
|
359
|
+
terms = [kw.text for kw in keywords[:n_terms]]
|
|
360
|
+
logger.info(f"Produced {len(terms)} additional search_terms.")
|
|
361
|
+
return terms
|