fraudcrawler 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +7 -5
- fraudcrawler/base/base.py +64 -32
- fraudcrawler/base/client.py +27 -11
- fraudcrawler/base/orchestrator.py +103 -25
- fraudcrawler/base/retry.py +5 -2
- fraudcrawler/launch_demo_pipeline.py +9 -9
- fraudcrawler/processing/processor.py +9 -5
- fraudcrawler/scraping/enrich.py +38 -21
- fraudcrawler/scraping/search.py +664 -0
- fraudcrawler/scraping/zyte.py +37 -15
- fraudcrawler/settings.py +13 -2
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/METADATA +6 -2
- fraudcrawler-0.5.1.dist-info/RECORD +22 -0
- fraudcrawler/scraping/serp.py +0 -515
- fraudcrawler-0.4.7.dist-info/RECORD +0 -22
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/enrich.py
CHANGED
|
@@ -2,12 +2,13 @@ from base64 import b64encode
|
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
import logging
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
from typing import Dict,
|
|
5
|
+
from typing import Dict, Iterator, List
|
|
6
6
|
|
|
7
|
+
import httpx
|
|
7
8
|
from tenacity import RetryCallState
|
|
8
9
|
|
|
9
10
|
from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
|
|
10
|
-
from fraudcrawler.base.base import Location, Language
|
|
11
|
+
from fraudcrawler.base.base import Location, Language
|
|
11
12
|
from fraudcrawler.base.retry import get_async_retry
|
|
12
13
|
|
|
13
14
|
|
|
@@ -21,7 +22,7 @@ class Keyword(BaseModel):
|
|
|
21
22
|
volume: int
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
class Enricher
|
|
25
|
+
class Enricher:
|
|
25
26
|
"""A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
|
|
26
27
|
|
|
27
28
|
_auth_encoding = "ascii"
|
|
@@ -29,13 +30,15 @@ class Enricher(AsyncClient):
|
|
|
29
30
|
_suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
|
|
30
31
|
_keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
|
|
31
32
|
|
|
32
|
-
def __init__(self, user: str, pwd: str):
|
|
33
|
+
def __init__(self, http_client: httpx.AsyncClient, user: str, pwd: str):
|
|
33
34
|
"""Initializes the DataForSeoApiClient with the given username and password.
|
|
34
35
|
|
|
35
36
|
Args:
|
|
37
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
36
38
|
user: The username for DataForSEO API.
|
|
37
39
|
pwd: The password for DataForSEO API.
|
|
38
40
|
"""
|
|
41
|
+
self._http_client = http_client
|
|
39
42
|
self._user = user
|
|
40
43
|
self._pwd = pwd
|
|
41
44
|
auth = f"{user}:{pwd}"
|
|
@@ -161,7 +164,9 @@ class Enricher(AsyncClient):
|
|
|
161
164
|
}
|
|
162
165
|
]
|
|
163
166
|
url = f"{self._base_endpoint}{self._suggestions_endpoint}"
|
|
164
|
-
logger.debug(
|
|
167
|
+
logger.debug(
|
|
168
|
+
f'DataForSEO search suggested keywords with url="{url}" and data="{data}".'
|
|
169
|
+
)
|
|
165
170
|
|
|
166
171
|
# Perform the request and retry if necessary. There is some context aware logging
|
|
167
172
|
# - `before`: before the request is made (or before retrying)
|
|
@@ -175,10 +180,14 @@ class Enricher(AsyncClient):
|
|
|
175
180
|
)
|
|
176
181
|
async for attempt in retry:
|
|
177
182
|
with attempt:
|
|
178
|
-
|
|
183
|
+
response = await self._http_client.post(
|
|
184
|
+
url=url, headers=self._headers, json=data
|
|
185
|
+
)
|
|
186
|
+
response.raise_for_status()
|
|
179
187
|
|
|
180
188
|
# Extract the keywords from the response
|
|
181
|
-
|
|
189
|
+
data_suggested_keywords = response.json()
|
|
190
|
+
keywords = self._extract_suggested_keywords(data=data_suggested_keywords)
|
|
182
191
|
|
|
183
192
|
logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
|
|
184
193
|
return keywords
|
|
@@ -260,28 +269,36 @@ class Enricher(AsyncClient):
|
|
|
260
269
|
"limit": limit,
|
|
261
270
|
}
|
|
262
271
|
]
|
|
272
|
+
url = f"{self._base_endpoint}{self._keywords_endpoint}"
|
|
263
273
|
logger.debug(
|
|
264
|
-
f'DataForSEO search
|
|
274
|
+
f'DataForSEO search related keywords with url="{url}" and data="{data}".'
|
|
265
275
|
)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
276
|
+
|
|
277
|
+
# Perform the request and retry if necessary. There is some context aware logging
|
|
278
|
+
# - `before`: before the request is made (or before retrying)
|
|
279
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
280
|
+
retry = get_async_retry()
|
|
281
|
+
retry.before = lambda retry_state: self._log_before(
|
|
282
|
+
search_term=search_term, retry_state=retry_state
|
|
283
|
+
)
|
|
284
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
285
|
+
search_term=search_term, retry_state=retry_state
|
|
286
|
+
)
|
|
287
|
+
async for attempt in retry:
|
|
288
|
+
with attempt:
|
|
289
|
+
response = await self._http_client.post(
|
|
290
|
+
url=url, headers=self._headers, json=data
|
|
291
|
+
)
|
|
292
|
+
response.raise_for_status()
|
|
272
293
|
|
|
273
294
|
# Extract the keywords from the response
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
except Exception as e:
|
|
277
|
-
logger.error(
|
|
278
|
-
f"Failed to extract related keywords from DataForSEO response with error: {e}."
|
|
279
|
-
)
|
|
295
|
+
data_related_keywords = response.json()
|
|
296
|
+
keywords = self._extract_related_keywords(data=data_related_keywords)
|
|
280
297
|
|
|
281
298
|
logger.debug(f"Found {len(keywords)} related keywords from DataForSEO search.")
|
|
282
299
|
return keywords
|
|
283
300
|
|
|
284
|
-
async def
|
|
301
|
+
async def enrich(
|
|
285
302
|
self,
|
|
286
303
|
search_term: str,
|
|
287
304
|
language: Language,
|