fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,361 @@
1
+ from base64 import b64encode
2
+ from collections import defaultdict
3
+ import logging
4
+ from pydantic import BaseModel
5
+ from typing import Dict, Iterator, List
6
+
7
+ import httpx
8
+ from tenacity import RetryCallState
9
+
10
+ from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
11
+ from fraudcrawler.base.base import Location, Language
12
+ from fraudcrawler.base.retry import get_async_retry
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class Keyword(BaseModel):
19
+ """Model for keyword details (e.g. `Keyword(text="sildenafil", volume=100)`)."""
20
+
21
+ text: str
22
+ volume: int
23
+
24
+
25
+ class Enricher:
26
+ """A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
27
+
28
+ _auth_encoding = "ascii"
29
+ _base_endpoint = "https://api.dataforseo.com"
30
+ _suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
31
+ _keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
32
+
33
+ def __init__(self, http_client: httpx.AsyncClient, user: str, pwd: str):
34
+ """Initializes the DataForSeoApiClient with the given username and password.
35
+
36
+ Args:
37
+ http_client: An httpx.AsyncClient to use for the async requests.
38
+ user: The username for DataForSEO API.
39
+ pwd: The password for DataForSEO API.
40
+ """
41
+ self._http_client = http_client
42
+ self._user = user
43
+ self._pwd = pwd
44
+ auth = f"{user}:{pwd}"
45
+ auth = b64encode(auth.encode(self._auth_encoding)).decode(self._auth_encoding)
46
+ self._headers = {
47
+ "Authorization": f"Basic {auth}",
48
+ "Content-Encoding": "gzip",
49
+ }
50
+
51
+ @staticmethod
52
+ def _log_before(search_term: str, retry_state: RetryCallState | None) -> None:
53
+ """Context aware logging before the request is made."""
54
+ if retry_state:
55
+ logger.debug(
56
+ f'DataForSEO suggested search with search="{search_term}" (attempt {retry_state.attempt_number}).'
57
+ )
58
+ else:
59
+ logger.debug(f"retry_state is {retry_state}, not logging before.")
60
+
61
+ @staticmethod
62
+ def _log_before_sleep(search_term: str, retry_state: RetryCallState | None) -> None:
63
+ """Context aware logging before sleeping after a failed request."""
64
+ if retry_state and retry_state.outcome:
65
+ logger.warning(
66
+ f'Attempt {retry_state.attempt_number} DataForSEO suggested search with search_term="{search_term}" '
67
+ f"failed with error: {retry_state.outcome.exception()}. "
68
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
69
+ )
70
+ else:
71
+ logger.debug(f"retry_state is {retry_state}, not logging before_sleep.")
72
+
73
+ @staticmethod
74
+ def _extract_items_from_data(data: dict) -> Iterator[dict]:
75
+ """Extracts the items from the DataForSEO response.
76
+
77
+ Args:
78
+ data: The response data from DataForSEO.
79
+ """
80
+ tasks = (
81
+ data.get("tasks") or []
82
+ ) # in contrast to data.get("tasks", []) this handles the case where data["tasks"] is set to None
83
+ for task in tasks:
84
+ results = task.get("result") or []
85
+ for result in results:
86
+ items = result.get("items") or []
87
+ yield from items
88
+
89
+ @staticmethod
90
+ def _parse_suggested_keyword(item: dict) -> Keyword:
91
+ """Parses a keyword from an item in the DataForSEO suggested keyword search response.
92
+
93
+ Args:
94
+ item: An item from the DataForSEO response.
95
+ """
96
+ text = item["keyword"]
97
+ volume = item["keyword_info"]["search_volume"]
98
+ return Keyword(text=text, volume=volume)
99
+
100
+ def _extract_suggested_keywords(self, data: dict) -> List[Keyword]:
101
+ """Extracts the keywords from the DataForSEO response for suggested keywords.
102
+
103
+ Args:
104
+ data: The response data from DataForSEO.
105
+
106
+ The DataForSEO results are of the form
107
+ (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/keyword_suggestions/live/?bash):
108
+ {
109
+ "tasks": [
110
+ {
111
+ "result": [
112
+ {
113
+ "items": [
114
+ {
115
+ "keyword": <suggested-keyword>,
116
+ "keyword_info": {
117
+ "search_volume": <volume>
118
+ }
119
+ }
120
+ ]
121
+ }
122
+ ]
123
+ }
124
+ ]
125
+ }
126
+
127
+ Args:
128
+ data: The response data from DataForSEO.
129
+ """
130
+ keywords = []
131
+ for item in self._extract_items_from_data(data=data):
132
+ try:
133
+ keyword = self._parse_suggested_keyword(item)
134
+ keywords.append(keyword)
135
+ except Exception as e:
136
+ logger.warning(f"Ignoring keyword due to error: {e}.")
137
+ return keywords
138
+
139
+ async def _get_suggested_keywords(
140
+ self,
141
+ search_term: str,
142
+ language: Language,
143
+ location: Location,
144
+ limit: int = ENRICHMENT_DEFAULT_LIMIT,
145
+ ) -> List[Keyword]:
146
+ """Get keyword suggestions for a given search_term.
147
+
148
+ Args:
149
+ search_term: The search term to use for the query.
150
+ language: The language to use for the search.
151
+ location: The location to use for the search.
152
+ limit: The upper limit of suggestions to get.
153
+ """
154
+
155
+ # Data must be a list of dictionaries, setting a number of search tasks; here we only have one task.
156
+ data = [
157
+ {
158
+ "keyword": search_term,
159
+ "language_name": language.name,
160
+ "location_name": location.name,
161
+ "limit": limit,
162
+ "include_serp_info": True,
163
+ "include_seed_keyword": True,
164
+ }
165
+ ]
166
+ url = f"{self._base_endpoint}{self._suggestions_endpoint}"
167
+ logger.debug(
168
+ f'DataForSEO search suggested keywords with url="{url}" and data="{data}".'
169
+ )
170
+
171
+ # Perform the request and retry if necessary. There is some context aware logging
172
+ # - `before`: before the request is made (or before retrying)
173
+ # - `before_sleep`: if the request fails before sleeping
174
+ retry = get_async_retry()
175
+ retry.before = lambda retry_state: self._log_before(
176
+ search_term=search_term, retry_state=retry_state
177
+ )
178
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
179
+ search_term=search_term, retry_state=retry_state
180
+ )
181
+ async for attempt in retry:
182
+ with attempt:
183
+ response = await self._http_client.post(
184
+ url=url, headers=self._headers, json=data
185
+ )
186
+ response.raise_for_status()
187
+
188
+ # Extract the keywords from the response
189
+ data_suggested_keywords = response.json()
190
+ keywords = self._extract_suggested_keywords(data=data_suggested_keywords)
191
+
192
+ logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
193
+ return keywords
194
+
195
+ @staticmethod
196
+ def _parse_related_keyword(item: dict) -> Keyword:
197
+ """Parses a keyword from an item in the DataForSEO related keyword search response.
198
+
199
+ Args:
200
+ item: An item from the DataForSEO response.
201
+ """
202
+ text = item["keyword_data"]["keyword"]
203
+ volume = item["keyword_data"]["keyword_info"]["search_volume"]
204
+ return Keyword(text=text, volume=volume)
205
+
206
+ def _extract_related_keywords(self, data: dict) -> List[Keyword]:
207
+ """Extracts the keywords from the DataForSEO response for related keywords.
208
+
209
+ Args:
210
+ data: The response data from DataForSEO.
211
+
212
+ The DataForSEO results are of the form
213
+ (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/related_keywords/live/?bash):
214
+ {
215
+ "tasks": [
216
+ {
217
+ "result": [
218
+ {
219
+ "items": [
220
+ {
221
+ "keyword_data": {
222
+ "keyword": <related-keyword>,
223
+ "keyword_info": {
224
+ "search_volume": <volume>
225
+ }
226
+ }
227
+ }
228
+ ]
229
+ }
230
+ ]
231
+ }
232
+ ]
233
+ }
234
+
235
+ Args:
236
+ data: The response data from DataForSEO.
237
+ """
238
+ keywords = []
239
+ for item in self._extract_items_from_data(data=data):
240
+ try:
241
+ keyword = self._parse_related_keyword(item)
242
+ keywords.append(keyword)
243
+ except Exception as e:
244
+ logger.warning(f"Ignoring keyword due to error: {e}.")
245
+ return keywords
246
+
247
+ async def _get_related_keywords(
248
+ self,
249
+ search_term: str,
250
+ language: Language,
251
+ location: Location,
252
+ limit: int = ENRICHMENT_DEFAULT_LIMIT,
253
+ ) -> List[Keyword]:
254
+ """Get related keywords for a given search_term.
255
+
256
+ Args:
257
+ search_term: The search term to use for the query.
258
+ location: The location to use for the search.
259
+ language: The language to use for the search.
260
+ limit: The upper limit of suggestions to get.
261
+ """
262
+
263
+ # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
264
+ data = [
265
+ {
266
+ "keyword": search_term,
267
+ "language_name": language.name,
268
+ "location_name": location.name,
269
+ "limit": limit,
270
+ }
271
+ ]
272
+ url = f"{self._base_endpoint}{self._keywords_endpoint}"
273
+ logger.debug(
274
+ f'DataForSEO search related keywords with url="{url}" and data="{data}".'
275
+ )
276
+
277
+ # Perform the request and retry if necessary. There is some context aware logging
278
+ # - `before`: before the request is made (or before retrying)
279
+ # - `before_sleep`: if the request fails before sleeping
280
+ retry = get_async_retry()
281
+ retry.before = lambda retry_state: self._log_before(
282
+ search_term=search_term, retry_state=retry_state
283
+ )
284
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
285
+ search_term=search_term, retry_state=retry_state
286
+ )
287
+ async for attempt in retry:
288
+ with attempt:
289
+ response = await self._http_client.post(
290
+ url=url, headers=self._headers, json=data
291
+ )
292
+ response.raise_for_status()
293
+
294
+ # Extract the keywords from the response
295
+ data_related_keywords = response.json()
296
+ keywords = self._extract_related_keywords(data=data_related_keywords)
297
+
298
+ logger.debug(f"Found {len(keywords)} related keywords from DataForSEO search.")
299
+ return keywords
300
+
301
+ async def enrich(
302
+ self,
303
+ search_term: str,
304
+ language: Language,
305
+ location: Location,
306
+ n_terms: int,
307
+ ) -> List[str]:
308
+ """Applies the enrichment to a search_term.
309
+
310
+ Args:
311
+ search_term: The search term to use for the query.
312
+ location: The location to use for the search.
313
+ language: The language to use for the search.
314
+ n_terms: The number of additional terms
315
+ """
316
+ logger.info(
317
+ f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
318
+ )
319
+ # Get the additional suggested keywords
320
+ try:
321
+ suggested = await self._get_suggested_keywords(
322
+ search_term=search_term,
323
+ location=location,
324
+ language=language,
325
+ limit=n_terms,
326
+ )
327
+ except Exception:
328
+ logger.error(
329
+ f"Fetching suggested keywords for search_term='{search_term}' failed",
330
+ exc_info=True,
331
+ )
332
+ suggested = []
333
+
334
+ # Get the additional related keywords
335
+ try:
336
+ related = await self._get_related_keywords(
337
+ search_term=search_term,
338
+ location=location,
339
+ language=language,
340
+ limit=n_terms,
341
+ )
342
+ except Exception:
343
+ logger.error(
344
+ f"Fetching related keywords for search_term='{search_term}' failed",
345
+ exc_info=True,
346
+ )
347
+ related = []
348
+
349
+ # Remove original keyword and aggregate them by volume
350
+ keywords = [kw for kw in suggested + related if kw.text != search_term]
351
+ kw_vol: Dict[str, int] = defaultdict(int)
352
+ for kw in keywords:
353
+ kw_vol[kw.text] = max(kw.volume, kw_vol[kw.text])
354
+ keywords = [Keyword(text=k, volume=v) for k, v in kw_vol.items()]
355
+ logger.debug(f"Found {len(keywords)} additional unique keywords.")
356
+
357
+ # Sort the keywords by volume and get the top n_terms
358
+ keywords = sorted(keywords, key=lambda kw: kw.volume, reverse=True)
359
+ terms = [kw.text for kw in keywords[:n_terms]]
360
+ logger.info(f"Produced {len(terms)} additional search_terms.")
361
+ return terms