webscout 2.6__py3-none-any.whl → 2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,11 @@
1
1
  import asyncio
2
- import logging
3
- import warnings
4
- from concurrent.futures import ThreadPoolExecutor
5
- from contextlib import suppress
6
- from datetime import datetime, timezone
7
- from decimal import Decimal
8
- from functools import cached_property, partial
9
- from itertools import cycle, islice
10
2
  from types import TracebackType
11
- from typing import Dict, List, Optional, Tuple, Type, Union, cast
3
+ from typing import Dict, List, Optional, Type, Union
12
4
 
13
- from curl_cffi import requests
5
+ from .webscout_search import WEBS
14
6
 
15
- try:
16
- from lxml.html import HTMLParser as LHTMLParser
17
- from lxml.html import document_fromstring
18
-
19
- LXML_AVAILABLE = True
20
- except ImportError:
21
- LXML_AVAILABLE = False
22
-
23
- from .exceptions import WebscoutE, RatelimitE, TimeoutE
24
- from .utils import (
25
- _calculate_distance,
26
- _extract_vqd,
27
- _normalize,
28
- _normalize_url,
29
- _text_extract_json,
30
- json_loads,
31
- )
32
-
33
- logger = logging.getLogger("webscout_search.AsyncWEBS")
34
-
35
-
36
- class AsyncWEBS:
37
- """webscout_search async class to get search results from duckduckgo.com."""
38
-
39
- _executor: Optional[ThreadPoolExecutor] = None
40
7
 
8
+ class AsyncWEBS(WEBS):
41
9
  def __init__(
42
10
  self,
43
11
  headers: Optional[Dict[str, str]] = None,
@@ -53,83 +21,35 @@ class AsyncWEBS:
53
21
  example: "http://user:pass@example.com:3128". Defaults to None.
54
22
  timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
55
23
  """
56
- self.proxy: Optional[str] = proxy
57
- assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
58
- if not proxy and proxies:
59
- warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
60
- self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
61
- self._asession = requests.AsyncSession(
62
- headers=headers,
63
- proxy=self.proxy,
64
- timeout=timeout,
65
- impersonate="chrome",
66
- allow_redirects=False,
67
- )
68
- self._asession.headers["Referer"] = "https://duckduckgo.com/"
69
- self._exception_event = asyncio.Event()
24
+ super().__init__(headers=headers, proxy=proxy, proxies=proxies, timeout=timeout)
25
+ self._loop = asyncio.get_running_loop()
26
+ self._executor = super()._executor
70
27
 
71
28
  async def __aenter__(self) -> "AsyncWEBS":
72
29
  return self
73
30
 
74
31
  async def __aexit__(
75
32
  self,
76
- exc_type: Optional[Type[BaseException]] = None,
77
- exc_val: Optional[BaseException] = None,
78
- exc_tb: Optional[TracebackType] = None,
33
+ exc_type: Optional[Type[BaseException]],
34
+ exc_val: Optional[BaseException],
35
+ exc_tb: Optional[TracebackType],
79
36
  ) -> None:
80
- await self._asession.__aexit__(exc_type, exc_val, exc_tb) # type: ignore
81
-
82
- def __del__(self) -> None:
83
- if hasattr(self, "_asession") and self._asession._closed is False:
84
- with suppress(RuntimeError, RuntimeWarning):
85
- asyncio.create_task(self._asession.close()) # type: ignore
86
-
87
- @cached_property
88
- def parser(self) -> Optional["LHTMLParser"]:
89
- """Get HTML parser."""
90
- return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
91
-
92
- @classmethod
93
- def _get_executor(cls, max_workers: int = 1) -> ThreadPoolExecutor:
94
- """Get ThreadPoolExecutor. Default max_workers=1, because >=2 leads to a big overhead"""
95
- if cls._executor is None:
96
- cls._executor = ThreadPoolExecutor(max_workers=max_workers)
97
- return cls._executor
37
+ pass
98
38
 
99
- @property
100
- def executor(cls) -> Optional[ThreadPoolExecutor]:
101
- return cls._get_executor()
39
+ async def achat(self, keywords: str, model: str = "gpt-3.5") -> str:
40
+ """Initiates async chat session with Webscout AI.
102
41
 
103
- async def _aget_url(
104
- self,
105
- method: str,
106
- url: str,
107
- data: Optional[Union[Dict[str, str], bytes]] = None,
108
- params: Optional[Dict[str, str]] = None,
109
- ) -> bytes:
110
- if self._exception_event.is_set():
111
- raise WebscoutE("Exception occurred in previous call.")
112
- try:
113
- resp = await self._asession.request(method, url, data=data, params=params)
114
- except Exception as ex:
115
- self._exception_event.set()
116
- if "time" in str(ex).lower():
117
- raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
118
- raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
119
- logger.debug(f"_aget_url() {resp.url} {resp.status_code} {resp.elapsed:.2f} {len(resp.content)}")
120
- if resp.status_code == 200:
121
- return cast(bytes, resp.content)
122
- self._exception_event.set()
123
- if resp.status_code in (202, 301, 403):
124
- raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
125
- raise WebscoutE(f"{resp.url} return None. {params=} {data=}")
42
+ Args:
43
+ keywords (str): The initial message or question to send to the AI.
44
+ model (str): The model to use: "gpt-3.5", "claude-3-haiku". Defaults to "gpt-3.5".
126
45
 
127
- async def _aget_vqd(self, keywords: str) -> str:
128
- """Get vqd value for a search query."""
129
- resp_content = await self._aget_url("POST", "https://duckduckgo.com", data={"q": keywords})
130
- return _extract_vqd(resp_content, keywords)
46
+ Returns:
47
+ str: The response from the AI.
48
+ """
49
+ result = await self._loop.run_in_executor(self._executor, super().chat, keywords, model)
50
+ return result
131
51
 
132
- async def text(
52
+ async def atext(
133
53
  self,
134
54
  keywords: str,
135
55
  region: str = "wt-wt",
@@ -138,7 +58,7 @@ class AsyncWEBS:
138
58
  backend: str = "api",
139
59
  max_results: Optional[int] = None,
140
60
  ) -> List[Dict[str, str]]:
141
- """webscout text search generator. Query params: https://duckduckgo.com/params.
61
+ """Webscout async text search. Query params: https://duckduckgo.com/params.
142
62
 
143
63
  Args:
144
64
  keywords: keywords for query.
@@ -155,296 +75,16 @@ class AsyncWEBS:
155
75
  List of dictionaries with search results, or None if there was an error.
156
76
 
157
77
  Raises:
158
- WebscoutE: Base exception for webscout_search errors.
159
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
160
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
161
- """
162
- if LXML_AVAILABLE is False and backend != "api":
163
- backend = "api"
164
- warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
165
-
166
- if backend == "api":
167
- results = await self._text_api(keywords, region, safesearch, timelimit, max_results)
168
- elif backend == "html":
169
- results = await self._text_html(keywords, region, safesearch, timelimit, max_results)
170
- elif backend == "lite":
171
- results = await self._text_lite(keywords, region, timelimit, max_results)
172
- return results
173
-
174
- async def _text_api(
175
- self,
176
- keywords: str,
177
- region: str = "wt-wt",
178
- safesearch: str = "moderate",
179
- timelimit: Optional[str] = None,
180
- max_results: Optional[int] = None,
181
- ) -> List[Dict[str, str]]:
182
- """webscout text search generator. Query params: https://duckduckgo.com/params.
183
-
184
- Args:
185
- keywords: keywords for query.
186
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
187
- safesearch: on, moderate, off. Defaults to "moderate".
188
- timelimit: d, w, m, y. Defaults to None.
189
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
190
-
191
- Returns:
192
- List of dictionaries with search results.
193
-
194
- Raises:
195
- WebscoutE: Base exception for webscout_search errors.
196
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
197
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
198
- """
199
- assert keywords, "keywords is mandatory"
200
-
201
- vqd = await self._aget_vqd(keywords)
202
-
203
- payload = {
204
- "q": keywords,
205
- "kl": region,
206
- "l": region,
207
- "p": "",
208
- "s": "0",
209
- "df": "",
210
- "vqd": vqd,
211
- "ex": "",
212
- }
213
- safesearch = safesearch.lower()
214
- if safesearch == "moderate":
215
- payload["ex"] = "-1"
216
- elif safesearch == "off":
217
- payload["ex"] = "-2"
218
- elif safesearch == "on": # strict
219
- payload["p"] = "1"
220
- if timelimit:
221
- payload["df"] = timelimit
222
-
223
- cache = set()
224
- results: List[Optional[Dict[str, str]]] = [None] * 1100
225
-
226
- async def _text_api_page(s: int, page: int) -> None:
227
- priority = page * 100
228
- payload["s"] = f"{s}"
229
- resp_content = await self._aget_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
230
- page_data = _text_extract_json(resp_content, keywords)
231
-
232
- for row in page_data:
233
- href = row.get("u", None)
234
- if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
235
- cache.add(href)
236
- body = _normalize(row["a"])
237
- if body:
238
- priority += 1
239
- result = {
240
- "title": _normalize(row["t"]),
241
- "href": _normalize_url(href),
242
- "body": body,
243
- }
244
- results[priority] = result
245
-
246
- tasks = [asyncio.create_task(_text_api_page(0, 0))]
247
- if max_results:
248
- max_results = min(max_results, 500)
249
- tasks.extend(
250
- asyncio.create_task(_text_api_page(s, i)) for i, s in enumerate(range(23, max_results, 50), start=1)
251
- )
252
- try:
253
- await asyncio.gather(*tasks)
254
- except Exception as e:
255
- for task in tasks:
256
- task.cancel()
257
- await asyncio.gather(*tasks, return_exceptions=True)
258
- raise e
259
-
260
- return list(islice(filter(None, results), max_results))
261
-
262
- async def _text_html(
263
- self,
264
- keywords: str,
265
- region: str = "wt-wt",
266
- safesearch: str = "moderate",
267
- timelimit: Optional[str] = None,
268
- max_results: Optional[int] = None,
269
- ) -> List[Dict[str, str]]:
270
- """webscout text search generator. Query params: https://duckduckgo.com/params.
271
-
272
- Args:
273
- keywords: keywords for query.
274
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
275
- safesearch: on, moderate, off. Defaults to "moderate".
276
- timelimit: d, w, m, y. Defaults to None.
277
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
278
-
279
- Returns:
280
- List of dictionaries with search results.
281
-
282
- Raises:
283
- WebscoutE: Base exception for webscout_search errors.
284
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
285
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
286
- """
287
- assert keywords, "keywords is mandatory"
288
-
289
- self._asession.headers["Referer"] = "https://html.duckduckgo.com/"
290
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
291
- payload = {
292
- "q": keywords,
293
- "kl": region,
294
- "p": safesearch_base[safesearch.lower()],
295
- "o": "json",
296
- "api": "d.js",
297
- }
298
- if timelimit:
299
- payload["df"] = timelimit
300
- if max_results and max_results > 20:
301
- vqd = await self._aget_vqd(keywords)
302
- payload["vqd"] = vqd
303
-
304
- cache = set()
305
- results: List[Optional[Dict[str, str]]] = [None] * 1100
306
-
307
- async def _text_html_page(s: int, page: int) -> None:
308
- priority = page * 100
309
- payload["s"] = f"{s}"
310
- resp_content = await self._aget_url("POST", "https://html.duckduckgo.com/html", data=payload)
311
- if b"No results." in resp_content:
312
- return
313
-
314
- tree = await self._asession.loop.run_in_executor(
315
- self.executor, partial(document_fromstring, resp_content, self.parser)
316
- )
317
-
318
- for e in tree.xpath("//div[h2]"):
319
- href = e.xpath("./a/@href")
320
- href = href[0] if href else None
321
- if (
322
- href
323
- and href not in cache
324
- and not href.startswith(
325
- ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
326
- )
327
- ):
328
- cache.add(href)
329
- title = e.xpath("./h2/a/text()")
330
- body = e.xpath("./a//text()")
331
-
332
- priority += 1
333
- result = {
334
- "title": _normalize(title[0]),
335
- "href": _normalize_url(href),
336
- "body": _normalize("".join(body)),
337
- }
338
- results[priority] = result
339
-
340
- tasks = [asyncio.create_task(_text_html_page(0, 0))]
341
- if max_results:
342
- max_results = min(max_results, 500)
343
- tasks.extend(
344
- asyncio.create_task(_text_html_page(s, i)) for i, s in enumerate(range(23, max_results, 50), start=1)
345
- )
346
- try:
347
- await asyncio.gather(*tasks)
348
- except Exception as e:
349
- for task in tasks:
350
- task.cancel()
351
- await asyncio.gather(*tasks, return_exceptions=True)
352
- raise e
353
-
354
- return list(islice(filter(None, results), max_results))
355
-
356
- async def _text_lite(
357
- self,
358
- keywords: str,
359
- region: str = "wt-wt",
360
- timelimit: Optional[str] = None,
361
- max_results: Optional[int] = None,
362
- ) -> List[Dict[str, str]]:
363
- """webscout text search generator. Query params: https://duckduckgo.com/params.
364
-
365
- Args:
366
- keywords: keywords for query.
367
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
368
- timelimit: d, w, m, y. Defaults to None.
369
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
370
-
371
- Returns:
372
- List of dictionaries with search results.
373
-
374
- Raises:
375
- WebscoutE: Base exception for webscout_search errors.
376
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
377
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
78
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
79
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
80
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
378
81
  """
379
- assert keywords, "keywords is mandatory"
380
-
381
- self._asession.headers["Referer"] = "https://lite.duckduckgo.com/"
382
- payload = {
383
- "q": keywords,
384
- "o": "json",
385
- "api": "d.js",
386
- "kl": region,
387
- }
388
- if timelimit:
389
- payload["df"] = timelimit
390
-
391
- cache = set()
392
- results: List[Optional[Dict[str, str]]] = [None] * 1100
393
-
394
- async def _text_lite_page(s: int, page: int) -> None:
395
- priority = page * 100
396
- payload["s"] = f"{s}"
397
- resp_content = await self._aget_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
398
- if b"No more results." in resp_content:
399
- return
400
-
401
- tree = await self._asession.loop.run_in_executor(
402
- self.executor, partial(document_fromstring, resp_content, self.parser)
403
- )
404
-
405
- data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr"))
406
- for i, e in data:
407
- if i == 1:
408
- href = e.xpath(".//a//@href")
409
- href = href[0] if href else None
410
- if (
411
- href is None
412
- or href in cache
413
- or href.startswith(("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain"))
414
- ):
415
- [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
416
- else:
417
- cache.add(href)
418
- title = e.xpath(".//a//text()")[0]
419
- elif i == 2:
420
- body = e.xpath(".//td[@class='result-snippet']//text()")
421
- body = "".join(body).strip()
422
- elif i == 3:
423
- priority += 1
424
- result = {
425
- "title": _normalize(title),
426
- "href": _normalize_url(href),
427
- "body": _normalize(body),
428
- }
429
- results[priority] = result
430
-
431
- tasks = [asyncio.create_task(_text_lite_page(0, 0))]
432
- if max_results:
433
- max_results = min(max_results, 500)
434
- tasks.extend(
435
- asyncio.create_task(_text_lite_page(s, i)) for i, s in enumerate(range(23, max_results, 50), start=1)
436
- )
437
- try:
438
- await asyncio.gather(*tasks)
439
- except Exception as e:
440
- for task in tasks:
441
- task.cancel()
442
- await asyncio.gather(*tasks, return_exceptions=True)
443
- raise e
444
-
445
- return list(islice(filter(None, results), max_results))
82
+ result = await self._loop.run_in_executor(
83
+ self._executor, super().text, keywords, region, safesearch, timelimit, backend, max_results
84
+ )
85
+ return result
446
86
 
447
- async def images(
87
+ async def aimages(
448
88
  self,
449
89
  keywords: str,
450
90
  region: str = "wt-wt",
@@ -457,7 +97,7 @@ class AsyncWEBS:
457
97
  license_image: Optional[str] = None,
458
98
  max_results: Optional[int] = None,
459
99
  ) -> List[Dict[str, str]]:
460
- """webscout images search. Query params: https://duckduckgo.com/params.
100
+ """Webscout async images search. Query params: https://duckduckgo.com/params.
461
101
 
462
102
  Args:
463
103
  keywords: keywords for query.
@@ -480,74 +120,27 @@ class AsyncWEBS:
480
120
  List of dictionaries with images search results.
481
121
 
482
122
  Raises:
483
- WebscoutE: Base exception for webscout_search errors.
484
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
485
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
123
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
124
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
125
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
486
126
  """
487
- assert keywords, "keywords is mandatory"
488
-
489
- vqd = await self._aget_vqd(keywords)
490
-
491
- safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
492
- timelimit = f"time:{timelimit}" if timelimit else ""
493
- size = f"size:{size}" if size else ""
494
- color = f"color:{color}" if color else ""
495
- type_image = f"type:{type_image}" if type_image else ""
496
- layout = f"layout:{layout}" if layout else ""
497
- license_image = f"license:{license_image}" if license_image else ""
498
- payload = {
499
- "l": region,
500
- "o": "json",
501
- "q": keywords,
502
- "vqd": vqd,
503
- "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
504
- "p": safesearch_base[safesearch.lower()],
505
- }
506
-
507
- cache = set()
508
- results: List[Optional[Dict[str, str]]] = [None] * 600
509
-
510
- async def _images_page(s: int, page: int) -> None:
511
- priority = page * 100
512
- payload["s"] = f"{s}"
513
- resp_content = await self._aget_url("GET", "https://duckduckgo.com/i.js", params=payload)
514
- resp_json = json_loads(resp_content)
515
-
516
- page_data = resp_json.get("results", [])
517
-
518
- for row in page_data:
519
- image_url = row.get("image")
520
- if image_url and image_url not in cache:
521
- cache.add(image_url)
522
- priority += 1
523
- result = {
524
- "title": row["title"],
525
- "image": _normalize_url(image_url),
526
- "thumbnail": _normalize_url(row["thumbnail"]),
527
- "url": _normalize_url(row["url"]),
528
- "height": row["height"],
529
- "width": row["width"],
530
- "source": row["source"],
531
- }
532
- results[priority] = result
533
-
534
- tasks = [asyncio.create_task(_images_page(0, page=0))]
535
- if max_results:
536
- max_results = min(max_results, 500)
537
- tasks.extend(
538
- asyncio.create_task(_images_page(s, i)) for i, s in enumerate(range(100, max_results, 100), start=1)
539
- )
540
- try:
541
- await asyncio.gather(*tasks)
542
- except Exception as e:
543
- for task in tasks:
544
- task.cancel()
545
- await asyncio.gather(*tasks, return_exceptions=True)
546
- raise e
547
-
548
- return list(islice(filter(None, results), max_results))
127
+ result = await self._loop.run_in_executor(
128
+ self._executor,
129
+ super().images,
130
+ keywords,
131
+ region,
132
+ safesearch,
133
+ timelimit,
134
+ size,
135
+ color,
136
+ type_image,
137
+ layout,
138
+ license_image,
139
+ max_results,
140
+ )
141
+ return result
549
142
 
550
- async def videos(
143
+ async def avideos(
551
144
  self,
552
145
  keywords: str,
553
146
  region: str = "wt-wt",
@@ -558,7 +151,7 @@ class AsyncWEBS:
558
151
  license_videos: Optional[str] = None,
559
152
  max_results: Optional[int] = None,
560
153
  ) -> List[Dict[str, str]]:
561
- """webscout videos search. Query params: https://duckduckgo.com/params.
154
+ """Webscout async videos search. Query params: https://duckduckgo.com/params.
562
155
 
563
156
  Args:
564
157
  keywords: keywords for query.
@@ -574,62 +167,25 @@ class AsyncWEBS:
574
167
  List of dictionaries with videos search results.
575
168
 
576
169
  Raises:
577
- WebscoutE: Base exception for webscout_search errors.
578
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
579
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
170
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
171
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
172
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
580
173
  """
581
- assert keywords, "keywords is mandatory"
582
-
583
- vqd = await self._aget_vqd(keywords)
584
-
585
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
586
- timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
587
- resolution = f"videoDefinition:{resolution}" if resolution else ""
588
- duration = f"videoDuration:{duration}" if duration else ""
589
- license_videos = f"videoLicense:{license_videos}" if license_videos else ""
590
- payload = {
591
- "l": region,
592
- "o": "json",
593
- "q": keywords,
594
- "vqd": vqd,
595
- "f": f"{timelimit},{resolution},{duration},{license_videos}",
596
- "p": safesearch_base[safesearch.lower()],
597
- }
598
-
599
- cache = set()
600
- results: List[Optional[Dict[str, str]]] = [None] * 700
601
-
602
- async def _videos_page(s: int, page: int) -> None:
603
- priority = page * 100
604
- payload["s"] = f"{s}"
605
- resp_content = await self._aget_url("GET", "https://duckduckgo.com/v.js", params=payload)
606
- resp_json = json_loads(resp_content)
607
-
608
- page_data = resp_json.get("results", [])
609
-
610
- for row in page_data:
611
- if row["content"] not in cache:
612
- cache.add(row["content"])
613
- priority += 1
614
- results[priority] = row
615
-
616
- tasks = [asyncio.create_task(_videos_page(0, 0))]
617
- if max_results:
618
- max_results = min(max_results, 400)
619
- tasks.extend(
620
- asyncio.create_task(_videos_page(s, i)) for i, s in enumerate(range(59, max_results, 59), start=1)
621
- )
622
- try:
623
- await asyncio.gather(*tasks)
624
- except Exception as e:
625
- for task in tasks:
626
- task.cancel()
627
- await asyncio.gather(*tasks, return_exceptions=True)
628
- raise e
629
-
630
- return list(islice(filter(None, results), max_results))
174
+ result = await self._loop.run_in_executor(
175
+ self._executor,
176
+ super().videos,
177
+ keywords,
178
+ region,
179
+ safesearch,
180
+ timelimit,
181
+ resolution,
182
+ duration,
183
+ license_videos,
184
+ max_results,
185
+ )
186
+ return result
631
187
 
632
- async def news(
188
+ async def anews(
633
189
  self,
634
190
  keywords: str,
635
191
  region: str = "wt-wt",
@@ -637,7 +193,7 @@ class AsyncWEBS:
637
193
  timelimit: Optional[str] = None,
638
194
  max_results: Optional[int] = None,
639
195
  ) -> List[Dict[str, str]]:
640
- """webscout news search. Query params: https://duckduckgo.com/params.
196
+ """Webscout async news search. Query params: https://duckduckgo.com/params.
641
197
 
642
198
  Args:
643
199
  keywords: keywords for query.
@@ -650,69 +206,26 @@ class AsyncWEBS:
650
206
  List of dictionaries with news search results.
651
207
 
652
208
  Raises:
653
- WebscoutE: Base exception for webscout_search errors.
654
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
655
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
209
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
210
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
211
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
656
212
  """
657
- assert keywords, "keywords is mandatory"
658
-
659
- vqd = await self._aget_vqd(keywords)
660
-
661
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
662
- payload = {
663
- "l": region,
664
- "o": "json",
665
- "noamp": "1",
666
- "q": keywords,
667
- "vqd": vqd,
668
- "p": safesearch_base[safesearch.lower()],
669
- }
670
- if timelimit:
671
- payload["df"] = timelimit
672
-
673
- cache = set()
674
- results: List[Optional[Dict[str, str]]] = [None] * 700
675
-
676
- async def _news_page(s: int, page: int) -> None:
677
- priority = page * 100
678
- payload["s"] = f"{s}"
679
- resp_content = await self._aget_url("GET", "https://duckduckgo.com/news.js", params=payload)
680
- resp_json = json_loads(resp_content)
681
- page_data = resp_json.get("results", [])
682
-
683
- for row in page_data:
684
- if row["url"] not in cache:
685
- cache.add(row["url"])
686
- image_url = row.get("image", None)
687
- priority += 1
688
- result = {
689
- "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
690
- "title": row["title"],
691
- "body": _normalize(row["excerpt"]),
692
- "url": _normalize_url(row["url"]),
693
- "image": _normalize_url(image_url),
694
- "source": row["source"],
695
- }
696
- results[priority] = result
697
-
698
- tasks = [asyncio.create_task(_news_page(0, 0))]
699
- if max_results:
700
- max_results = min(max_results, 200)
701
- tasks.extend(
702
- asyncio.create_task(_news_page(s, i)) for i, s in enumerate(range(29, max_results, 29), start=1)
703
- )
704
- try:
705
- await asyncio.gather(*tasks)
706
- except Exception as e:
707
- for task in tasks:
708
- task.cancel()
709
- await asyncio.gather(*tasks, return_exceptions=True)
710
- raise e
711
-
712
- return list(islice(filter(None, results), max_results))
213
+ result = await self._loop.run_in_executor(
214
+ self._executor,
215
+ super().news,
216
+ keywords,
217
+ region,
218
+ safesearch,
219
+ timelimit,
220
+ max_results,
221
+ )
222
+ return result
713
223
 
714
- async def answers(self, keywords: str) -> List[Dict[str, str]]:
715
- """webscout instant answers. Query params: https://duckduckgo.com/params.
224
+ async def aanswers(
225
+ self,
226
+ keywords: str,
227
+ ) -> List[Dict[str, str]]:
228
+ """Webscout async instant answers. Query params: https://duckduckgo.com/params.
716
229
 
717
230
  Args:
718
231
  keywords: keywords for query,
@@ -721,69 +234,23 @@ class AsyncWEBS:
721
234
  List of dictionaries with instant answers results.
722
235
 
723
236
  Raises:
724
- WebscoutE: Base exception for webscout_search errors.
725
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
726
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
237
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
238
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
239
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
727
240
  """
728
- assert keywords, "keywords is mandatory"
729
-
730
- payload = {
731
- "q": f"what is {keywords}",
732
- "format": "json",
733
- }
734
- resp_content = await self._aget_url("GET", "https://api.duckduckgo.com/", params=payload)
735
- page_data = json_loads(resp_content)
736
-
737
- results = []
738
- answer = page_data.get("AbstractText")
739
- url = page_data.get("AbstractURL")
740
- if answer:
741
- results.append(
742
- {
743
- "icon": None,
744
- "text": answer,
745
- "topic": None,
746
- "url": url,
747
- }
748
- )
749
-
750
- # related
751
- payload = {
752
- "q": f"{keywords}",
753
- "format": "json",
754
- }
755
- resp_content = await self._aget_url("GET", "https://api.duckduckgo.com/", params=payload)
756
- resp_json = json_loads(resp_content)
757
- page_data = resp_json.get("RelatedTopics", [])
758
-
759
- for row in page_data:
760
- topic = row.get("Name")
761
- if not topic:
762
- icon = row["Icon"].get("URL")
763
- results.append(
764
- {
765
- "icon": f"https://duckduckgo.com{icon}" if icon else "",
766
- "text": row["Text"],
767
- "topic": None,
768
- "url": row["FirstURL"],
769
- }
770
- )
771
- else:
772
- for subrow in row["Topics"]:
773
- icon = subrow["Icon"].get("URL")
774
- results.append(
775
- {
776
- "icon": f"https://duckduckgo.com{icon}" if icon else "",
777
- "text": subrow["Text"],
778
- "topic": topic,
779
- "url": subrow["FirstURL"],
780
- }
781
- )
782
-
783
- return results
241
+ result = await self._loop.run_in_executor(
242
+ self._executor,
243
+ super().answers,
244
+ keywords,
245
+ )
246
+ return result
784
247
 
785
- async def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
786
- """webscout suggestions. Query params: https://duckduckgo.com/params.
248
+ async def asuggestions(
249
+ self,
250
+ keywords: str,
251
+ region: str = "wt-wt",
252
+ ) -> List[Dict[str, str]]:
253
+ """Webscout async suggestions. Query params: https://duckduckgo.com/params.
787
254
 
788
255
  Args:
789
256
  keywords: keywords for query.
@@ -793,21 +260,19 @@ class AsyncWEBS:
793
260
  List of dictionaries with suggestions results.
794
261
 
795
262
  Raises:
796
- WebscoutE: Base exception for webscout_search errors.
797
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
798
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
263
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
264
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
265
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
799
266
  """
800
- assert keywords, "keywords is mandatory"
801
-
802
- payload = {
803
- "q": keywords,
804
- "kl": region,
805
- }
806
- resp_content = await self._aget_url("GET", "https://duckduckgo.com/ac/", params=payload)
807
- page_data = json_loads(resp_content)
808
- return [r for r in page_data]
267
+ result = await self._loop.run_in_executor(
268
+ self._executor,
269
+ super().suggestions,
270
+ keywords,
271
+ region,
272
+ )
273
+ return result
809
274
 
810
- async def maps(
275
+ async def amaps(
811
276
  self,
812
277
  keywords: str,
813
278
  place: Optional[str] = None,
@@ -822,7 +287,7 @@ class AsyncWEBS:
822
287
  radius: int = 0,
823
288
  max_results: Optional[int] = None,
824
289
  ) -> List[Dict[str, str]]:
825
- """webscout maps search. Query params: https://duckduckgo.com/params.
290
+ """Webscout async maps search. Query params: https://duckduckgo.com/params.
826
291
 
827
292
  Args:
828
293
  keywords: keywords for query
@@ -843,159 +308,35 @@ class AsyncWEBS:
843
308
  List of dictionaries with maps search results, or None if there was an error.
844
309
 
845
310
  Raises:
846
- WebscoutE: Base exception for webscout_search errors.
847
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
848
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
311
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
312
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
313
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
849
314
  """
850
- assert keywords, "keywords is mandatory"
851
-
852
- vqd = await self._aget_vqd(keywords)
853
-
854
- # if longitude and latitude are specified, skip the request about bbox to the nominatim api
855
- if latitude and longitude:
856
- lat_t = Decimal(latitude.replace(",", "."))
857
- lat_b = Decimal(latitude.replace(",", "."))
858
- lon_l = Decimal(longitude.replace(",", "."))
859
- lon_r = Decimal(longitude.replace(",", "."))
860
- if radius == 0:
861
- radius = 1
862
- # otherwise request about bbox to nominatim api
863
- else:
864
- if place:
865
- params = {
866
- "q": place,
867
- "polygon_geojson": "0",
868
- "format": "jsonv2",
869
- }
870
- else:
871
- params = {
872
- "polygon_geojson": "0",
873
- "format": "jsonv2",
874
- }
875
- if street:
876
- params["street"] = street
877
- if city:
878
- params["city"] = city
879
- if county:
880
- params["county"] = county
881
- if state:
882
- params["state"] = state
883
- if country:
884
- params["country"] = country
885
- if postalcode:
886
- params["postalcode"] = postalcode
887
- # request nominatim api to get coordinates box
888
- resp_content = await self._aget_url(
889
- "GET",
890
- "https://nominatim.openstreetmap.org/search.php",
891
- params=params,
892
- )
893
- if resp_content == b"[]":
894
- raise WebscoutE("maps() Сoordinates are not found, check function parameters.")
895
- resp_json = json_loads(resp_content)
896
- coordinates = resp_json[0]["boundingbox"]
897
- lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
898
- lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
899
-
900
- # if a radius is specified, expand the search square
901
- lat_t += Decimal(radius) * Decimal(0.008983)
902
- lat_b -= Decimal(radius) * Decimal(0.008983)
903
- lon_l -= Decimal(radius) * Decimal(0.008983)
904
- lon_r += Decimal(radius) * Decimal(0.008983)
905
- logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
906
-
907
- cache = set()
908
- results: List[Dict[str, str]] = []
909
-
910
- async def _maps_page(
911
- bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
912
- ) -> Optional[List[Dict[str, str]]]:
913
- if max_results and len(results) >= max_results:
914
- return None
915
- lat_t, lon_l, lat_b, lon_r = bbox
916
- params = {
917
- "q": keywords,
918
- "vqd": vqd,
919
- "tg": "maps_places",
920
- "rt": "D",
921
- "mkexp": "b",
922
- "wiki_info": "1",
923
- "is_requery": "1",
924
- "bbox_tl": f"{lat_t},{lon_l}",
925
- "bbox_br": f"{lat_b},{lon_r}",
926
- "strict_bbox": "1",
927
- }
928
- resp_content = await self._aget_url("GET", "https://duckduckgo.com/local.js", params=params)
929
- resp_json = json_loads(resp_content)
930
- page_data = resp_json.get("results", [])
931
-
932
- page_results = []
933
- for res in page_data:
934
- r_name = f'{res["name"]} {res["address"]}'
935
- if r_name in cache:
936
- continue
937
- else:
938
- cache.add(r_name)
939
- result = {
940
- "title": res["name"],
941
- "address": res["address"],
942
- "country_code": res["country_code"],
943
- "url": _normalize_url(res["website"]),
944
- "phone": res["phone"] or "",
945
- "latitude": res["coordinates"]["latitude"],
946
- "longitude": res["coordinates"]["longitude"],
947
- "source": _normalize_url(res["url"]),
948
- "image": x.get("image", "") if (x := res["embed"]) else "",
949
- "desc": x.get("description", "") if (x := res["embed"]) else "",
950
- "hours": res["hours"] or "",
951
- "category": res["ddg_category"] or "",
952
- "facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
953
- "instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
954
- "twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
955
- }
956
- page_results.append(result)
957
-
958
- return page_results
959
-
960
- # search squares (bboxes)
961
- start_bbox = (lat_t, lon_l, lat_b, lon_r)
962
- work_bboxes = [start_bbox]
963
- while work_bboxes:
964
- queue_bboxes = [] # for next iteration, at the end of the iteration work_bboxes = queue_bboxes
965
- tasks = []
966
- for bbox in work_bboxes:
967
- tasks.append(asyncio.create_task(_maps_page(bbox)))
968
- # if distance between coordinates > 1, divide the square into 4 parts and save them in queue_bboxes
969
- if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
970
- lat_t, lon_l, lat_b, lon_r = bbox
971
- lat_middle = (lat_t + lat_b) / 2
972
- lon_middle = (lon_l + lon_r) / 2
973
- bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
974
- bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
975
- bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
976
- bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
977
- queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
978
-
979
- # gather tasks using asyncio.wait_for and timeout
980
- with suppress(Exception):
981
- work_bboxes_results = await asyncio.gather(*[asyncio.wait_for(task, timeout=10) for task in tasks])
982
-
983
- for x in work_bboxes_results:
984
- if isinstance(x, list):
985
- results.extend(x)
986
- elif isinstance(x, dict):
987
- results.append(x)
988
-
989
- work_bboxes = queue_bboxes
990
- if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
991
- break
992
-
993
- return list(islice(results, max_results))
315
+ result = await self._loop.run_in_executor(
316
+ self._executor,
317
+ super().maps,
318
+ keywords,
319
+ place,
320
+ street,
321
+ city,
322
+ county,
323
+ state,
324
+ country,
325
+ postalcode,
326
+ latitude,
327
+ longitude,
328
+ radius,
329
+ max_results,
330
+ )
331
+ return result
994
332
 
995
- async def translate(
996
- self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
333
+ async def atranslate(
334
+ self,
335
+ keywords: Union[List[str], str],
336
+ from_: Optional[str] = None,
337
+ to: str = "en",
997
338
  ) -> List[Dict[str, str]]:
998
- """webscout translate.
339
+ """Webscout async translate.
999
340
 
1000
341
  Args:
1001
342
  keywords: string or list of strings to translate.
@@ -1006,44 +347,15 @@ class AsyncWEBS:
1006
347
  List od dictionaries with translated keywords.
1007
348
 
1008
349
  Raises:
1009
- WebscoutE: Base exception for webscout_search errors.
1010
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
1011
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
350
+ DuckDuckGoSearchException: Base exception for duckduckgo_search errors.
351
+ RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
352
+ TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
1012
353
  """
1013
- assert keywords, "keywords is mandatory"
1014
-
1015
- vqd = await self._aget_vqd("translate")
1016
-
1017
- payload = {
1018
- "vqd": vqd,
1019
- "query": "translate",
1020
- "to": to,
1021
- }
1022
- if from_:
1023
- payload["from"] = from_
1024
-
1025
- results = []
1026
-
1027
- async def _translate_keyword(keyword: str) -> None:
1028
- resp_content = await self._aget_url(
1029
- "POST",
1030
- "https://duckduckgo.com/translation.js",
1031
- params=payload,
1032
- data=keyword.encode(),
1033
- )
1034
- page_data = json_loads(resp_content)
1035
- page_data["original"] = keyword
1036
- results.append(page_data)
1037
-
1038
- if isinstance(keywords, str):
1039
- keywords = [keywords]
1040
- tasks = [asyncio.create_task(_translate_keyword(keyword)) for keyword in keywords]
1041
- try:
1042
- await asyncio.gather(*tasks)
1043
- except Exception as e:
1044
- for task in tasks:
1045
- task.cancel()
1046
- await asyncio.gather(*tasks, return_exceptions=True)
1047
- raise e
1048
-
1049
- return results
354
+ result = await self._loop.run_in_executor(
355
+ self._executor,
356
+ super().translate,
357
+ keywords,
358
+ from_,
359
+ to,
360
+ )
361
+ return result