webscout 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/__init__.py +18 -0
- webscout/__main__.py +5 -0
- webscout/cli.py +432 -0
- webscout/exceptions.py +2 -0
- webscout/models.py +23 -0
- webscout/utils.py +48 -0
- webscout/version.py +1 -0
- webscout/webscout_search.py +65 -0
- webscout/webscout_search_async.py +861 -0
- webscout-1.0.0.dist-info/LICENSE.md +21 -0
- webscout-1.0.0.dist-info/METADATA +627 -0
- webscout-1.0.0.dist-info/RECORD +15 -0
- webscout-1.0.0.dist-info/WHEEL +5 -0
- webscout-1.0.0.dist-info/entry_points.txt +2 -0
- webscout-1.0.0.dist-info/top_level.txt +1 -0
webscout/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""webscout.
|
|
2
|
+
|
|
3
|
+
Search for words, documents, images, videos, news, maps and text translation
|
|
4
|
+
using the DuckDuckGo.com search engine.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
# votext: noqa: F401
|
|
10
|
+
from .webscout_search import DDGS
|
|
11
|
+
from .webscout_search_async import AsyncDDGS
|
|
12
|
+
from .version import __version__
|
|
13
|
+
|
|
14
|
+
__all__ = ["DDGS", "AsyncDDGS", "__version__", "cli"]
|
|
15
|
+
|
|
16
|
+
# A do-nothing logging handler
|
|
17
|
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
|
18
|
+
logging.getLogger("duckduckgo_search").addHandler(logging.NullHandler())
|
webscout/__main__.py
ADDED
webscout/cli.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from urllib.parse import unquote
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
from curl_cffi import requests
|
|
11
|
+
|
|
12
|
+
from .webscout_search import DDGS
|
|
13
|
+
from .version import __version__
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
COLORS = {
|
|
18
|
+
0: "black",
|
|
19
|
+
1: "red",
|
|
20
|
+
2: "green",
|
|
21
|
+
3: "yellow",
|
|
22
|
+
4: "blue",
|
|
23
|
+
5: "magenta",
|
|
24
|
+
6: "cyan",
|
|
25
|
+
7: "bright_black",
|
|
26
|
+
8: "bright_red",
|
|
27
|
+
9: "bright_green",
|
|
28
|
+
10: "bright_yellow",
|
|
29
|
+
11: "bright_blue",
|
|
30
|
+
12: "bright_magenta",
|
|
31
|
+
13: "bright_cyan",
|
|
32
|
+
14: "white",
|
|
33
|
+
15: "bright_white",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _save_json(jsonfile, data):
|
|
38
|
+
with open(jsonfile, "w", encoding="utf-8") as file:
|
|
39
|
+
json.dump(data, file, ensure_ascii=False, indent=4)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _save_csv(csvfile, data):
|
|
43
|
+
with open(csvfile, "w", newline="", encoding="utf-8") as file:
|
|
44
|
+
if data:
|
|
45
|
+
headers = data[0].keys()
|
|
46
|
+
writer = csv.DictWriter(file, fieldnames=headers, quoting=csv.QUOTE_MINIMAL)
|
|
47
|
+
writer.writeheader()
|
|
48
|
+
writer.writerows(data)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _print_data(data):
|
|
52
|
+
if data:
|
|
53
|
+
for i, e in enumerate(data, start=1):
|
|
54
|
+
click.secho(f"{i}.\t {'=' * 78}", bg="black", fg="white")
|
|
55
|
+
for j, (k, v) in enumerate(e.items(), start=1):
|
|
56
|
+
if v:
|
|
57
|
+
width = 300 if k in ("content", "href", "image", "source", "thumbnail", "url") else 78
|
|
58
|
+
k = "language" if k == "detected_language" else k
|
|
59
|
+
text = click.wrap_text(
|
|
60
|
+
f"{v}", width=width, initial_indent="", subsequent_indent=" " * 12, preserve_paragraphs=True
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
text = v
|
|
64
|
+
click.secho(f"{k:<12}{text}", bg="black", fg=COLORS[j], overline=True)
|
|
65
|
+
input()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _sanitize_keywords(keywords):
|
|
69
|
+
keywords = (
|
|
70
|
+
keywords.replace("filetype", "")
|
|
71
|
+
.replace(":", "")
|
|
72
|
+
.replace('"', "'")
|
|
73
|
+
.replace("site", "")
|
|
74
|
+
.replace(" ", "_")
|
|
75
|
+
.replace("/", "_")
|
|
76
|
+
.replace("\\", "_")
|
|
77
|
+
.replace(" ", "")
|
|
78
|
+
)
|
|
79
|
+
return keywords
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _download_file(url, dir_path, filename, proxy):
|
|
83
|
+
try:
|
|
84
|
+
resp = requests.get(url, proxies=proxy, impersonate="chrome", timeout=10)
|
|
85
|
+
resp.raise_for_status()
|
|
86
|
+
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
|
|
87
|
+
file.write(resp.content)
|
|
88
|
+
except Exception as ex:
|
|
89
|
+
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _download_results(keywords, results, images=False, proxy=None, threads=None):
|
|
93
|
+
path_type = "images" if images else "text"
|
|
94
|
+
path = f"{path_type}_{keywords}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
95
|
+
os.makedirs(path, exist_ok=True)
|
|
96
|
+
proxy = {"http": proxy, "https": proxy}
|
|
97
|
+
|
|
98
|
+
threads = 10 if threads is None else threads
|
|
99
|
+
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
100
|
+
futures = []
|
|
101
|
+
for i, res in enumerate(results, start=1):
|
|
102
|
+
url = res["image"] if images else res["href"]
|
|
103
|
+
filename = unquote(url.split("/")[-1].split("?")[0])
|
|
104
|
+
f = executor.submit(_download_file, url, path, f"{i}_{filename}", proxy)
|
|
105
|
+
futures.append(f)
|
|
106
|
+
|
|
107
|
+
with click.progressbar(
|
|
108
|
+
length=len(futures), label="Downloading", show_percent=True, show_pos=True, width=50
|
|
109
|
+
) as bar:
|
|
110
|
+
for future in as_completed(futures):
|
|
111
|
+
future.result()
|
|
112
|
+
bar.update(1)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@click.group(chain=True)
|
|
116
|
+
def cli():
|
|
117
|
+
"""A decorator that creates a command group.
|
|
118
|
+
|
|
119
|
+
This decorator is used to create a group of commands.
|
|
120
|
+
The `chain=True` parameter allows the commands in the group to be chained together,
|
|
121
|
+
meaning the output of one command can be used as the input for the next command.
|
|
122
|
+
"""
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@cli.command()
|
|
127
|
+
def version():
|
|
128
|
+
"""A command-line interface command that prints and returns the current version of the program."""
|
|
129
|
+
print(__version__)
|
|
130
|
+
return __version__
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@cli.command()
|
|
134
|
+
@click.option("-k", "--keywords", required=True, help="text search, keywords for query")
|
|
135
|
+
@click.option("-r", "--region", default="wt-wt", help="wt-wt, us-en, ru-ru, etc. -region https://duckduckgo.com/params")
|
|
136
|
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
|
137
|
+
@click.option("-t", "--timelimit", default=None, type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
|
|
138
|
+
@click.option("-m", "--max_results", default=20, help="maximum number of results, default=20")
|
|
139
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
140
|
+
@click.option("-d", "--download", is_flag=True, default=False, help="download results to 'keywords' folder")
|
|
141
|
+
@click.option("-b", "--backend", default="api", type=click.Choice(["api", "html", "lite"]), help="which backend to use")
|
|
142
|
+
@click.option("-th", "--threads", default=10, help="download threads, default=10")
|
|
143
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
144
|
+
def text(keywords, region, safesearch, timelimit, backend, output, download, threads, max_results, proxy):
|
|
145
|
+
"""CLI function to perform a text search using DuckDuckGo API."""
|
|
146
|
+
data = []
|
|
147
|
+
for r in DDGS(proxies=proxy).text(
|
|
148
|
+
keywords=keywords,
|
|
149
|
+
region=region,
|
|
150
|
+
safesearch=safesearch,
|
|
151
|
+
timelimit=timelimit,
|
|
152
|
+
backend=backend,
|
|
153
|
+
max_results=max_results,
|
|
154
|
+
):
|
|
155
|
+
data.append(r)
|
|
156
|
+
keywords = _sanitize_keywords(keywords)
|
|
157
|
+
filename = f"text_{keywords}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
158
|
+
if output == "print" and not download:
|
|
159
|
+
_print_data(data)
|
|
160
|
+
elif output == "csv":
|
|
161
|
+
_save_csv(f"{filename}.csv", data)
|
|
162
|
+
elif output == "json":
|
|
163
|
+
_save_json(f"{filename}.json", data)
|
|
164
|
+
if download:
|
|
165
|
+
_download_results(keywords, data, proxy=proxy, threads=threads)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@cli.command()
|
|
169
|
+
@click.option("-k", "--keywords", required=True, help="answers search, keywords for query")
|
|
170
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
171
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
172
|
+
def answers(keywords, output, proxy):
|
|
173
|
+
"""CLI function to perform a answers search using DuckDuckGo API."""
|
|
174
|
+
data = []
|
|
175
|
+
for r in DDGS(proxies=proxy).answers(keywords=keywords):
|
|
176
|
+
data.append(r)
|
|
177
|
+
filename = f"answers_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
178
|
+
if output == "print":
|
|
179
|
+
_print_data(data)
|
|
180
|
+
elif output == "csv":
|
|
181
|
+
_save_csv(f"{filename}.csv", data)
|
|
182
|
+
elif output == "json":
|
|
183
|
+
_save_json(f"{filename}.json", data)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@cli.command()
|
|
187
|
+
@click.option("-k", "--keywords", required=True, help="keywords for query")
|
|
188
|
+
@click.option("-r", "--region", default="wt-wt", help="wt-wt, us-en, ru-ru, etc. -region https://duckduckgo.com/params")
|
|
189
|
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
|
190
|
+
@click.option("-t", "--timelimit", default=None, type=click.Choice(["Day", "Week", "Month", "Year"]))
|
|
191
|
+
@click.option("-size", "--size", default=None, type=click.Choice(["Small", "Medium", "Large", "Wallpaper"]))
|
|
192
|
+
@click.option(
|
|
193
|
+
"-c",
|
|
194
|
+
"--color",
|
|
195
|
+
default=None,
|
|
196
|
+
type=click.Choice(
|
|
197
|
+
[
|
|
198
|
+
"color",
|
|
199
|
+
"Monochrome",
|
|
200
|
+
"Red",
|
|
201
|
+
"Orange",
|
|
202
|
+
"Yellow",
|
|
203
|
+
"Green",
|
|
204
|
+
"Blue",
|
|
205
|
+
"Purple",
|
|
206
|
+
"Pink",
|
|
207
|
+
"Brown",
|
|
208
|
+
"Black",
|
|
209
|
+
"Gray",
|
|
210
|
+
"Teal",
|
|
211
|
+
"White",
|
|
212
|
+
]
|
|
213
|
+
),
|
|
214
|
+
)
|
|
215
|
+
@click.option(
|
|
216
|
+
"-type", "--type_image", default=None, type=click.Choice(["photo", "clipart", "gif", "transparent", "line"])
|
|
217
|
+
)
|
|
218
|
+
@click.option("-l", "--layout", default=None, type=click.Choice(["Square", "Tall", "Wide"]))
|
|
219
|
+
@click.option(
|
|
220
|
+
"-lic",
|
|
221
|
+
"--license_image",
|
|
222
|
+
default=None,
|
|
223
|
+
type=click.Choice(["any", "Public", "Share", "Modify", "ModifyCommercially"]),
|
|
224
|
+
)
|
|
225
|
+
@click.option("-m", "--max_results", default=90, help="maximum number of results, default=90")
|
|
226
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
227
|
+
@click.option("-d", "--download", is_flag=True, default=False, help="download and save images to 'keywords' folder")
|
|
228
|
+
@click.option("-th", "--threads", default=10, help="download threads, default=10")
|
|
229
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
230
|
+
def images(
|
|
231
|
+
keywords,
|
|
232
|
+
region,
|
|
233
|
+
safesearch,
|
|
234
|
+
timelimit,
|
|
235
|
+
size,
|
|
236
|
+
color,
|
|
237
|
+
type_image,
|
|
238
|
+
layout,
|
|
239
|
+
license_image,
|
|
240
|
+
download,
|
|
241
|
+
threads,
|
|
242
|
+
max_results,
|
|
243
|
+
output,
|
|
244
|
+
proxy,
|
|
245
|
+
):
|
|
246
|
+
"""CLI function to perform a images search using DuckDuckGo API."""
|
|
247
|
+
data = []
|
|
248
|
+
for r in DDGS(proxies=proxy).images(
|
|
249
|
+
keywords=keywords,
|
|
250
|
+
region=region,
|
|
251
|
+
safesearch=safesearch,
|
|
252
|
+
timelimit=timelimit,
|
|
253
|
+
size=size,
|
|
254
|
+
color=color,
|
|
255
|
+
type_image=type_image,
|
|
256
|
+
layout=layout,
|
|
257
|
+
license_image=license_image,
|
|
258
|
+
max_results=max_results,
|
|
259
|
+
):
|
|
260
|
+
data.append(r)
|
|
261
|
+
keywords = _sanitize_keywords(keywords)
|
|
262
|
+
filename = f"images_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
263
|
+
if output == "print" and not download:
|
|
264
|
+
_print_data(data)
|
|
265
|
+
elif output == "csv":
|
|
266
|
+
_save_csv(f"{filename}.csv", data)
|
|
267
|
+
elif output == "json":
|
|
268
|
+
_save_json(f"{filename}.json", data)
|
|
269
|
+
if download:
|
|
270
|
+
_download_results(keywords, data, images=True, proxy=proxy, threads=threads)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@cli.command()
|
|
274
|
+
@click.option("-k", "--keywords", required=True, help="keywords for query")
|
|
275
|
+
@click.option("-r", "--region", default="wt-wt", help="wt-wt, us-en, ru-ru, etc. -region https://duckduckgo.com/params")
|
|
276
|
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
|
277
|
+
@click.option("-t", "--timelimit", default=None, type=click.Choice(["d", "w", "m"]), help="day, week, month")
|
|
278
|
+
@click.option("-res", "--resolution", default=None, type=click.Choice(["high", "standart"]))
|
|
279
|
+
@click.option("-d", "--duration", default=None, type=click.Choice(["short", "medium", "long"]))
|
|
280
|
+
@click.option("-lic", "--license_videos", default=None, type=click.Choice(["creativeCommon", "youtube"]))
|
|
281
|
+
@click.option("-m", "--max_results", default=50, help="maximum number of results, default=50")
|
|
282
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
283
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
284
|
+
def videos(keywords, region, safesearch, timelimit, resolution, duration, license_videos, max_results, output, proxy):
|
|
285
|
+
"""CLI function to perform a videos search using DuckDuckGo API."""
|
|
286
|
+
data = []
|
|
287
|
+
for r in DDGS(proxies=proxy).videos(
|
|
288
|
+
keywords=keywords,
|
|
289
|
+
region=region,
|
|
290
|
+
safesearch=safesearch,
|
|
291
|
+
timelimit=timelimit,
|
|
292
|
+
resolution=resolution,
|
|
293
|
+
duration=duration,
|
|
294
|
+
license_videos=license_videos,
|
|
295
|
+
max_results=max_results,
|
|
296
|
+
):
|
|
297
|
+
data.append(r)
|
|
298
|
+
filename = f"videos_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
299
|
+
if output == "print":
|
|
300
|
+
_print_data(data)
|
|
301
|
+
elif output == "csv":
|
|
302
|
+
_save_csv(f"{filename}.csv", data)
|
|
303
|
+
elif output == "json":
|
|
304
|
+
_save_json(f"{filename}.json", data)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@cli.command()
|
|
308
|
+
@click.option("-k", "--keywords", required=True, help="keywords for query")
|
|
309
|
+
@click.option("-r", "--region", default="wt-wt", help="wt-wt, us-en, ru-ru, etc. -region https://duckduckgo.com/params")
|
|
310
|
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
|
311
|
+
@click.option("-t", "--timelimit", default=None, type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
|
|
312
|
+
@click.option("-m", "--max_results", default=25, help="maximum number of results, default=25")
|
|
313
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
314
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
315
|
+
def news(keywords, region, safesearch, timelimit, max_results, output, proxy):
|
|
316
|
+
"""CLI function to perform a news search using DuckDuckGo API."""
|
|
317
|
+
data = []
|
|
318
|
+
for r in DDGS(proxies=proxy).news(
|
|
319
|
+
keywords=keywords, region=region, safesearch=safesearch, timelimit=timelimit, max_results=max_results
|
|
320
|
+
):
|
|
321
|
+
data.append(r)
|
|
322
|
+
filename = f"news_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
323
|
+
if output == "print":
|
|
324
|
+
_print_data(data)
|
|
325
|
+
elif output == "csv":
|
|
326
|
+
_save_csv(f"{filename}.csv", data)
|
|
327
|
+
elif output == "json":
|
|
328
|
+
_save_json(f"{filename}.json", data)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@cli.command()
|
|
332
|
+
@click.option("-k", "--keywords", required=True, help="keywords for query")
|
|
333
|
+
@click.option("-p", "--place", default=None, help="simplified search - if set, the other parameters are not used")
|
|
334
|
+
@click.option("-s", "--street", default=None, help="house number/street")
|
|
335
|
+
@click.option("-c", "--city", default=None, help="city of search")
|
|
336
|
+
@click.option("-county", "--county", default=None, help="county of search")
|
|
337
|
+
@click.option("-state", "--state", default=None, help="state of search")
|
|
338
|
+
@click.option("-country", "--country", default=None, help="country of search")
|
|
339
|
+
@click.option("-post", "--postalcode", default=None, help="postalcode of search")
|
|
340
|
+
@click.option("-lat", "--latitude", default=None, help="""if lat and long are set, the other params are not used""")
|
|
341
|
+
@click.option("-lon", "--longitude", default=None, help="""if lat and long are set, the other params are not used""")
|
|
342
|
+
@click.option("-r", "--radius", default=0, help="expand the search square by the distance in kilometers")
|
|
343
|
+
@click.option("-m", "--max_results", default=50, help="number of results, default=50")
|
|
344
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
345
|
+
@click.option("-proxy", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
346
|
+
def maps(
|
|
347
|
+
keywords,
|
|
348
|
+
place,
|
|
349
|
+
street,
|
|
350
|
+
city,
|
|
351
|
+
county,
|
|
352
|
+
state,
|
|
353
|
+
country,
|
|
354
|
+
postalcode,
|
|
355
|
+
latitude,
|
|
356
|
+
longitude,
|
|
357
|
+
radius,
|
|
358
|
+
max_results,
|
|
359
|
+
output,
|
|
360
|
+
proxy,
|
|
361
|
+
):
|
|
362
|
+
"""CLI function to perform a maps search using DuckDuckGo API."""
|
|
363
|
+
data = []
|
|
364
|
+
for i, r in enumerate(
|
|
365
|
+
DDGS(proxies=proxy).maps(
|
|
366
|
+
keywords=keywords,
|
|
367
|
+
place=place,
|
|
368
|
+
street=street,
|
|
369
|
+
city=city,
|
|
370
|
+
county=county,
|
|
371
|
+
state=state,
|
|
372
|
+
country=country,
|
|
373
|
+
postalcode=postalcode,
|
|
374
|
+
latitude=latitude,
|
|
375
|
+
longitude=longitude,
|
|
376
|
+
radius=radius,
|
|
377
|
+
max_results=max_results,
|
|
378
|
+
),
|
|
379
|
+
start=1,
|
|
380
|
+
):
|
|
381
|
+
data.append(r)
|
|
382
|
+
if i % 100 == 0:
|
|
383
|
+
print(i)
|
|
384
|
+
filename = f"maps_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
385
|
+
if output == "print":
|
|
386
|
+
_print_data(data)
|
|
387
|
+
elif output == "csv":
|
|
388
|
+
_save_csv(f"{filename}.csv", data)
|
|
389
|
+
elif output == "json":
|
|
390
|
+
_save_json(f"{filename}.json", data)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
@cli.command()
|
|
394
|
+
@click.option("-k", "--keywords", required=True, help="text for translation")
|
|
395
|
+
@click.option("-f", "--from_", help="What language to translate from (defaults automatically)")
|
|
396
|
+
@click.option("-t", "--to", default="en", help="de, ru, fr, etc. What language to translate, defaults='en'")
|
|
397
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
398
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
399
|
+
def translate(keywords, from_, to, output, proxy):
|
|
400
|
+
"""CLI function to perform translate using DuckDuckGo API."""
|
|
401
|
+
data = DDGS(proxies=proxy).translate(keywords=keywords, from_=from_, to=to)
|
|
402
|
+
data = [data]
|
|
403
|
+
filename = f"translate_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
404
|
+
if output == "print":
|
|
405
|
+
_print_data(data)
|
|
406
|
+
elif output == "csv":
|
|
407
|
+
_save_csv(f"{filename}.csv", data)
|
|
408
|
+
elif output == "json":
|
|
409
|
+
_save_json(f"{filename}.json", data)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
@cli.command()
|
|
413
|
+
@click.option("-k", "--keywords", required=True, help="keywords for query")
|
|
414
|
+
@click.option("-r", "--region", default="wt-wt", help="wt-wt, us-en, ru-ru, etc. -region https://duckduckgo.com/params")
|
|
415
|
+
@click.option("-o", "--output", default="print", help="csv, json (save the results to a csv or json file)")
|
|
416
|
+
@click.option("-p", "--proxy", default=None, help="the proxy to send requests, example: socks5://localhost:9150")
|
|
417
|
+
def suggestions(keywords, region, output, proxy):
|
|
418
|
+
"""CLI function to perform a suggestions search using DuckDuckGo API."""
|
|
419
|
+
data = []
|
|
420
|
+
for r in DDGS(proxies=proxy).suggestions(keywords=keywords, region=region):
|
|
421
|
+
data.append(r)
|
|
422
|
+
filename = f"suggestions_{_sanitize_keywords(keywords)}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
423
|
+
if output == "print":
|
|
424
|
+
_print_data(data)
|
|
425
|
+
elif output == "csv":
|
|
426
|
+
_save_csv(f"{filename}.csv", data)
|
|
427
|
+
elif output == "json":
|
|
428
|
+
_save_json(f"{filename}.json", data)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
if __name__ == "__main__":
|
|
432
|
+
cli(prog_name="ddgs")
|
webscout/exceptions.py
ADDED
webscout/models.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class MapsResult:
|
|
7
|
+
"""Represents a result from the maps search."""
|
|
8
|
+
|
|
9
|
+
title: Optional[str] = None
|
|
10
|
+
address: Optional[str] = None
|
|
11
|
+
country_code: Optional[str] = None
|
|
12
|
+
latitude: Optional[str] = None
|
|
13
|
+
longitude: Optional[str] = None
|
|
14
|
+
url: Optional[str] = None
|
|
15
|
+
desc: Optional[str] = None
|
|
16
|
+
phone: Optional[str] = None
|
|
17
|
+
image: Optional[str] = None
|
|
18
|
+
source: Optional[str] = None
|
|
19
|
+
hours: Optional[Dict[str, str]] = None
|
|
20
|
+
category: Optional[str] = None
|
|
21
|
+
facebook: Optional[str] = None
|
|
22
|
+
instagram: Optional[str] = None
|
|
23
|
+
twitter: Optional[str] = None
|
webscout/utils.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from html import unescape
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from urllib.parse import unquote
|
|
6
|
+
|
|
7
|
+
from .exceptions import DuckDuckGoSearchException
|
|
8
|
+
|
|
9
|
+
REGEX_500_IN_URL = re.compile(r"(?:\d{3}-\d{2}\.js)")
|
|
10
|
+
REGEX_STRIP_TAGS = re.compile("<.*?>")
|
|
11
|
+
REGEX_VQD = re.compile(rb"""vqd=['"]?([^&"']+)""")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _extract_vqd(html_bytes: bytes, keywords: str) -> Optional[str]:
|
|
15
|
+
"""Extract vqd from html using a regular expression."""
|
|
16
|
+
try:
|
|
17
|
+
match = REGEX_VQD.search(html_bytes)
|
|
18
|
+
if match:
|
|
19
|
+
return match.group(1).decode()
|
|
20
|
+
except Exception:
|
|
21
|
+
pass
|
|
22
|
+
raise DuckDuckGoSearchException(f"_extract_vqd() {keywords=} Could not extract vqd.")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _text_extract_json(html_bytes: bytes, keywords: str) -> Optional[str]:
|
|
26
|
+
"""text(backend="api") -> extract json from html."""
|
|
27
|
+
try:
|
|
28
|
+
start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
|
|
29
|
+
end = html_bytes.index(b");DDG.duckbar.load(", start)
|
|
30
|
+
data = html_bytes[start:end]
|
|
31
|
+
return json.loads(data)
|
|
32
|
+
except Exception as ex:
|
|
33
|
+
raise DuckDuckGoSearchException(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _is_500_in_url(url: str) -> bool:
|
|
37
|
+
"""Something like '506-00.js' inside the url."""
|
|
38
|
+
return bool(REGEX_500_IN_URL.search(url))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _normalize(raw_html: str) -> str:
|
|
42
|
+
"""Strip HTML tags from the raw_html string."""
|
|
43
|
+
return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _normalize_url(url: str) -> str:
|
|
47
|
+
"""Unquote URL and replace spaces with '+'."""
|
|
48
|
+
return unquote(url.replace(" ", "+")) if url else ""
|
webscout/version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0" # Replace with your actual version number
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Dict, Generator, Optional
|
|
5
|
+
|
|
6
|
+
import nest_asyncio
|
|
7
|
+
|
|
8
|
+
from .webscout_search_async import AsyncDDGS
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger("duckduckgo_search.DDGS")
|
|
11
|
+
nest_asyncio.apply()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DDGS(AsyncDDGS):
|
|
15
|
+
def __init__(self, headers=None, proxies=None, timeout=10):
|
|
16
|
+
if asyncio.get_event_loop().is_running():
|
|
17
|
+
warnings.warn("DDGS running in an async loop. This may cause errors. Use AsyncDDGS instead.", stacklevel=2)
|
|
18
|
+
super().__init__(headers, proxies, timeout)
|
|
19
|
+
self._loop = asyncio.get_event_loop()
|
|
20
|
+
|
|
21
|
+
def __enter__(self) -> "DDGS":
|
|
22
|
+
return self
|
|
23
|
+
|
|
24
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
25
|
+
self._loop.create_task(self.__aexit__(exc_type, exc_val, exc_tb))
|
|
26
|
+
|
|
27
|
+
def _iter_over_async(self, async_gen):
|
|
28
|
+
"""Iterate over an async generator."""
|
|
29
|
+
while True:
|
|
30
|
+
try:
|
|
31
|
+
yield self._loop.run_until_complete(async_gen.__anext__())
|
|
32
|
+
except StopAsyncIteration:
|
|
33
|
+
break
|
|
34
|
+
|
|
35
|
+
def text(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
36
|
+
async_gen = super().text(*args, **kwargs)
|
|
37
|
+
return self._iter_over_async(async_gen)
|
|
38
|
+
|
|
39
|
+
def images(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
40
|
+
async_gen = super().images(*args, **kwargs)
|
|
41
|
+
return self._iter_over_async(async_gen)
|
|
42
|
+
|
|
43
|
+
def videos(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
44
|
+
async_gen = super().videos(*args, **kwargs)
|
|
45
|
+
return self._iter_over_async(async_gen)
|
|
46
|
+
|
|
47
|
+
def news(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
48
|
+
async_gen = super().news(*args, **kwargs)
|
|
49
|
+
return self._iter_over_async(async_gen)
|
|
50
|
+
|
|
51
|
+
def answers(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
52
|
+
async_gen = super().answers(*args, **kwargs)
|
|
53
|
+
return self._iter_over_async(async_gen)
|
|
54
|
+
|
|
55
|
+
def suggestions(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
56
|
+
async_gen = super().suggestions(*args, **kwargs)
|
|
57
|
+
return self._iter_over_async(async_gen)
|
|
58
|
+
|
|
59
|
+
def maps(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
|
|
60
|
+
async_gen = super().maps(*args, **kwargs)
|
|
61
|
+
return self._iter_over_async(async_gen)
|
|
62
|
+
|
|
63
|
+
def translate(self, *args, **kwargs) -> Optional[Dict[str, Optional[str]]]:
|
|
64
|
+
async_coro = super().translate(*args, **kwargs)
|
|
65
|
+
return self._loop.run_until_complete(async_coro)
|