webscout 7.0__py3-none-any.whl → 7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +191 -191
- webscout/AIbase.py +122 -122
- webscout/AIutel.py +440 -440
- webscout/Bard.py +343 -161
- webscout/DWEBS.py +489 -492
- webscout/Extra/YTToolkit/YTdownloader.py +995 -995
- webscout/Extra/YTToolkit/__init__.py +2 -2
- webscout/Extra/YTToolkit/transcriber.py +476 -479
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
- webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
- webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
- webscout/Extra/YTToolkit/ytapi/video.py +103 -103
- webscout/Extra/autocoder/__init__.py +9 -9
- webscout/Extra/autocoder/autocoder_utiles.py +199 -199
- webscout/Extra/autocoder/rawdog.py +5 -7
- webscout/Extra/autollama.py +230 -230
- webscout/Extra/gguf.py +3 -3
- webscout/Extra/weather.py +171 -171
- webscout/LLM.py +442 -442
- webscout/Litlogger/__init__.py +67 -681
- webscout/Litlogger/core/__init__.py +6 -0
- webscout/Litlogger/core/level.py +20 -0
- webscout/Litlogger/core/logger.py +123 -0
- webscout/Litlogger/handlers/__init__.py +12 -0
- webscout/Litlogger/handlers/console.py +50 -0
- webscout/Litlogger/handlers/file.py +143 -0
- webscout/Litlogger/handlers/network.py +174 -0
- webscout/Litlogger/styles/__init__.py +7 -0
- webscout/Litlogger/styles/colors.py +231 -0
- webscout/Litlogger/styles/formats.py +377 -0
- webscout/Litlogger/styles/text.py +87 -0
- webscout/Litlogger/utils/__init__.py +6 -0
- webscout/Litlogger/utils/detectors.py +154 -0
- webscout/Litlogger/utils/formatters.py +200 -0
- webscout/Provider/AISEARCH/DeepFind.py +250 -250
- webscout/Provider/Blackboxai.py +136 -137
- webscout/Provider/ChatGPTGratis.py +226 -0
- webscout/Provider/Cloudflare.py +91 -78
- webscout/Provider/DeepSeek.py +218 -0
- webscout/Provider/Deepinfra.py +59 -35
- webscout/Provider/Free2GPT.py +131 -124
- webscout/Provider/Gemini.py +100 -115
- webscout/Provider/Glider.py +74 -59
- webscout/Provider/Groq.py +30 -18
- webscout/Provider/Jadve.py +108 -77
- webscout/Provider/Llama3.py +117 -94
- webscout/Provider/Marcus.py +191 -137
- webscout/Provider/Netwrck.py +62 -50
- webscout/Provider/PI.py +79 -124
- webscout/Provider/PizzaGPT.py +129 -83
- webscout/Provider/QwenLM.py +311 -0
- webscout/Provider/TTI/AiForce/__init__.py +22 -22
- webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
- webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
- webscout/Provider/TTI/Nexra/__init__.py +22 -22
- webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
- webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
- webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
- webscout/Provider/TTI/artbit/__init__.py +22 -22
- webscout/Provider/TTI/artbit/async_artbit.py +184 -184
- webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
- webscout/Provider/TTI/blackbox/__init__.py +4 -4
- webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
- webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
- webscout/Provider/TTI/deepinfra/__init__.py +4 -4
- webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
- webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
- webscout/Provider/TTI/huggingface/__init__.py +22 -22
- webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
- webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
- webscout/Provider/TTI/imgninza/__init__.py +4 -4
- webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
- webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
- webscout/Provider/TTI/talkai/__init__.py +4 -4
- webscout/Provider/TTI/talkai/async_talkai.py +229 -229
- webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
- webscout/Provider/TTS/deepgram.py +182 -182
- webscout/Provider/TTS/elevenlabs.py +136 -136
- webscout/Provider/TTS/gesserit.py +150 -150
- webscout/Provider/TTS/murfai.py +138 -138
- webscout/Provider/TTS/parler.py +133 -134
- webscout/Provider/TTS/streamElements.py +360 -360
- webscout/Provider/TTS/utils.py +280 -280
- webscout/Provider/TTS/voicepod.py +116 -116
- webscout/Provider/TextPollinationsAI.py +74 -47
- webscout/Provider/WiseCat.py +193 -0
- webscout/Provider/__init__.py +144 -136
- webscout/Provider/cerebras.py +242 -227
- webscout/Provider/chatglm.py +204 -204
- webscout/Provider/dgaf.py +67 -39
- webscout/Provider/gaurish.py +105 -66
- webscout/Provider/geminiapi.py +208 -208
- webscout/Provider/granite.py +223 -0
- webscout/Provider/hermes.py +218 -218
- webscout/Provider/llama3mitril.py +179 -179
- webscout/Provider/llamatutor.py +72 -62
- webscout/Provider/llmchat.py +60 -35
- webscout/Provider/meta.py +794 -794
- webscout/Provider/multichat.py +331 -230
- webscout/Provider/typegpt.py +359 -356
- webscout/Provider/yep.py +5 -5
- webscout/__main__.py +5 -5
- webscout/cli.py +319 -319
- webscout/conversation.py +241 -242
- webscout/exceptions.py +328 -328
- webscout/litagent/__init__.py +28 -28
- webscout/litagent/agent.py +2 -3
- webscout/litprinter/__init__.py +0 -58
- webscout/scout/__init__.py +8 -8
- webscout/scout/core.py +884 -884
- webscout/scout/element.py +459 -459
- webscout/scout/parsers/__init__.py +69 -69
- webscout/scout/parsers/html5lib_parser.py +172 -172
- webscout/scout/parsers/html_parser.py +236 -236
- webscout/scout/parsers/lxml_parser.py +178 -178
- webscout/scout/utils.py +38 -38
- webscout/swiftcli/__init__.py +811 -811
- webscout/update_checker.py +2 -12
- webscout/version.py +1 -1
- webscout/webscout_search.py +1142 -1140
- webscout/webscout_search_async.py +635 -635
- webscout/zeroart/__init__.py +54 -54
- webscout/zeroart/base.py +60 -60
- webscout/zeroart/effects.py +99 -99
- webscout/zeroart/fonts.py +816 -816
- {webscout-7.0.dist-info → webscout-7.2.dist-info}/METADATA +21 -28
- webscout-7.2.dist-info/RECORD +217 -0
- webstoken/__init__.py +30 -30
- webstoken/classifier.py +189 -189
- webstoken/keywords.py +216 -216
- webstoken/language.py +128 -128
- webstoken/ner.py +164 -164
- webstoken/normalizer.py +35 -35
- webstoken/processor.py +77 -77
- webstoken/sentiment.py +206 -206
- webstoken/stemmer.py +73 -73
- webstoken/tagger.py +60 -60
- webstoken/tokenizer.py +158 -158
- webscout/Provider/RUBIKSAI.py +0 -272
- webscout-7.0.dist-info/RECORD +0 -199
- {webscout-7.0.dist-info → webscout-7.2.dist-info}/LICENSE.md +0 -0
- {webscout-7.0.dist-info → webscout-7.2.dist-info}/WHEEL +0 -0
- {webscout-7.0.dist-info → webscout-7.2.dist-info}/entry_points.txt +0 -0
- {webscout-7.0.dist-info → webscout-7.2.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,493 +1,490 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
from typing import Dict, List, Optional, Union, Any
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
-
from webscout.scout import Scout
|
|
5
|
-
from urllib.parse import quote, urljoin
|
|
6
|
-
from webscout.litagent import LitAgent
|
|
7
|
-
|
|
8
|
-
import time
|
|
9
|
-
import random
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
from datetime import datetime, timedelta
|
|
13
|
-
from functools import lru_cache
|
|
14
|
-
from webscout.Litlogger import
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
>>>
|
|
24
|
-
>>>
|
|
25
|
-
>>>
|
|
26
|
-
>>>
|
|
27
|
-
|
|
28
|
-
... print(f"
|
|
29
|
-
... print(f"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
>>>
|
|
34
|
-
|
|
35
|
-
...
|
|
36
|
-
...
|
|
37
|
-
...
|
|
38
|
-
...
|
|
39
|
-
...
|
|
40
|
-
|
|
41
|
-
>>>
|
|
42
|
-
|
|
43
|
-
... '
|
|
44
|
-
... '
|
|
45
|
-
... '
|
|
46
|
-
... '
|
|
47
|
-
... '
|
|
48
|
-
...
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
>>>
|
|
53
|
-
|
|
54
|
-
...
|
|
55
|
-
...
|
|
56
|
-
...
|
|
57
|
-
...
|
|
58
|
-
...
|
|
59
|
-
|
|
60
|
-
>>>
|
|
61
|
-
|
|
62
|
-
... '
|
|
63
|
-
... '
|
|
64
|
-
... '
|
|
65
|
-
...
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
-
|
|
70
|
-
-
|
|
71
|
-
-
|
|
72
|
-
-
|
|
73
|
-
-
|
|
74
|
-
-
|
|
75
|
-
-
|
|
76
|
-
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
'
|
|
83
|
-
'
|
|
84
|
-
'
|
|
85
|
-
'
|
|
86
|
-
'
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
'
|
|
93
|
-
'
|
|
94
|
-
'
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"
|
|
101
|
-
"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
self.
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
self.
|
|
128
|
-
self.client
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
self.
|
|
133
|
-
self.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
self.
|
|
138
|
-
self.
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
sleep_time
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
retry_count
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
retry_delay
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
retry_delay
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
if
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
- '
|
|
265
|
-
- '
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
- '
|
|
269
|
-
- '
|
|
270
|
-
|
|
271
|
-
- '
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
- '
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
...
|
|
288
|
-
...
|
|
289
|
-
|
|
290
|
-
...
|
|
291
|
-
... )
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
soup =
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
"
|
|
390
|
-
"
|
|
391
|
-
"
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
"
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
"
|
|
424
|
-
"
|
|
425
|
-
"
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
visible_text
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
return
|
|
479
|
-
|
|
480
|
-
def
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
searcher = GoogleS(rate_limit=3.0, use_litlogger=True)
|
|
491
|
-
results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
|
|
492
|
-
for result in results:
|
|
1
|
+
import requests
|
|
2
|
+
from typing import Dict, List, Optional, Union, Any
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from webscout.scout import Scout
|
|
5
|
+
from urllib.parse import quote, urljoin
|
|
6
|
+
from webscout.litagent import LitAgent
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
import random
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from datetime import datetime, timedelta
|
|
13
|
+
from functools import lru_cache
|
|
14
|
+
from webscout.Litlogger import Logger, LogFormat
|
|
15
|
+
class GoogleS:
|
|
16
|
+
"""A Python interface for Google search with advanced features
|
|
17
|
+
|
|
18
|
+
The GoogleS class provides a powerful interface to perform web searches, image searches,
|
|
19
|
+
and advanced filtering on Google. Built with love by HAI to keep it
|
|
20
|
+
|
|
21
|
+
Basic Usage:
|
|
22
|
+
>>> from webscout.DWEBS import GoogleS
|
|
23
|
+
>>> searcher = GoogleS()
|
|
24
|
+
>>> # Simple web search
|
|
25
|
+
>>> results = searcher.search("Python programming")
|
|
26
|
+
>>> for result in results:
|
|
27
|
+
... print(f"Title: {result['title']}")
|
|
28
|
+
... print(f"URL: {result['href']}")
|
|
29
|
+
... print(f"Description: {result['abstract']}")
|
|
30
|
+
|
|
31
|
+
Advanced Web Search:
|
|
32
|
+
>>> # Search with filters
|
|
33
|
+
>>> results = searcher.search(
|
|
34
|
+
... query="Python tutorials",
|
|
35
|
+
... site="github.com",
|
|
36
|
+
... file_type="pdf",
|
|
37
|
+
... time_period="month",
|
|
38
|
+
... max_results=5
|
|
39
|
+
... )
|
|
40
|
+
>>> # Example response format:
|
|
41
|
+
>>> {
|
|
42
|
+
... 'title': 'Python Tutorial',
|
|
43
|
+
... 'href': 'https://example.com/python-tutorial',
|
|
44
|
+
... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
|
|
45
|
+
... 'index': 0,
|
|
46
|
+
... 'type': 'web',
|
|
47
|
+
... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
|
|
48
|
+
... }
|
|
49
|
+
|
|
50
|
+
Image Search:
|
|
51
|
+
>>> # Search for images
|
|
52
|
+
>>> images = searcher.search_images(
|
|
53
|
+
... query="cute puppies",
|
|
54
|
+
... size="large",
|
|
55
|
+
... color="color",
|
|
56
|
+
... type_filter="photo",
|
|
57
|
+
... max_results=5
|
|
58
|
+
... )
|
|
59
|
+
>>> # Example response format:
|
|
60
|
+
>>> {
|
|
61
|
+
... 'title': 'Cute Puppy Image',
|
|
62
|
+
... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
|
|
63
|
+
... 'full_url': 'https://example.com/puppy-full.jpg',
|
|
64
|
+
... 'type': 'image'
|
|
65
|
+
... }
|
|
66
|
+
|
|
67
|
+
Features:
|
|
68
|
+
- Web Search: Get detailed web results with title, URL, and description
|
|
69
|
+
- Image Search: Find images with thumbnails and full-resolution URLs
|
|
70
|
+
- Advanced Filters: Site-specific search, file types, time periods
|
|
71
|
+
- Rate Limiting: Smart request handling to avoid blocks
|
|
72
|
+
- Caching: Save results for faster repeat searches
|
|
73
|
+
- Retry Logic: Automatic retry on temporary failures
|
|
74
|
+
- Logging: Optional LitLogger integration for beautiful console output
|
|
75
|
+
- Proxy Support: Use custom proxies for requests
|
|
76
|
+
- Concurrent Processing: Multi-threaded requests for better performance
|
|
77
|
+
|
|
78
|
+
Response Format:
|
|
79
|
+
Web Search Results:
|
|
80
|
+
{
|
|
81
|
+
'title': str, # Title of the webpage
|
|
82
|
+
'href': str, # URL of the webpage
|
|
83
|
+
'abstract': str, # Brief description or snippet
|
|
84
|
+
'index': int, # Result position
|
|
85
|
+
'type': 'web', # Result type identifier
|
|
86
|
+
'visible_text': str # Full page text (if extract_text=True)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
Image Search Results:
|
|
90
|
+
{
|
|
91
|
+
'title': str, # Image title or description
|
|
92
|
+
'thumbnail': str, # Thumbnail image URL
|
|
93
|
+
'full_url': str, # Full resolution image URL
|
|
94
|
+
'type': 'image' # Result type identifier
|
|
95
|
+
}
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
SEARCH_TYPES = {
|
|
99
|
+
"web": "https://www.google.com/search",
|
|
100
|
+
"image": "https://www.google.com/images",
|
|
101
|
+
"news": "https://www.google.com/news",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
headers: Optional[Dict[str, str]] = None,
|
|
107
|
+
proxy: Optional[str] = None,
|
|
108
|
+
timeout: Optional[int] = 10,
|
|
109
|
+
max_workers: int = 20,
|
|
110
|
+
cache_dir: Optional[str] = None,
|
|
111
|
+
rate_limit: float = 2.0,
|
|
112
|
+
use_litlogger: bool = False
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Initialize the GoogleS object with enhanced features.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
cache_dir: Directory to store search result cache
|
|
119
|
+
rate_limit: Minimum time between requests in seconds
|
|
120
|
+
use_litlogger: Whether to use LitLogger for logging (default: False)
|
|
121
|
+
"""
|
|
122
|
+
self.proxy = proxy
|
|
123
|
+
self.headers = headers if headers else {
|
|
124
|
+
"User-Agent": LitAgent().random() # Use LitAgent to generate user agent
|
|
125
|
+
}
|
|
126
|
+
self.headers["Referer"] = "https://www.google.com/"
|
|
127
|
+
self.client = requests.Session()
|
|
128
|
+
self.client.headers.update(self.headers)
|
|
129
|
+
if proxy:
|
|
130
|
+
self.client.proxies.update({"http": proxy, "https": proxy})
|
|
131
|
+
self.timeout = timeout
|
|
132
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
133
|
+
self.cache_dir = cache_dir
|
|
134
|
+
if cache_dir and not os.path.exists(cache_dir):
|
|
135
|
+
os.makedirs(cache_dir)
|
|
136
|
+
self.last_request_time = 0
|
|
137
|
+
self.rate_limit = rate_limit
|
|
138
|
+
self.use_litlogger = use_litlogger
|
|
139
|
+
|
|
140
|
+
# Setup enhanced logging with LitLogger if enabled
|
|
141
|
+
if self.use_litlogger:
|
|
142
|
+
self.logger = Logger(
|
|
143
|
+
name="GoogleS",
|
|
144
|
+
format=LogFormat.MODERN_EMOJI,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def _respect_rate_limit(self):
|
|
148
|
+
"""Ensure minimum time between requests"""
|
|
149
|
+
current_time = time.time()
|
|
150
|
+
time_since_last = current_time - self.last_request_time
|
|
151
|
+
if time_since_last < self.rate_limit:
|
|
152
|
+
sleep_time = self.rate_limit - time_since_last
|
|
153
|
+
if self.use_litlogger:
|
|
154
|
+
self.logger.debug(f"Rate limiting: Waiting {sleep_time:.2f} seconds")
|
|
155
|
+
time.sleep(sleep_time)
|
|
156
|
+
self.last_request_time = time.time()
|
|
157
|
+
|
|
158
|
+
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
|
159
|
+
data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
|
|
160
|
+
"""
|
|
161
|
+
Makes an HTTP request with manual retry logic and rate limiting.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
method (str): HTTP method (GET, POST, etc.)
|
|
165
|
+
url (str): Target URL
|
|
166
|
+
params (Optional[Dict[str, str]]): Query parameters
|
|
167
|
+
data (Optional[Union[Dict[str, str], bytes]]): Request payload
|
|
168
|
+
max_retries (int): Maximum number of retry attempts
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
bytes: Response content
|
|
172
|
+
"""
|
|
173
|
+
retry_count = 0
|
|
174
|
+
base_delay = 5 # Base delay in seconds
|
|
175
|
+
|
|
176
|
+
while retry_count < max_retries:
|
|
177
|
+
try:
|
|
178
|
+
self._respect_rate_limit()
|
|
179
|
+
response = self.client.request(
|
|
180
|
+
method=method,
|
|
181
|
+
url=url,
|
|
182
|
+
params=params,
|
|
183
|
+
data=data,
|
|
184
|
+
timeout=self.timeout
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if response.status_code == 429:
|
|
188
|
+
retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
|
|
189
|
+
if self.use_litlogger:
|
|
190
|
+
self.logger.warning(f"Rate limited by Google. Waiting {retry_delay} seconds before retry...")
|
|
191
|
+
time.sleep(retry_delay)
|
|
192
|
+
retry_count += 1
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
response.raise_for_status()
|
|
196
|
+
return response.content
|
|
197
|
+
|
|
198
|
+
except requests.exceptions.RequestException as e:
|
|
199
|
+
if retry_count == max_retries - 1:
|
|
200
|
+
if self.use_litlogger:
|
|
201
|
+
self.logger.error(f"Max retries reached. Last error: {str(e)}")
|
|
202
|
+
raise
|
|
203
|
+
|
|
204
|
+
retry_delay = base_delay * (2 ** retry_count)
|
|
205
|
+
if self.use_litlogger:
|
|
206
|
+
self.logger.warning(f"Request failed. Retrying in {retry_delay} seconds... Error: {str(e)}")
|
|
207
|
+
time.sleep(retry_delay)
|
|
208
|
+
retry_count += 1
|
|
209
|
+
|
|
210
|
+
raise Exception("Max retries reached")
|
|
211
|
+
|
|
212
|
+
@lru_cache(maxsize=100)
|
|
213
|
+
def _cache_key(self, query: str, **kwargs) -> str:
|
|
214
|
+
"""Generate a cache key from search parameters"""
|
|
215
|
+
cache_data = {'query': query, **kwargs}
|
|
216
|
+
return json.dumps(cache_data, sort_keys=True)
|
|
217
|
+
|
|
218
|
+
def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
|
|
219
|
+
"""Retrieve cached results if they exist and are not expired"""
|
|
220
|
+
if not self.cache_dir:
|
|
221
|
+
return None
|
|
222
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
223
|
+
if os.path.exists(cache_file):
|
|
224
|
+
with open(cache_file, 'r') as f:
|
|
225
|
+
cached_data = json.load(f)
|
|
226
|
+
if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
|
|
227
|
+
if self.use_litlogger:
|
|
228
|
+
self.logger.info(f"Using cached results for: {cache_key}")
|
|
229
|
+
return cached_data['results']
|
|
230
|
+
if self.use_litlogger:
|
|
231
|
+
self.logger.debug(f"No valid cache found for: {cache_key}")
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
|
|
235
|
+
"""Cache search results"""
|
|
236
|
+
if not self.cache_dir:
|
|
237
|
+
return
|
|
238
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
239
|
+
with open(cache_file, 'w') as f:
|
|
240
|
+
json.dump({
|
|
241
|
+
'timestamp': datetime.now().isoformat(),
|
|
242
|
+
'results': results
|
|
243
|
+
}, f)
|
|
244
|
+
|
|
245
|
+
def search_images(
|
|
246
|
+
self,
|
|
247
|
+
query: str,
|
|
248
|
+
max_results: int = 10,
|
|
249
|
+
size: Optional[str] = None,
|
|
250
|
+
color: Optional[str] = None,
|
|
251
|
+
type_filter: Optional[str] = None,
|
|
252
|
+
**kwargs
|
|
253
|
+
) -> List[Dict[str, str]]:
|
|
254
|
+
"""Search for images on Google with style!
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
query (str): What you're looking for fam
|
|
258
|
+
max_results (int): How many results you want (default: 10)
|
|
259
|
+
size (Optional[str]): Image size filter
|
|
260
|
+
- 'large': Big pics
|
|
261
|
+
- 'medium': Medium sized
|
|
262
|
+
- 'icon': Small icons
|
|
263
|
+
color (Optional[str]): Color filter
|
|
264
|
+
- 'color': Full color
|
|
265
|
+
- 'gray': Black and white
|
|
266
|
+
- 'transparent': Transparent background
|
|
267
|
+
type_filter (Optional[str]): Type of image
|
|
268
|
+
- 'face': Just faces
|
|
269
|
+
- 'photo': Real photos
|
|
270
|
+
- 'clipart': Vector art
|
|
271
|
+
- 'lineart': Line drawings
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
List[Dict[str, str]]: List of image results with these keys:
|
|
275
|
+
- 'thumbnail': Small preview URL
|
|
276
|
+
- 'full_url': Full resolution image URL
|
|
277
|
+
- 'title': Image title/description
|
|
278
|
+
- 'type': Always 'image'
|
|
279
|
+
|
|
280
|
+
Example:
|
|
281
|
+
>>> searcher = GoogleS()
|
|
282
|
+
>>> # Find some cool nature pics
|
|
283
|
+
>>> images = searcher.search_images(
|
|
284
|
+
... query="beautiful landscapes",
|
|
285
|
+
... size="large",
|
|
286
|
+
... color="color",
|
|
287
|
+
... max_results=5
|
|
288
|
+
... )
|
|
289
|
+
>>> for img in images:
|
|
290
|
+
... print(f"Found: {img['title']}")
|
|
291
|
+
... print(f"URL: {img['full_url']}")
|
|
292
|
+
"""
|
|
293
|
+
params = {
|
|
294
|
+
"q": query,
|
|
295
|
+
"tbm": "isch",
|
|
296
|
+
"num": max_results
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if size:
|
|
300
|
+
params["tbs"] = f"isz:{size}"
|
|
301
|
+
if color:
|
|
302
|
+
params["tbs"] = f"ic:{color}"
|
|
303
|
+
if type_filter:
|
|
304
|
+
params["tbs"] = f"itp:{type_filter}"
|
|
305
|
+
|
|
306
|
+
content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
|
|
307
|
+
soup = Scout(content) # Use Scout parser
|
|
308
|
+
|
|
309
|
+
results = []
|
|
310
|
+
for img in soup.find_all("img", class_="rg_i"):
|
|
311
|
+
if len(results) >= max_results:
|
|
312
|
+
break
|
|
313
|
+
|
|
314
|
+
img_data = {
|
|
315
|
+
"thumbnail": img.get("src", ""),
|
|
316
|
+
"title": img.get("alt", ""),
|
|
317
|
+
"type": "image"
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# Extract full resolution image URL if available
|
|
321
|
+
parent = img.parent
|
|
322
|
+
if parent and parent.get("href"):
|
|
323
|
+
img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
|
|
324
|
+
|
|
325
|
+
results.append(img_data)
|
|
326
|
+
|
|
327
|
+
return results
|
|
328
|
+
|
|
329
|
+
def search(
|
|
330
|
+
self,
|
|
331
|
+
query: str,
|
|
332
|
+
region: str = "us-en",
|
|
333
|
+
language: str = "en",
|
|
334
|
+
safe: str = "off",
|
|
335
|
+
time_period: Optional[str] = None,
|
|
336
|
+
max_results: int = 10,
|
|
337
|
+
extract_text: bool = False,
|
|
338
|
+
max_text_length: Optional[int] = 100,
|
|
339
|
+
site: Optional[str] = None, # Search within specific site
|
|
340
|
+
file_type: Optional[str] = None, # Filter by file type
|
|
341
|
+
sort_by: str = "relevance", # relevance, date
|
|
342
|
+
exclude_terms: Optional[List[str]] = None, # Terms to exclude
|
|
343
|
+
exact_phrase: Optional[str] = None, # Exact phrase match
|
|
344
|
+
) -> List[Dict[str, Union[str, int]]]:
|
|
345
|
+
"""
|
|
346
|
+
Enhanced search with additional filters and options.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
site: Limit search to specific website
|
|
350
|
+
file_type: Filter by file type (pdf, doc, etc.)
|
|
351
|
+
sort_by: Sort results by relevance or date
|
|
352
|
+
exclude_terms: List of terms to exclude from search
|
|
353
|
+
exact_phrase: Exact phrase to match
|
|
354
|
+
"""
|
|
355
|
+
if self.use_litlogger:
|
|
356
|
+
self.logger.info(f"Starting search for: {query}")
|
|
357
|
+
|
|
358
|
+
# Build advanced query
|
|
359
|
+
advanced_query = query
|
|
360
|
+
if site:
|
|
361
|
+
advanced_query += f" site:{site}"
|
|
362
|
+
if file_type:
|
|
363
|
+
advanced_query += f" filetype:{file_type}"
|
|
364
|
+
if exclude_terms:
|
|
365
|
+
advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
|
|
366
|
+
if exact_phrase:
|
|
367
|
+
advanced_query = f'"{exact_phrase}"' + advanced_query
|
|
368
|
+
|
|
369
|
+
if self.use_litlogger:
|
|
370
|
+
self.logger.debug(f"Advanced query: {advanced_query}")
|
|
371
|
+
|
|
372
|
+
# Check cache first
|
|
373
|
+
cache_key = self._cache_key(advanced_query, region=region, language=language,
|
|
374
|
+
safe=safe, time_period=time_period, sort_by=sort_by)
|
|
375
|
+
cached_results = self._get_cached_results(cache_key)
|
|
376
|
+
if cached_results:
|
|
377
|
+
return cached_results[:max_results]
|
|
378
|
+
|
|
379
|
+
# Continue with regular search implementation...
|
|
380
|
+
results = []
|
|
381
|
+
futures = []
|
|
382
|
+
start = 0
|
|
383
|
+
|
|
384
|
+
while len(results) < max_results:
|
|
385
|
+
params = {
|
|
386
|
+
"q": advanced_query,
|
|
387
|
+
"num": 10,
|
|
388
|
+
"hl": language,
|
|
389
|
+
"start": start,
|
|
390
|
+
"safe": safe,
|
|
391
|
+
"gl": region,
|
|
392
|
+
}
|
|
393
|
+
if time_period:
|
|
394
|
+
params["tbs"] = f"qdr:{time_period}"
|
|
395
|
+
|
|
396
|
+
futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
|
|
397
|
+
start += 10
|
|
398
|
+
|
|
399
|
+
for future in as_completed(futures):
|
|
400
|
+
try:
|
|
401
|
+
resp_content = future.result()
|
|
402
|
+
soup = Scout(resp_content) # Use Scout parser
|
|
403
|
+
|
|
404
|
+
result_blocks = soup.find_all("div", class_="g")
|
|
405
|
+
|
|
406
|
+
if not result_blocks:
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
# Extract links and titles first
|
|
410
|
+
for result_block in result_blocks:
|
|
411
|
+
link = result_block.find("a", href=True)
|
|
412
|
+
title = result_block.find("h3")
|
|
413
|
+
description_box = result_block.find(
|
|
414
|
+
"div", {"style": "-webkit-line-clamp:2"}
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
if link and title and description_box:
|
|
418
|
+
url = link["href"]
|
|
419
|
+
results.append({
|
|
420
|
+
"title": title.text,
|
|
421
|
+
"href": url,
|
|
422
|
+
"abstract": description_box.text,
|
|
423
|
+
"index": len(results),
|
|
424
|
+
"type": "web",
|
|
425
|
+
"visible_text": "" # Initialize visible_text as empty string
|
|
426
|
+
})
|
|
427
|
+
|
|
428
|
+
if len(results) >= max_results:
|
|
429
|
+
break # Stop if we have enough results
|
|
430
|
+
|
|
431
|
+
# Parallelize text extraction if needed
|
|
432
|
+
if extract_text:
|
|
433
|
+
with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
|
|
434
|
+
extraction_futures = [
|
|
435
|
+
text_extractor.submit(self._extract_text_from_webpage,
|
|
436
|
+
self._get_url("GET", result['href']),
|
|
437
|
+
max_characters=max_text_length)
|
|
438
|
+
for result in results
|
|
439
|
+
if 'href' in result
|
|
440
|
+
]
|
|
441
|
+
for i, future in enumerate(as_completed(extraction_futures)):
|
|
442
|
+
try:
|
|
443
|
+
results[i]['visible_text'] = future.result()
|
|
444
|
+
except Exception as e:
|
|
445
|
+
print(f"Error extracting text: {e}")
|
|
446
|
+
|
|
447
|
+
except Exception as e:
|
|
448
|
+
print(f"Error: {e}")
|
|
449
|
+
|
|
450
|
+
# Cache results before returning
|
|
451
|
+
self._cache_results(cache_key, results)
|
|
452
|
+
return results
|
|
453
|
+
|
|
454
|
+
def get_search_suggestions(self, query: str) -> List[str]:
|
|
455
|
+
"""Get search suggestions for a query"""
|
|
456
|
+
params = {
|
|
457
|
+
"client": "chrome",
|
|
458
|
+
"q": query
|
|
459
|
+
}
|
|
460
|
+
content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
|
|
461
|
+
params=params)
|
|
462
|
+
suggestions = json.loads(content.decode('utf-8'))[1]
|
|
463
|
+
return suggestions
|
|
464
|
+
|
|
465
|
+
def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
|
|
466
|
+
"""
|
|
467
|
+
Extracts visible text from HTML content using Scout parser.
|
|
468
|
+
"""
|
|
469
|
+
soup = Scout(html_content) # Use Scout parser
|
|
470
|
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
|
471
|
+
tag.extract()
|
|
472
|
+
visible_text = soup.get_text(strip=True)
|
|
473
|
+
if max_characters:
|
|
474
|
+
visible_text = visible_text[:max_characters]
|
|
475
|
+
return visible_text
|
|
476
|
+
|
|
477
|
+
def __enter__(self):
|
|
478
|
+
return self
|
|
479
|
+
|
|
480
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
481
|
+
self.client.close()
|
|
482
|
+
self._executor.shutdown()
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
if __name__ == "__main__":
|
|
486
|
+
from rich import print
|
|
487
|
+
searcher = GoogleS(rate_limit=3.0, use_litlogger=True)
|
|
488
|
+
results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
|
|
489
|
+
for result in results:
|
|
493
490
|
print(result)
|