webscout 4.8__py3-none-any.whl → 5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Agents/functioncall.py +58 -102
- webscout/Bing_search.py +124 -0
- webscout/DWEBS.py +141 -777
- webscout/Provider/Cloudflare.py +286 -0
- webscout/Provider/DiscordRocks.py +5 -4
- webscout/Provider/Farfalle.py +3 -3
- webscout/Provider/Llama3.py +3 -3
- webscout/Provider/PI.py +208 -0
- webscout/Provider/Youchat.py +247 -0
- webscout/Provider/__init__.py +16 -2
- webscout/Provider/felo_search.py +238 -0
- webscout/Provider/julius.py +263 -0
- webscout/Provider/turboseek.py +237 -0
- webscout/Provider/xdash.py +202 -0
- webscout/Provider/yep.py +258 -0
- webscout/__init__.py +1 -59
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/METADATA +25 -74
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/RECORD +22 -14
- webscout/GoogleS.py +0 -342
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/LICENSE.md +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/WHEEL +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/entry_points.txt +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,11 @@ webscout/AIauto.py,sha256=gC01wLPpnqONf9DwKqkmbC_gIWo5Lh5V8YPu4OmYnhE,19923
|
|
|
2
2
|
webscout/AIbase.py,sha256=GoHbN8r0gq2saYRZv6LA-Fr9Jlcjv80STKFXUq2ZeGU,4710
|
|
3
3
|
webscout/AIutel.py,sha256=e1RbQHMMPL_sB_P_lNk8DKWDNiTGteMiCK-_uUKagbw,34248
|
|
4
4
|
webscout/Bard.py,sha256=8XkV_j0gJ-krYYR7bd5UORWMk7VlyTd3z66RPYBtdxg,13134
|
|
5
|
-
webscout/
|
|
6
|
-
webscout/
|
|
5
|
+
webscout/Bing_search.py,sha256=cjlmQtD1OrumLiWUdv1UN7X-VE__4-HxfD_HdvnuL9g,4647
|
|
6
|
+
webscout/DWEBS.py,sha256=9Jtq6weBAYfAy0bMenPn1fdJyzCPHyptc6hGywrB2Ro,6203
|
|
7
7
|
webscout/LLM.py,sha256=LbGCZdJf8A5dwfoGS4tyy39tAh5BDdhMZP0ScKaaQfU,4184
|
|
8
8
|
webscout/YTdownloader.py,sha256=uWpUWnw9pxeEGw9KJ_3XDyQ5gd38gH1dJpr-HJo4vzU,39144
|
|
9
|
-
webscout/__init__.py,sha256=
|
|
9
|
+
webscout/__init__.py,sha256=6nDNd2K22SOVYGpMBoDZnpdXHV624oAi0HLco7iKtwA,1254
|
|
10
10
|
webscout/__main__.py,sha256=ZtTRgsRjUi2JOvYFLF1ZCh55Sdoz94I-BS-TlJC7WDU,126
|
|
11
11
|
webscout/async_providers.py,sha256=MRj0klEhBYVQXnzZGG_15d0e-TPA0nOc2nn735H-wR4,622
|
|
12
12
|
webscout/cli.py,sha256=RlBKeS9CSIsiBMqlzxevWtKjbY9htkZvA7J0bM_hHE8,14999
|
|
@@ -24,7 +24,7 @@ webscout/webscout_search_async.py,sha256=dooKGwLm0cwTml55Vy6NHPPY-nymEqX2h8laX94
|
|
|
24
24
|
webscout/websx_search.py,sha256=5hfkkmGFhyQzojUpvMzIOJ3DBZIBNS90UReaacsfu6s,521
|
|
25
25
|
webscout/Agents/Onlinesearcher.py,sha256=GzF2JcMfj07d74mxQEoaxwtxahgLHl3b_ugTbXjOwq4,7113
|
|
26
26
|
webscout/Agents/__init__.py,sha256=VbGyW5pulh3LRqbVTv54n5TwWsrTqOANRioG18xtdJ0,58
|
|
27
|
-
webscout/Agents/functioncall.py,sha256=
|
|
27
|
+
webscout/Agents/functioncall.py,sha256=qH1Tofi4h5CK5RhXaLQhXu8swEUmcyK9R5xpS6jMLrs,5784
|
|
28
28
|
webscout/Extra/__init__.py,sha256=GG1qUwS-HspT4TeeAIT4qFpM8PaO1ZdQhpelctaM7Rs,99
|
|
29
29
|
webscout/Extra/autollama.py,sha256=8lyodIWAgJABzlMMHytlolPCgvUKh8ynkZD6MMEltXs,5970
|
|
30
30
|
webscout/Extra/gguf.py,sha256=RvSp7xuaD6epAA9iAzthUnAQ3HA5N-svMyKUadAVnw8,7009
|
|
@@ -42,19 +42,21 @@ webscout/Provider/Andi.py,sha256=uBME1v8lZbvpPHq5e_IOiOUC766uGTWMfNx9lWACssU,106
|
|
|
42
42
|
webscout/Provider/BasedGPT.py,sha256=pQd6_eDgdjv5_J0HZGugZ5ghqPLv2Hs18szq1G0bIh8,8905
|
|
43
43
|
webscout/Provider/Berlin4h.py,sha256=-mO-ljQUV6pCnm-nKEeV7sePDn7wiGO_WG9XgVh2z10,8774
|
|
44
44
|
webscout/Provider/Blackboxai.py,sha256=OXq8rF0EDHkTK65HVXPXLrJ9sp950h4c56sc-YxbsjU,17378
|
|
45
|
+
webscout/Provider/Cloudflare.py,sha256=yIjsHsIvJjnZebTLhJBH3yfg-zJ2dKJLsuhkpJlpGaM,11530
|
|
45
46
|
webscout/Provider/Cohere.py,sha256=OZ7-0iaJ8L5e4Sy-L2UGm8SnBmS7CbaFIj6a08bABVw,8941
|
|
46
47
|
webscout/Provider/DARKAI.py,sha256=JpfFcPfd2kp15KSJ7GJ5Zy4zrwYQ_zHpqdFD2904Ii0,9065
|
|
47
48
|
webscout/Provider/Deepinfra.py,sha256=tdST5aQjaCs9_B5mrnrXmihDei73MjB-F8cpES-noc4,18756
|
|
48
49
|
webscout/Provider/Deepseek.py,sha256=jp8cZhbmscDjlXLCGI8MhDGORkbbxyeUlCqu5Z5GGSI,9210
|
|
49
|
-
webscout/Provider/DiscordRocks.py,sha256=
|
|
50
|
-
webscout/Provider/Farfalle.py,sha256=
|
|
50
|
+
webscout/Provider/DiscordRocks.py,sha256=AgpAofgHY8MMKYhuqhtwLM8qGiYatStc2Aa1XX-3PPU,15028
|
|
51
|
+
webscout/Provider/Farfalle.py,sha256=zl2AD5NomuHCkW21tDfI1Z-KIlhiuQ32eiNM-1B4KWQ,9010
|
|
51
52
|
webscout/Provider/Gemini.py,sha256=V79nIi5vhPfvjlGYg5XuH6RfY7AyNnBqnJM-OBK99hE,8453
|
|
52
53
|
webscout/Provider/Groq.py,sha256=h_dPKwqXRwmgvmEmkDYKdXwrlI4Zm2vZuCnSMItoa2w,28662
|
|
53
54
|
webscout/Provider/Koboldai.py,sha256=KwWx2yPlvT9BGx37iNvSbgzWkJ9I8kSOmeg7sL1hb0M,15806
|
|
54
55
|
webscout/Provider/Llama.py,sha256=pqjpB09MFufImzTav1PwTWsukSCr3QiB-yFGJIIBAu8,8293
|
|
55
|
-
webscout/Provider/Llama3.py,sha256=
|
|
56
|
+
webscout/Provider/Llama3.py,sha256=qO5R5mNznrobi7eKZR8opb_UekmO0_PUEOkPTnNw9nE,7583
|
|
56
57
|
webscout/Provider/OLLAMA.py,sha256=Modmkp_WiZaBYsv4-_5y7fHpqBJY20zWxyZZwtSfqVs,7117
|
|
57
58
|
webscout/Provider/Openai.py,sha256=SjfVOwY94unVnXhvN0Fkome-q2-wi4mPJk_vCGq5Fjc,20617
|
|
59
|
+
webscout/Provider/PI.py,sha256=IodVvGR_RIZpGJ0ypFF4U6NBMZAZ5O1BlRFMelot8so,8364
|
|
58
60
|
webscout/Provider/Perplexity.py,sha256=gUnXyVNbl6tWAqirwHEoPkjCzxpORcKxL77aoFKepBk,21485
|
|
59
61
|
webscout/Provider/Phind.py,sha256=_3y4CHn_uOsK6j2IP5n9RbnIAS6uTm5tI7IZccaDrMQ,39260
|
|
60
62
|
webscout/Provider/PizzaGPT.py,sha256=EiHSt0sK9kgmcIbBmkVtuniDvOrlhdi6zim5t_EZo30,7216
|
|
@@ -62,14 +64,20 @@ webscout/Provider/Poe.py,sha256=ObUxa-Fa2Dq7sJcV0hc65m09StS9uWsB2-bR2rSjXDY,7510
|
|
|
62
64
|
webscout/Provider/RUBIKSAI.py,sha256=HPY8klGBNVVkfAXb-RziNrEtJGItjiqbSyXKXTOIHW4,7954
|
|
63
65
|
webscout/Provider/Reka.py,sha256=F0ZXENkhARprj5biK3mRxwiuPH0BW3ga7EWsi8agbtE,8917
|
|
64
66
|
webscout/Provider/ThinkAnyAI.py,sha256=_qFjj0djxxrranyEY33w14oizyRjzlVwMv_hzvVtwNc,11616
|
|
65
|
-
webscout/Provider/
|
|
67
|
+
webscout/Provider/Youchat.py,sha256=p4rIodsNP3qxA78VpzZwUymSAs-uADQ_9CKuf_Nf9Ng,9582
|
|
68
|
+
webscout/Provider/__init__.py,sha256=7M8o6_tqIrjI6G2XF3PYToZq9kWcp9JLsyk1qkJzGiQ,2038
|
|
66
69
|
webscout/Provider/ai4chat.py,sha256=UB77kWH5vxSqSpz7PPgM4FH0aDpGOpwHJEv42Fa1W_U,7798
|
|
70
|
+
webscout/Provider/felo_search.py,sha256=mYi1xW9egUMZ47bJb0MOD9364VLYgGJsOW2NQUbe190,9314
|
|
71
|
+
webscout/Provider/julius.py,sha256=ffm-9oeHYwuQMMkSXu_3ly0Xqvj-0Dh7DlatebCl1ls,10331
|
|
67
72
|
webscout/Provider/koala.py,sha256=x5OoT7hM8V-camPNMevqddHvfmzjKvLER2tpCDB6X4o,10059
|
|
68
73
|
webscout/Provider/liaobots.py,sha256=s2VxS4epBLVxoLCyQR0bdxiRm9Q6ZYUf019TC3xQCtM,10362
|
|
69
74
|
webscout/Provider/meta.py,sha256=3iBylmAk9d673Axvw6hFi0-0x_Fq7ZgtH_1j2_rcDwY,30715
|
|
70
|
-
webscout
|
|
71
|
-
webscout
|
|
72
|
-
webscout
|
|
73
|
-
webscout-
|
|
74
|
-
webscout-
|
|
75
|
-
webscout-
|
|
75
|
+
webscout/Provider/turboseek.py,sha256=BNx_urbs6Ixr7SEOgL4Uo1iZdjYC7CxoefJcsN4LK6I,9138
|
|
76
|
+
webscout/Provider/xdash.py,sha256=KUDTEX8I0z72bIDi-w5Se7xmB_lbmaX7KlCmIl2ad4c,7925
|
|
77
|
+
webscout/Provider/yep.py,sha256=RbEBzHeEFxgfdnHXHuBny6NKHcYYYNA6bvTggvAzoLk,10399
|
|
78
|
+
webscout-5.0.dist-info/LICENSE.md,sha256=9P0imsudI7MEvZe2pOcg8rKBn6E5FGHQ-riYozZI-Bk,2942
|
|
79
|
+
webscout-5.0.dist-info/METADATA,sha256=anlsD-HmXJT4_UV8LyrT5mxdnEKznprEDn2oPcf-Ucg,50819
|
|
80
|
+
webscout-5.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
81
|
+
webscout-5.0.dist-info/entry_points.txt,sha256=Hh4YIIjvkqB9SVxZ2ri4DZUkgEu_WF_5_r_nZDIvfG8,73
|
|
82
|
+
webscout-5.0.dist-info/top_level.txt,sha256=nYIw7OKBQDr_Z33IzZUKidRD3zQEo8jOJYkMVMeN334,9
|
|
83
|
+
webscout-5.0.dist-info/RECORD,,
|
webscout/GoogleS.py
DELETED
|
@@ -1,342 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import random
|
|
3
|
-
import sys
|
|
4
|
-
import time
|
|
5
|
-
import ssl
|
|
6
|
-
|
|
7
|
-
if sys.version_info[0] > 2:
|
|
8
|
-
from http.cookiejar import LWPCookieJar
|
|
9
|
-
from urllib.request import Request, urlopen
|
|
10
|
-
from urllib.parse import quote_plus, urlparse, parse_qs
|
|
11
|
-
else:
|
|
12
|
-
from cookielib import LWPCookieJar
|
|
13
|
-
from urllib import quote_plus
|
|
14
|
-
from urllib2 import Request, urlopen
|
|
15
|
-
from urlparse import urlparse, parse_qs
|
|
16
|
-
|
|
17
|
-
try:
|
|
18
|
-
from bs4 import BeautifulSoup
|
|
19
|
-
is_bs4 = True
|
|
20
|
-
except ImportError:
|
|
21
|
-
from BeautifulSoup import BeautifulSoup # type: ignore
|
|
22
|
-
is_bs4 = False
|
|
23
|
-
|
|
24
|
-
__all__ = [
|
|
25
|
-
|
|
26
|
-
# Main search function.
|
|
27
|
-
'search',
|
|
28
|
-
|
|
29
|
-
# Shortcut for "get lucky" search.
|
|
30
|
-
'lucky',
|
|
31
|
-
|
|
32
|
-
# Miscellaneous utility functions.
|
|
33
|
-
'get_random_user_agent', 'get_tbs',
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
# URL templates to make Google searches.
|
|
37
|
-
url_home = "https://www.google.%(tld)s/"
|
|
38
|
-
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
39
|
-
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
40
|
-
"cr=%(country)s"
|
|
41
|
-
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
42
|
-
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
43
|
-
"cr=%(country)s"
|
|
44
|
-
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
45
|
-
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
46
|
-
"cr=%(country)s"
|
|
47
|
-
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
|
|
48
|
-
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
|
|
49
|
-
"safe=%(safe)s&cr=%(country)s"
|
|
50
|
-
url_parameters = (
|
|
51
|
-
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
|
|
52
|
-
|
|
53
|
-
# Cookie jar. Stored at the user's home folder.
|
|
54
|
-
# If the cookie jar is inaccessible, the errors are ignored.
|
|
55
|
-
home_folder = os.getenv('HOME')
|
|
56
|
-
if not home_folder:
|
|
57
|
-
home_folder = os.getenv('USERHOME')
|
|
58
|
-
if not home_folder:
|
|
59
|
-
home_folder = '.' # Use the current folder on error.
|
|
60
|
-
cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
|
|
61
|
-
try:
|
|
62
|
-
cookie_jar.load()
|
|
63
|
-
except Exception:
|
|
64
|
-
pass
|
|
65
|
-
|
|
66
|
-
# Default user agent, unless instructed by the user to change it.
|
|
67
|
-
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
install_folder = os.path.abspath(os.path.split(__file__)[0])
|
|
71
|
-
try:
|
|
72
|
-
user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
|
|
73
|
-
import gzip
|
|
74
|
-
fp = gzip.open(user_agents_file, 'rb')
|
|
75
|
-
try:
|
|
76
|
-
user_agents_list = [_.strip() for _ in fp.readlines()]
|
|
77
|
-
finally:
|
|
78
|
-
fp.close()
|
|
79
|
-
del fp
|
|
80
|
-
except Exception:
|
|
81
|
-
user_agents_file = os.path.join(install_folder, 'user_agents.txt')
|
|
82
|
-
with open(user_agents_file) as fp:
|
|
83
|
-
user_agents_list = [_.strip() for _ in fp.readlines()]
|
|
84
|
-
except Exception:
|
|
85
|
-
user_agents_list = [USER_AGENT]
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
# Get a random user agent.
|
|
89
|
-
def get_random_user_agent():
|
|
90
|
-
"""
|
|
91
|
-
Get a random user agent string.
|
|
92
|
-
|
|
93
|
-
:rtype: str
|
|
94
|
-
:return: Random user agent string.
|
|
95
|
-
"""
|
|
96
|
-
return random.choice(user_agents_list)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
# Helper function to format the tbs parameter.
|
|
100
|
-
def get_tbs(from_date, to_date):
|
|
101
|
-
"""
|
|
102
|
-
Helper function to format the tbs parameter.
|
|
103
|
-
|
|
104
|
-
:param datetime.date from_date: Python date object.
|
|
105
|
-
:param datetime.date to_date: Python date object.
|
|
106
|
-
|
|
107
|
-
:rtype: str
|
|
108
|
-
:return: Dates encoded in tbs format.
|
|
109
|
-
"""
|
|
110
|
-
from_date = from_date.strftime('%m/%d/%Y')
|
|
111
|
-
to_date = to_date.strftime('%m/%d/%Y')
|
|
112
|
-
return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Request the given URL and return the response page, using the cookie jar.
|
|
116
|
-
# If the cookie jar is inaccessible, the errors are ignored.
|
|
117
|
-
def get_page(url, user_agent=None, verify_ssl=True):
|
|
118
|
-
"""
|
|
119
|
-
Request the given URL and return the response page, using the cookie jar.
|
|
120
|
-
|
|
121
|
-
:param str url: URL to retrieve.
|
|
122
|
-
:param str user_agent: User agent for the HTTP requests.
|
|
123
|
-
Use None for the default.
|
|
124
|
-
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
125
|
-
traffic interception attacks. Defaults to True.
|
|
126
|
-
|
|
127
|
-
:rtype: str
|
|
128
|
-
:return: Web page retrieved for the given URL.
|
|
129
|
-
|
|
130
|
-
:raises IOError: An exception is raised on error.
|
|
131
|
-
:raises urllib2.URLError: An exception is raised on error.
|
|
132
|
-
:raises urllib2.HTTPError: An exception is raised on error.
|
|
133
|
-
"""
|
|
134
|
-
if user_agent is None:
|
|
135
|
-
user_agent = USER_AGENT
|
|
136
|
-
request = Request(url)
|
|
137
|
-
request.add_header('User-Agent', user_agent)
|
|
138
|
-
cookie_jar.add_cookie_header(request)
|
|
139
|
-
if verify_ssl:
|
|
140
|
-
response = urlopen(request)
|
|
141
|
-
else:
|
|
142
|
-
context = ssl._create_unverified_context()
|
|
143
|
-
response = urlopen(request, context=context)
|
|
144
|
-
cookie_jar.extract_cookies(response, request)
|
|
145
|
-
html = response.read()
|
|
146
|
-
response.close()
|
|
147
|
-
try:
|
|
148
|
-
cookie_jar.save()
|
|
149
|
-
except Exception:
|
|
150
|
-
pass
|
|
151
|
-
return html
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
# Filter links found in the Google result pages HTML code.
|
|
155
|
-
# Returns None if the link doesn't yield a valid result.
|
|
156
|
-
def filter_result(link):
|
|
157
|
-
try:
|
|
158
|
-
|
|
159
|
-
# Decode hidden URLs.
|
|
160
|
-
if link.startswith('/url?'):
|
|
161
|
-
o = urlparse(link, 'http')
|
|
162
|
-
link = parse_qs(o.query)['q'][0]
|
|
163
|
-
|
|
164
|
-
# Valid results are absolute URLs not pointing to a Google domain,
|
|
165
|
-
# like images.google.com or googleusercontent.com for example.
|
|
166
|
-
# TODO this could be improved!
|
|
167
|
-
o = urlparse(link, 'http')
|
|
168
|
-
if o.netloc and 'google' not in o.netloc:
|
|
169
|
-
return link
|
|
170
|
-
|
|
171
|
-
# On error, return None.
|
|
172
|
-
except Exception:
|
|
173
|
-
pass
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
# Returns a generator that yields URLs.
|
|
177
|
-
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
|
|
178
|
-
stop=None, pause=2.0, country='', extra_params=None,
|
|
179
|
-
user_agent=None, verify_ssl=True):
|
|
180
|
-
"""
|
|
181
|
-
Search the given query string using Google.
|
|
182
|
-
|
|
183
|
-
:param str query: Query string. Must NOT be url-encoded.
|
|
184
|
-
:param str tld: Top level domain.
|
|
185
|
-
:param str lang: Language.
|
|
186
|
-
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
|
187
|
-
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
|
188
|
-
:param str safe: Safe search.
|
|
189
|
-
:param int num: Number of results per page.
|
|
190
|
-
:param int start: First result to retrieve.
|
|
191
|
-
:param int stop: Last result to retrieve.
|
|
192
|
-
Use None to keep searching forever.
|
|
193
|
-
:param float pause: Lapse to wait between HTTP requests.
|
|
194
|
-
A lapse too long will make the search slow, but a lapse too short may
|
|
195
|
-
cause Google to block your IP. Your mileage may vary!
|
|
196
|
-
:param str country: Country or region to focus the search on. Similar to
|
|
197
|
-
changing the TLD, but does not yield exactly the same results.
|
|
198
|
-
Only Google knows why...
|
|
199
|
-
:param dict extra_params: A dictionary of extra HTTP GET
|
|
200
|
-
parameters, which must be URL encoded. For example if you don't want
|
|
201
|
-
Google to filter similar results you can set the extra_params to
|
|
202
|
-
{'filter': '0'} which will append '&filter=0' to every query.
|
|
203
|
-
:param str user_agent: User agent for the HTTP requests.
|
|
204
|
-
Use None for the default.
|
|
205
|
-
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
206
|
-
traffic interception attacks. Defaults to True.
|
|
207
|
-
|
|
208
|
-
:rtype: generator of str
|
|
209
|
-
:return: Generator (iterator) that yields found URLs.
|
|
210
|
-
If the stop parameter is None the iterator will loop forever.
|
|
211
|
-
"""
|
|
212
|
-
# Set of hashes for the results found.
|
|
213
|
-
# This is used to avoid repeated results.
|
|
214
|
-
hashes = set()
|
|
215
|
-
|
|
216
|
-
# Count the number of links yielded.
|
|
217
|
-
count = 0
|
|
218
|
-
|
|
219
|
-
# Prepare the search string.
|
|
220
|
-
query = quote_plus(query)
|
|
221
|
-
|
|
222
|
-
# If no extra_params is given, create an empty dictionary.
|
|
223
|
-
# We should avoid using an empty dictionary as a default value
|
|
224
|
-
# in a function parameter in Python.
|
|
225
|
-
if not extra_params:
|
|
226
|
-
extra_params = {}
|
|
227
|
-
|
|
228
|
-
# Check extra_params for overlapping.
|
|
229
|
-
for builtin_param in url_parameters:
|
|
230
|
-
if builtin_param in extra_params.keys():
|
|
231
|
-
raise ValueError(
|
|
232
|
-
'GET parameter "%s" is overlapping with \
|
|
233
|
-
the built-in GET parameter',
|
|
234
|
-
builtin_param
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
# Grab the cookie from the home page.
|
|
238
|
-
get_page(url_home % vars(), user_agent, verify_ssl)
|
|
239
|
-
|
|
240
|
-
# Prepare the URL of the first request.
|
|
241
|
-
if start:
|
|
242
|
-
if num == 10:
|
|
243
|
-
url = url_next_page % vars()
|
|
244
|
-
else:
|
|
245
|
-
url = url_next_page_num % vars()
|
|
246
|
-
else:
|
|
247
|
-
if num == 10:
|
|
248
|
-
url = url_search % vars()
|
|
249
|
-
else:
|
|
250
|
-
url = url_search_num % vars()
|
|
251
|
-
|
|
252
|
-
# Loop until we reach the maximum result, if any (otherwise, loop forever).
|
|
253
|
-
while not stop or count < stop:
|
|
254
|
-
|
|
255
|
-
# Remeber last count to detect the end of results.
|
|
256
|
-
last_count = count
|
|
257
|
-
|
|
258
|
-
# Append extra GET parameters to the URL.
|
|
259
|
-
# This is done on every iteration because we're
|
|
260
|
-
# rebuilding the entire URL at the end of this loop.
|
|
261
|
-
for k, v in extra_params.items():
|
|
262
|
-
k = quote_plus(k)
|
|
263
|
-
v = quote_plus(v)
|
|
264
|
-
url = url + ('&%s=%s' % (k, v))
|
|
265
|
-
|
|
266
|
-
# Sleep between requests.
|
|
267
|
-
# Keeps Google from banning you for making too many requests.
|
|
268
|
-
time.sleep(pause)
|
|
269
|
-
|
|
270
|
-
# Request the Google Search results page.
|
|
271
|
-
html = get_page(url, user_agent, verify_ssl)
|
|
272
|
-
|
|
273
|
-
# Parse the response and get every anchored URL.
|
|
274
|
-
if is_bs4:
|
|
275
|
-
soup = BeautifulSoup(html, 'html.parser')
|
|
276
|
-
else:
|
|
277
|
-
soup = BeautifulSoup(html)
|
|
278
|
-
try:
|
|
279
|
-
anchors = soup.find(id='search').findAll('a')
|
|
280
|
-
# Sometimes (depending on the User-agent) there is
|
|
281
|
-
# no id "search" in html response...
|
|
282
|
-
except AttributeError:
|
|
283
|
-
# Remove links of the top bar.
|
|
284
|
-
gbar = soup.find(id='gbar')
|
|
285
|
-
if gbar:
|
|
286
|
-
gbar.clear()
|
|
287
|
-
anchors = soup.findAll('a')
|
|
288
|
-
|
|
289
|
-
# Process every anchored URL.
|
|
290
|
-
for a in anchors:
|
|
291
|
-
|
|
292
|
-
# Get the URL from the anchor tag.
|
|
293
|
-
try:
|
|
294
|
-
link = a['href']
|
|
295
|
-
except KeyError:
|
|
296
|
-
continue
|
|
297
|
-
|
|
298
|
-
# Filter invalid links and links pointing to Google itself.
|
|
299
|
-
link = filter_result(link)
|
|
300
|
-
if not link:
|
|
301
|
-
continue
|
|
302
|
-
|
|
303
|
-
# Discard repeated results.
|
|
304
|
-
h = hash(link)
|
|
305
|
-
if h in hashes:
|
|
306
|
-
continue
|
|
307
|
-
hashes.add(h)
|
|
308
|
-
|
|
309
|
-
# Yield the result.
|
|
310
|
-
yield link
|
|
311
|
-
|
|
312
|
-
# Increase the results counter.
|
|
313
|
-
# If we reached the limit, stop.
|
|
314
|
-
count += 1
|
|
315
|
-
if stop and count >= stop:
|
|
316
|
-
return
|
|
317
|
-
|
|
318
|
-
# End if there are no more results.
|
|
319
|
-
# XXX TODO review this logic, not sure if this is still true!
|
|
320
|
-
if last_count == count:
|
|
321
|
-
break
|
|
322
|
-
|
|
323
|
-
# Prepare the URL for the next request.
|
|
324
|
-
start += num
|
|
325
|
-
if num == 10:
|
|
326
|
-
url = url_next_page % vars()
|
|
327
|
-
else:
|
|
328
|
-
url = url_next_page_num % vars()
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
# Shortcut to single-item search.
|
|
332
|
-
# Evaluates the iterator to return the single URL as a string.
|
|
333
|
-
def lucky(*args, **kwargs):
|
|
334
|
-
"""
|
|
335
|
-
Shortcut to single-item search.
|
|
336
|
-
|
|
337
|
-
Same arguments as the main search function, but the return value changes.
|
|
338
|
-
|
|
339
|
-
:rtype: str
|
|
340
|
-
:return: URL found by Google.
|
|
341
|
-
"""
|
|
342
|
-
return next(search(*args, **kwargs))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|