py2ls 0.1.8.4__py3-none-any.whl → 0.1.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py
CHANGED
@@ -33,13 +33,13 @@ from datetime import datetime
|
|
33
33
|
import time
|
34
34
|
from py2ls import ips
|
35
35
|
|
36
|
-
dir_save=
|
36
|
+
dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
37
37
|
# Set up logging
|
38
38
|
logging.basicConfig(level=logging.INFO)
|
39
39
|
logger = logging.getLogger(__name__)
|
40
40
|
# Suppress WDM INFO logs
|
41
|
-
logging.getLogger(
|
42
|
-
proxies_glob=None
|
41
|
+
logging.getLogger("WDM").setLevel(logging.WARNING)
|
42
|
+
proxies_glob = None
|
43
43
|
|
44
44
|
# Define supported content types and corresponding parsers
|
45
45
|
CONTENT_PARSERS = {
|
@@ -49,13 +49,23 @@ CONTENT_PARSERS = {
|
|
49
49
|
"text/plain": lambda text, parser: text.text,
|
50
50
|
}
|
51
51
|
|
52
|
-
|
53
|
-
|
52
|
+
|
53
|
+
def user_agent(
|
54
|
+
browsers=["chrome", "edge", "firefox", "safari"],
|
55
|
+
platforms=["pc", "tablet"],
|
56
|
+
verbose=False,
|
57
|
+
os=["windows", "macos", "linux"],
|
58
|
+
):
|
59
|
+
ua = UserAgent(browsers=browsers, platforms=platforms, os=os)
|
54
60
|
output_ua = ua.random
|
55
61
|
if verbose:
|
56
62
|
print(output_ua)
|
57
|
-
return output_ua
|
58
|
-
|
63
|
+
return output_ua
|
64
|
+
|
65
|
+
|
66
|
+
def extract_text_from_content(
|
67
|
+
content, content_type="text/html", where=None, what=None, extend=True, **kwargs
|
68
|
+
):
|
59
69
|
"""
|
60
70
|
Extracts text from the given content based on the specified content type and search criteria.
|
61
71
|
|
@@ -70,6 +80,7 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
70
80
|
Returns:
|
71
81
|
- list: A list of extracted text segments.
|
72
82
|
"""
|
83
|
+
|
73
84
|
def extract_text(element):
|
74
85
|
texts = ""
|
75
86
|
if isinstance(element, str) and element.strip():
|
@@ -78,6 +89,7 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
78
89
|
for child in element.children:
|
79
90
|
texts += extract_text(child)
|
80
91
|
return texts
|
92
|
+
|
81
93
|
if content is None:
|
82
94
|
logger.error("Content is None, cannot extract text.")
|
83
95
|
return []
|
@@ -88,39 +100,48 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
88
100
|
if "json" in content_type:
|
89
101
|
where = None
|
90
102
|
return extract_text_from_json(content, where)
|
91
|
-
elif
|
103
|
+
elif "text" in content_type:
|
92
104
|
if isinstance(where, list):
|
93
|
-
res=[]
|
105
|
+
res = []
|
94
106
|
for where_ in where:
|
95
|
-
res.extend(
|
107
|
+
res.extend(
|
108
|
+
extract_text_from_content(
|
109
|
+
content,
|
110
|
+
content_type="text/html",
|
111
|
+
where=where_,
|
112
|
+
what=what,
|
113
|
+
extend=extend,
|
114
|
+
**kwargs,
|
115
|
+
)
|
116
|
+
)
|
96
117
|
return res
|
97
118
|
else:
|
98
119
|
search_kwargs = {**kwargs}
|
99
120
|
# correct 'class_'
|
100
121
|
# dict_=dict(class_="gsc_mnd_art_info")
|
101
|
-
if
|
102
|
-
search_kwargs["class"]=search_kwargs["class_"]
|
103
|
-
del search_kwargs[
|
122
|
+
if "class_" in search_kwargs:
|
123
|
+
search_kwargs["class"] = search_kwargs["class_"]
|
124
|
+
del search_kwargs["class_"]
|
104
125
|
if what:
|
105
126
|
search_kwargs["class"] = what
|
106
|
-
if
|
127
|
+
if "attrs" in kwargs:
|
107
128
|
result_set = content.find_all(where, **search_kwargs)
|
108
129
|
print(f"attrs =>{search_kwargs}")
|
109
130
|
else:
|
110
131
|
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
111
132
|
print(f"{search_kwargs}")
|
112
|
-
|
133
|
+
|
113
134
|
if not result_set:
|
114
135
|
print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
|
115
136
|
if extend:
|
116
137
|
texts = ""
|
117
138
|
for tag in result_set:
|
118
|
-
texts =texts+" "+ extract_text(tag) + " \n"
|
139
|
+
texts = texts + " " + extract_text(tag) + " \n"
|
119
140
|
text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
|
120
141
|
return text_list
|
121
142
|
else:
|
122
143
|
# texts_ = " ".join(tag.get_text() for tag in result_set)
|
123
|
-
texts_=[]
|
144
|
+
texts_ = []
|
124
145
|
for tag in result_set:
|
125
146
|
for child in tag.children:
|
126
147
|
if child.name is None:
|
@@ -130,35 +151,42 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
130
151
|
texts = [tx.strip() for tx in texts_ if tx.strip()]
|
131
152
|
return texts
|
132
153
|
|
154
|
+
|
133
155
|
def extract_text_from_json(content, key=None):
|
134
156
|
if key:
|
135
157
|
if isinstance(content, list):
|
136
|
-
return [str(item.get(key,
|
158
|
+
return [str(item.get(key, "")) for item in content if key in item]
|
137
159
|
if isinstance(content, dict):
|
138
|
-
return [str(content.get(key,
|
160
|
+
return [str(content.get(key, ""))]
|
139
161
|
else:
|
140
162
|
return [str(value) for key, value in flatten_json(content).items()]
|
141
163
|
|
164
|
+
|
142
165
|
def flatten_json(y):
|
143
166
|
out = {}
|
144
|
-
|
167
|
+
|
168
|
+
def flatten(x, name=""):
|
145
169
|
if isinstance(x, dict):
|
146
170
|
for a in x:
|
147
|
-
flatten(x[a], name + a +
|
171
|
+
flatten(x[a], name + a + "_")
|
148
172
|
elif isinstance(x, list):
|
149
173
|
i = 0
|
150
174
|
for a in x:
|
151
|
-
flatten(a, name + str(i) +
|
175
|
+
flatten(a, name + str(i) + "_")
|
152
176
|
i += 1
|
153
177
|
else:
|
154
178
|
out[name[:-1]] = x
|
179
|
+
|
155
180
|
flatten(y)
|
156
181
|
return out
|
157
182
|
|
183
|
+
|
158
184
|
def get_proxy():
|
159
185
|
list_ = []
|
160
186
|
headers = {"User-Agent": user_agent()}
|
161
|
-
response = requests.get(
|
187
|
+
response = requests.get(
|
188
|
+
"https://free-proxy-list.net", headers=headers, timeout=30, stream=True
|
189
|
+
)
|
162
190
|
content = BeautifulSoup(response.content, "html.parser")
|
163
191
|
info = extract_text_from_content(content, where="td", extend=0)[0].split()
|
164
192
|
count, pair_proxy = 0, 2
|
@@ -175,11 +203,18 @@ def get_proxy():
|
|
175
203
|
"https": f"http://" + prox[1],
|
176
204
|
}
|
177
205
|
return proxies
|
206
|
+
|
207
|
+
|
178
208
|
# proxies_glob=get_proxy()
|
179
|
-
def get_soup(url, driver=
|
180
|
-
_,soup_=fetch_all(url, driver=driver)
|
209
|
+
def get_soup(url, driver="req"):
|
210
|
+
_, soup_ = fetch_all(url, driver=driver)
|
181
211
|
return soup_
|
182
|
-
|
212
|
+
|
213
|
+
|
214
|
+
def fetch_all(
|
215
|
+
url,
|
216
|
+
parser="lxml",
|
217
|
+
driver="request", # request or selenium
|
183
218
|
by=By.TAG_NAME,
|
184
219
|
timeout=10,
|
185
220
|
retry=2,
|
@@ -196,7 +231,8 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
196
231
|
proxy=None, # Add proxy parameter
|
197
232
|
javascript=True, # Add JavaScript option
|
198
233
|
disable_images=False, # Add option to disable images
|
199
|
-
iframe_name=None
|
234
|
+
iframe_name=None,
|
235
|
+
): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
|
200
236
|
try:
|
201
237
|
# # Generate a random user-agent string
|
202
238
|
# response = requests.get(url)
|
@@ -205,21 +241,31 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
205
241
|
# # get token from cookies
|
206
242
|
# scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
|
207
243
|
# headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
|
208
|
-
|
244
|
+
|
209
245
|
headers = {"User-Agent": user_agent()}
|
210
|
-
if
|
211
|
-
response = requests.get(
|
246
|
+
if "req" in driver.lower():
|
247
|
+
response = requests.get(
|
248
|
+
url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
|
249
|
+
)
|
212
250
|
|
213
251
|
# If the response is a redirect, follow it
|
214
252
|
while response.is_redirect:
|
215
253
|
logger.info(f"Redirecting to: {response.headers['Location']}")
|
216
|
-
response = requests.get(
|
254
|
+
response = requests.get(
|
255
|
+
response.headers["Location"],
|
256
|
+
headers=headers,
|
257
|
+
proxies=proxies_glob,
|
258
|
+
timeout=30,
|
259
|
+
stream=True,
|
260
|
+
)
|
217
261
|
# Check for a 403 error
|
218
262
|
if response.status_code == 403:
|
219
263
|
logger.warning("403 Forbidden error. Retrying...")
|
220
264
|
# Retry the request after a short delay
|
221
265
|
sleep(random.uniform(1, 3))
|
222
|
-
response = requests.get(
|
266
|
+
response = requests.get(
|
267
|
+
url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
|
268
|
+
)
|
223
269
|
# Raise an error if retry also fails
|
224
270
|
response.raise_for_status()
|
225
271
|
|
@@ -227,11 +273,13 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
227
273
|
response.raise_for_status()
|
228
274
|
|
229
275
|
# Get the content type
|
230
|
-
content_type =
|
276
|
+
content_type = (
|
277
|
+
response.headers.get("content-type", "").split(";")[0].lower()
|
278
|
+
)
|
231
279
|
if response.encoding:
|
232
280
|
content = response.content.decode(response.encoding)
|
233
281
|
else:
|
234
|
-
content=None
|
282
|
+
content = None
|
235
283
|
# logger.info(f"Content type: {content_type}")
|
236
284
|
|
237
285
|
# Check if content type is supported
|
@@ -240,14 +288,14 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
240
288
|
else:
|
241
289
|
logger.warning("Unsupported content type")
|
242
290
|
return None, None
|
243
|
-
elif
|
291
|
+
elif "se" in driver.lower():
|
244
292
|
chrome_options = Options()
|
245
293
|
chrome_options.add_argument("--headless")
|
246
294
|
chrome_options.add_argument("--no-sandbox")
|
247
295
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
248
296
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
249
297
|
if proxy:
|
250
|
-
chrome_options.add_argument(f
|
298
|
+
chrome_options.add_argument(f"--proxy-server={proxy}")
|
251
299
|
if disable_images:
|
252
300
|
prefs = {"profile.managed_default_content_settings.images": 2}
|
253
301
|
chrome_options.add_experimental_option("prefs", prefs)
|
@@ -255,9 +303,11 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
255
303
|
service = Service(ChromeDriverManager().install())
|
256
304
|
|
257
305
|
driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
258
|
-
|
306
|
+
|
259
307
|
if not javascript:
|
260
|
-
driver_.execute_cdp_cmd(
|
308
|
+
driver_.execute_cdp_cmd(
|
309
|
+
"Emulation.setScriptExecutionDisabled", {"value": True}
|
310
|
+
)
|
261
311
|
|
262
312
|
if login_url:
|
263
313
|
driver_.get(login_url)
|
@@ -272,7 +322,7 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
272
322
|
).click()
|
273
323
|
|
274
324
|
driver_.get(url)
|
275
|
-
|
325
|
+
|
276
326
|
if iframe_name:
|
277
327
|
iframe = WebDriverWait(driver_, timeout).until(
|
278
328
|
EC.presence_of_element_located((By.NAME, iframe_name))
|
@@ -286,14 +336,16 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
286
336
|
driver_.quit()
|
287
337
|
|
288
338
|
content = BeautifulSoup(page_source, "html.parser")
|
289
|
-
if content:
|
290
|
-
return
|
339
|
+
if content:
|
340
|
+
return "text/html", content
|
291
341
|
else:
|
292
342
|
logger.warning("Selenium could not fetch content")
|
293
343
|
return None, None
|
294
344
|
except requests.RequestException as e:
|
295
|
-
logger.error(f"Error fetching URL '{url}': {e}")
|
296
|
-
return None, None
|
345
|
+
logger.error(f"Error fetching URL '{url}': {e}")
|
346
|
+
return None, None
|
347
|
+
|
348
|
+
|
297
349
|
# # Function to change Tor IP address
|
298
350
|
# def renew_tor_ip():
|
299
351
|
# with Controller.from_port(port=9051) as controller:
|
@@ -324,14 +376,14 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
324
376
|
# content_type, soup = fetch_all(url,driver=driver)
|
325
377
|
# if soup:
|
326
378
|
# base_url = urlparse(url)
|
327
|
-
|
379
|
+
|
328
380
|
# # Extract links from both 'href' and 'src' attributes across relevant tags
|
329
381
|
# tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
|
330
382
|
# elements = []
|
331
383
|
# for tag in tags_with_links:
|
332
384
|
# elements.extend(soup.find_all(tag, href=True))
|
333
385
|
# elements.extend(soup.find_all(tag, src=True))
|
334
|
-
|
386
|
+
|
335
387
|
# for element in elements:
|
336
388
|
# link_href = element.get('href') or element.get('src')
|
337
389
|
# if link_href:
|
@@ -339,39 +391,40 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
|
339
391
|
# link_href = "http:" + link_href
|
340
392
|
# elif not link_href.startswith(("http", "https")):
|
341
393
|
# link_href = urljoin(base_url.geturl(), link_href)
|
342
|
-
|
394
|
+
|
343
395
|
# if all(exclusion not in link_href for exclusion in cond_ex):
|
344
396
|
# links_href.append(link_href)
|
345
|
-
|
397
|
+
|
346
398
|
# return list(set(links_href)) # Remove duplicates
|
347
399
|
|
400
|
+
|
348
401
|
# elif url.split('.')[-1] in ['pdf']:
|
349
402
|
# return url
|
350
403
|
# else:
|
351
404
|
# return None
|
352
|
-
def find_links(url, driver=
|
405
|
+
def find_links(url, driver="request", booster=False):
|
353
406
|
links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
|
354
407
|
content_type, soup = fetch_all(url, driver=driver)
|
355
|
-
|
356
|
-
if soup and content_type==
|
408
|
+
|
409
|
+
if soup and content_type == "text/html":
|
357
410
|
base_url = urlparse(url)
|
358
|
-
|
411
|
+
|
359
412
|
# Extract links from all tags with 'href' and 'src' attributes
|
360
413
|
elements = soup.find_all(True, href=True) + soup.find_all(True, src=True)
|
361
|
-
|
414
|
+
|
362
415
|
for element in elements:
|
363
|
-
link_href = element.get(
|
416
|
+
link_href = element.get("href") or element.get("src")
|
364
417
|
if link_href:
|
365
418
|
if link_href.startswith("//"):
|
366
419
|
link_href = "http:" + link_href
|
367
420
|
elif not link_href.startswith(("http", "https")):
|
368
421
|
link_href = urljoin(base_url.geturl(), link_href)
|
369
|
-
|
422
|
+
|
370
423
|
if all(exclusion not in link_href for exclusion in cond_ex):
|
371
424
|
links_href.append(link_href)
|
372
|
-
|
425
|
+
|
373
426
|
unique_links = list(set(links_href)) # Remove duplicates
|
374
|
-
|
427
|
+
|
375
428
|
if booster:
|
376
429
|
for link in unique_links:
|
377
430
|
if link != url: # Avoid infinite recursion
|
@@ -379,27 +432,27 @@ def find_links(url, driver='request', booster=False):
|
|
379
432
|
if sub_links:
|
380
433
|
links_href.extend(sub_links)
|
381
434
|
links_href = list(set(links_href)) # Remove duplicates again
|
382
|
-
|
435
|
+
|
383
436
|
return links_href
|
384
437
|
|
385
|
-
elif url.split(
|
438
|
+
elif url.split(".")[-1] in ["pdf"]:
|
386
439
|
return [url]
|
387
440
|
else:
|
388
441
|
return None
|
389
|
-
|
442
|
+
|
390
443
|
|
391
444
|
# To determine which links are related to target domains(e.g., pages) you are interested in
|
392
|
-
def filter_links(links, contains="html",driver=
|
445
|
+
def filter_links(links, contains="html", driver="requ", booster=False):
|
393
446
|
filtered_links = []
|
394
447
|
if isinstance(contains, str):
|
395
448
|
contains = [contains]
|
396
|
-
if isinstance(links,str):
|
397
|
-
links=find_links(links,driver=driver,booster=booster)
|
449
|
+
if isinstance(links, str):
|
450
|
+
links = find_links(links, driver=driver, booster=booster)
|
398
451
|
for link in links:
|
399
|
-
parsed_link = urlparse(link)
|
400
|
-
condition = (
|
401
|
-
and "javascript:" not in parsed_link
|
402
|
-
)
|
452
|
+
parsed_link = urlparse(link)
|
453
|
+
condition = (
|
454
|
+
all([i in link for i in contains]) and "javascript:" not in parsed_link
|
455
|
+
)
|
403
456
|
if condition:
|
404
457
|
filtered_links.append(link)
|
405
458
|
return filtered_links
|
@@ -421,31 +474,33 @@ def find_domain(links):
|
|
421
474
|
return None
|
422
475
|
|
423
476
|
|
424
|
-
def pdf_detector(url, contains
|
477
|
+
def pdf_detector(url, contains=None, dir_save=None, booster=False):
|
425
478
|
print("usage: pdf_detector(url, dir_save, booster=True")
|
479
|
+
|
426
480
|
def fname_pdf_corr(fname):
|
427
|
-
if fname[-4:]!=
|
428
|
-
fname = fname[:-4] +
|
481
|
+
if fname[-4:] != ".pdf":
|
482
|
+
fname = fname[:-4] + ".pdf"
|
429
483
|
return fname
|
484
|
+
|
430
485
|
if isinstance(contains, str):
|
431
486
|
contains = [contains]
|
432
|
-
if isinstance(url,str):
|
433
|
-
if
|
434
|
-
pdf_links=url
|
487
|
+
if isinstance(url, str):
|
488
|
+
if ".pdf" in url:
|
489
|
+
pdf_links = url
|
435
490
|
else:
|
436
491
|
if booster:
|
437
|
-
links_all=[]
|
438
|
-
if
|
439
|
-
[links_all.append(i) for i in find_links(url) if
|
492
|
+
links_all = []
|
493
|
+
if "http" in url and url:
|
494
|
+
[links_all.append(i) for i in find_links(url) if "http" in i]
|
440
495
|
print(links_all)
|
441
496
|
else:
|
442
|
-
links_all=url
|
497
|
+
links_all = url
|
443
498
|
if contains is not None:
|
444
499
|
pdf_links = filter_links(links=links_all, contains=[".pdf"] + contains)
|
445
500
|
else:
|
446
501
|
pdf_links = filter_links(links=links_all, contains=[".pdf"])
|
447
|
-
elif isinstance(url,list):
|
448
|
-
links_all=url
|
502
|
+
elif isinstance(url, list):
|
503
|
+
links_all = url
|
449
504
|
if contains is not None:
|
450
505
|
pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
|
451
506
|
else:
|
@@ -460,7 +515,7 @@ def pdf_detector(url, contains = None, dir_save = None, booster = False):
|
|
460
515
|
if pdf_links:
|
461
516
|
pp(f"pdf detected{pdf_links}")
|
462
517
|
else:
|
463
|
-
print(
|
518
|
+
print("no pdf file")
|
464
519
|
if dir_save:
|
465
520
|
print("... is trying to download to local")
|
466
521
|
fnames = [pdf_link_.split("/")[-1] for pdf_link_ in pdf_links]
|
@@ -477,40 +532,67 @@ def pdf_detector(url, contains = None, dir_save = None, booster = False):
|
|
477
532
|
else:
|
478
533
|
print("Failed to download PDF:", response.status_code)
|
479
534
|
idx += 1
|
480
|
-
print(f
|
535
|
+
print(f"{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}")
|
481
536
|
|
482
|
-
|
537
|
+
|
538
|
+
def downloader(
|
539
|
+
url,
|
540
|
+
dir_save=dir_save,
|
541
|
+
kind=[".pdf"],
|
542
|
+
contains=None,
|
543
|
+
rm_folder=False,
|
544
|
+
booster=False,
|
545
|
+
verbose=True,
|
546
|
+
timeout=30,
|
547
|
+
n_try=3,
|
548
|
+
timestamp=False,
|
549
|
+
):
|
483
550
|
if verbose:
|
484
|
-
print(
|
551
|
+
print(
|
552
|
+
"usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
|
553
|
+
)
|
554
|
+
|
485
555
|
def fname_corrector(fname, ext):
|
486
556
|
if not ext.startswith("."):
|
487
|
-
ext="."+ext
|
488
|
-
if not fname.endswith("ext")
|
489
|
-
fname = fname[
|
557
|
+
ext = "." + ext
|
558
|
+
if not fname.endswith("ext"): # if not ext in fname:
|
559
|
+
fname = fname[: -len(ext)] + ext
|
490
560
|
return fname
|
561
|
+
|
491
562
|
def check_and_modify_filename(directory, filename):
|
492
563
|
base, ext = os.path.splitext(filename)
|
493
564
|
counter = 1
|
494
565
|
new_filename = filename
|
495
566
|
while os.path.exists(os.path.join(directory, new_filename)):
|
496
|
-
if counter<=9:
|
497
|
-
counter_=
|
567
|
+
if counter <= 9:
|
568
|
+
counter_ = "0" + str(counter)
|
498
569
|
else:
|
499
|
-
counter_=str(counter)
|
570
|
+
counter_ = str(counter)
|
500
571
|
new_filename = f"{base}_{counter_}{ext}"
|
501
572
|
counter += 1
|
502
573
|
return new_filename
|
503
|
-
|
504
|
-
|
505
|
-
|
574
|
+
|
575
|
+
fpath_tmp, corrected_fname = None, None
|
576
|
+
if not isinstance(kind, list):
|
577
|
+
kind = [kind]
|
506
578
|
if isinstance(url, list):
|
507
579
|
for url_ in url:
|
508
|
-
downloader(
|
580
|
+
downloader(
|
581
|
+
url_,
|
582
|
+
dir_save=dir_save,
|
583
|
+
kind=kind,
|
584
|
+
contains=contains,
|
585
|
+
booster=booster,
|
586
|
+
verbose=verbose,
|
587
|
+
timeout=timeout,
|
588
|
+
n_try=n_try,
|
589
|
+
timestamp=timestamp,
|
590
|
+
)
|
509
591
|
# sleep(random.uniform(1, 3))
|
510
|
-
for i,k in enumerate(kind):
|
511
|
-
if not k.startswith(
|
512
|
-
kind[i]=
|
513
|
-
file_links_all=[]
|
592
|
+
for i, k in enumerate(kind):
|
593
|
+
if not k.startswith("."):
|
594
|
+
kind[i] = "." + kind[i]
|
595
|
+
file_links_all = []
|
514
596
|
for kind_ in kind:
|
515
597
|
print(kind_)
|
516
598
|
if isinstance(contains, str):
|
@@ -521,16 +603,16 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
|
|
521
603
|
else:
|
522
604
|
if booster:
|
523
605
|
links_all = []
|
524
|
-
if
|
525
|
-
links_all = find_links(url)
|
606
|
+
if "http" in url:
|
607
|
+
links_all = find_links(url)
|
526
608
|
else:
|
527
609
|
links_all = url
|
528
610
|
if contains is not None:
|
529
611
|
file_links = filter_links(links_all, contains=contains + kind_)
|
530
612
|
else:
|
531
|
-
file_links = links_all#filter_links(links_all, contains=kind_)
|
613
|
+
file_links = links_all # filter_links(links_all, contains=kind_)
|
532
614
|
elif isinstance(url, list):
|
533
|
-
links_all = url
|
615
|
+
links_all = url
|
534
616
|
if contains is not None:
|
535
617
|
file_links = filter_links(links_all, contains=contains + kind_)
|
536
618
|
else:
|
@@ -540,14 +622,14 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
|
|
540
622
|
if contains is not None:
|
541
623
|
file_links = filter_links(links_all, contains=contains + kind_)
|
542
624
|
else:
|
543
|
-
file_links = filter_links(links_all, contains=kind_)
|
625
|
+
file_links = filter_links(links_all, contains=kind_)
|
544
626
|
if verbose:
|
545
627
|
if file_links:
|
546
628
|
print("Files detected:")
|
547
629
|
pp(file_links)
|
548
630
|
else:
|
549
|
-
file_links=[]
|
550
|
-
print(
|
631
|
+
file_links = []
|
632
|
+
print("No files detected")
|
551
633
|
file_links_all.extend(file_links)
|
552
634
|
if dir_save:
|
553
635
|
if rm_folder:
|
@@ -557,18 +639,27 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
|
|
557
639
|
fnames = [file_link.split("/")[-1] for file_link in file_links_all]
|
558
640
|
for idx, file_link in enumerate(file_links_all):
|
559
641
|
headers = {"User-Agent": user_agent()}
|
560
|
-
itry = 0
|
642
|
+
itry = 0 # Retry logic with exception handling
|
561
643
|
while itry < n_try:
|
562
644
|
try:
|
563
645
|
# streaming to handle large files and reduce memory usage.
|
564
|
-
response = requests.get(
|
646
|
+
response = requests.get(
|
647
|
+
file_link, headers=headers, timeout=timeout, stream=True
|
648
|
+
)
|
565
649
|
if response.status_code == 200:
|
566
|
-
ext = next(
|
650
|
+
ext = next(
|
651
|
+
(ftype for ftype in kind if ftype in file_link), None
|
652
|
+
)
|
567
653
|
if ext:
|
568
654
|
corrected_fname = fname_corrector(fnames[idx], ext)
|
569
|
-
corrected_fname = check_and_modify_filename(
|
655
|
+
corrected_fname = check_and_modify_filename(
|
656
|
+
dir_save, corrected_fname
|
657
|
+
)
|
570
658
|
if timestamp:
|
571
|
-
corrected_fname=
|
659
|
+
corrected_fname = (
|
660
|
+
datetime.now().strftime("%y%m%d_%H%M%S_")
|
661
|
+
+ corrected_fname
|
662
|
+
)
|
572
663
|
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
573
664
|
with open(fpath_tmp, "wb") as file:
|
574
665
|
for chunk in response.iter_content(chunk_size=8192):
|
@@ -582,26 +673,29 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
|
|
582
673
|
break # Exit the retry loop if successful
|
583
674
|
else:
|
584
675
|
if verbose:
|
585
|
-
print(
|
676
|
+
print(
|
677
|
+
f"Failed to download file: HTTP status code {response.status_code}"
|
678
|
+
)
|
586
679
|
except (ChunkedEncodingError, ConnectionError) as e:
|
587
680
|
print(f"Attempt {itry+1} failed: {e}. Retrying in a few seconds...")
|
588
681
|
# time.sleep(random.uniform(0, 2)) # Random sleep to mitigate server issues
|
589
682
|
if fpath_tmp and os.path.exists(fpath_tmp):
|
590
|
-
|
683
|
+
os.remove(fpath_tmp)
|
591
684
|
itry += 1
|
592
685
|
|
593
686
|
if itry == n_try:
|
594
687
|
print(f"Failed to download {file_link} after {n_try} attempts.")
|
595
688
|
|
596
|
-
print(f
|
689
|
+
print(f"\n{len(fnames)} files were downloaded:")
|
597
690
|
if verbose:
|
598
691
|
if corrected_fname:
|
599
|
-
pp(corrected_fname)
|
600
|
-
else:
|
692
|
+
pp(corrected_fname)
|
693
|
+
else:
|
601
694
|
pp(fnames)
|
602
695
|
print(f"\n\nsaved @:\n{dir_save}")
|
603
|
-
|
604
|
-
|
696
|
+
|
697
|
+
|
698
|
+
def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
|
605
699
|
"""
|
606
700
|
Save images referenced in HTML content locally.
|
607
701
|
Args:
|
@@ -614,7 +708,7 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
|
|
614
708
|
"""
|
615
709
|
if rm_folder:
|
616
710
|
ips.rm_folder(dir_save)
|
617
|
-
content_type, content = fetch_all(url,driver=driver)
|
711
|
+
content_type, content = fetch_all(url, driver=driver)
|
618
712
|
print(content_type)
|
619
713
|
if "html" in content_type.lower():
|
620
714
|
# Create the directory if it doesn't exist
|
@@ -626,7 +720,7 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
|
|
626
720
|
# Extracting images
|
627
721
|
images = content.find_all("img", src=True)
|
628
722
|
if not images:
|
629
|
-
content_type, content = fetch_all(url,driver=
|
723
|
+
content_type, content = fetch_all(url, driver="selenium")
|
630
724
|
images = content.find_all("img", src=True)
|
631
725
|
for i, image in enumerate(images):
|
632
726
|
try:
|
@@ -658,7 +752,9 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
|
|
658
752
|
parsed_url = urlparse(absolute_image_url)
|
659
753
|
image_extension = os.path.splitext(parsed_url.path)[1]
|
660
754
|
# Download the image
|
661
|
-
image_response = requests.get(
|
755
|
+
image_response = requests.get(
|
756
|
+
absolute_image_url, proxies=proxies_glob
|
757
|
+
)
|
662
758
|
# Save the image to a file
|
663
759
|
image_filename = os.path.join(
|
664
760
|
dir_save, f"image_{i}{image_extension}"
|
@@ -671,18 +767,21 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
|
|
671
767
|
print(f"Failed to process image {image_url}: {e}")
|
672
768
|
print(f"images were saved at\n{dir_save}")
|
673
769
|
if verbose:
|
674
|
-
display_thumbnail_figure(flist(dir_save,kind=
|
770
|
+
display_thumbnail_figure(flist(dir_save, kind="img"), dpi=100)
|
675
771
|
return content
|
676
772
|
|
773
|
+
|
677
774
|
def svg_to_png(svg_file):
|
678
775
|
with WandImage(filename=svg_file, resolution=300) as img:
|
679
|
-
img.format =
|
776
|
+
img.format = "png"
|
680
777
|
png_image = img.make_blob()
|
681
778
|
return Image.open(io.BytesIO(png_image))
|
682
779
|
|
683
|
-
|
780
|
+
|
781
|
+
def display_thumbnail_figure(dir_img_list, figsize=(10, 10), dpi=100):
|
684
782
|
import matplotlib.pyplot as plt
|
685
783
|
from PIL import Image
|
784
|
+
|
686
785
|
"""
|
687
786
|
Display a thumbnail figure of all images in the specified directory.
|
688
787
|
Args:
|
@@ -692,13 +791,13 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
|
692
791
|
if num_images == 0:
|
693
792
|
print("No images found to display.")
|
694
793
|
return
|
695
|
-
grid_size = int(num_images
|
696
|
-
fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
|
794
|
+
grid_size = int(num_images**0.5) + 1
|
795
|
+
fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize, dpi=dpi)
|
697
796
|
for ax, image_file in zip(axs.flatten(), dir_img_list):
|
698
797
|
try:
|
699
798
|
img = Image.open(image_file)
|
700
799
|
ax.imshow(img)
|
701
|
-
ax.axis(
|
800
|
+
ax.axis("off") # Hide axes
|
702
801
|
except:
|
703
802
|
continue
|
704
803
|
try:
|
@@ -708,6 +807,7 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
|
708
807
|
except:
|
709
808
|
pass
|
710
809
|
|
810
|
+
|
711
811
|
def content_div_class(content, div="div", div_class="highlight"):
|
712
812
|
texts = [div.text for div in content.find_all(div, class_=div_class)]
|
713
813
|
return texts
|
@@ -735,7 +835,7 @@ def fetch_selenium(
|
|
735
835
|
javascript=True, # Add JavaScript option
|
736
836
|
disable_images=False, # Add option to disable images
|
737
837
|
iframe_name=None, # Add option to handle iframe
|
738
|
-
**kwargs
|
838
|
+
**kwargs,
|
739
839
|
):
|
740
840
|
chrome_options = Options()
|
741
841
|
chrome_options.add_argument("--headless")
|
@@ -743,7 +843,7 @@ def fetch_selenium(
|
|
743
843
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
744
844
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
745
845
|
if proxy:
|
746
|
-
chrome_options.add_argument(f
|
846
|
+
chrome_options.add_argument(f"--proxy-server={proxy}")
|
747
847
|
if disable_images:
|
748
848
|
prefs = {"profile.managed_default_content_settings.images": 2}
|
749
849
|
chrome_options.add_experimental_option("prefs", prefs)
|
@@ -752,9 +852,11 @@ def fetch_selenium(
|
|
752
852
|
for attempt in range(retry):
|
753
853
|
try:
|
754
854
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
755
|
-
|
855
|
+
|
756
856
|
if not javascript:
|
757
|
-
driver.execute_cdp_cmd(
|
857
|
+
driver.execute_cdp_cmd(
|
858
|
+
"Emulation.setScriptExecutionDisabled", {"value": True}
|
859
|
+
)
|
758
860
|
|
759
861
|
if login_url:
|
760
862
|
driver.get(login_url)
|
@@ -769,7 +871,7 @@ def fetch_selenium(
|
|
769
871
|
).click()
|
770
872
|
|
771
873
|
driver.get(url)
|
772
|
-
|
874
|
+
|
773
875
|
if iframe_name:
|
774
876
|
iframe = WebDriverWait(driver, timeout).until(
|
775
877
|
EC.presence_of_element_located((By.NAME, iframe_name))
|
@@ -783,7 +885,9 @@ def fetch_selenium(
|
|
783
885
|
driver.quit()
|
784
886
|
|
785
887
|
content = BeautifulSoup(page_source, "html.parser")
|
786
|
-
texts = extract_text_from_content(
|
888
|
+
texts = extract_text_from_content(
|
889
|
+
content, where=where, what=what, extend=extend, **kwargs
|
890
|
+
)
|
787
891
|
return texts
|
788
892
|
except Exception as e:
|
789
893
|
# logger.error(f"Attempt {attempt + 1} failed with error ")
|
@@ -797,33 +901,69 @@ def fetch_selenium(
|
|
797
901
|
return []
|
798
902
|
|
799
903
|
|
800
|
-
def fetch(
|
904
|
+
def fetch(
|
905
|
+
url,
|
906
|
+
where="div",
|
907
|
+
driver="request",
|
908
|
+
what=None,
|
909
|
+
extend=True,
|
910
|
+
booster=False,
|
911
|
+
retry=2,
|
912
|
+
verbose=False,
|
913
|
+
output="text",
|
914
|
+
**kws,
|
915
|
+
):
|
801
916
|
print(f"output is {output}")
|
802
|
-
if
|
917
|
+
if "xt" in output.lower():
|
803
918
|
for attempt in range(retry):
|
804
|
-
if verbose and attempt==0:
|
919
|
+
if verbose and attempt == 0:
|
805
920
|
xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
|
806
921
|
print(xample)
|
807
|
-
|
808
|
-
|
809
|
-
|
922
|
+
if isinstance(url, str):
|
923
|
+
content_type, content = fetch_all(
|
924
|
+
url, parser="html.parser", driver=driver
|
925
|
+
)
|
926
|
+
else:
|
927
|
+
content_type, content = "text/html", url
|
928
|
+
texts = extract_text_from_content(
|
929
|
+
content,
|
930
|
+
content_type=content_type,
|
931
|
+
where=where,
|
932
|
+
what=what,
|
933
|
+
extend=extend,
|
934
|
+
**kws,
|
935
|
+
)
|
936
|
+
if isinstance(texts, pd.core.frame.DataFrame):
|
810
937
|
if not texts.empty:
|
811
938
|
break
|
812
|
-
else:
|
939
|
+
else:
|
813
940
|
if texts:
|
814
941
|
break
|
815
942
|
sleep(random.uniform(0.5, 1.5))
|
816
|
-
if isinstance(texts,pd.core.frame.DataFrame):
|
817
|
-
condition_=[texts.empty, booster]
|
943
|
+
if isinstance(texts, pd.core.frame.DataFrame):
|
944
|
+
condition_ = [texts.empty, booster]
|
818
945
|
else:
|
819
|
-
condition_=[not texts, booster]
|
946
|
+
condition_ = [not texts, booster]
|
820
947
|
if any(condition_):
|
821
948
|
print("trying to use 'fetcher2'...")
|
822
|
-
texts = fetch_selenium(
|
949
|
+
texts = fetch_selenium(
|
950
|
+
url=url, where=where, what=what, extend=extend, **kws
|
951
|
+
)
|
823
952
|
if texts:
|
824
953
|
return texts
|
825
954
|
else:
|
826
|
-
return fetch(
|
955
|
+
return fetch(
|
956
|
+
url,
|
957
|
+
where=where,
|
958
|
+
driver=driver,
|
959
|
+
what=what,
|
960
|
+
extend=extend,
|
961
|
+
booster=booster,
|
962
|
+
retry=retry,
|
963
|
+
verbose=verbose,
|
964
|
+
output="soup",
|
965
|
+
**kws,
|
966
|
+
)
|
827
967
|
elif "url" in output.lower():
|
828
968
|
base_url = urlparse(url)
|
829
969
|
if verbose:
|
@@ -831,15 +971,15 @@ def fetch(url, where="div", driver='request',what=None, extend=True, booster=Fal
|
|
831
971
|
return base_url.geturl()
|
832
972
|
else:
|
833
973
|
try:
|
834
|
-
content_type, content = fetch_all(url, parser="html.parser",driver=driver)
|
974
|
+
content_type, content = fetch_all(url, parser="html.parser", driver=driver)
|
835
975
|
search_kwargs = {**kws}
|
836
976
|
print(search_kwargs)
|
837
|
-
if
|
838
|
-
search_kwargs["class"]=search_kwargs["class_"]
|
839
|
-
del search_kwargs[
|
977
|
+
if "class_" in search_kwargs:
|
978
|
+
search_kwargs["class"] = search_kwargs["class_"]
|
979
|
+
del search_kwargs["class_"]
|
840
980
|
if what:
|
841
981
|
search_kwargs["class"] = what
|
842
|
-
if
|
982
|
+
if "attrs" in kws:
|
843
983
|
result_set = content.find_all(where, **search_kwargs)
|
844
984
|
print(f"attrs =>{search_kwargs}")
|
845
985
|
else:
|
@@ -853,26 +993,27 @@ def fetch(url, where="div", driver='request',what=None, extend=True, booster=Fal
|
|
853
993
|
|
854
994
|
def extract_from_content(content, where="div", what=None):
|
855
995
|
if what is None:
|
856
|
-
result_set = content.find_all(where,recursive=True)
|
996
|
+
result_set = content.find_all(where, recursive=True)
|
857
997
|
texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
|
858
998
|
texts = [tx for tx in texts_.split("\n") if tx]
|
859
999
|
else:
|
860
1000
|
texts_ = " ".join(
|
861
|
-
div.get_text() + "\n"
|
1001
|
+
div.get_text() + "\n"
|
1002
|
+
for div in content.find_all(where, class_=what, recursive=True)
|
862
1003
|
)
|
863
1004
|
texts = [tx for tx in texts_.split("\n") if tx]
|
864
1005
|
return texts
|
865
1006
|
|
866
1007
|
|
867
|
-
def find_forms(url, driver=
|
868
|
-
content_type, content = fetch_all(url,driver=driver)
|
1008
|
+
def find_forms(url, driver="requ"):
|
1009
|
+
content_type, content = fetch_all(url, driver=driver)
|
869
1010
|
df = pd.DataFrame()
|
870
1011
|
# Extracting forms and inputs
|
871
|
-
forms = content.find_all("form",recursive=True)
|
1012
|
+
forms = content.find_all("form", recursive=True)
|
872
1013
|
form_data = []
|
873
1014
|
for form in forms:
|
874
1015
|
if form:
|
875
|
-
form_inputs = form.find_all("input",recursive=True)
|
1016
|
+
form_inputs = form.find_all("input", recursive=True)
|
876
1017
|
input_data = {}
|
877
1018
|
for input_tag in form_inputs:
|
878
1019
|
input_type = input_tag.get("type")
|
@@ -891,24 +1032,30 @@ def clean_string(value):
|
|
891
1032
|
return value
|
892
1033
|
|
893
1034
|
|
894
|
-
def find_all(url, dir_save=None, driver=
|
895
|
-
content_type, content = fetch_all(url,driver=driver)
|
1035
|
+
def find_all(url, dir_save=None, driver="req"):
|
1036
|
+
content_type, content = fetch_all(url, driver=driver)
|
896
1037
|
paragraphs_text = extract_from_content(content, where="p")
|
897
1038
|
# Extracting specific elements by class
|
898
1039
|
specific_elements_text = [
|
899
|
-
element.text
|
1040
|
+
element.text
|
1041
|
+
for element in content.find_all(class_="specific-class", recursive=True)
|
1042
|
+
if element
|
900
1043
|
]
|
901
1044
|
# Extracting links (anchor tags)
|
902
1045
|
links_href = find_links(url)
|
903
1046
|
links_href = filter_links(links_href)
|
904
1047
|
|
905
1048
|
# Extracting images
|
906
|
-
images_src = [
|
1049
|
+
images_src = [
|
1050
|
+
image["src"]
|
1051
|
+
for image in content.find_all("img", src=True, recursive=True)
|
1052
|
+
if image
|
1053
|
+
]
|
907
1054
|
|
908
1055
|
# Extracting headings (h1, h2, h3, etc.)
|
909
1056
|
headings = [f"h{i}" for i in range(1, 7)]
|
910
1057
|
headings_text = {
|
911
|
-
heading: [tag.text for tag in content.find_all(heading,recursive=True)]
|
1058
|
+
heading: [tag.text for tag in content.find_all(heading, recursive=True)]
|
912
1059
|
for heading in headings
|
913
1060
|
if heading
|
914
1061
|
}
|
@@ -916,15 +1063,15 @@ def find_all(url, dir_save=None, driver='req'):
|
|
916
1063
|
# Extracting lists (ul, ol, li)
|
917
1064
|
list_items_text = [
|
918
1065
|
item.text
|
919
|
-
for list_ in content.find_all(["ul", "ol"],recursive=True)
|
920
|
-
for item in list_.find_all("li",recursive=True)
|
1066
|
+
for list_ in content.find_all(["ul", "ol"], recursive=True)
|
1067
|
+
for item in list_.find_all("li", recursive=True)
|
921
1068
|
if item
|
922
1069
|
]
|
923
1070
|
|
924
1071
|
# Extracting tables (table, tr, td)
|
925
1072
|
table_cells_text = [
|
926
1073
|
cell.text
|
927
|
-
for table in content.find_all("table",recursive=True)
|
1074
|
+
for table in content.find_all("table", recursive=True)
|
928
1075
|
for row in table.find_all("tr")
|
929
1076
|
for cell in row.find_all("td")
|
930
1077
|
if cell
|
@@ -933,10 +1080,12 @@ def find_all(url, dir_save=None, driver='req'):
|
|
933
1080
|
# Extracting other elements
|
934
1081
|
divs_content = extract_from_content(content, where="div")
|
935
1082
|
headers_footer_content = [
|
936
|
-
tag.text
|
1083
|
+
tag.text
|
1084
|
+
for tag in content.find_all(["header", "footer"], recursive=True)
|
1085
|
+
if tag
|
937
1086
|
]
|
938
1087
|
meta_tags_content = [
|
939
|
-
(tag.name, tag.attrs) for tag in content.find_all("meta",recursive=True) if tag
|
1088
|
+
(tag.name, tag.attrs) for tag in content.find_all("meta", recursive=True) if tag
|
940
1089
|
]
|
941
1090
|
spans_content = extract_from_content(content, where="span")
|
942
1091
|
bold_text_content = extract_from_content(content, where="b")
|
@@ -996,15 +1145,21 @@ def find_all(url, dir_save=None, driver='req'):
|
|
996
1145
|
script_texts = content_div_class(content, div="div", div_class="highlight")
|
997
1146
|
lists_to_fill.append(script_texts)
|
998
1147
|
|
999
|
-
audio_src = [
|
1000
|
-
|
1001
|
-
|
1148
|
+
audio_src = [
|
1149
|
+
audio["src"] for audio in content.find_all("audio", src=True, recursive=True)
|
1150
|
+
]
|
1151
|
+
video_src = [
|
1152
|
+
video["src"] for video in content.find_all("video", src=True, recursive=True)
|
1153
|
+
]
|
1154
|
+
iframe_src = [
|
1155
|
+
iframe["src"] for iframe in content.find_all("iframe", src=True, recursive=True)
|
1156
|
+
]
|
1002
1157
|
lists_to_fill.extend([audio_src, video_src, iframe_src])
|
1003
1158
|
|
1004
1159
|
rss_links = [
|
1005
1160
|
link["href"]
|
1006
1161
|
for link in content.find_all(
|
1007
|
-
"link", type=["application/rss+xml", "application/atom+xml"],recursive=True
|
1162
|
+
"link", type=["application/rss+xml", "application/atom+xml"], recursive=True
|
1008
1163
|
)
|
1009
1164
|
]
|
1010
1165
|
lists_to_fill.append(rss_links)
|
@@ -1074,12 +1229,16 @@ def find_all(url, dir_save=None, driver='req'):
|
|
1074
1229
|
|
1075
1230
|
|
1076
1231
|
def flist(fpath, kind="all"):
|
1077
|
-
all_files = [
|
1078
|
-
|
1232
|
+
all_files = [
|
1233
|
+
os.path.join(fpath, f)
|
1234
|
+
for f in os.listdir(fpath)
|
1235
|
+
if os.path.isfile(os.path.join(fpath, f))
|
1236
|
+
]
|
1237
|
+
if kind == "all" or "all" in kind:
|
1079
1238
|
return all_files
|
1080
1239
|
if isinstance(kind, str):
|
1081
|
-
kind=[kind]
|
1082
|
-
filt_files=[]
|
1240
|
+
kind = [kind]
|
1241
|
+
filt_files = []
|
1083
1242
|
for f in all_files:
|
1084
1243
|
for kind_ in kind:
|
1085
1244
|
if isa(f, kind_):
|
@@ -1087,7 +1246,8 @@ def flist(fpath, kind="all"):
|
|
1087
1246
|
break
|
1088
1247
|
return filt_files
|
1089
1248
|
|
1090
|
-
|
1249
|
+
|
1250
|
+
def isa(fpath, kind="img"):
|
1091
1251
|
"""
|
1092
1252
|
kinds file paths based on the specified kind.
|
1093
1253
|
Args:
|
@@ -1097,51 +1257,66 @@ def isa(fpath, kind='img'):
|
|
1097
1257
|
Returns:
|
1098
1258
|
bool: True if the file matches the kind, False otherwise.
|
1099
1259
|
"""
|
1100
|
-
if
|
1260
|
+
if "img" in kind.lower():
|
1101
1261
|
return is_image(fpath)
|
1102
|
-
elif
|
1262
|
+
elif "doc" in kind.lower():
|
1103
1263
|
return is_document(fpath)
|
1104
|
-
elif
|
1264
|
+
elif "zip" in kind.lower():
|
1105
1265
|
return is_zip(fpath)
|
1106
1266
|
else:
|
1107
1267
|
return False
|
1108
1268
|
|
1269
|
+
|
1109
1270
|
def is_image(fpath):
|
1110
1271
|
mime_type, _ = mimetypes.guess_type(fpath)
|
1111
|
-
if mime_type and mime_type.startswith(
|
1272
|
+
if mime_type and mime_type.startswith("image"):
|
1112
1273
|
return True
|
1113
1274
|
else:
|
1114
1275
|
return False
|
1115
1276
|
|
1277
|
+
|
1116
1278
|
def is_document(fpath):
|
1117
1279
|
mime_type, _ = mimetypes.guess_type(fpath)
|
1118
1280
|
if mime_type and (
|
1119
|
-
mime_type.startswith(
|
1120
|
-
mime_type ==
|
1121
|
-
mime_type ==
|
1122
|
-
mime_type
|
1123
|
-
|
1124
|
-
mime_type ==
|
1125
|
-
mime_type
|
1126
|
-
|
1281
|
+
mime_type.startswith("text/")
|
1282
|
+
or mime_type == "application/pdf"
|
1283
|
+
or mime_type == "application/msword"
|
1284
|
+
or mime_type
|
1285
|
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
1286
|
+
or mime_type == "application/vnd.ms-excel"
|
1287
|
+
or mime_type
|
1288
|
+
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
1289
|
+
or mime_type == "application/vnd.ms-powerpoint"
|
1290
|
+
or mime_type
|
1291
|
+
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
1127
1292
|
):
|
1128
1293
|
return True
|
1129
1294
|
else:
|
1130
1295
|
return False
|
1131
1296
|
|
1297
|
+
|
1132
1298
|
def is_zip(fpath):
|
1133
1299
|
mime_type, _ = mimetypes.guess_type(fpath)
|
1134
|
-
if mime_type ==
|
1300
|
+
if mime_type == "application/zip":
|
1135
1301
|
return True
|
1136
1302
|
else:
|
1137
1303
|
return False
|
1138
1304
|
|
1139
|
-
def search(query, limit=5, kind='text', output='df',verbose=False,download=False, dir_save=dir_save):
|
1140
1305
|
|
1141
|
-
|
1306
|
+
def search(
|
1307
|
+
query,
|
1308
|
+
limit=5,
|
1309
|
+
kind="text",
|
1310
|
+
output="df",
|
1311
|
+
verbose=False,
|
1312
|
+
download=False,
|
1313
|
+
dir_save=dir_save,
|
1314
|
+
):
|
1315
|
+
|
1316
|
+
if "te" in kind.lower():
|
1142
1317
|
results = DDGS().text(query, max_results=limit)
|
1143
|
-
res=pd.DataFrame(results)
|
1144
|
-
res.rename(columns={"href":"links"},inplace=True)
|
1318
|
+
res = pd.DataFrame(results)
|
1319
|
+
res.rename(columns={"href": "links"}, inplace=True)
|
1145
1320
|
if verbose:
|
1146
1321
|
print(f'searching "{query}": got the results below\n{res}')
|
1147
1322
|
if download:
|
@@ -1152,16 +1327,18 @@ def search(query, limit=5, kind='text', output='df',verbose=False,download=False
|
|
1152
1327
|
print(f"failed link")
|
1153
1328
|
return res
|
1154
1329
|
|
1330
|
+
|
1155
1331
|
def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
|
1156
1332
|
def is_in_any(str_candi_short, str_full, ignore_case=True):
|
1157
1333
|
if isinstance(str_candi_short, str):
|
1158
|
-
str_candi_short=[str_candi_short]
|
1159
|
-
res_bool=[]
|
1334
|
+
str_candi_short = [str_candi_short]
|
1335
|
+
res_bool = []
|
1160
1336
|
if ignore_case:
|
1161
|
-
[res_bool.append(i in str_full.lower())
|
1337
|
+
[res_bool.append(i in str_full.lower()) for i in str_candi_short]
|
1162
1338
|
else:
|
1163
|
-
[res_bool.append(i in str_full)
|
1339
|
+
[res_bool.append(i in str_full) for i in str_candi_short]
|
1164
1340
|
return any(res_bool)
|
1341
|
+
|
1165
1342
|
def valid_mod_name(str_fly):
|
1166
1343
|
if is_in_any(str_fly, "claude-3-haiku"):
|
1167
1344
|
return "claude-3-haiku"
|
@@ -1172,27 +1349,32 @@ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
|
|
1172
1349
|
elif is_in_any(str_fly, "mixtral-8x7b"):
|
1173
1350
|
return "mixtral-8x7b"
|
1174
1351
|
else:
|
1175
|
-
print(
|
1176
|
-
|
1352
|
+
print(
|
1353
|
+
f"not support your model{model}, supported models: 'claude','gpt(default)', 'llama','mixtral'"
|
1354
|
+
)
|
1355
|
+
return "gpt-3.5" # default model
|
1356
|
+
|
1177
1357
|
model_valid = valid_mod_name(model)
|
1178
|
-
res=DDGS().chat(query, model=model_valid)
|
1358
|
+
res = DDGS().chat(query, model=model_valid)
|
1179
1359
|
if verbose:
|
1180
1360
|
pp(res)
|
1181
1361
|
if log:
|
1182
|
-
dt_str=datetime.fromtimestamp(time.time()).strftime(
|
1362
|
+
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
1183
1363
|
res_ = f"###{dt_str}\n\n>{res}\n"
|
1184
1364
|
os.makedirs(dir_save, exist_ok=True)
|
1185
1365
|
fpath = os.path.join(dir_save, f"log_ai.md")
|
1186
|
-
ips.fupdate(fpath=fpath,content=res_)
|
1366
|
+
ips.fupdate(fpath=fpath, content=res_)
|
1187
1367
|
print(f"log file:{fpath}")
|
1188
1368
|
return res
|
1189
1369
|
|
1370
|
+
|
1190
1371
|
def chat(*args, **kwargs):
|
1191
1372
|
if len(args) == 1 and isinstance(args[0], str):
|
1192
|
-
kwargs[
|
1373
|
+
kwargs["query"] = args[0]
|
1193
1374
|
return echo(**kwargs)
|
1194
1375
|
|
1376
|
+
|
1195
1377
|
def ai(*args, **kwargs):
|
1196
1378
|
if len(args) == 1 and isinstance(args[0], str):
|
1197
|
-
kwargs[
|
1198
|
-
return echo(**kwargs)
|
1379
|
+
kwargs["query"] = args[0]
|
1380
|
+
return echo(**kwargs)
|