py2ls 0.1.4.6__py3-none-any.whl → 0.1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/config +1 -0
- py2ls/ips.py +581 -118
- py2ls/netfinder.py +452 -128
- py2ls/translator.py +172 -121
- {py2ls-0.1.4.6.dist-info → py2ls-0.1.4.8.dist-info}/METADATA +1 -1
- {py2ls-0.1.4.6.dist-info → py2ls-0.1.4.8.dist-info}/RECORD +7 -7
- {py2ls-0.1.4.6.dist-info → py2ls-0.1.4.8.dist-info}/WHEEL +1 -1
py2ls/netfinder.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from bs4 import BeautifulSoup
|
2
2
|
import requests
|
3
|
+
from requests.utils import dict_from_cookiejar
|
3
4
|
import os
|
4
5
|
from urllib.parse import urlparse, urljoin
|
5
6
|
import base64
|
@@ -26,7 +27,12 @@ import mimetypes
|
|
26
27
|
import io
|
27
28
|
import matplotlib.pyplot as plt
|
28
29
|
from PIL import Image
|
30
|
+
from duckduckgo_search import DDGS
|
31
|
+
from datetime import datetime
|
32
|
+
import time
|
33
|
+
from py2ls import ips
|
29
34
|
|
35
|
+
dir_save='/Users/macjianfeng/Dropbox/Downloads/'
|
30
36
|
# Set up logging
|
31
37
|
logging.basicConfig(level=logging.INFO)
|
32
38
|
logger = logging.getLogger(__name__)
|
@@ -63,6 +69,14 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
63
69
|
Returns:
|
64
70
|
- list: A list of extracted text segments.
|
65
71
|
"""
|
72
|
+
def extract_text(element):
|
73
|
+
texts = ""
|
74
|
+
if isinstance(element, str) and element.strip():
|
75
|
+
texts += element.strip()
|
76
|
+
elif hasattr(element, "children"):
|
77
|
+
for child in element.children:
|
78
|
+
texts += extract_text(child)
|
79
|
+
return texts
|
66
80
|
if content is None:
|
67
81
|
logger.error("Content is None, cannot extract text.")
|
68
82
|
return []
|
@@ -70,7 +84,6 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
70
84
|
if content_type not in CONTENT_PARSERS:
|
71
85
|
logger.error(f"Unsupported content type: {content_type}")
|
72
86
|
return []
|
73
|
-
|
74
87
|
if "json" in content_type:
|
75
88
|
where = None
|
76
89
|
return extract_text_from_json(content, where)
|
@@ -81,32 +94,30 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
|
|
81
94
|
res.extend(extract_text_from_content(content, content_type="text/html", where=where_, what=what, extend=extend, **kwargs))
|
82
95
|
return res
|
83
96
|
else:
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
search_kwargs = {**kwargs}
|
95
|
-
if what:
|
96
|
-
search_kwargs["class_"] = what
|
97
|
-
|
97
|
+
search_kwargs = {**kwargs}
|
98
|
+
# correct 'class_'
|
99
|
+
# dict_=dict(class_="gsc_mnd_art_info")
|
100
|
+
if 'class_' in search_kwargs:
|
101
|
+
search_kwargs["class"]=search_kwargs["class_"]
|
102
|
+
del search_kwargs['class_']
|
103
|
+
if what:
|
104
|
+
search_kwargs["class"] = what
|
105
|
+
if 'attrs' in kwargs:
|
98
106
|
result_set = content.find_all(where, **search_kwargs)
|
107
|
+
print(f"attrs =>{search_kwargs}")
|
108
|
+
else:
|
109
|
+
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
110
|
+
print(f"{search_kwargs}")
|
111
|
+
|
112
|
+
if not result_set:
|
113
|
+
print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
|
114
|
+
if extend:
|
99
115
|
texts = ""
|
100
116
|
for tag in result_set:
|
101
117
|
texts += extract_text(tag) + "\n"
|
102
118
|
text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
|
103
119
|
return text_list
|
104
120
|
else:
|
105
|
-
search_kwargs = {**kwargs}
|
106
|
-
if what:
|
107
|
-
search_kwargs["class_"] = what
|
108
|
-
|
109
|
-
result_set = content.find_all(where, **search_kwargs)
|
110
121
|
texts_ = " ".join(tag.get_text() for tag in result_set)
|
111
122
|
texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
|
112
123
|
return texts
|
@@ -157,51 +168,124 @@ def get_proxy():
|
|
157
168
|
}
|
158
169
|
return proxies
|
159
170
|
# proxies_glob=get_proxy()
|
160
|
-
|
161
|
-
|
171
|
+
def get_soup(url, driver='req'):
|
172
|
+
_,soup_=fetch_all(url, driver=driver)
|
173
|
+
return soup_
|
174
|
+
def fetch_all(url, parser="lxml", driver='request', # request or selenium
|
175
|
+
by=By.TAG_NAME,
|
176
|
+
timeout=10,
|
177
|
+
retry=2,
|
178
|
+
login_url=None,
|
179
|
+
username=None,
|
180
|
+
password=None,
|
181
|
+
username_field="username",
|
182
|
+
password_field="password",
|
183
|
+
submit_field="submit",
|
184
|
+
username_by=By.NAME,
|
185
|
+
password_by=By.NAME,
|
186
|
+
submit_by=By.NAME,
|
187
|
+
# capability='eager', # eager or none
|
188
|
+
proxy=None, # Add proxy parameter
|
189
|
+
javascript=True, # Add JavaScript option
|
190
|
+
disable_images=False, # Add option to disable images
|
191
|
+
iframe_name=None): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
|
162
192
|
try:
|
163
|
-
# Generate a random user-agent string
|
193
|
+
# # Generate a random user-agent string
|
194
|
+
# response = requests.get(url)
|
195
|
+
# # get cookies
|
196
|
+
# cookie=dict_from_cookiejar(response.cookies)
|
197
|
+
# # get token from cookies
|
198
|
+
# scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
|
199
|
+
# headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
|
200
|
+
|
164
201
|
headers = {"User-Agent": user_agent()}
|
165
|
-
|
166
|
-
# Send the initial request
|
167
|
-
response = requests.get(url, headers=headers,proxies=proxies_glob)
|
168
|
-
|
169
|
-
# If the response is a redirect, follow it
|
170
|
-
while response.is_redirect:
|
171
|
-
logger.info(f"Redirecting to: {response.headers['Location']}")
|
172
|
-
response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
|
173
|
-
# Check for a 403 error
|
174
|
-
if response.status_code == 403:
|
175
|
-
logger.warning("403 Forbidden error. Retrying...")
|
176
|
-
# Retry the request after a short delay
|
177
|
-
sleep(random.uniform(1, 3))
|
202
|
+
if 'req' in driver.lower():
|
178
203
|
response = requests.get(url, headers=headers,proxies=proxies_glob)
|
179
|
-
|
204
|
+
|
205
|
+
# If the response is a redirect, follow it
|
206
|
+
while response.is_redirect:
|
207
|
+
logger.info(f"Redirecting to: {response.headers['Location']}")
|
208
|
+
response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
|
209
|
+
# Check for a 403 error
|
210
|
+
if response.status_code == 403:
|
211
|
+
logger.warning("403 Forbidden error. Retrying...")
|
212
|
+
# Retry the request after a short delay
|
213
|
+
sleep(random.uniform(1, 3))
|
214
|
+
response = requests.get(url, headers=headers,proxies=proxies_glob)
|
215
|
+
# Raise an error if retry also fails
|
216
|
+
response.raise_for_status()
|
217
|
+
|
218
|
+
# Raise an error for other HTTP status codes
|
180
219
|
response.raise_for_status()
|
181
220
|
|
182
|
-
|
183
|
-
|
221
|
+
# Get the content type
|
222
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
223
|
+
if response.encoding:
|
224
|
+
content = response.content.decode(response.encoding)
|
225
|
+
else:
|
226
|
+
content=None
|
227
|
+
# logger.info(f"Content type: {content_type}")
|
184
228
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
229
|
+
# Check if content type is supported
|
230
|
+
if content_type in CONTENT_PARSERS and content:
|
231
|
+
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
232
|
+
else:
|
233
|
+
logger.warning("Unsupported content type")
|
234
|
+
return None, None
|
235
|
+
elif 'se' in driver.lower():
|
236
|
+
chrome_options = Options()
|
237
|
+
chrome_options.add_argument("--headless")
|
238
|
+
chrome_options.add_argument("--no-sandbox")
|
239
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
240
|
+
chrome_options.add_argument(f"user-agent={user_agent()}")
|
241
|
+
if proxy:
|
242
|
+
chrome_options.add_argument(f'--proxy-server={proxy}')
|
243
|
+
if disable_images:
|
244
|
+
prefs = {"profile.managed_default_content_settings.images": 2}
|
245
|
+
chrome_options.add_experimental_option("prefs", prefs)
|
246
|
+
# chrome_options.page_load_strategy = capability
|
247
|
+
service = Service(ChromeDriverManager().install())
|
248
|
+
|
249
|
+
driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
250
|
+
|
251
|
+
if not javascript:
|
252
|
+
driver_.execute_cdp_cmd("Emulation.setScriptExecutionDisabled", {"value": True})
|
192
253
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
254
|
+
if login_url:
|
255
|
+
driver_.get(login_url)
|
256
|
+
WebDriverWait(driver_, timeout).until(
|
257
|
+
EC.presence_of_element_located((username_by, username_field))
|
258
|
+
).send_keys(username)
|
259
|
+
WebDriverWait(driver_, timeout).until(
|
260
|
+
EC.presence_of_element_located((password_by, password_field))
|
261
|
+
).send_keys(password)
|
262
|
+
WebDriverWait(driver_, timeout).until(
|
263
|
+
EC.element_to_be_clickable((submit_by, submit_field))
|
264
|
+
).click()
|
202
265
|
|
266
|
+
driver_.get(url)
|
267
|
+
|
268
|
+
if iframe_name:
|
269
|
+
iframe = WebDriverWait(driver_, timeout).until(
|
270
|
+
EC.presence_of_element_located((By.NAME, iframe_name))
|
271
|
+
)
|
272
|
+
driver_.switch_to.frame(iframe)
|
203
273
|
|
274
|
+
# WebDriverWait(driver, timeout).until(
|
275
|
+
# EC.presence_of_element_located((by, where))
|
276
|
+
# )
|
277
|
+
page_source = driver_.page_source
|
278
|
+
driver_.quit()
|
204
279
|
|
280
|
+
content = BeautifulSoup(page_source, "html.parser")
|
281
|
+
if content:
|
282
|
+
return 'text/html', content
|
283
|
+
else:
|
284
|
+
logger.warning("Selenium could not fetch content")
|
285
|
+
return None, None
|
286
|
+
except requests.RequestException as e:
|
287
|
+
logger.error(f"Error fetching URL '{url}': {e}")
|
288
|
+
return None, None
|
205
289
|
# # Function to change Tor IP address
|
206
290
|
# def renew_tor_ip():
|
207
291
|
# with Controller.from_port(port=9051) as controller:
|
@@ -227,24 +311,90 @@ def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
|
|
227
311
|
# return None
|
228
312
|
|
229
313
|
|
230
|
-
def find_links(url):
|
231
|
-
|
232
|
-
|
233
|
-
|
314
|
+
# def find_links(url,driver='request'):
|
315
|
+
# links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
|
316
|
+
# content_type, soup = fetch_all(url,driver=driver)
|
317
|
+
# if soup:
|
318
|
+
# base_url = urlparse(url)
|
319
|
+
|
320
|
+
# # Extract links from both 'href' and 'src' attributes across relevant tags
|
321
|
+
# tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
|
322
|
+
# elements = []
|
323
|
+
# for tag in tags_with_links:
|
324
|
+
# elements.extend(soup.find_all(tag, href=True))
|
325
|
+
# elements.extend(soup.find_all(tag, src=True))
|
326
|
+
|
327
|
+
# for element in elements:
|
328
|
+
# link_href = element.get('href') or element.get('src')
|
329
|
+
# if link_href:
|
330
|
+
# if link_href.startswith("//"):
|
331
|
+
# link_href = "http:" + link_href
|
332
|
+
# elif not link_href.startswith(("http", "https")):
|
333
|
+
# link_href = urljoin(base_url.geturl(), link_href)
|
334
|
+
|
335
|
+
# if all(exclusion not in link_href for exclusion in cond_ex):
|
336
|
+
# links_href.append(link_href)
|
337
|
+
|
338
|
+
# return list(set(links_href)) # Remove duplicates
|
339
|
+
|
340
|
+
# elif url.split('.')[-1] in ['pdf']:
|
341
|
+
# return url
|
342
|
+
# else:
|
343
|
+
# return None
|
344
|
+
def find_links(url, driver='request', booster=False):
|
345
|
+
links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
|
346
|
+
content_type, soup = fetch_all(url, driver=driver)
|
347
|
+
|
348
|
+
if soup and content_type=='text/html':
|
234
349
|
base_url = urlparse(url)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
if
|
242
|
-
|
243
|
-
|
350
|
+
|
351
|
+
# Extract links from all tags with 'href' and 'src' attributes
|
352
|
+
elements = soup.find_all(True, href=True) + soup.find_all(True, src=True)
|
353
|
+
|
354
|
+
for element in elements:
|
355
|
+
link_href = element.get('href') or element.get('src')
|
356
|
+
if link_href:
|
357
|
+
if link_href.startswith("//"):
|
358
|
+
link_href = "http:" + link_href
|
359
|
+
elif not link_href.startswith(("http", "https")):
|
360
|
+
link_href = urljoin(base_url.geturl(), link_href)
|
361
|
+
|
362
|
+
if all(exclusion not in link_href for exclusion in cond_ex):
|
363
|
+
links_href.append(link_href)
|
364
|
+
|
365
|
+
unique_links = list(set(links_href)) # Remove duplicates
|
366
|
+
|
367
|
+
if booster:
|
368
|
+
for link in unique_links:
|
369
|
+
if link != url: # Avoid infinite recursion
|
370
|
+
sub_links = find_links(link, driver=driver, booster=False)
|
371
|
+
if sub_links:
|
372
|
+
links_href.extend(sub_links)
|
373
|
+
links_href = list(set(links_href)) # Remove duplicates again
|
374
|
+
|
375
|
+
return links_href
|
376
|
+
|
244
377
|
elif url.split('.')[-1] in ['pdf']:
|
245
|
-
return url
|
378
|
+
return [url]
|
246
379
|
else:
|
247
380
|
return None
|
381
|
+
|
382
|
+
|
383
|
+
# To determine which links are related to target domains(e.g., pages) you are interested in
|
384
|
+
def filter_links(links, contains="html",driver='requ', booster=False):
|
385
|
+
filtered_links = []
|
386
|
+
if isinstance(contains, str):
|
387
|
+
contains = [contains]
|
388
|
+
if isinstance(links,str):
|
389
|
+
links=find_links(links,driver=driver,booster=booster)
|
390
|
+
for link in links:
|
391
|
+
parsed_link = urlparse(link)
|
392
|
+
condition = (all([i in link for i in contains])
|
393
|
+
and "javascript:" not in parsed_link
|
394
|
+
)
|
395
|
+
if condition:
|
396
|
+
filtered_links.append(link)
|
397
|
+
return filtered_links
|
248
398
|
|
249
399
|
|
250
400
|
def find_domain(links):
|
@@ -263,24 +413,8 @@ def find_domain(links):
|
|
263
413
|
return None
|
264
414
|
|
265
415
|
|
266
|
-
|
267
|
-
|
268
|
-
filtered_links = []
|
269
|
-
if isinstance(contains, str):
|
270
|
-
contains = [contains]
|
271
|
-
if isinstance(links,str):
|
272
|
-
links=find_links(links)
|
273
|
-
for link in links:
|
274
|
-
parsed_link = urlparse(link)
|
275
|
-
condition = (all([i in link for i in contains])
|
276
|
-
and "javascript:" not in parsed_link
|
277
|
-
)
|
278
|
-
if condition:
|
279
|
-
filtered_links.append(link)
|
280
|
-
return filtered_links
|
281
|
-
|
282
|
-
|
283
|
-
def pdf_detector(url, contains=None, dir_save=None,booster=False):
|
416
|
+
def pdf_detector(url, contains = None, dir_save = None, booster = False):
|
417
|
+
print("usage: pdf_detector(url, dir_save, booster=True")
|
284
418
|
def fname_pdf_corr(fname):
|
285
419
|
if fname[-4:]!='.pdf':
|
286
420
|
fname = fname[:-4] + '.pdf'
|
@@ -337,8 +471,101 @@ def pdf_detector(url, contains=None, dir_save=None,booster=False):
|
|
337
471
|
idx += 1
|
338
472
|
print(f'{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}')
|
339
473
|
|
340
|
-
|
341
|
-
|
474
|
+
def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=False, booster=False,verbose=True):
|
475
|
+
if verbose:
|
476
|
+
print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
|
477
|
+
def fname_corrector(fname, ext):
|
478
|
+
if not ext.startswith("."):
|
479
|
+
ext="."+ext
|
480
|
+
if not fname.endswith("ext"):#if not ext in fname:
|
481
|
+
fname = fname[:-len(ext)] + ext
|
482
|
+
return fname
|
483
|
+
def check_and_modify_filename(directory, filename):
|
484
|
+
base, ext = os.path.splitext(filename)
|
485
|
+
counter = 2
|
486
|
+
new_filename = filename
|
487
|
+
while os.path.exists(os.path.join(directory, new_filename)):
|
488
|
+
new_filename = f"{base}_{counter}{ext}"
|
489
|
+
counter += 1
|
490
|
+
return new_filename
|
491
|
+
if not isinstance(kind,list):
|
492
|
+
kind=[kind]
|
493
|
+
if isinstance(url, list):
|
494
|
+
for url_ in url:
|
495
|
+
downloader(url_, dir_save=dir_save, kind=kind, contains=contains, booster=booster,verbose=verbose)
|
496
|
+
# sleep(random.uniform(1, 3))
|
497
|
+
for i,k in enumerate(kind):
|
498
|
+
if not k.startswith('.'):
|
499
|
+
kind[i]='.'+kind[i]
|
500
|
+
file_links_all=[]
|
501
|
+
for kind_ in kind:
|
502
|
+
print(kind_)
|
503
|
+
if isinstance(contains, str):
|
504
|
+
contains = [contains]
|
505
|
+
if isinstance(url, str):
|
506
|
+
if any(ext in url for ext in kind):
|
507
|
+
file_links = [url]
|
508
|
+
else:
|
509
|
+
if booster:
|
510
|
+
links_all = []
|
511
|
+
if 'http' in url:
|
512
|
+
links_all = find_links(url)
|
513
|
+
else:
|
514
|
+
links_all = url
|
515
|
+
if contains is not None:
|
516
|
+
file_links = filter_links(links_all, contains=contains + kind_)
|
517
|
+
else:
|
518
|
+
file_links = links_all#filter_links(links_all, contains=kind_)
|
519
|
+
elif isinstance(url, list):
|
520
|
+
links_all = url
|
521
|
+
if contains is not None:
|
522
|
+
file_links = filter_links(links_all, contains=contains + kind_)
|
523
|
+
else:
|
524
|
+
file_links = filter_links(links_all, contains=kind_)
|
525
|
+
else:
|
526
|
+
links_all = find_links(url)
|
527
|
+
if contains is not None:
|
528
|
+
file_links = filter_links(links_all, contains=contains + kind_)
|
529
|
+
else:
|
530
|
+
file_links = filter_links(links_all, contains=kind_)
|
531
|
+
if verbose:
|
532
|
+
if file_links:
|
533
|
+
print("Files detected:")
|
534
|
+
pp(file_links)
|
535
|
+
else:
|
536
|
+
file_links=[]
|
537
|
+
print('No files detected')
|
538
|
+
file_links_all.extend(file_links)
|
539
|
+
if dir_save:
|
540
|
+
if rm_folder:
|
541
|
+
ips.rm_folder(dir_save)
|
542
|
+
if verbose:
|
543
|
+
print(f"\n... attempting to download to local\n")
|
544
|
+
fnames = [file_link.split("/")[-1] for file_link in file_links_all]
|
545
|
+
for idx, file_link in enumerate(file_links_all):
|
546
|
+
headers = {"User-Agent": user_agent()}
|
547
|
+
response = requests.get(file_link, headers=headers)
|
548
|
+
if response.status_code == 200:
|
549
|
+
ext = next((ftype for ftype in kind if ftype in file_link), None)
|
550
|
+
if ext:
|
551
|
+
corrected_fname = fname_corrector(fnames[idx], ext)
|
552
|
+
corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
|
553
|
+
with open(os.path.join(dir_save, corrected_fname), "wb") as file:
|
554
|
+
file.write(response.content)
|
555
|
+
if verbose:
|
556
|
+
print(f"Done! {fnames[idx]}")
|
557
|
+
else:
|
558
|
+
if verbose:
|
559
|
+
print(f"Unknown file type for {file_link}")
|
560
|
+
else:
|
561
|
+
if verbose:
|
562
|
+
print(f"Failed to download file: {response.status_code}")
|
563
|
+
print(f'\n{len(fnames)} files were downloaded:')
|
564
|
+
if verbose:
|
565
|
+
pp(fnames)
|
566
|
+
print(f"\n\nsaved @:\n{dir_save}")
|
567
|
+
|
568
|
+
def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=True):
|
342
569
|
"""
|
343
570
|
Save images referenced in HTML content locally.
|
344
571
|
Args:
|
@@ -349,7 +576,10 @@ def find_img(url, dir_save="images", verbose=True):
|
|
349
576
|
Returns:
|
350
577
|
str: HTML content with updated image URLs pointing to local files.
|
351
578
|
"""
|
352
|
-
|
579
|
+
if rm_folder:
|
580
|
+
ips.rm_folder(dir_save)
|
581
|
+
content_type, content = fetch_all(url,driver=driver)
|
582
|
+
print(content_type)
|
353
583
|
if "html" in content_type.lower():
|
354
584
|
# Create the directory if it doesn't exist
|
355
585
|
os.makedirs(dir_save, exist_ok=True)
|
@@ -359,6 +589,9 @@ def find_img(url, dir_save="images", verbose=True):
|
|
359
589
|
image_links = []
|
360
590
|
# Extracting images
|
361
591
|
images = content.find_all("img", src=True)
|
592
|
+
if not images:
|
593
|
+
content_type, content = fetch_all(url,driver='selenium')
|
594
|
+
images = content.find_all("img", src=True)
|
362
595
|
for i, image in enumerate(images):
|
363
596
|
try:
|
364
597
|
image_url = image["src"]
|
@@ -380,8 +613,8 @@ def find_img(url, dir_save="images", verbose=True):
|
|
380
613
|
with open(image_filename, "wb") as image_file:
|
381
614
|
image_file.write(image_data)
|
382
615
|
image["src"] = image_filename
|
383
|
-
if verbose:
|
384
|
-
|
616
|
+
# if verbose:
|
617
|
+
# plt.imshow(image_data)
|
385
618
|
else:
|
386
619
|
# Construct the absolute image URL
|
387
620
|
absolute_image_url = urljoin(url, image_url)
|
@@ -404,11 +637,13 @@ def find_img(url, dir_save="images", verbose=True):
|
|
404
637
|
if verbose:
|
405
638
|
display_thumbnail_figure(flist(dir_save,filter='img'),dpi=100)
|
406
639
|
return content
|
640
|
+
|
407
641
|
def svg_to_png(svg_file):
|
408
642
|
with WandImage(filename=svg_file, resolution=300) as img:
|
409
643
|
img.format = 'png'
|
410
644
|
png_image = img.make_blob()
|
411
645
|
return Image.open(io.BytesIO(png_image))
|
646
|
+
|
412
647
|
def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
413
648
|
import matplotlib.pyplot as plt
|
414
649
|
from PIL import Image
|
@@ -418,16 +653,11 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
|
418
653
|
dir_img_list (list): List of the Directory containing the images.
|
419
654
|
"""
|
420
655
|
num_images = len(dir_img_list)
|
421
|
-
|
422
656
|
if num_images == 0:
|
423
657
|
print("No images found to display.")
|
424
658
|
return
|
425
|
-
|
426
|
-
# Determine grid size
|
427
659
|
grid_size = int(num_images ** 0.5) + 1
|
428
|
-
|
429
660
|
fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
|
430
|
-
|
431
661
|
for ax, image_file in zip(axs.flatten(), dir_img_list):
|
432
662
|
try:
|
433
663
|
img = Image.open(image_file)
|
@@ -435,11 +665,12 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
|
|
435
665
|
ax.axis('off') # Hide axes
|
436
666
|
except:
|
437
667
|
continue
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
668
|
+
try:
|
669
|
+
[ax.axis("off") for ax in axs.flatten()]
|
670
|
+
plt.tight_layout()
|
671
|
+
plt.show()
|
672
|
+
except:
|
673
|
+
pass
|
443
674
|
|
444
675
|
def content_div_class(content, div="div", div_class="highlight"):
|
445
676
|
texts = [div.text for div in content.find_all(div, class_=div_class)]
|
@@ -530,28 +761,58 @@ def fetch_selenium(
|
|
530
761
|
return []
|
531
762
|
|
532
763
|
|
533
|
-
def fetch(url, where="div", what=None, extend=True, booster=False,retry=2,verbose=False, **kws):
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
764
|
+
def fetch(url, where="div", driver='request',what=None, extend=True, booster=False,retry=2,verbose=False, output="text", **kws):
|
765
|
+
print(f"output is {output}")
|
766
|
+
if 'xt' in output.lower():
|
767
|
+
for attempt in range(retry):
|
768
|
+
if verbose and attempt==0:
|
769
|
+
xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
|
770
|
+
print(xample)
|
771
|
+
content_type, content = fetch_all(url, parser="html.parser",driver=driver)
|
772
|
+
texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
|
773
|
+
if isinstance(texts, pd.core.frame.DataFrame):
|
774
|
+
if not texts.empty:
|
775
|
+
break
|
776
|
+
else:
|
777
|
+
if texts:
|
778
|
+
break
|
779
|
+
sleep(random.uniform(0.5, 1.5))
|
780
|
+
if isinstance(texts,pd.core.frame.DataFrame):
|
781
|
+
condition_=[texts.empty, booster]
|
782
|
+
else:
|
783
|
+
condition_=[not texts, booster]
|
784
|
+
if any(condition_):
|
785
|
+
print("trying to use 'fetcher2'...")
|
786
|
+
texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
|
787
|
+
if texts:
|
788
|
+
return texts
|
789
|
+
else:
|
790
|
+
return fetch(url, where=where, driver=driver,what=what, extend=extend, booster=booster,retry=retry,verbose=verbose, output="soup", **kws)
|
791
|
+
elif "url" in output.lower():
|
792
|
+
base_url = urlparse(url)
|
793
|
+
if verbose:
|
794
|
+
print("urljoin(urlparse(url), link_part)")
|
795
|
+
return base_url.geturl()
|
549
796
|
else:
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
797
|
+
try:
|
798
|
+
content_type, content = fetch_all(url, parser="html.parser",driver=driver)
|
799
|
+
search_kwargs = {**kws}
|
800
|
+
print(search_kwargs)
|
801
|
+
if 'class_' in search_kwargs:
|
802
|
+
search_kwargs["class"]=search_kwargs["class_"]
|
803
|
+
del search_kwargs['class_']
|
804
|
+
if what:
|
805
|
+
search_kwargs["class"] = what
|
806
|
+
if 'attrs' in kws:
|
807
|
+
result_set = content.find_all(where, **search_kwargs)
|
808
|
+
print(f"attrs =>{search_kwargs}")
|
809
|
+
else:
|
810
|
+
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
811
|
+
print(f"{search_kwargs}")
|
812
|
+
return result_set
|
813
|
+
except:
|
814
|
+
print("got nothing")
|
815
|
+
return None
|
555
816
|
|
556
817
|
|
557
818
|
def extract_from_content(content, where="div", what=None):
|
@@ -567,8 +828,8 @@ def extract_from_content(content, where="div", what=None):
|
|
567
828
|
return texts
|
568
829
|
|
569
830
|
|
570
|
-
def find_forms(url):
|
571
|
-
content_type, content = fetch_all(url)
|
831
|
+
def find_forms(url, driver='requ'):
|
832
|
+
content_type, content = fetch_all(url,driver=driver)
|
572
833
|
df = pd.DataFrame()
|
573
834
|
# Extracting forms and inputs
|
574
835
|
forms = content.find_all("form",recursive=True)
|
@@ -594,8 +855,8 @@ def clean_string(value):
|
|
594
855
|
return value
|
595
856
|
|
596
857
|
|
597
|
-
def find_all(url, dir_save=None):
|
598
|
-
content_type, content = fetch_all(url)
|
858
|
+
def find_all(url, dir_save=None, driver='req'):
|
859
|
+
content_type, content = fetch_all(url,driver=driver)
|
599
860
|
paragraphs_text = extract_from_content(content, where="p")
|
600
861
|
# Extracting specific elements by class
|
601
862
|
specific_elements_text = [
|
@@ -778,6 +1039,8 @@ def find_all(url, dir_save=None):
|
|
778
1039
|
|
779
1040
|
def flist(fpath, filter="all"):
|
780
1041
|
all_files = [os.path.join(fpath, f) for f in os.listdir(fpath) if os.path.isfile(os.path.join(fpath, f))]
|
1042
|
+
if isinstance(filter, str):
|
1043
|
+
filter=[filter]
|
781
1044
|
if isinstance(filter, list):
|
782
1045
|
filt_files=[]
|
783
1046
|
for filter_ in filter:
|
@@ -837,4 +1100,65 @@ def is_zip(fpath):
|
|
837
1100
|
if mime_type == 'application/zip':
|
838
1101
|
return True
|
839
1102
|
else:
|
840
|
-
return False
|
1103
|
+
return False
|
1104
|
+
|
1105
|
+
def search(query, limit=5, kind='text', output='df',verbose=False,download=False, dir_save=dir_save):
|
1106
|
+
|
1107
|
+
if 'te' in kind.lower():
|
1108
|
+
results = DDGS().text(query, max_results=limit)
|
1109
|
+
res=pd.DataFrame(results)
|
1110
|
+
res.rename(columns={"href":"links"},inplace=True)
|
1111
|
+
if verbose:
|
1112
|
+
print(f'searching "{query}": got the results below\n{res}')
|
1113
|
+
if download:
|
1114
|
+
try:
|
1115
|
+
downloader(url=res.links.tolist(), dir_save=dir_save, verbose=verbose)
|
1116
|
+
except:
|
1117
|
+
if verbose:
|
1118
|
+
print(f"failed link")
|
1119
|
+
return res
|
1120
|
+
|
1121
|
+
def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
|
1122
|
+
def is_in_any(str_candi_short, str_full, ignore_case=True):
|
1123
|
+
if isinstance(str_candi_short, str):
|
1124
|
+
str_candi_short=[str_candi_short]
|
1125
|
+
res_bool=[]
|
1126
|
+
if ignore_case:
|
1127
|
+
[res_bool.append(i in str_full.lower()) for i in str_candi_short ]
|
1128
|
+
else:
|
1129
|
+
[res_bool.append(i in str_full) for i in str_candi_short ]
|
1130
|
+
return any(res_bool)
|
1131
|
+
def valid_mod_name(str_fly):
|
1132
|
+
if is_in_any(str_fly, "claude-3-haiku"):
|
1133
|
+
return "claude-3-haiku"
|
1134
|
+
elif is_in_any(str_fly, "gpt-3.5"):
|
1135
|
+
return "gpt-3.5"
|
1136
|
+
elif is_in_any(str_fly, "llama-3-70b"):
|
1137
|
+
return "llama-3-70b"
|
1138
|
+
elif is_in_any(str_fly, "mixtral-8x7b"):
|
1139
|
+
return "mixtral-8x7b"
|
1140
|
+
else:
|
1141
|
+
print(f"not support your model{model}, supported models: 'claude','gpt(default)', 'llama','mixtral'")
|
1142
|
+
return "gpt-3.5" # default model
|
1143
|
+
model_valid = valid_mod_name(model)
|
1144
|
+
res=DDGS().chat(query, model=model_valid)
|
1145
|
+
if verbose:
|
1146
|
+
pp(res)
|
1147
|
+
if log:
|
1148
|
+
dt_str=datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')
|
1149
|
+
res_ = f"###{dt_str}\n\n>{res}\n"
|
1150
|
+
os.makedirs(dir_save, exist_ok=True)
|
1151
|
+
fpath = os.path.join(dir_save, f"log_ai.md")
|
1152
|
+
ips.fupdate(fpath=fpath,content=res_)
|
1153
|
+
print(f"log file:{fpath}")
|
1154
|
+
return res
|
1155
|
+
|
1156
|
+
def chat(*args, **kwargs):
|
1157
|
+
if len(args) == 1 and isinstance(args[0], str):
|
1158
|
+
kwargs['query'] = args[0]
|
1159
|
+
return echo(**kwargs)
|
1160
|
+
|
1161
|
+
def ai(*args, **kwargs):
|
1162
|
+
if len(args) == 1 and isinstance(args[0], str):
|
1163
|
+
kwargs['query'] = args[0]
|
1164
|
+
return echo(**kwargs)
|