py2ls 0.1.8.4__py3-none-any.whl → 0.1.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -33,13 +33,13 @@ from datetime import datetime
33
33
  import time
34
34
  from py2ls import ips
35
35
 
36
- dir_save='/Users/macjianfeng/Dropbox/Downloads/'
36
+ dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
37
37
  # Set up logging
38
38
  logging.basicConfig(level=logging.INFO)
39
39
  logger = logging.getLogger(__name__)
40
40
  # Suppress WDM INFO logs
41
- logging.getLogger('WDM').setLevel(logging.WARNING)
42
- proxies_glob=None
41
+ logging.getLogger("WDM").setLevel(logging.WARNING)
42
+ proxies_glob = None
43
43
 
44
44
  # Define supported content types and corresponding parsers
45
45
  CONTENT_PARSERS = {
@@ -49,13 +49,23 @@ CONTENT_PARSERS = {
49
49
  "text/plain": lambda text, parser: text.text,
50
50
  }
51
51
 
52
- def user_agent(browsers=["chrome", "edge", "firefox", "safari"], platforms=["pc", "tablet"],verbose=False,os=["windows", "macos", "linux"]):
53
- ua = UserAgent(browsers=browsers, platforms=platforms,os=os)
52
+
53
+ def user_agent(
54
+ browsers=["chrome", "edge", "firefox", "safari"],
55
+ platforms=["pc", "tablet"],
56
+ verbose=False,
57
+ os=["windows", "macos", "linux"],
58
+ ):
59
+ ua = UserAgent(browsers=browsers, platforms=platforms, os=os)
54
60
  output_ua = ua.random
55
61
  if verbose:
56
62
  print(output_ua)
57
- return output_ua
58
- def extract_text_from_content(content, content_type="text/html", where=None, what=None, extend=True, **kwargs):
63
+ return output_ua
64
+
65
+
66
+ def extract_text_from_content(
67
+ content, content_type="text/html", where=None, what=None, extend=True, **kwargs
68
+ ):
59
69
  """
60
70
  Extracts text from the given content based on the specified content type and search criteria.
61
71
 
@@ -70,6 +80,7 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
70
80
  Returns:
71
81
  - list: A list of extracted text segments.
72
82
  """
83
+
73
84
  def extract_text(element):
74
85
  texts = ""
75
86
  if isinstance(element, str) and element.strip():
@@ -78,6 +89,7 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
78
89
  for child in element.children:
79
90
  texts += extract_text(child)
80
91
  return texts
92
+
81
93
  if content is None:
82
94
  logger.error("Content is None, cannot extract text.")
83
95
  return []
@@ -88,39 +100,48 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
88
100
  if "json" in content_type:
89
101
  where = None
90
102
  return extract_text_from_json(content, where)
91
- elif 'text' in content_type:
103
+ elif "text" in content_type:
92
104
  if isinstance(where, list):
93
- res=[]
105
+ res = []
94
106
  for where_ in where:
95
- res.extend(extract_text_from_content(content, content_type="text/html", where=where_, what=what, extend=extend, **kwargs))
107
+ res.extend(
108
+ extract_text_from_content(
109
+ content,
110
+ content_type="text/html",
111
+ where=where_,
112
+ what=what,
113
+ extend=extend,
114
+ **kwargs,
115
+ )
116
+ )
96
117
  return res
97
118
  else:
98
119
  search_kwargs = {**kwargs}
99
120
  # correct 'class_'
100
121
  # dict_=dict(class_="gsc_mnd_art_info")
101
- if 'class_' in search_kwargs:
102
- search_kwargs["class"]=search_kwargs["class_"]
103
- del search_kwargs['class_']
122
+ if "class_" in search_kwargs:
123
+ search_kwargs["class"] = search_kwargs["class_"]
124
+ del search_kwargs["class_"]
104
125
  if what:
105
126
  search_kwargs["class"] = what
106
- if 'attrs' in kwargs:
127
+ if "attrs" in kwargs:
107
128
  result_set = content.find_all(where, **search_kwargs)
108
129
  print(f"attrs =>{search_kwargs}")
109
130
  else:
110
131
  result_set = content.find_all(where, attrs=dict(**search_kwargs))
111
132
  print(f"{search_kwargs}")
112
-
133
+
113
134
  if not result_set:
114
135
  print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
115
136
  if extend:
116
137
  texts = ""
117
138
  for tag in result_set:
118
- texts =texts+" "+ extract_text(tag) + " \n"
139
+ texts = texts + " " + extract_text(tag) + " \n"
119
140
  text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
120
141
  return text_list
121
142
  else:
122
143
  # texts_ = " ".join(tag.get_text() for tag in result_set)
123
- texts_=[]
144
+ texts_ = []
124
145
  for tag in result_set:
125
146
  for child in tag.children:
126
147
  if child.name is None:
@@ -130,35 +151,42 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
130
151
  texts = [tx.strip() for tx in texts_ if tx.strip()]
131
152
  return texts
132
153
 
154
+
133
155
  def extract_text_from_json(content, key=None):
134
156
  if key:
135
157
  if isinstance(content, list):
136
- return [str(item.get(key, '')) for item in content if key in item]
158
+ return [str(item.get(key, "")) for item in content if key in item]
137
159
  if isinstance(content, dict):
138
- return [str(content.get(key, ''))]
160
+ return [str(content.get(key, ""))]
139
161
  else:
140
162
  return [str(value) for key, value in flatten_json(content).items()]
141
163
 
164
+
142
165
  def flatten_json(y):
143
166
  out = {}
144
- def flatten(x, name=''):
167
+
168
+ def flatten(x, name=""):
145
169
  if isinstance(x, dict):
146
170
  for a in x:
147
- flatten(x[a], name + a + '_')
171
+ flatten(x[a], name + a + "_")
148
172
  elif isinstance(x, list):
149
173
  i = 0
150
174
  for a in x:
151
- flatten(a, name + str(i) + '_')
175
+ flatten(a, name + str(i) + "_")
152
176
  i += 1
153
177
  else:
154
178
  out[name[:-1]] = x
179
+
155
180
  flatten(y)
156
181
  return out
157
182
 
183
+
158
184
  def get_proxy():
159
185
  list_ = []
160
186
  headers = {"User-Agent": user_agent()}
161
- response = requests.get("https://free-proxy-list.net", headers=headers,timeout=30,stream=True)
187
+ response = requests.get(
188
+ "https://free-proxy-list.net", headers=headers, timeout=30, stream=True
189
+ )
162
190
  content = BeautifulSoup(response.content, "html.parser")
163
191
  info = extract_text_from_content(content, where="td", extend=0)[0].split()
164
192
  count, pair_proxy = 0, 2
@@ -175,11 +203,18 @@ def get_proxy():
175
203
  "https": f"http://" + prox[1],
176
204
  }
177
205
  return proxies
206
+
207
+
178
208
  # proxies_glob=get_proxy()
179
- def get_soup(url, driver='req'):
180
- _,soup_=fetch_all(url, driver=driver)
209
+ def get_soup(url, driver="req"):
210
+ _, soup_ = fetch_all(url, driver=driver)
181
211
  return soup_
182
- def fetch_all(url, parser="lxml", driver='request', # request or selenium
212
+
213
+
214
+ def fetch_all(
215
+ url,
216
+ parser="lxml",
217
+ driver="request", # request or selenium
183
218
  by=By.TAG_NAME,
184
219
  timeout=10,
185
220
  retry=2,
@@ -196,7 +231,8 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
196
231
  proxy=None, # Add proxy parameter
197
232
  javascript=True, # Add JavaScript option
198
233
  disable_images=False, # Add option to disable images
199
- iframe_name=None): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
234
+ iframe_name=None,
235
+ ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
200
236
  try:
201
237
  # # Generate a random user-agent string
202
238
  # response = requests.get(url)
@@ -205,21 +241,31 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
205
241
  # # get token from cookies
206
242
  # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
207
243
  # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
208
-
244
+
209
245
  headers = {"User-Agent": user_agent()}
210
- if 'req' in driver.lower():
211
- response = requests.get(url, headers=headers,proxies=proxies_glob,timeout=30,stream=True)
246
+ if "req" in driver.lower():
247
+ response = requests.get(
248
+ url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
249
+ )
212
250
 
213
251
  # If the response is a redirect, follow it
214
252
  while response.is_redirect:
215
253
  logger.info(f"Redirecting to: {response.headers['Location']}")
216
- response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob,timeout=30,stream=True)
254
+ response = requests.get(
255
+ response.headers["Location"],
256
+ headers=headers,
257
+ proxies=proxies_glob,
258
+ timeout=30,
259
+ stream=True,
260
+ )
217
261
  # Check for a 403 error
218
262
  if response.status_code == 403:
219
263
  logger.warning("403 Forbidden error. Retrying...")
220
264
  # Retry the request after a short delay
221
265
  sleep(random.uniform(1, 3))
222
- response = requests.get(url, headers=headers,proxies=proxies_glob,timeout=30,stream=True)
266
+ response = requests.get(
267
+ url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
268
+ )
223
269
  # Raise an error if retry also fails
224
270
  response.raise_for_status()
225
271
 
@@ -227,11 +273,13 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
227
273
  response.raise_for_status()
228
274
 
229
275
  # Get the content type
230
- content_type = response.headers.get("content-type", "").split(";")[0].lower()
276
+ content_type = (
277
+ response.headers.get("content-type", "").split(";")[0].lower()
278
+ )
231
279
  if response.encoding:
232
280
  content = response.content.decode(response.encoding)
233
281
  else:
234
- content=None
282
+ content = None
235
283
  # logger.info(f"Content type: {content_type}")
236
284
 
237
285
  # Check if content type is supported
@@ -240,14 +288,14 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
240
288
  else:
241
289
  logger.warning("Unsupported content type")
242
290
  return None, None
243
- elif 'se' in driver.lower():
291
+ elif "se" in driver.lower():
244
292
  chrome_options = Options()
245
293
  chrome_options.add_argument("--headless")
246
294
  chrome_options.add_argument("--no-sandbox")
247
295
  chrome_options.add_argument("--disable-dev-shm-usage")
248
296
  chrome_options.add_argument(f"user-agent={user_agent()}")
249
297
  if proxy:
250
- chrome_options.add_argument(f'--proxy-server={proxy}')
298
+ chrome_options.add_argument(f"--proxy-server={proxy}")
251
299
  if disable_images:
252
300
  prefs = {"profile.managed_default_content_settings.images": 2}
253
301
  chrome_options.add_experimental_option("prefs", prefs)
@@ -255,9 +303,11 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
255
303
  service = Service(ChromeDriverManager().install())
256
304
 
257
305
  driver_ = webdriver.Chrome(service=service, options=chrome_options)
258
-
306
+
259
307
  if not javascript:
260
- driver_.execute_cdp_cmd("Emulation.setScriptExecutionDisabled", {"value": True})
308
+ driver_.execute_cdp_cmd(
309
+ "Emulation.setScriptExecutionDisabled", {"value": True}
310
+ )
261
311
 
262
312
  if login_url:
263
313
  driver_.get(login_url)
@@ -272,7 +322,7 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
272
322
  ).click()
273
323
 
274
324
  driver_.get(url)
275
-
325
+
276
326
  if iframe_name:
277
327
  iframe = WebDriverWait(driver_, timeout).until(
278
328
  EC.presence_of_element_located((By.NAME, iframe_name))
@@ -286,14 +336,16 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
286
336
  driver_.quit()
287
337
 
288
338
  content = BeautifulSoup(page_source, "html.parser")
289
- if content:
290
- return 'text/html', content
339
+ if content:
340
+ return "text/html", content
291
341
  else:
292
342
  logger.warning("Selenium could not fetch content")
293
343
  return None, None
294
344
  except requests.RequestException as e:
295
- logger.error(f"Error fetching URL '{url}': {e}")
296
- return None, None
345
+ logger.error(f"Error fetching URL '{url}': {e}")
346
+ return None, None
347
+
348
+
297
349
  # # Function to change Tor IP address
298
350
  # def renew_tor_ip():
299
351
  # with Controller.from_port(port=9051) as controller:
@@ -324,14 +376,14 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
324
376
  # content_type, soup = fetch_all(url,driver=driver)
325
377
  # if soup:
326
378
  # base_url = urlparse(url)
327
-
379
+
328
380
  # # Extract links from both 'href' and 'src' attributes across relevant tags
329
381
  # tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
330
382
  # elements = []
331
383
  # for tag in tags_with_links:
332
384
  # elements.extend(soup.find_all(tag, href=True))
333
385
  # elements.extend(soup.find_all(tag, src=True))
334
-
386
+
335
387
  # for element in elements:
336
388
  # link_href = element.get('href') or element.get('src')
337
389
  # if link_href:
@@ -339,39 +391,40 @@ def fetch_all(url, parser="lxml", driver='request', # request or selenium
339
391
  # link_href = "http:" + link_href
340
392
  # elif not link_href.startswith(("http", "https")):
341
393
  # link_href = urljoin(base_url.geturl(), link_href)
342
-
394
+
343
395
  # if all(exclusion not in link_href for exclusion in cond_ex):
344
396
  # links_href.append(link_href)
345
-
397
+
346
398
  # return list(set(links_href)) # Remove duplicates
347
399
 
400
+
348
401
  # elif url.split('.')[-1] in ['pdf']:
349
402
  # return url
350
403
  # else:
351
404
  # return None
352
- def find_links(url, driver='request', booster=False):
405
+ def find_links(url, driver="request", booster=False):
353
406
  links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
354
407
  content_type, soup = fetch_all(url, driver=driver)
355
-
356
- if soup and content_type=='text/html':
408
+
409
+ if soup and content_type == "text/html":
357
410
  base_url = urlparse(url)
358
-
411
+
359
412
  # Extract links from all tags with 'href' and 'src' attributes
360
413
  elements = soup.find_all(True, href=True) + soup.find_all(True, src=True)
361
-
414
+
362
415
  for element in elements:
363
- link_href = element.get('href') or element.get('src')
416
+ link_href = element.get("href") or element.get("src")
364
417
  if link_href:
365
418
  if link_href.startswith("//"):
366
419
  link_href = "http:" + link_href
367
420
  elif not link_href.startswith(("http", "https")):
368
421
  link_href = urljoin(base_url.geturl(), link_href)
369
-
422
+
370
423
  if all(exclusion not in link_href for exclusion in cond_ex):
371
424
  links_href.append(link_href)
372
-
425
+
373
426
  unique_links = list(set(links_href)) # Remove duplicates
374
-
427
+
375
428
  if booster:
376
429
  for link in unique_links:
377
430
  if link != url: # Avoid infinite recursion
@@ -379,27 +432,27 @@ def find_links(url, driver='request', booster=False):
379
432
  if sub_links:
380
433
  links_href.extend(sub_links)
381
434
  links_href = list(set(links_href)) # Remove duplicates again
382
-
435
+
383
436
  return links_href
384
437
 
385
- elif url.split('.')[-1] in ['pdf']:
438
+ elif url.split(".")[-1] in ["pdf"]:
386
439
  return [url]
387
440
  else:
388
441
  return None
389
-
442
+
390
443
 
391
444
  # To determine which links are related to target domains(e.g., pages) you are interested in
392
- def filter_links(links, contains="html",driver='requ', booster=False):
445
+ def filter_links(links, contains="html", driver="requ", booster=False):
393
446
  filtered_links = []
394
447
  if isinstance(contains, str):
395
448
  contains = [contains]
396
- if isinstance(links,str):
397
- links=find_links(links,driver=driver,booster=booster)
449
+ if isinstance(links, str):
450
+ links = find_links(links, driver=driver, booster=booster)
398
451
  for link in links:
399
- parsed_link = urlparse(link)
400
- condition = (all([i in link for i in contains])
401
- and "javascript:" not in parsed_link
402
- )
452
+ parsed_link = urlparse(link)
453
+ condition = (
454
+ all([i in link for i in contains]) and "javascript:" not in parsed_link
455
+ )
403
456
  if condition:
404
457
  filtered_links.append(link)
405
458
  return filtered_links
@@ -421,31 +474,33 @@ def find_domain(links):
421
474
  return None
422
475
 
423
476
 
424
- def pdf_detector(url, contains = None, dir_save = None, booster = False):
477
+ def pdf_detector(url, contains=None, dir_save=None, booster=False):
425
478
  print("usage: pdf_detector(url, dir_save, booster=True")
479
+
426
480
  def fname_pdf_corr(fname):
427
- if fname[-4:]!='.pdf':
428
- fname = fname[:-4] + '.pdf'
481
+ if fname[-4:] != ".pdf":
482
+ fname = fname[:-4] + ".pdf"
429
483
  return fname
484
+
430
485
  if isinstance(contains, str):
431
486
  contains = [contains]
432
- if isinstance(url,str):
433
- if '.pdf' in url:
434
- pdf_links=url
487
+ if isinstance(url, str):
488
+ if ".pdf" in url:
489
+ pdf_links = url
435
490
  else:
436
491
  if booster:
437
- links_all=[]
438
- if 'http' in url and url:
439
- [links_all.append(i) for i in find_links(url) if 'http' in i]
492
+ links_all = []
493
+ if "http" in url and url:
494
+ [links_all.append(i) for i in find_links(url) if "http" in i]
440
495
  print(links_all)
441
496
  else:
442
- links_all=url
497
+ links_all = url
443
498
  if contains is not None:
444
499
  pdf_links = filter_links(links=links_all, contains=[".pdf"] + contains)
445
500
  else:
446
501
  pdf_links = filter_links(links=links_all, contains=[".pdf"])
447
- elif isinstance(url,list):
448
- links_all=url
502
+ elif isinstance(url, list):
503
+ links_all = url
449
504
  if contains is not None:
450
505
  pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
451
506
  else:
@@ -460,7 +515,7 @@ def pdf_detector(url, contains = None, dir_save = None, booster = False):
460
515
  if pdf_links:
461
516
  pp(f"pdf detected{pdf_links}")
462
517
  else:
463
- print('no pdf file')
518
+ print("no pdf file")
464
519
  if dir_save:
465
520
  print("... is trying to download to local")
466
521
  fnames = [pdf_link_.split("/")[-1] for pdf_link_ in pdf_links]
@@ -477,40 +532,67 @@ def pdf_detector(url, contains = None, dir_save = None, booster = False):
477
532
  else:
478
533
  print("Failed to download PDF:", response.status_code)
479
534
  idx += 1
480
- print(f'{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}')
535
+ print(f"{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}")
481
536
 
482
- def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=False, booster=False,verbose=True, timeout=30, n_try=3,timestamp=False):
537
+
538
+ def downloader(
539
+ url,
540
+ dir_save=dir_save,
541
+ kind=[".pdf"],
542
+ contains=None,
543
+ rm_folder=False,
544
+ booster=False,
545
+ verbose=True,
546
+ timeout=30,
547
+ n_try=3,
548
+ timestamp=False,
549
+ ):
483
550
  if verbose:
484
- print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
551
+ print(
552
+ "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
553
+ )
554
+
485
555
  def fname_corrector(fname, ext):
486
556
  if not ext.startswith("."):
487
- ext="."+ext
488
- if not fname.endswith("ext"):#if not ext in fname:
489
- fname = fname[:-len(ext)] + ext
557
+ ext = "." + ext
558
+ if not fname.endswith("ext"): # if not ext in fname:
559
+ fname = fname[: -len(ext)] + ext
490
560
  return fname
561
+
491
562
  def check_and_modify_filename(directory, filename):
492
563
  base, ext = os.path.splitext(filename)
493
564
  counter = 1
494
565
  new_filename = filename
495
566
  while os.path.exists(os.path.join(directory, new_filename)):
496
- if counter<=9:
497
- counter_='0'+str(counter)
567
+ if counter <= 9:
568
+ counter_ = "0" + str(counter)
498
569
  else:
499
- counter_=str(counter)
570
+ counter_ = str(counter)
500
571
  new_filename = f"{base}_{counter_}{ext}"
501
572
  counter += 1
502
573
  return new_filename
503
- fpath_tmp, corrected_fname=None, None
504
- if not isinstance(kind,list):
505
- kind=[kind]
574
+
575
+ fpath_tmp, corrected_fname = None, None
576
+ if not isinstance(kind, list):
577
+ kind = [kind]
506
578
  if isinstance(url, list):
507
579
  for url_ in url:
508
- downloader(url_, dir_save=dir_save, kind=kind, contains=contains, booster=booster,verbose=verbose,timeout=timeout,n_try=n_try,timestamp=timestamp)
580
+ downloader(
581
+ url_,
582
+ dir_save=dir_save,
583
+ kind=kind,
584
+ contains=contains,
585
+ booster=booster,
586
+ verbose=verbose,
587
+ timeout=timeout,
588
+ n_try=n_try,
589
+ timestamp=timestamp,
590
+ )
509
591
  # sleep(random.uniform(1, 3))
510
- for i,k in enumerate(kind):
511
- if not k.startswith('.'):
512
- kind[i]='.'+kind[i]
513
- file_links_all=[]
592
+ for i, k in enumerate(kind):
593
+ if not k.startswith("."):
594
+ kind[i] = "." + kind[i]
595
+ file_links_all = []
514
596
  for kind_ in kind:
515
597
  print(kind_)
516
598
  if isinstance(contains, str):
@@ -521,16 +603,16 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
521
603
  else:
522
604
  if booster:
523
605
  links_all = []
524
- if 'http' in url:
525
- links_all = find_links(url)
606
+ if "http" in url:
607
+ links_all = find_links(url)
526
608
  else:
527
609
  links_all = url
528
610
  if contains is not None:
529
611
  file_links = filter_links(links_all, contains=contains + kind_)
530
612
  else:
531
- file_links = links_all#filter_links(links_all, contains=kind_)
613
+ file_links = links_all # filter_links(links_all, contains=kind_)
532
614
  elif isinstance(url, list):
533
- links_all = url
615
+ links_all = url
534
616
  if contains is not None:
535
617
  file_links = filter_links(links_all, contains=contains + kind_)
536
618
  else:
@@ -540,14 +622,14 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
540
622
  if contains is not None:
541
623
  file_links = filter_links(links_all, contains=contains + kind_)
542
624
  else:
543
- file_links = filter_links(links_all, contains=kind_)
625
+ file_links = filter_links(links_all, contains=kind_)
544
626
  if verbose:
545
627
  if file_links:
546
628
  print("Files detected:")
547
629
  pp(file_links)
548
630
  else:
549
- file_links=[]
550
- print('No files detected')
631
+ file_links = []
632
+ print("No files detected")
551
633
  file_links_all.extend(file_links)
552
634
  if dir_save:
553
635
  if rm_folder:
@@ -557,18 +639,27 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
557
639
  fnames = [file_link.split("/")[-1] for file_link in file_links_all]
558
640
  for idx, file_link in enumerate(file_links_all):
559
641
  headers = {"User-Agent": user_agent()}
560
- itry = 0 # Retry logic with exception handling
642
+ itry = 0 # Retry logic with exception handling
561
643
  while itry < n_try:
562
644
  try:
563
645
  # streaming to handle large files and reduce memory usage.
564
- response = requests.get(file_link, headers=headers, timeout=timeout, stream=True)
646
+ response = requests.get(
647
+ file_link, headers=headers, timeout=timeout, stream=True
648
+ )
565
649
  if response.status_code == 200:
566
- ext = next((ftype for ftype in kind if ftype in file_link), None)
650
+ ext = next(
651
+ (ftype for ftype in kind if ftype in file_link), None
652
+ )
567
653
  if ext:
568
654
  corrected_fname = fname_corrector(fnames[idx], ext)
569
- corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
655
+ corrected_fname = check_and_modify_filename(
656
+ dir_save, corrected_fname
657
+ )
570
658
  if timestamp:
571
- corrected_fname=datetime.now().strftime("%y%m%d_%H%M%S_")+corrected_fname
659
+ corrected_fname = (
660
+ datetime.now().strftime("%y%m%d_%H%M%S_")
661
+ + corrected_fname
662
+ )
572
663
  fpath_tmp = os.path.join(dir_save, corrected_fname)
573
664
  with open(fpath_tmp, "wb") as file:
574
665
  for chunk in response.iter_content(chunk_size=8192):
@@ -582,26 +673,29 @@ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=F
582
673
  break # Exit the retry loop if successful
583
674
  else:
584
675
  if verbose:
585
- print(f"Failed to download file: HTTP status code {response.status_code}")
676
+ print(
677
+ f"Failed to download file: HTTP status code {response.status_code}"
678
+ )
586
679
  except (ChunkedEncodingError, ConnectionError) as e:
587
680
  print(f"Attempt {itry+1} failed: {e}. Retrying in a few seconds...")
588
681
  # time.sleep(random.uniform(0, 2)) # Random sleep to mitigate server issues
589
682
  if fpath_tmp and os.path.exists(fpath_tmp):
590
- os.remove(fpath_tmp)
683
+ os.remove(fpath_tmp)
591
684
  itry += 1
592
685
 
593
686
  if itry == n_try:
594
687
  print(f"Failed to download {file_link} after {n_try} attempts.")
595
688
 
596
- print(f'\n{len(fnames)} files were downloaded:')
689
+ print(f"\n{len(fnames)} files were downloaded:")
597
690
  if verbose:
598
691
  if corrected_fname:
599
- pp(corrected_fname)
600
- else:
692
+ pp(corrected_fname)
693
+ else:
601
694
  pp(fnames)
602
695
  print(f"\n\nsaved @:\n{dir_save}")
603
-
604
- def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=True):
696
+
697
+
698
+ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
605
699
  """
606
700
  Save images referenced in HTML content locally.
607
701
  Args:
@@ -614,7 +708,7 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
614
708
  """
615
709
  if rm_folder:
616
710
  ips.rm_folder(dir_save)
617
- content_type, content = fetch_all(url,driver=driver)
711
+ content_type, content = fetch_all(url, driver=driver)
618
712
  print(content_type)
619
713
  if "html" in content_type.lower():
620
714
  # Create the directory if it doesn't exist
@@ -626,7 +720,7 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
626
720
  # Extracting images
627
721
  images = content.find_all("img", src=True)
628
722
  if not images:
629
- content_type, content = fetch_all(url,driver='selenium')
723
+ content_type, content = fetch_all(url, driver="selenium")
630
724
  images = content.find_all("img", src=True)
631
725
  for i, image in enumerate(images):
632
726
  try:
@@ -658,7 +752,9 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
658
752
  parsed_url = urlparse(absolute_image_url)
659
753
  image_extension = os.path.splitext(parsed_url.path)[1]
660
754
  # Download the image
661
- image_response = requests.get(absolute_image_url,proxies=proxies_glob)
755
+ image_response = requests.get(
756
+ absolute_image_url, proxies=proxies_glob
757
+ )
662
758
  # Save the image to a file
663
759
  image_filename = os.path.join(
664
760
  dir_save, f"image_{i}{image_extension}"
@@ -671,18 +767,21 @@ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=T
671
767
  print(f"Failed to process image {image_url}: {e}")
672
768
  print(f"images were saved at\n{dir_save}")
673
769
  if verbose:
674
- display_thumbnail_figure(flist(dir_save,kind='img'),dpi=100)
770
+ display_thumbnail_figure(flist(dir_save, kind="img"), dpi=100)
675
771
  return content
676
772
 
773
+
677
774
  def svg_to_png(svg_file):
678
775
  with WandImage(filename=svg_file, resolution=300) as img:
679
- img.format = 'png'
776
+ img.format = "png"
680
777
  png_image = img.make_blob()
681
778
  return Image.open(io.BytesIO(png_image))
682
779
 
683
- def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
780
+
781
+ def display_thumbnail_figure(dir_img_list, figsize=(10, 10), dpi=100):
684
782
  import matplotlib.pyplot as plt
685
783
  from PIL import Image
784
+
686
785
  """
687
786
  Display a thumbnail figure of all images in the specified directory.
688
787
  Args:
@@ -692,13 +791,13 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
692
791
  if num_images == 0:
693
792
  print("No images found to display.")
694
793
  return
695
- grid_size = int(num_images ** 0.5) + 1
696
- fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
794
+ grid_size = int(num_images**0.5) + 1
795
+ fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize, dpi=dpi)
697
796
  for ax, image_file in zip(axs.flatten(), dir_img_list):
698
797
  try:
699
798
  img = Image.open(image_file)
700
799
  ax.imshow(img)
701
- ax.axis('off') # Hide axes
800
+ ax.axis("off") # Hide axes
702
801
  except:
703
802
  continue
704
803
  try:
@@ -708,6 +807,7 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
708
807
  except:
709
808
  pass
710
809
 
810
+
711
811
  def content_div_class(content, div="div", div_class="highlight"):
712
812
  texts = [div.text for div in content.find_all(div, class_=div_class)]
713
813
  return texts
@@ -735,7 +835,7 @@ def fetch_selenium(
735
835
  javascript=True, # Add JavaScript option
736
836
  disable_images=False, # Add option to disable images
737
837
  iframe_name=None, # Add option to handle iframe
738
- **kwargs
838
+ **kwargs,
739
839
  ):
740
840
  chrome_options = Options()
741
841
  chrome_options.add_argument("--headless")
@@ -743,7 +843,7 @@ def fetch_selenium(
743
843
  chrome_options.add_argument("--disable-dev-shm-usage")
744
844
  chrome_options.add_argument(f"user-agent={user_agent()}")
745
845
  if proxy:
746
- chrome_options.add_argument(f'--proxy-server={proxy}')
846
+ chrome_options.add_argument(f"--proxy-server={proxy}")
747
847
  if disable_images:
748
848
  prefs = {"profile.managed_default_content_settings.images": 2}
749
849
  chrome_options.add_experimental_option("prefs", prefs)
@@ -752,9 +852,11 @@ def fetch_selenium(
752
852
  for attempt in range(retry):
753
853
  try:
754
854
  driver = webdriver.Chrome(service=service, options=chrome_options)
755
-
855
+
756
856
  if not javascript:
757
- driver.execute_cdp_cmd("Emulation.setScriptExecutionDisabled", {"value": True})
857
+ driver.execute_cdp_cmd(
858
+ "Emulation.setScriptExecutionDisabled", {"value": True}
859
+ )
758
860
 
759
861
  if login_url:
760
862
  driver.get(login_url)
@@ -769,7 +871,7 @@ def fetch_selenium(
769
871
  ).click()
770
872
 
771
873
  driver.get(url)
772
-
874
+
773
875
  if iframe_name:
774
876
  iframe = WebDriverWait(driver, timeout).until(
775
877
  EC.presence_of_element_located((By.NAME, iframe_name))
@@ -783,7 +885,9 @@ def fetch_selenium(
783
885
  driver.quit()
784
886
 
785
887
  content = BeautifulSoup(page_source, "html.parser")
786
- texts = extract_text_from_content(content, where=where, what=what, extend=extend, **kwargs)
888
+ texts = extract_text_from_content(
889
+ content, where=where, what=what, extend=extend, **kwargs
890
+ )
787
891
  return texts
788
892
  except Exception as e:
789
893
  # logger.error(f"Attempt {attempt + 1} failed with error ")
@@ -797,33 +901,69 @@ def fetch_selenium(
797
901
  return []
798
902
 
799
903
 
800
- def fetch(url, where="div", driver='request',what=None, extend=True, booster=False,retry=2,verbose=False, output="text", **kws):
904
+ def fetch(
905
+ url,
906
+ where="div",
907
+ driver="request",
908
+ what=None,
909
+ extend=True,
910
+ booster=False,
911
+ retry=2,
912
+ verbose=False,
913
+ output="text",
914
+ **kws,
915
+ ):
801
916
  print(f"output is {output}")
802
- if 'xt' in output.lower():
917
+ if "xt" in output.lower():
803
918
  for attempt in range(retry):
804
- if verbose and attempt==0:
919
+ if verbose and attempt == 0:
805
920
  xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
806
921
  print(xample)
807
- content_type, content = fetch_all(url, parser="html.parser",driver=driver)
808
- texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
809
- if isinstance(texts, pd.core.frame.DataFrame):
922
+ if isinstance(url, str):
923
+ content_type, content = fetch_all(
924
+ url, parser="html.parser", driver=driver
925
+ )
926
+ else:
927
+ content_type, content = "text/html", url
928
+ texts = extract_text_from_content(
929
+ content,
930
+ content_type=content_type,
931
+ where=where,
932
+ what=what,
933
+ extend=extend,
934
+ **kws,
935
+ )
936
+ if isinstance(texts, pd.core.frame.DataFrame):
810
937
  if not texts.empty:
811
938
  break
812
- else:
939
+ else:
813
940
  if texts:
814
941
  break
815
942
  sleep(random.uniform(0.5, 1.5))
816
- if isinstance(texts,pd.core.frame.DataFrame):
817
- condition_=[texts.empty, booster]
943
+ if isinstance(texts, pd.core.frame.DataFrame):
944
+ condition_ = [texts.empty, booster]
818
945
  else:
819
- condition_=[not texts, booster]
946
+ condition_ = [not texts, booster]
820
947
  if any(condition_):
821
948
  print("trying to use 'fetcher2'...")
822
- texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
949
+ texts = fetch_selenium(
950
+ url=url, where=where, what=what, extend=extend, **kws
951
+ )
823
952
  if texts:
824
953
  return texts
825
954
  else:
826
- return fetch(url, where=where, driver=driver,what=what, extend=extend, booster=booster,retry=retry,verbose=verbose, output="soup", **kws)
955
+ return fetch(
956
+ url,
957
+ where=where,
958
+ driver=driver,
959
+ what=what,
960
+ extend=extend,
961
+ booster=booster,
962
+ retry=retry,
963
+ verbose=verbose,
964
+ output="soup",
965
+ **kws,
966
+ )
827
967
  elif "url" in output.lower():
828
968
  base_url = urlparse(url)
829
969
  if verbose:
@@ -831,15 +971,15 @@ def fetch(url, where="div", driver='request',what=None, extend=True, booster=Fal
831
971
  return base_url.geturl()
832
972
  else:
833
973
  try:
834
- content_type, content = fetch_all(url, parser="html.parser",driver=driver)
974
+ content_type, content = fetch_all(url, parser="html.parser", driver=driver)
835
975
  search_kwargs = {**kws}
836
976
  print(search_kwargs)
837
- if 'class_' in search_kwargs:
838
- search_kwargs["class"]=search_kwargs["class_"]
839
- del search_kwargs['class_']
977
+ if "class_" in search_kwargs:
978
+ search_kwargs["class"] = search_kwargs["class_"]
979
+ del search_kwargs["class_"]
840
980
  if what:
841
981
  search_kwargs["class"] = what
842
- if 'attrs' in kws:
982
+ if "attrs" in kws:
843
983
  result_set = content.find_all(where, **search_kwargs)
844
984
  print(f"attrs =>{search_kwargs}")
845
985
  else:
@@ -853,26 +993,27 @@ def fetch(url, where="div", driver='request',what=None, extend=True, booster=Fal
853
993
 
854
994
  def extract_from_content(content, where="div", what=None):
855
995
  if what is None:
856
- result_set = content.find_all(where,recursive=True)
996
+ result_set = content.find_all(where, recursive=True)
857
997
  texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
858
998
  texts = [tx for tx in texts_.split("\n") if tx]
859
999
  else:
860
1000
  texts_ = " ".join(
861
- div.get_text() + "\n" for div in content.find_all(where, class_=what,recursive=True)
1001
+ div.get_text() + "\n"
1002
+ for div in content.find_all(where, class_=what, recursive=True)
862
1003
  )
863
1004
  texts = [tx for tx in texts_.split("\n") if tx]
864
1005
  return texts
865
1006
 
866
1007
 
867
- def find_forms(url, driver='requ'):
868
- content_type, content = fetch_all(url,driver=driver)
1008
+ def find_forms(url, driver="requ"):
1009
+ content_type, content = fetch_all(url, driver=driver)
869
1010
  df = pd.DataFrame()
870
1011
  # Extracting forms and inputs
871
- forms = content.find_all("form",recursive=True)
1012
+ forms = content.find_all("form", recursive=True)
872
1013
  form_data = []
873
1014
  for form in forms:
874
1015
  if form:
875
- form_inputs = form.find_all("input",recursive=True)
1016
+ form_inputs = form.find_all("input", recursive=True)
876
1017
  input_data = {}
877
1018
  for input_tag in form_inputs:
878
1019
  input_type = input_tag.get("type")
@@ -891,24 +1032,30 @@ def clean_string(value):
891
1032
  return value
892
1033
 
893
1034
 
894
- def find_all(url, dir_save=None, driver='req'):
895
- content_type, content = fetch_all(url,driver=driver)
1035
+ def find_all(url, dir_save=None, driver="req"):
1036
+ content_type, content = fetch_all(url, driver=driver)
896
1037
  paragraphs_text = extract_from_content(content, where="p")
897
1038
  # Extracting specific elements by class
898
1039
  specific_elements_text = [
899
- element.text for element in content.find_all(class_="specific-class",recursive=True) if element
1040
+ element.text
1041
+ for element in content.find_all(class_="specific-class", recursive=True)
1042
+ if element
900
1043
  ]
901
1044
  # Extracting links (anchor tags)
902
1045
  links_href = find_links(url)
903
1046
  links_href = filter_links(links_href)
904
1047
 
905
1048
  # Extracting images
906
- images_src = [image["src"] for image in content.find_all("img", src=True,recursive=True) if image]
1049
+ images_src = [
1050
+ image["src"]
1051
+ for image in content.find_all("img", src=True, recursive=True)
1052
+ if image
1053
+ ]
907
1054
 
908
1055
  # Extracting headings (h1, h2, h3, etc.)
909
1056
  headings = [f"h{i}" for i in range(1, 7)]
910
1057
  headings_text = {
911
- heading: [tag.text for tag in content.find_all(heading,recursive=True)]
1058
+ heading: [tag.text for tag in content.find_all(heading, recursive=True)]
912
1059
  for heading in headings
913
1060
  if heading
914
1061
  }
@@ -916,15 +1063,15 @@ def find_all(url, dir_save=None, driver='req'):
916
1063
  # Extracting lists (ul, ol, li)
917
1064
  list_items_text = [
918
1065
  item.text
919
- for list_ in content.find_all(["ul", "ol"],recursive=True)
920
- for item in list_.find_all("li",recursive=True)
1066
+ for list_ in content.find_all(["ul", "ol"], recursive=True)
1067
+ for item in list_.find_all("li", recursive=True)
921
1068
  if item
922
1069
  ]
923
1070
 
924
1071
  # Extracting tables (table, tr, td)
925
1072
  table_cells_text = [
926
1073
  cell.text
927
- for table in content.find_all("table",recursive=True)
1074
+ for table in content.find_all("table", recursive=True)
928
1075
  for row in table.find_all("tr")
929
1076
  for cell in row.find_all("td")
930
1077
  if cell
@@ -933,10 +1080,12 @@ def find_all(url, dir_save=None, driver='req'):
933
1080
  # Extracting other elements
934
1081
  divs_content = extract_from_content(content, where="div")
935
1082
  headers_footer_content = [
936
- tag.text for tag in content.find_all(["header", "footer"],recursive=True) if tag
1083
+ tag.text
1084
+ for tag in content.find_all(["header", "footer"], recursive=True)
1085
+ if tag
937
1086
  ]
938
1087
  meta_tags_content = [
939
- (tag.name, tag.attrs) for tag in content.find_all("meta",recursive=True) if tag
1088
+ (tag.name, tag.attrs) for tag in content.find_all("meta", recursive=True) if tag
940
1089
  ]
941
1090
  spans_content = extract_from_content(content, where="span")
942
1091
  bold_text_content = extract_from_content(content, where="b")
@@ -996,15 +1145,21 @@ def find_all(url, dir_save=None, driver='req'):
996
1145
  script_texts = content_div_class(content, div="div", div_class="highlight")
997
1146
  lists_to_fill.append(script_texts)
998
1147
 
999
- audio_src = [audio["src"] for audio in content.find_all("audio", src=True,recursive=True)]
1000
- video_src = [video["src"] for video in content.find_all("video", src=True,recursive=True)]
1001
- iframe_src = [iframe["src"] for iframe in content.find_all("iframe", src=True,recursive=True)]
1148
+ audio_src = [
1149
+ audio["src"] for audio in content.find_all("audio", src=True, recursive=True)
1150
+ ]
1151
+ video_src = [
1152
+ video["src"] for video in content.find_all("video", src=True, recursive=True)
1153
+ ]
1154
+ iframe_src = [
1155
+ iframe["src"] for iframe in content.find_all("iframe", src=True, recursive=True)
1156
+ ]
1002
1157
  lists_to_fill.extend([audio_src, video_src, iframe_src])
1003
1158
 
1004
1159
  rss_links = [
1005
1160
  link["href"]
1006
1161
  for link in content.find_all(
1007
- "link", type=["application/rss+xml", "application/atom+xml"],recursive=True
1162
+ "link", type=["application/rss+xml", "application/atom+xml"], recursive=True
1008
1163
  )
1009
1164
  ]
1010
1165
  lists_to_fill.append(rss_links)
@@ -1074,12 +1229,16 @@ def find_all(url, dir_save=None, driver='req'):
1074
1229
 
1075
1230
 
1076
1231
  def flist(fpath, kind="all"):
1077
- all_files = [os.path.join(fpath, f) for f in os.listdir(fpath) if os.path.isfile(os.path.join(fpath, f))]
1078
- if kind == "all" or 'all' in kind:
1232
+ all_files = [
1233
+ os.path.join(fpath, f)
1234
+ for f in os.listdir(fpath)
1235
+ if os.path.isfile(os.path.join(fpath, f))
1236
+ ]
1237
+ if kind == "all" or "all" in kind:
1079
1238
  return all_files
1080
1239
  if isinstance(kind, str):
1081
- kind=[kind]
1082
- filt_files=[]
1240
+ kind = [kind]
1241
+ filt_files = []
1083
1242
  for f in all_files:
1084
1243
  for kind_ in kind:
1085
1244
  if isa(f, kind_):
@@ -1087,7 +1246,8 @@ def flist(fpath, kind="all"):
1087
1246
  break
1088
1247
  return filt_files
1089
1248
 
1090
- def isa(fpath, kind='img'):
1249
+
1250
+ def isa(fpath, kind="img"):
1091
1251
  """
1092
1252
  kinds file paths based on the specified kind.
1093
1253
  Args:
@@ -1097,51 +1257,66 @@ def isa(fpath, kind='img'):
1097
1257
  Returns:
1098
1258
  bool: True if the file matches the kind, False otherwise.
1099
1259
  """
1100
- if 'img' in kind.lower():
1260
+ if "img" in kind.lower():
1101
1261
  return is_image(fpath)
1102
- elif 'doc' in kind.lower():
1262
+ elif "doc" in kind.lower():
1103
1263
  return is_document(fpath)
1104
- elif 'zip' in kind.lower():
1264
+ elif "zip" in kind.lower():
1105
1265
  return is_zip(fpath)
1106
1266
  else:
1107
1267
  return False
1108
1268
 
1269
+
1109
1270
  def is_image(fpath):
1110
1271
  mime_type, _ = mimetypes.guess_type(fpath)
1111
- if mime_type and mime_type.startswith('image'):
1272
+ if mime_type and mime_type.startswith("image"):
1112
1273
  return True
1113
1274
  else:
1114
1275
  return False
1115
1276
 
1277
+
1116
1278
  def is_document(fpath):
1117
1279
  mime_type, _ = mimetypes.guess_type(fpath)
1118
1280
  if mime_type and (
1119
- mime_type.startswith('text/') or
1120
- mime_type == 'application/pdf' or
1121
- mime_type == 'application/msword' or
1122
- mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' or
1123
- mime_type == 'application/vnd.ms-excel' or
1124
- mime_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' or
1125
- mime_type == 'application/vnd.ms-powerpoint' or
1126
- mime_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
1281
+ mime_type.startswith("text/")
1282
+ or mime_type == "application/pdf"
1283
+ or mime_type == "application/msword"
1284
+ or mime_type
1285
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
1286
+ or mime_type == "application/vnd.ms-excel"
1287
+ or mime_type
1288
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
1289
+ or mime_type == "application/vnd.ms-powerpoint"
1290
+ or mime_type
1291
+ == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
1127
1292
  ):
1128
1293
  return True
1129
1294
  else:
1130
1295
  return False
1131
1296
 
1297
+
1132
1298
  def is_zip(fpath):
1133
1299
  mime_type, _ = mimetypes.guess_type(fpath)
1134
- if mime_type == 'application/zip':
1300
+ if mime_type == "application/zip":
1135
1301
  return True
1136
1302
  else:
1137
1303
  return False
1138
1304
 
1139
- def search(query, limit=5, kind='text', output='df',verbose=False,download=False, dir_save=dir_save):
1140
1305
 
1141
- if 'te' in kind.lower():
1306
+ def search(
1307
+ query,
1308
+ limit=5,
1309
+ kind="text",
1310
+ output="df",
1311
+ verbose=False,
1312
+ download=False,
1313
+ dir_save=dir_save,
1314
+ ):
1315
+
1316
+ if "te" in kind.lower():
1142
1317
  results = DDGS().text(query, max_results=limit)
1143
- res=pd.DataFrame(results)
1144
- res.rename(columns={"href":"links"},inplace=True)
1318
+ res = pd.DataFrame(results)
1319
+ res.rename(columns={"href": "links"}, inplace=True)
1145
1320
  if verbose:
1146
1321
  print(f'searching "{query}": got the results below\n{res}')
1147
1322
  if download:
@@ -1152,16 +1327,18 @@ def search(query, limit=5, kind='text', output='df',verbose=False,download=False
1152
1327
  print(f"failed link")
1153
1328
  return res
1154
1329
 
1330
+
1155
1331
  def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
1156
1332
  def is_in_any(str_candi_short, str_full, ignore_case=True):
1157
1333
  if isinstance(str_candi_short, str):
1158
- str_candi_short=[str_candi_short]
1159
- res_bool=[]
1334
+ str_candi_short = [str_candi_short]
1335
+ res_bool = []
1160
1336
  if ignore_case:
1161
- [res_bool.append(i in str_full.lower()) for i in str_candi_short ]
1337
+ [res_bool.append(i in str_full.lower()) for i in str_candi_short]
1162
1338
  else:
1163
- [res_bool.append(i in str_full) for i in str_candi_short ]
1339
+ [res_bool.append(i in str_full) for i in str_candi_short]
1164
1340
  return any(res_bool)
1341
+
1165
1342
  def valid_mod_name(str_fly):
1166
1343
  if is_in_any(str_fly, "claude-3-haiku"):
1167
1344
  return "claude-3-haiku"
@@ -1172,27 +1349,32 @@ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
1172
1349
  elif is_in_any(str_fly, "mixtral-8x7b"):
1173
1350
  return "mixtral-8x7b"
1174
1351
  else:
1175
- print(f"not support your model{model}, supported models: 'claude','gpt(default)', 'llama','mixtral'")
1176
- return "gpt-3.5" # default model
1352
+ print(
1353
+ f"not support your model{model}, supported models: 'claude','gpt(default)', 'llama','mixtral'"
1354
+ )
1355
+ return "gpt-3.5" # default model
1356
+
1177
1357
  model_valid = valid_mod_name(model)
1178
- res=DDGS().chat(query, model=model_valid)
1358
+ res = DDGS().chat(query, model=model_valid)
1179
1359
  if verbose:
1180
1360
  pp(res)
1181
1361
  if log:
1182
- dt_str=datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')
1362
+ dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
1183
1363
  res_ = f"###{dt_str}\n\n>{res}\n"
1184
1364
  os.makedirs(dir_save, exist_ok=True)
1185
1365
  fpath = os.path.join(dir_save, f"log_ai.md")
1186
- ips.fupdate(fpath=fpath,content=res_)
1366
+ ips.fupdate(fpath=fpath, content=res_)
1187
1367
  print(f"log file:{fpath}")
1188
1368
  return res
1189
1369
 
1370
+
1190
1371
  def chat(*args, **kwargs):
1191
1372
  if len(args) == 1 and isinstance(args[0], str):
1192
- kwargs['query'] = args[0]
1373
+ kwargs["query"] = args[0]
1193
1374
  return echo(**kwargs)
1194
1375
 
1376
+
1195
1377
  def ai(*args, **kwargs):
1196
1378
  if len(args) == 1 and isinstance(args[0], str):
1197
- kwargs['query'] = args[0]
1198
- return echo(**kwargs)
1379
+ kwargs["query"] = args[0]
1380
+ return echo(**kwargs)