py2ls 0.1.4.6__py3-none-any.whl → 0.1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from bs4 import BeautifulSoup
2
2
  import requests
3
+ from requests.utils import dict_from_cookiejar
3
4
  import os
4
5
  from urllib.parse import urlparse, urljoin
5
6
  import base64
@@ -26,7 +27,12 @@ import mimetypes
26
27
  import io
27
28
  import matplotlib.pyplot as plt
28
29
  from PIL import Image
30
+ from duckduckgo_search import DDGS
31
+ from datetime import datetime
32
+ import time
33
+ from py2ls import ips
29
34
 
35
+ dir_save='/Users/macjianfeng/Dropbox/Downloads/'
30
36
  # Set up logging
31
37
  logging.basicConfig(level=logging.INFO)
32
38
  logger = logging.getLogger(__name__)
@@ -63,6 +69,14 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
63
69
  Returns:
64
70
  - list: A list of extracted text segments.
65
71
  """
72
+ def extract_text(element):
73
+ texts = ""
74
+ if isinstance(element, str) and element.strip():
75
+ texts += element.strip()
76
+ elif hasattr(element, "children"):
77
+ for child in element.children:
78
+ texts += extract_text(child)
79
+ return texts
66
80
  if content is None:
67
81
  logger.error("Content is None, cannot extract text.")
68
82
  return []
@@ -70,7 +84,6 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
70
84
  if content_type not in CONTENT_PARSERS:
71
85
  logger.error(f"Unsupported content type: {content_type}")
72
86
  return []
73
-
74
87
  if "json" in content_type:
75
88
  where = None
76
89
  return extract_text_from_json(content, where)
@@ -81,32 +94,30 @@ def extract_text_from_content(content, content_type="text/html", where=None, wha
81
94
  res.extend(extract_text_from_content(content, content_type="text/html", where=where_, what=what, extend=extend, **kwargs))
82
95
  return res
83
96
  else:
84
- if extend:
85
- def extract_text(element):
86
- texts = ""
87
- if isinstance(element, str) and element.strip():
88
- texts += element.strip()
89
- elif hasattr(element, "children"):
90
- for child in element.children:
91
- texts += extract_text(child)
92
- return texts
93
-
94
- search_kwargs = {**kwargs}
95
- if what:
96
- search_kwargs["class_"] = what
97
-
97
+ search_kwargs = {**kwargs}
98
+ # correct 'class_'
99
+ # dict_=dict(class_="gsc_mnd_art_info")
100
+ if 'class_' in search_kwargs:
101
+ search_kwargs["class"]=search_kwargs["class_"]
102
+ del search_kwargs['class_']
103
+ if what:
104
+ search_kwargs["class"] = what
105
+ if 'attrs' in kwargs:
98
106
  result_set = content.find_all(where, **search_kwargs)
107
+ print(f"attrs =>{search_kwargs}")
108
+ else:
109
+ result_set = content.find_all(where, attrs=dict(**search_kwargs))
110
+ print(f"{search_kwargs}")
111
+
112
+ if not result_set:
113
+ print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
114
+ if extend:
99
115
  texts = ""
100
116
  for tag in result_set:
101
117
  texts += extract_text(tag) + "\n"
102
118
  text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
103
119
  return text_list
104
120
  else:
105
- search_kwargs = {**kwargs}
106
- if what:
107
- search_kwargs["class_"] = what
108
-
109
- result_set = content.find_all(where, **search_kwargs)
110
121
  texts_ = " ".join(tag.get_text() for tag in result_set)
111
122
  texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
112
123
  return texts
@@ -157,51 +168,124 @@ def get_proxy():
157
168
  }
158
169
  return proxies
159
170
  # proxies_glob=get_proxy()
160
-
161
- def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
171
+ def get_soup(url, driver='req'):
172
+ _,soup_=fetch_all(url, driver=driver)
173
+ return soup_
174
+ def fetch_all(url, parser="lxml", driver='request', # request or selenium
175
+ by=By.TAG_NAME,
176
+ timeout=10,
177
+ retry=2,
178
+ login_url=None,
179
+ username=None,
180
+ password=None,
181
+ username_field="username",
182
+ password_field="password",
183
+ submit_field="submit",
184
+ username_by=By.NAME,
185
+ password_by=By.NAME,
186
+ submit_by=By.NAME,
187
+ # capability='eager', # eager or none
188
+ proxy=None, # Add proxy parameter
189
+ javascript=True, # Add JavaScript option
190
+ disable_images=False, # Add option to disable images
191
+ iframe_name=None): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
162
192
  try:
163
- # Generate a random user-agent string
193
+ # # Generate a random user-agent string
194
+ # response = requests.get(url)
195
+ # # get cookies
196
+ # cookie=dict_from_cookiejar(response.cookies)
197
+ # # get token from cookies
198
+ # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
199
+ # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
200
+
164
201
  headers = {"User-Agent": user_agent()}
165
-
166
- # Send the initial request
167
- response = requests.get(url, headers=headers,proxies=proxies_glob)
168
-
169
- # If the response is a redirect, follow it
170
- while response.is_redirect:
171
- logger.info(f"Redirecting to: {response.headers['Location']}")
172
- response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
173
- # Check for a 403 error
174
- if response.status_code == 403:
175
- logger.warning("403 Forbidden error. Retrying...")
176
- # Retry the request after a short delay
177
- sleep(random.uniform(1, 3))
202
+ if 'req' in driver.lower():
178
203
  response = requests.get(url, headers=headers,proxies=proxies_glob)
179
- # Raise an error if retry also fails
204
+
205
+ # If the response is a redirect, follow it
206
+ while response.is_redirect:
207
+ logger.info(f"Redirecting to: {response.headers['Location']}")
208
+ response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
209
+ # Check for a 403 error
210
+ if response.status_code == 403:
211
+ logger.warning("403 Forbidden error. Retrying...")
212
+ # Retry the request after a short delay
213
+ sleep(random.uniform(1, 3))
214
+ response = requests.get(url, headers=headers,proxies=proxies_glob)
215
+ # Raise an error if retry also fails
216
+ response.raise_for_status()
217
+
218
+ # Raise an error for other HTTP status codes
180
219
  response.raise_for_status()
181
220
 
182
- # Raise an error for other HTTP status codes
183
- response.raise_for_status()
221
+ # Get the content type
222
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
223
+ if response.encoding:
224
+ content = response.content.decode(response.encoding)
225
+ else:
226
+ content=None
227
+ # logger.info(f"Content type: {content_type}")
184
228
 
185
- # Get the content type
186
- content_type = response.headers.get("content-type", "").split(";")[0].lower()
187
- if response.encoding:
188
- content = response.content.decode(response.encoding)
189
- else:
190
- content=None
191
- # logger.info(f"Content type: {content_type}")
229
+ # Check if content type is supported
230
+ if content_type in CONTENT_PARSERS and content:
231
+ return content_type, CONTENT_PARSERS[content_type](content, parser)
232
+ else:
233
+ logger.warning("Unsupported content type")
234
+ return None, None
235
+ elif 'se' in driver.lower():
236
+ chrome_options = Options()
237
+ chrome_options.add_argument("--headless")
238
+ chrome_options.add_argument("--no-sandbox")
239
+ chrome_options.add_argument("--disable-dev-shm-usage")
240
+ chrome_options.add_argument(f"user-agent={user_agent()}")
241
+ if proxy:
242
+ chrome_options.add_argument(f'--proxy-server={proxy}')
243
+ if disable_images:
244
+ prefs = {"profile.managed_default_content_settings.images": 2}
245
+ chrome_options.add_experimental_option("prefs", prefs)
246
+ # chrome_options.page_load_strategy = capability
247
+ service = Service(ChromeDriverManager().install())
248
+
249
+ driver_ = webdriver.Chrome(service=service, options=chrome_options)
250
+
251
+ if not javascript:
252
+ driver_.execute_cdp_cmd("Emulation.setScriptExecutionDisabled", {"value": True})
192
253
 
193
- # Check if content type is supported
194
- if content_type in CONTENT_PARSERS and content:
195
- return content_type, CONTENT_PARSERS[content_type](content, parser)
196
- else:
197
- logger.warning("Unsupported content type")
198
- return None, None
199
- except requests.RequestException as e:
200
- logger.error(f"Error fetching URL '{url}': {e}")
201
- return None, None
254
+ if login_url:
255
+ driver_.get(login_url)
256
+ WebDriverWait(driver_, timeout).until(
257
+ EC.presence_of_element_located((username_by, username_field))
258
+ ).send_keys(username)
259
+ WebDriverWait(driver_, timeout).until(
260
+ EC.presence_of_element_located((password_by, password_field))
261
+ ).send_keys(password)
262
+ WebDriverWait(driver_, timeout).until(
263
+ EC.element_to_be_clickable((submit_by, submit_field))
264
+ ).click()
202
265
 
266
+ driver_.get(url)
267
+
268
+ if iframe_name:
269
+ iframe = WebDriverWait(driver_, timeout).until(
270
+ EC.presence_of_element_located((By.NAME, iframe_name))
271
+ )
272
+ driver_.switch_to.frame(iframe)
203
273
 
274
+ # WebDriverWait(driver, timeout).until(
275
+ # EC.presence_of_element_located((by, where))
276
+ # )
277
+ page_source = driver_.page_source
278
+ driver_.quit()
204
279
 
280
+ content = BeautifulSoup(page_source, "html.parser")
281
+ if content:
282
+ return 'text/html', content
283
+ else:
284
+ logger.warning("Selenium could not fetch content")
285
+ return None, None
286
+ except requests.RequestException as e:
287
+ logger.error(f"Error fetching URL '{url}': {e}")
288
+ return None, None
205
289
  # # Function to change Tor IP address
206
290
  # def renew_tor_ip():
207
291
  # with Controller.from_port(port=9051) as controller:
@@ -227,24 +311,90 @@ def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
227
311
  # return None
228
312
 
229
313
 
230
- def find_links(url):
231
- links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
232
- content_type, soup = fetch_all(url)
233
- if soup:
314
+ # def find_links(url,driver='request'):
315
+ # links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
316
+ # content_type, soup = fetch_all(url,driver=driver)
317
+ # if soup:
318
+ # base_url = urlparse(url)
319
+
320
+ # # Extract links from both 'href' and 'src' attributes across relevant tags
321
+ # tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
322
+ # elements = []
323
+ # for tag in tags_with_links:
324
+ # elements.extend(soup.find_all(tag, href=True))
325
+ # elements.extend(soup.find_all(tag, src=True))
326
+
327
+ # for element in elements:
328
+ # link_href = element.get('href') or element.get('src')
329
+ # if link_href:
330
+ # if link_href.startswith("//"):
331
+ # link_href = "http:" + link_href
332
+ # elif not link_href.startswith(("http", "https")):
333
+ # link_href = urljoin(base_url.geturl(), link_href)
334
+
335
+ # if all(exclusion not in link_href for exclusion in cond_ex):
336
+ # links_href.append(link_href)
337
+
338
+ # return list(set(links_href)) # Remove duplicates
339
+
340
+ # elif url.split('.')[-1] in ['pdf']:
341
+ # return url
342
+ # else:
343
+ # return None
344
+ def find_links(url, driver='request', booster=False):
345
+ links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
346
+ content_type, soup = fetch_all(url, driver=driver)
347
+
348
+ if soup and content_type=='text/html':
234
349
  base_url = urlparse(url)
235
- links = soup.find_all("a", href=True, recursive=True)
236
- for link in links:
237
- link_href = link["href"]
238
- if not link_href.startswith(("http")):
239
- link_href = urljoin(base_url.geturl(), link_href)
240
- cond_ex_ = all([i not in link_href for i in cond_ex])
241
- if cond_ex_:
242
- links_href.append(link_href)
243
- return list(set(links_href))
350
+
351
+ # Extract links from all tags with 'href' and 'src' attributes
352
+ elements = soup.find_all(True, href=True) + soup.find_all(True, src=True)
353
+
354
+ for element in elements:
355
+ link_href = element.get('href') or element.get('src')
356
+ if link_href:
357
+ if link_href.startswith("//"):
358
+ link_href = "http:" + link_href
359
+ elif not link_href.startswith(("http", "https")):
360
+ link_href = urljoin(base_url.geturl(), link_href)
361
+
362
+ if all(exclusion not in link_href for exclusion in cond_ex):
363
+ links_href.append(link_href)
364
+
365
+ unique_links = list(set(links_href)) # Remove duplicates
366
+
367
+ if booster:
368
+ for link in unique_links:
369
+ if link != url: # Avoid infinite recursion
370
+ sub_links = find_links(link, driver=driver, booster=False)
371
+ if sub_links:
372
+ links_href.extend(sub_links)
373
+ links_href = list(set(links_href)) # Remove duplicates again
374
+
375
+ return links_href
376
+
244
377
  elif url.split('.')[-1] in ['pdf']:
245
- return url
378
+ return [url]
246
379
  else:
247
380
  return None
381
+
382
+
383
+ # To determine which links are related to target domains(e.g., pages) you are interested in
384
+ def filter_links(links, contains="html",driver='requ', booster=False):
385
+ filtered_links = []
386
+ if isinstance(contains, str):
387
+ contains = [contains]
388
+ if isinstance(links,str):
389
+ links=find_links(links,driver=driver,booster=booster)
390
+ for link in links:
391
+ parsed_link = urlparse(link)
392
+ condition = (all([i in link for i in contains])
393
+ and "javascript:" not in parsed_link
394
+ )
395
+ if condition:
396
+ filtered_links.append(link)
397
+ return filtered_links
248
398
 
249
399
 
250
400
  def find_domain(links):
@@ -263,24 +413,8 @@ def find_domain(links):
263
413
  return None
264
414
 
265
415
 
266
- # To determine which links are related to target domains(e.g., pages) you are interested in
267
- def filter_links(links, contains="html"):
268
- filtered_links = []
269
- if isinstance(contains, str):
270
- contains = [contains]
271
- if isinstance(links,str):
272
- links=find_links(links)
273
- for link in links:
274
- parsed_link = urlparse(link)
275
- condition = (all([i in link for i in contains])
276
- and "javascript:" not in parsed_link
277
- )
278
- if condition:
279
- filtered_links.append(link)
280
- return filtered_links
281
-
282
-
283
- def pdf_detector(url, contains=None, dir_save=None,booster=False):
416
+ def pdf_detector(url, contains = None, dir_save = None, booster = False):
417
+ print("usage: pdf_detector(url, dir_save, booster=True")
284
418
  def fname_pdf_corr(fname):
285
419
  if fname[-4:]!='.pdf':
286
420
  fname = fname[:-4] + '.pdf'
@@ -337,8 +471,101 @@ def pdf_detector(url, contains=None, dir_save=None,booster=False):
337
471
  idx += 1
338
472
  print(f'{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}')
339
473
 
340
-
341
- def find_img(url, dir_save="images", verbose=True):
474
+ def downloader(url, dir_save=dir_save, kind=['.pdf'], contains=None, rm_folder=False, booster=False,verbose=True):
475
+ if verbose:
476
+ print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
477
+ def fname_corrector(fname, ext):
478
+ if not ext.startswith("."):
479
+ ext="."+ext
480
+ if not fname.endswith("ext"):#if not ext in fname:
481
+ fname = fname[:-len(ext)] + ext
482
+ return fname
483
+ def check_and_modify_filename(directory, filename):
484
+ base, ext = os.path.splitext(filename)
485
+ counter = 2
486
+ new_filename = filename
487
+ while os.path.exists(os.path.join(directory, new_filename)):
488
+ new_filename = f"{base}_{counter}{ext}"
489
+ counter += 1
490
+ return new_filename
491
+ if not isinstance(kind,list):
492
+ kind=[kind]
493
+ if isinstance(url, list):
494
+ for url_ in url:
495
+ downloader(url_, dir_save=dir_save, kind=kind, contains=contains, booster=booster,verbose=verbose)
496
+ # sleep(random.uniform(1, 3))
497
+ for i,k in enumerate(kind):
498
+ if not k.startswith('.'):
499
+ kind[i]='.'+kind[i]
500
+ file_links_all=[]
501
+ for kind_ in kind:
502
+ print(kind_)
503
+ if isinstance(contains, str):
504
+ contains = [contains]
505
+ if isinstance(url, str):
506
+ if any(ext in url for ext in kind):
507
+ file_links = [url]
508
+ else:
509
+ if booster:
510
+ links_all = []
511
+ if 'http' in url:
512
+ links_all = find_links(url)
513
+ else:
514
+ links_all = url
515
+ if contains is not None:
516
+ file_links = filter_links(links_all, contains=contains + kind_)
517
+ else:
518
+ file_links = links_all#filter_links(links_all, contains=kind_)
519
+ elif isinstance(url, list):
520
+ links_all = url
521
+ if contains is not None:
522
+ file_links = filter_links(links_all, contains=contains + kind_)
523
+ else:
524
+ file_links = filter_links(links_all, contains=kind_)
525
+ else:
526
+ links_all = find_links(url)
527
+ if contains is not None:
528
+ file_links = filter_links(links_all, contains=contains + kind_)
529
+ else:
530
+ file_links = filter_links(links_all, contains=kind_)
531
+ if verbose:
532
+ if file_links:
533
+ print("Files detected:")
534
+ pp(file_links)
535
+ else:
536
+ file_links=[]
537
+ print('No files detected')
538
+ file_links_all.extend(file_links)
539
+ if dir_save:
540
+ if rm_folder:
541
+ ips.rm_folder(dir_save)
542
+ if verbose:
543
+ print(f"\n... attempting to download to local\n")
544
+ fnames = [file_link.split("/")[-1] for file_link in file_links_all]
545
+ for idx, file_link in enumerate(file_links_all):
546
+ headers = {"User-Agent": user_agent()}
547
+ response = requests.get(file_link, headers=headers)
548
+ if response.status_code == 200:
549
+ ext = next((ftype for ftype in kind if ftype in file_link), None)
550
+ if ext:
551
+ corrected_fname = fname_corrector(fnames[idx], ext)
552
+ corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
553
+ with open(os.path.join(dir_save, corrected_fname), "wb") as file:
554
+ file.write(response.content)
555
+ if verbose:
556
+ print(f"Done! {fnames[idx]}")
557
+ else:
558
+ if verbose:
559
+ print(f"Unknown file type for {file_link}")
560
+ else:
561
+ if verbose:
562
+ print(f"Failed to download file: {response.status_code}")
563
+ print(f'\n{len(fnames)} files were downloaded:')
564
+ if verbose:
565
+ pp(fnames)
566
+ print(f"\n\nsaved @:\n{dir_save}")
567
+
568
+ def find_img(url, driver='request',dir_save="images", rm_folder=False, verbose=True):
342
569
  """
343
570
  Save images referenced in HTML content locally.
344
571
  Args:
@@ -349,7 +576,10 @@ def find_img(url, dir_save="images", verbose=True):
349
576
  Returns:
350
577
  str: HTML content with updated image URLs pointing to local files.
351
578
  """
352
- content_type, content = fetch_all(url)
579
+ if rm_folder:
580
+ ips.rm_folder(dir_save)
581
+ content_type, content = fetch_all(url,driver=driver)
582
+ print(content_type)
353
583
  if "html" in content_type.lower():
354
584
  # Create the directory if it doesn't exist
355
585
  os.makedirs(dir_save, exist_ok=True)
@@ -359,6 +589,9 @@ def find_img(url, dir_save="images", verbose=True):
359
589
  image_links = []
360
590
  # Extracting images
361
591
  images = content.find_all("img", src=True)
592
+ if not images:
593
+ content_type, content = fetch_all(url,driver='selenium')
594
+ images = content.find_all("img", src=True)
362
595
  for i, image in enumerate(images):
363
596
  try:
364
597
  image_url = image["src"]
@@ -380,8 +613,8 @@ def find_img(url, dir_save="images", verbose=True):
380
613
  with open(image_filename, "wb") as image_file:
381
614
  image_file.write(image_data)
382
615
  image["src"] = image_filename
383
- if verbose:
384
- plt.imshow(image_data)
616
+ # if verbose:
617
+ # plt.imshow(image_data)
385
618
  else:
386
619
  # Construct the absolute image URL
387
620
  absolute_image_url = urljoin(url, image_url)
@@ -404,11 +637,13 @@ def find_img(url, dir_save="images", verbose=True):
404
637
  if verbose:
405
638
  display_thumbnail_figure(flist(dir_save,filter='img'),dpi=100)
406
639
  return content
640
+
407
641
  def svg_to_png(svg_file):
408
642
  with WandImage(filename=svg_file, resolution=300) as img:
409
643
  img.format = 'png'
410
644
  png_image = img.make_blob()
411
645
  return Image.open(io.BytesIO(png_image))
646
+
412
647
  def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
413
648
  import matplotlib.pyplot as plt
414
649
  from PIL import Image
@@ -418,16 +653,11 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
418
653
  dir_img_list (list): List of the Directory containing the images.
419
654
  """
420
655
  num_images = len(dir_img_list)
421
-
422
656
  if num_images == 0:
423
657
  print("No images found to display.")
424
658
  return
425
-
426
- # Determine grid size
427
659
  grid_size = int(num_images ** 0.5) + 1
428
-
429
660
  fig, axs = plt.subplots(grid_size, grid_size, figsize=figsize,dpi=dpi)
430
-
431
661
  for ax, image_file in zip(axs.flatten(), dir_img_list):
432
662
  try:
433
663
  img = Image.open(image_file)
@@ -435,11 +665,12 @@ def display_thumbnail_figure(dir_img_list,figsize=(10,10),dpi=100):
435
665
  ax.axis('off') # Hide axes
436
666
  except:
437
667
  continue
438
- # Hide remaining subplots
439
- [ax.axis("off") for ax in axs.flatten()]
440
-
441
- plt.tight_layout()
442
- plt.show()
668
+ try:
669
+ [ax.axis("off") for ax in axs.flatten()]
670
+ plt.tight_layout()
671
+ plt.show()
672
+ except:
673
+ pass
443
674
 
444
675
  def content_div_class(content, div="div", div_class="highlight"):
445
676
  texts = [div.text for div in content.find_all(div, class_=div_class)]
@@ -530,28 +761,58 @@ def fetch_selenium(
530
761
  return []
531
762
 
532
763
 
533
- def fetch(url, where="div", what=None, extend=True, booster=False,retry=2,verbose=False, **kws):
534
- for attempt in range(retry):
535
- if verbose and attempt==0:
536
- xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
537
- print(xample)
538
- content_type, content = fetch_all(url, parser="html.parser")
539
- texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
540
- if isinstance(texts, pd.core.frame.DataFrame):
541
- if not texts.empty:
542
- break
543
- else:
544
- if texts:
545
- break
546
- sleep(random.uniform(0.5, 1.5))
547
- if isinstance(texts,pd.core.frame.DataFrame):
548
- condition_=[texts.empty, booster]
764
+ def fetch(url, where="div", driver='request',what=None, extend=True, booster=False,retry=2,verbose=False, output="text", **kws):
765
+ print(f"output is {output}")
766
+ if 'xt' in output.lower():
767
+ for attempt in range(retry):
768
+ if verbose and attempt==0:
769
+ xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
770
+ print(xample)
771
+ content_type, content = fetch_all(url, parser="html.parser",driver=driver)
772
+ texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
773
+ if isinstance(texts, pd.core.frame.DataFrame):
774
+ if not texts.empty:
775
+ break
776
+ else:
777
+ if texts:
778
+ break
779
+ sleep(random.uniform(0.5, 1.5))
780
+ if isinstance(texts,pd.core.frame.DataFrame):
781
+ condition_=[texts.empty, booster]
782
+ else:
783
+ condition_=[not texts, booster]
784
+ if any(condition_):
785
+ print("trying to use 'fetcher2'...")
786
+ texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
787
+ if texts:
788
+ return texts
789
+ else:
790
+ return fetch(url, where=where, driver=driver,what=what, extend=extend, booster=booster,retry=retry,verbose=verbose, output="soup", **kws)
791
+ elif "url" in output.lower():
792
+ base_url = urlparse(url)
793
+ if verbose:
794
+ print("urljoin(urlparse(url), link_part)")
795
+ return base_url.geturl()
549
796
  else:
550
- condition_=[not texts, booster]
551
- if any(condition_):
552
- print("trying to use 'fetcher2'...")
553
- texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
554
- return texts
797
+ try:
798
+ content_type, content = fetch_all(url, parser="html.parser",driver=driver)
799
+ search_kwargs = {**kws}
800
+ print(search_kwargs)
801
+ if 'class_' in search_kwargs:
802
+ search_kwargs["class"]=search_kwargs["class_"]
803
+ del search_kwargs['class_']
804
+ if what:
805
+ search_kwargs["class"] = what
806
+ if 'attrs' in kws:
807
+ result_set = content.find_all(where, **search_kwargs)
808
+ print(f"attrs =>{search_kwargs}")
809
+ else:
810
+ result_set = content.find_all(where, attrs=dict(**search_kwargs))
811
+ print(f"{search_kwargs}")
812
+ return result_set
813
+ except:
814
+ print("got nothing")
815
+ return None
555
816
 
556
817
 
557
818
  def extract_from_content(content, where="div", what=None):
@@ -567,8 +828,8 @@ def extract_from_content(content, where="div", what=None):
567
828
  return texts
568
829
 
569
830
 
570
- def find_forms(url):
571
- content_type, content = fetch_all(url)
831
+ def find_forms(url, driver='requ'):
832
+ content_type, content = fetch_all(url,driver=driver)
572
833
  df = pd.DataFrame()
573
834
  # Extracting forms and inputs
574
835
  forms = content.find_all("form",recursive=True)
@@ -594,8 +855,8 @@ def clean_string(value):
594
855
  return value
595
856
 
596
857
 
597
- def find_all(url, dir_save=None):
598
- content_type, content = fetch_all(url)
858
+ def find_all(url, dir_save=None, driver='req'):
859
+ content_type, content = fetch_all(url,driver=driver)
599
860
  paragraphs_text = extract_from_content(content, where="p")
600
861
  # Extracting specific elements by class
601
862
  specific_elements_text = [
@@ -778,6 +1039,8 @@ def find_all(url, dir_save=None):
778
1039
 
779
1040
  def flist(fpath, filter="all"):
780
1041
  all_files = [os.path.join(fpath, f) for f in os.listdir(fpath) if os.path.isfile(os.path.join(fpath, f))]
1042
+ if isinstance(filter, str):
1043
+ filter=[filter]
781
1044
  if isinstance(filter, list):
782
1045
  filt_files=[]
783
1046
  for filter_ in filter:
@@ -837,4 +1100,65 @@ def is_zip(fpath):
837
1100
  if mime_type == 'application/zip':
838
1101
  return True
839
1102
  else:
840
- return False
1103
+ return False
1104
+
1105
+ def search(query, limit=5, kind='text', output='df',verbose=False,download=False, dir_save=dir_save):
1106
+
1107
+ if 'te' in kind.lower():
1108
+ results = DDGS().text(query, max_results=limit)
1109
+ res=pd.DataFrame(results)
1110
+ res.rename(columns={"href":"links"},inplace=True)
1111
+ if verbose:
1112
+ print(f'searching "{query}": got the results below\n{res}')
1113
+ if download:
1114
+ try:
1115
+ downloader(url=res.links.tolist(), dir_save=dir_save, verbose=verbose)
1116
+ except:
1117
+ if verbose:
1118
+ print(f"failed link")
1119
+ return res
1120
+
1121
+ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
1122
+ def is_in_any(str_candi_short, str_full, ignore_case=True):
1123
+ if isinstance(str_candi_short, str):
1124
+ str_candi_short=[str_candi_short]
1125
+ res_bool=[]
1126
+ if ignore_case:
1127
+ [res_bool.append(i in str_full.lower()) for i in str_candi_short ]
1128
+ else:
1129
+ [res_bool.append(i in str_full) for i in str_candi_short ]
1130
+ return any(res_bool)
1131
+ def valid_mod_name(str_fly):
1132
+ if is_in_any(str_fly, "claude-3-haiku"):
1133
+ return "claude-3-haiku"
1134
+ elif is_in_any(str_fly, "gpt-3.5"):
1135
+ return "gpt-3.5"
1136
+ elif is_in_any(str_fly, "llama-3-70b"):
1137
+ return "llama-3-70b"
1138
+ elif is_in_any(str_fly, "mixtral-8x7b"):
1139
+ return "mixtral-8x7b"
1140
+ else:
1141
+ print(f"not support your model{model}, supported models: 'claude','gpt(default)', 'llama','mixtral'")
1142
+ return "gpt-3.5" # default model
1143
+ model_valid = valid_mod_name(model)
1144
+ res=DDGS().chat(query, model=model_valid)
1145
+ if verbose:
1146
+ pp(res)
1147
+ if log:
1148
+ dt_str=datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')
1149
+ res_ = f"###{dt_str}\n\n>{res}\n"
1150
+ os.makedirs(dir_save, exist_ok=True)
1151
+ fpath = os.path.join(dir_save, f"log_ai.md")
1152
+ ips.fupdate(fpath=fpath,content=res_)
1153
+ print(f"log file:{fpath}")
1154
+ return res
1155
+
1156
+ def chat(*args, **kwargs):
1157
+ if len(args) == 1 and isinstance(args[0], str):
1158
+ kwargs['query'] = args[0]
1159
+ return echo(**kwargs)
1160
+
1161
+ def ai(*args, **kwargs):
1162
+ if len(args) == 1 and isinstance(args[0], str):
1163
+ kwargs['query'] = args[0]
1164
+ return echo(**kwargs)