py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/netfinder.py CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
626
626
  )
627
627
  if condition:
628
628
  filtered_links.append(link)
629
- return filtered_links
629
+ return ips.unique(filtered_links)
630
630
 
631
631
 
632
632
  def find_domain(links):
@@ -717,7 +717,7 @@ def downloader(
717
717
  kind=[".pdf"],
718
718
  contains=None,
719
719
  rm_folder=False,
720
- booster=False,
720
+ booster=True,# use find_links
721
721
  verbose=True,
722
722
  timeout=30,
723
723
  n_try=3,
@@ -726,7 +726,7 @@ def downloader(
726
726
 
727
727
  from requests.exceptions import ChunkedEncodingError, ConnectionError
728
728
 
729
- if verbose:
729
+ if verbose and ips.run_once_within():
730
730
  print(
731
731
  "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
732
732
  )
@@ -734,8 +734,11 @@ def downloader(
734
734
  def fname_corrector(fname, ext):
735
735
  if not ext.startswith("."):
736
736
  ext = "." + ext
737
- if not fname.endswith("ext"): # if not ext in fname:
737
+ if not fname.endswith(ext): # if not ext in fname:
738
738
  fname = fname[: -len(ext)] + ext
739
+ if not any(fname[: -len(ext)]):
740
+ from datetime import datetime
741
+ fname = datetime.now().strftime("%H%M%S") + ext
739
742
  return fname
740
743
 
741
744
  def check_and_modify_filename(directory, filename):
@@ -784,8 +787,8 @@ def downloader(
784
787
  kind[i] = "." + kind[i]
785
788
  file_links_all = []
786
789
  for kind_ in kind:
787
- if isinstance(contains, str):
788
- contains = [contains]
790
+ # if isinstance(contains, str):
791
+ # contains = [contains]
789
792
  if isinstance(url, str):
790
793
  if any(ext in url for ext in kind):
791
794
  file_links = [url]
@@ -799,7 +802,7 @@ def downloader(
799
802
  if contains is not None:
800
803
  file_links = filter_links(links_all, contains=contains + kind_)
801
804
  else:
802
- file_links = links_all # filter_links(links_all, contains=kind_)
805
+ file_links = filter_links(links_all, contains=kind_)#links_all #
803
806
  elif isinstance(url, list):
804
807
  links_all = url
805
808
  if contains is not None:
@@ -812,6 +815,7 @@ def downloader(
812
815
  file_links = filter_links(links_all, contains=contains + kind_)
813
816
  else:
814
817
  file_links = filter_links(links_all, contains=kind_)
818
+ file_links=ips.unique(file_links)
815
819
  if verbose:
816
820
  if file_links:
817
821
  from pprint import pp
@@ -825,6 +829,7 @@ def downloader(
825
829
  file_links_all = [file_links]
826
830
  elif isinstance(file_links, list):
827
831
  file_links_all.extend(file_links)
832
+ file_links_all=ips.unique(file_links_all)
828
833
  if dir_save:
829
834
  if rm_folder:
830
835
  ips.rm_folder(dir_save)
@@ -847,7 +852,7 @@ def downloader(
847
852
  )
848
853
  if ext is None:
849
854
  ext = kind_
850
- print("ehereerere", ext)
855
+
851
856
  if ext:
852
857
  corrected_fname = fname_corrector(fnames[idx], ext)
853
858
  corrected_fname = check_and_modify_filename(
@@ -860,13 +865,13 @@ def downloader(
860
865
  datetime.now().strftime("%y%m%d_%H%M%S_")
861
866
  + corrected_fname
862
867
  )
863
- fpath_tmp = os.path.join(dir_save, corrected_fname)
868
+ fpath_tmp = os.path.join(dir_save, corrected_fname)
864
869
  with open(fpath_tmp, "wb") as file:
865
870
  for chunk in response.iter_content(chunk_size=8192):
866
871
  if chunk: # Filter out keep-alive chunks
867
872
  file.write(chunk)
868
873
  if verbose:
869
- print(f"Done! {fnames[idx]}")
874
+ print(f"Done{fnames[idx]}")
870
875
  else:
871
876
  if verbose:
872
877
  print(f"Unknown file type for {file_link}")
@@ -886,16 +891,7 @@ def downloader(
886
891
 
887
892
  if itry == n_try:
888
893
  print(f"Failed to download {file_link} after {n_try} attempts.")
889
-
890
- # print(f"\n{len(fnames)} files were downloaded:")
891
- if verbose:
892
- from pprint import pp
893
-
894
- if corrected_fname:
895
- pp(corrected_fname)
896
- print(f"\n\nsaved @:\n{dir_save}")
897
- else:
898
- pp(fnames)
894
+
899
895
 
900
896
 
901
897
  def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
@@ -1612,3 +1608,191 @@ def ai(*args, **kwargs):
1612
1608
  if len(args) == 1 and isinstance(args[0], str):
1613
1609
  kwargs["query"] = args[0]
1614
1610
  return echo(**kwargs)
1611
+
1612
+
1613
+ #! get_ip()
1614
+ def get_ip(ip=None):
1615
+ """
1616
+ Usage:
1617
+ from py2ls import netfinder as nt
1618
+ ip = nt.get_ip()
1619
+ """
1620
+
1621
+ import requests
1622
+ import time
1623
+ import logging
1624
+ from datetime import datetime, timedelta
1625
+
1626
+ # Set up logging configuration
1627
+ logging.basicConfig(
1628
+ level=logging.INFO,
1629
+ format="%(asctime)s - %(levelname)s - %(message)s",
1630
+ handlers=[
1631
+ logging.StreamHandler(),
1632
+ logging.FileHandler("public_ip_log.log"), # Log to a file
1633
+ ],
1634
+ )
1635
+
1636
+ cache = {}
1637
+
1638
+ # Function to fetch IP addresses synchronously
1639
+ def fetch_ip(url, retries, timeout, headers):
1640
+ """
1641
+ Synchronous function to fetch the IP address with retries.
1642
+ """
1643
+ for attempt in range(retries):
1644
+ try:
1645
+ response = requests.get(url, timeout=timeout, headers=headers)
1646
+ response.raise_for_status()
1647
+ return response.json()
1648
+ except requests.RequestException as e:
1649
+ logging.error(f"Attempt {attempt + 1} failed: {e}")
1650
+ if attempt < retries - 1:
1651
+ time.sleep(2**attempt) # Exponential backoff
1652
+ else:
1653
+ logging.error("Max retries reached.")
1654
+ return {"error": f"Error fetching IP: {e}"}
1655
+ except requests.Timeout:
1656
+ logging.error("Request timed out")
1657
+ time.sleep(2**attempt)
1658
+ return {"error": "Failed to fetch IP after retries"}
1659
+
1660
+ # Function to fetch geolocation synchronously
1661
+ def fetch_geolocation(url, retries, timeout, headers):
1662
+ """
1663
+ Synchronous function to fetch geolocation data by IP address.
1664
+ """
1665
+ for attempt in range(retries):
1666
+ try:
1667
+ response = requests.get(url, timeout=timeout, headers=headers)
1668
+ response.raise_for_status()
1669
+ return response.json()
1670
+ except requests.RequestException as e:
1671
+ logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
1672
+ if attempt < retries - 1:
1673
+ time.sleep(2**attempt) # Exponential backoff
1674
+ else:
1675
+ logging.error("Max retries reached.")
1676
+ return {"error": f"Error fetching geolocation: {e}"}
1677
+ except requests.Timeout:
1678
+ logging.error("Geolocation request timed out")
1679
+ time.sleep(2**attempt)
1680
+ return {"error": "Failed to fetch geolocation after retries"}
1681
+
1682
+ # Main function to get public IP and geolocation
1683
+ def get_public_ip(
1684
+ ip4=True,
1685
+ ip6=True,
1686
+ verbose=True,
1687
+ retries=3,
1688
+ timeout=5,
1689
+ geolocation=True,
1690
+ headers=None,
1691
+ cache_duration=5,
1692
+ ):
1693
+ """
1694
+ Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
1695
+ """
1696
+ # Use the cache if it's still valid
1697
+ cache_key_ip4 = "public_ip4"
1698
+ cache_key_ip6 = "public_ip6"
1699
+ cache_key_geolocation = "geolocation"
1700
+
1701
+ if (
1702
+ cache
1703
+ and cache_key_ip4 in cache
1704
+ and datetime.now() < cache[cache_key_ip4]["expires"]
1705
+ ):
1706
+ logging.info("Cache hit for IPv4, using cached data.")
1707
+ ip4_data = cache[cache_key_ip4]["data"]
1708
+ else:
1709
+ ip4_data = None
1710
+
1711
+ if (
1712
+ cache
1713
+ and cache_key_ip6 in cache
1714
+ and datetime.now() < cache[cache_key_ip6]["expires"]
1715
+ ):
1716
+ logging.info("Cache hit for IPv6, using cached data.")
1717
+ ip6_data = cache[cache_key_ip6]["data"]
1718
+ else:
1719
+ ip6_data = None
1720
+
1721
+ if (
1722
+ cache
1723
+ and cache_key_geolocation in cache
1724
+ and datetime.now() < cache[cache_key_geolocation]["expires"]
1725
+ ):
1726
+ logging.info("Cache hit for Geolocation, using cached data.")
1727
+ geolocation_data = cache[cache_key_geolocation]["data"]
1728
+ else:
1729
+ geolocation_data = None
1730
+
1731
+ # Fetch IPv4 if requested
1732
+ if ip4 and not ip4_data:
1733
+ logging.info("Fetching IPv4...")
1734
+ ip4_data = fetch_ip(
1735
+ "https://api.ipify.org?format=json", retries, timeout, headers
1736
+ )
1737
+ cache[cache_key_ip4] = {
1738
+ "data": ip4_data,
1739
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1740
+ }
1741
+
1742
+ # Fetch IPv6 if requested
1743
+ if ip6 and not ip6_data:
1744
+ logging.info("Fetching IPv6...")
1745
+ ip6_data = fetch_ip(
1746
+ "https://api6.ipify.org?format=json", retries, timeout, headers
1747
+ )
1748
+ cache[cache_key_ip6] = {
1749
+ "data": ip6_data,
1750
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1751
+ }
1752
+
1753
+ # Fetch geolocation if requested
1754
+ if geolocation and not geolocation_data:
1755
+ logging.info("Fetching Geolocation...")
1756
+ geolocation_data = fetch_geolocation(
1757
+ "https://ipinfo.io/json", retries, timeout, headers
1758
+ )
1759
+ cache[cache_key_geolocation] = {
1760
+ "data": geolocation_data,
1761
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1762
+ }
1763
+
1764
+ # Prepare the results
1765
+ ip_info = {
1766
+ "ip4": ip4_data.get("ip") if ip4_data else "N/A",
1767
+ "ip6": ip6_data.get("ip") if ip6_data else "N/A",
1768
+ "geolocation": geolocation_data if geolocation_data else "N/A",
1769
+ }
1770
+
1771
+ # Verbose output if requested
1772
+ if verbose:
1773
+ print(f"Public IPv4: {ip_info['ip4']}")
1774
+ print(f"Public IPv6: {ip_info['ip6']}")
1775
+ print(f"Geolocation: {ip_info['geolocation']}")
1776
+
1777
+ return ip_info
1778
+
1779
+ # Function to get geolocation data by IP
1780
+ def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
1781
+ """
1782
+ Fetches geolocation data for a given IP address.
1783
+ """
1784
+ url = f"https://ipinfo.io/{ip}/json"
1785
+ geolocation_data = fetch_geolocation(url, retries, timeout, headers)
1786
+ return geolocation_data
1787
+ #! here starting get_ip()
1788
+ headers = {"User-Agent": user_agent()}
1789
+ if ip is None:
1790
+ try:
1791
+ ip_data = get_public_ip(headers=headers, verbose=True)
1792
+ except Exception as e:
1793
+ print(e)
1794
+ ip_data = None
1795
+ return ip_data
1796
+ else:
1797
+ geolocation_data = get_geolocation_by_ip(ip, headers=headers)
1798
+ return geolocation_data
py2ls/ocr.py CHANGED
@@ -486,6 +486,18 @@ def preprocess_img(
486
486
 
487
487
  return img_preprocessed
488
488
 
489
+ def convert_image_to_bytes(image):
490
+ """
491
+ Convert a CV2 or numpy image to bytes for ddddocr.
492
+ """
493
+ import io
494
+ # Convert OpenCV image (numpy array) to PIL image
495
+ if isinstance(image, np.ndarray):
496
+ image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
497
+ # Save PIL image to a byte stream
498
+ img_byte_arr = io.BytesIO()
499
+ image.save(img_byte_arr, format='PNG')
500
+ return img_byte_arr.getvalue()
489
501
 
490
502
  def text_postprocess(
491
503
  text,
@@ -604,10 +616,11 @@ def get_text(
604
616
  """
605
617
  )
606
618
 
607
- models = ["easyocr", "paddleocr", "pytesseract"]
619
+ models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
608
620
  model = strcmp(model, models)[0]
609
621
  lang = lang_auto_detect(lang, model)
610
622
  if isinstance(image, str):
623
+ dir_img=image
611
624
  image = cv2.imread(image)
612
625
 
613
626
  # Ensure lang is always a list
@@ -705,9 +718,10 @@ def get_text(
705
718
  ) # PaddleOCR supports only one language at a time
706
719
  result = ocr.ocr(image_process, **kwargs)
707
720
  detections = []
708
- for line in result[0]:
709
- bbox, (text, score) = line
710
- detections.append((bbox, text, score))
721
+ if result[0] is not None:
722
+ for line in result[0]:
723
+ bbox, (text, score) = line
724
+ detections.append((bbox, text, score))
711
725
  if postprocess is None:
712
726
  postprocess = dict(
713
727
  spell_check=True,
@@ -787,7 +801,49 @@ def get_text(
787
801
  else:
788
802
  # 默认返回所有检测信息
789
803
  return detections
804
+ elif "ddddocr" in model.lower():
805
+ import ddddocr
806
+
807
+ ocr = ddddocr.DdddOcr(det=False, ocr=True)
808
+ image_bytes = convert_image_to_bytes(image_process)
809
+
810
+ results = ocr.classification(image_bytes) # Text extraction
811
+
812
+ # Optional: Perform detection for bounding boxes
813
+ detections = []
814
+ if kwargs.get("det", False):
815
+ det_ocr = ddddocr.DdddOcr(det=True)
816
+ det_results = det_ocr.detect(image_bytes)
817
+ for box in det_results:
818
+ top_left = (box[0], box[1])
819
+ bottom_right = (box[2], box[3])
820
+ detections.append((top_left, bottom_right))
790
821
 
822
+ if postprocess is None:
823
+ postprocess = dict(
824
+ spell_check=True,
825
+ clean=True,
826
+ filter=dict(min_length=2),
827
+ pattern=None,
828
+ merge=True,
829
+ )
830
+ text_corr = []
831
+ [
832
+ text_corr.extend(text_postprocess(text, **postprocess))
833
+ for _, text, _ in detections
834
+ ]
835
+ # Visualization
836
+ if show:
837
+ if ax is None:
838
+ ax = plt.gca()
839
+ image_vis = image.copy()
840
+ if detections:
841
+ for top_left, bottom_right in detections:
842
+ cv2.rectangle(image_vis, top_left, bottom_right, box_color, 2)
843
+ image_vis = cv2.cvtColor(image_vis, cmap)
844
+ ax.imshow(image_vis)
845
+ ax.axis("off")
846
+ return detections
791
847
  else: # "pytesseract"
792
848
  if ax is None:
793
849
  ax = plt.gca()