py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
626
626
  )
627
627
  if condition:
628
628
  filtered_links.append(link)
629
- return filtered_links
629
+ return ips.unique(filtered_links)
630
630
 
631
631
 
632
632
  def find_domain(links):
@@ -717,7 +717,7 @@ def downloader(
717
717
  kind=[".pdf"],
718
718
  contains=None,
719
719
  rm_folder=False,
720
- booster=False,
720
+ booster=True,# use find_links
721
721
  verbose=True,
722
722
  timeout=30,
723
723
  n_try=3,
@@ -726,7 +726,7 @@ def downloader(
726
726
 
727
727
  from requests.exceptions import ChunkedEncodingError, ConnectionError
728
728
 
729
- if verbose:
729
+ if verbose and ips.run_once_within():
730
730
  print(
731
731
  "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
732
732
  )
@@ -734,8 +734,11 @@ def downloader(
734
734
  def fname_corrector(fname, ext):
735
735
  if not ext.startswith("."):
736
736
  ext = "." + ext
737
- if not fname.endswith("ext"): # if not ext in fname:
737
+ if not fname.endswith(ext): # if not ext in fname:
738
738
  fname = fname[: -len(ext)] + ext
739
+ if not any(fname[: -len(ext)]):
740
+ from datetime import datetime
741
+ fname = datetime.now().strftime("%H%M%S") + ext
739
742
  return fname
740
743
 
741
744
  def check_and_modify_filename(directory, filename):
@@ -784,8 +787,8 @@ def downloader(
784
787
  kind[i] = "." + kind[i]
785
788
  file_links_all = []
786
789
  for kind_ in kind:
787
- if isinstance(contains, str):
788
- contains = [contains]
790
+ # if isinstance(contains, str):
791
+ # contains = [contains]
789
792
  if isinstance(url, str):
790
793
  if any(ext in url for ext in kind):
791
794
  file_links = [url]
@@ -799,7 +802,7 @@ def downloader(
799
802
  if contains is not None:
800
803
  file_links = filter_links(links_all, contains=contains + kind_)
801
804
  else:
802
- file_links = links_all # filter_links(links_all, contains=kind_)
805
+ file_links = filter_links(links_all, contains=kind_)#links_all #
803
806
  elif isinstance(url, list):
804
807
  links_all = url
805
808
  if contains is not None:
@@ -812,6 +815,7 @@ def downloader(
812
815
  file_links = filter_links(links_all, contains=contains + kind_)
813
816
  else:
814
817
  file_links = filter_links(links_all, contains=kind_)
818
+ file_links=ips.unique(file_links)
815
819
  if verbose:
816
820
  if file_links:
817
821
  from pprint import pp
@@ -825,6 +829,7 @@ def downloader(
825
829
  file_links_all = [file_links]
826
830
  elif isinstance(file_links, list):
827
831
  file_links_all.extend(file_links)
832
+ file_links_all=ips.unique(file_links_all)
828
833
  if dir_save:
829
834
  if rm_folder:
830
835
  ips.rm_folder(dir_save)
@@ -847,7 +852,7 @@ def downloader(
847
852
  )
848
853
  if ext is None:
849
854
  ext = kind_
850
- print("ehereerere", ext)
855
+
851
856
  if ext:
852
857
  corrected_fname = fname_corrector(fnames[idx], ext)
853
858
  corrected_fname = check_and_modify_filename(
@@ -860,13 +865,13 @@ def downloader(
860
865
  datetime.now().strftime("%y%m%d_%H%M%S_")
861
866
  + corrected_fname
862
867
  )
863
- fpath_tmp = os.path.join(dir_save, corrected_fname)
868
+ fpath_tmp = os.path.join(dir_save, corrected_fname)
864
869
  with open(fpath_tmp, "wb") as file:
865
870
  for chunk in response.iter_content(chunk_size=8192):
866
871
  if chunk: # Filter out keep-alive chunks
867
872
  file.write(chunk)
868
873
  if verbose:
869
- print(f"Done! {fnames[idx]}")
874
+ print(f"Done{fnames[idx]}")
870
875
  else:
871
876
  if verbose:
872
877
  print(f"Unknown file type for {file_link}")
@@ -886,16 +891,7 @@ def downloader(
886
891
 
887
892
  if itry == n_try:
888
893
  print(f"Failed to download {file_link} after {n_try} attempts.")
889
-
890
- # print(f"\n{len(fnames)} files were downloaded:")
891
- if verbose:
892
- from pprint import pp
893
-
894
- if corrected_fname:
895
- pp(corrected_fname)
896
- print(f"\n\nsaved @:\n{dir_save}")
897
- else:
898
- pp(fnames)
894
+
899
895
 
900
896
 
901
897
  def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
@@ -1612,3 +1608,191 @@ def ai(*args, **kwargs):
1612
1608
  if len(args) == 1 and isinstance(args[0], str):
1613
1609
  kwargs["query"] = args[0]
1614
1610
  return echo(**kwargs)
1611
+
1612
+
1613
+ #! get_ip()
1614
+ def get_ip(ip=None):
1615
+ """
1616
+ Usage:
1617
+ from py2ls import netfinder as nt
1618
+ ip = nt.get_ip()
1619
+ """
1620
+
1621
+ import requests
1622
+ import time
1623
+ import logging
1624
+ from datetime import datetime, timedelta
1625
+
1626
+ # Set up logging configuration
1627
+ logging.basicConfig(
1628
+ level=logging.INFO,
1629
+ format="%(asctime)s - %(levelname)s - %(message)s",
1630
+ handlers=[
1631
+ logging.StreamHandler(),
1632
+ logging.FileHandler("public_ip_log.log"), # Log to a file
1633
+ ],
1634
+ )
1635
+
1636
+ cache = {}
1637
+
1638
+ # Function to fetch IP addresses synchronously
1639
+ def fetch_ip(url, retries, timeout, headers):
1640
+ """
1641
+ Synchronous function to fetch the IP address with retries.
1642
+ """
1643
+ for attempt in range(retries):
1644
+ try:
1645
+ response = requests.get(url, timeout=timeout, headers=headers)
1646
+ response.raise_for_status()
1647
+ return response.json()
1648
+ except requests.RequestException as e:
1649
+ logging.error(f"Attempt {attempt + 1} failed: {e}")
1650
+ if attempt < retries - 1:
1651
+ time.sleep(2**attempt) # Exponential backoff
1652
+ else:
1653
+ logging.error("Max retries reached.")
1654
+ return {"error": f"Error fetching IP: {e}"}
1655
+ except requests.Timeout:
1656
+ logging.error("Request timed out")
1657
+ time.sleep(2**attempt)
1658
+ return {"error": "Failed to fetch IP after retries"}
1659
+
1660
+ # Function to fetch geolocation synchronously
1661
+ def fetch_geolocation(url, retries, timeout, headers):
1662
+ """
1663
+ Synchronous function to fetch geolocation data by IP address.
1664
+ """
1665
+ for attempt in range(retries):
1666
+ try:
1667
+ response = requests.get(url, timeout=timeout, headers=headers)
1668
+ response.raise_for_status()
1669
+ return response.json()
1670
+ except requests.RequestException as e:
1671
+ logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
1672
+ if attempt < retries - 1:
1673
+ time.sleep(2**attempt) # Exponential backoff
1674
+ else:
1675
+ logging.error("Max retries reached.")
1676
+ return {"error": f"Error fetching geolocation: {e}"}
1677
+ except requests.Timeout:
1678
+ logging.error("Geolocation request timed out")
1679
+ time.sleep(2**attempt)
1680
+ return {"error": "Failed to fetch geolocation after retries"}
1681
+
1682
+ # Main function to get public IP and geolocation
1683
+ def get_public_ip(
1684
+ ip4=True,
1685
+ ip6=True,
1686
+ verbose=True,
1687
+ retries=3,
1688
+ timeout=5,
1689
+ geolocation=True,
1690
+ headers=None,
1691
+ cache_duration=5,
1692
+ ):
1693
+ """
1694
+ Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
1695
+ """
1696
+ # Use the cache if it's still valid
1697
+ cache_key_ip4 = "public_ip4"
1698
+ cache_key_ip6 = "public_ip6"
1699
+ cache_key_geolocation = "geolocation"
1700
+
1701
+ if (
1702
+ cache
1703
+ and cache_key_ip4 in cache
1704
+ and datetime.now() < cache[cache_key_ip4]["expires"]
1705
+ ):
1706
+ logging.info("Cache hit for IPv4, using cached data.")
1707
+ ip4_data = cache[cache_key_ip4]["data"]
1708
+ else:
1709
+ ip4_data = None
1710
+
1711
+ if (
1712
+ cache
1713
+ and cache_key_ip6 in cache
1714
+ and datetime.now() < cache[cache_key_ip6]["expires"]
1715
+ ):
1716
+ logging.info("Cache hit for IPv6, using cached data.")
1717
+ ip6_data = cache[cache_key_ip6]["data"]
1718
+ else:
1719
+ ip6_data = None
1720
+
1721
+ if (
1722
+ cache
1723
+ and cache_key_geolocation in cache
1724
+ and datetime.now() < cache[cache_key_geolocation]["expires"]
1725
+ ):
1726
+ logging.info("Cache hit for Geolocation, using cached data.")
1727
+ geolocation_data = cache[cache_key_geolocation]["data"]
1728
+ else:
1729
+ geolocation_data = None
1730
+
1731
+ # Fetch IPv4 if requested
1732
+ if ip4 and not ip4_data:
1733
+ logging.info("Fetching IPv4...")
1734
+ ip4_data = fetch_ip(
1735
+ "https://api.ipify.org?format=json", retries, timeout, headers
1736
+ )
1737
+ cache[cache_key_ip4] = {
1738
+ "data": ip4_data,
1739
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1740
+ }
1741
+
1742
+ # Fetch IPv6 if requested
1743
+ if ip6 and not ip6_data:
1744
+ logging.info("Fetching IPv6...")
1745
+ ip6_data = fetch_ip(
1746
+ "https://api6.ipify.org?format=json", retries, timeout, headers
1747
+ )
1748
+ cache[cache_key_ip6] = {
1749
+ "data": ip6_data,
1750
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1751
+ }
1752
+
1753
+ # Fetch geolocation if requested
1754
+ if geolocation and not geolocation_data:
1755
+ logging.info("Fetching Geolocation...")
1756
+ geolocation_data = fetch_geolocation(
1757
+ "https://ipinfo.io/json", retries, timeout, headers
1758
+ )
1759
+ cache[cache_key_geolocation] = {
1760
+ "data": geolocation_data,
1761
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
1762
+ }
1763
+
1764
+ # Prepare the results
1765
+ ip_info = {
1766
+ "ip4": ip4_data.get("ip") if ip4_data else "N/A",
1767
+ "ip6": ip6_data.get("ip") if ip6_data else "N/A",
1768
+ "geolocation": geolocation_data if geolocation_data else "N/A",
1769
+ }
1770
+
1771
+ # Verbose output if requested
1772
+ if verbose:
1773
+ print(f"Public IPv4: {ip_info['ip4']}")
1774
+ print(f"Public IPv6: {ip_info['ip6']}")
1775
+ print(f"Geolocation: {ip_info['geolocation']}")
1776
+
1777
+ return ip_info
1778
+
1779
+ # Function to get geolocation data by IP
1780
+ def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
1781
+ """
1782
+ Fetches geolocation data for a given IP address.
1783
+ """
1784
+ url = f"https://ipinfo.io/{ip}/json"
1785
+ geolocation_data = fetch_geolocation(url, retries, timeout, headers)
1786
+ return geolocation_data
1787
+ #! here starting get_ip()
1788
+ headers = {"User-Agent": user_agent()}
1789
+ if ip is None:
1790
+ try:
1791
+ ip_data = get_public_ip(headers=headers, verbose=True)
1792
+ except Exception as e:
1793
+ print(e)
1794
+ ip_data = None
1795
+ return ip_data
1796
+ else:
1797
+ geolocation_data = get_geolocation_by_ip(ip, headers=headers)
1798
+ return geolocation_data
py2ls/ocr.py CHANGED
@@ -486,6 +486,18 @@ def preprocess_img(
486
486
 
487
487
  return img_preprocessed
488
488
 
489
+ def convert_image_to_bytes(image):
490
+ """
491
+ Convert a CV2 or numpy image to bytes for ddddocr.
492
+ """
493
+ import io
494
+ # Convert OpenCV image (numpy array) to PIL image
495
+ if isinstance(image, np.ndarray):
496
+ image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
497
+ # Save PIL image to a byte stream
498
+ img_byte_arr = io.BytesIO()
499
+ image.save(img_byte_arr, format='PNG')
500
+ return img_byte_arr.getvalue()
489
501
 
490
502
  def text_postprocess(
491
503
  text,
@@ -604,10 +616,11 @@ def get_text(
604
616
  """
605
617
  )
606
618
 
607
- models = ["easyocr", "paddleocr", "pytesseract"]
619
+ models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
608
620
  model = strcmp(model, models)[0]
609
621
  lang = lang_auto_detect(lang, model)
610
622
  if isinstance(image, str):
623
+ dir_img=image
611
624
  image = cv2.imread(image)
612
625
 
613
626
  # Ensure lang is always a list
@@ -705,9 +718,10 @@ def get_text(
705
718
  ) # PaddleOCR supports only one language at a time
706
719
  result = ocr.ocr(image_process, **kwargs)
707
720
  detections = []
708
- for line in result[0]:
709
- bbox, (text, score) = line
710
- detections.append((bbox, text, score))
721
+ if result[0] is not None:
722
+ for line in result[0]:
723
+ bbox, (text, score) = line
724
+ detections.append((bbox, text, score))
711
725
  if postprocess is None:
712
726
  postprocess = dict(
713
727
  spell_check=True,
@@ -787,7 +801,49 @@ def get_text(
787
801
  else:
788
802
  # 默认返回所有检测信息
789
803
  return detections
804
+ elif "ddddocr" in model.lower():
805
+ import ddddocr
806
+
807
+ ocr = ddddocr.DdddOcr(det=False, ocr=True)
808
+ image_bytes = convert_image_to_bytes(image_process)
809
+
810
+ results = ocr.classification(image_bytes) # Text extraction
811
+
812
+ # Optional: Perform detection for bounding boxes
813
+ detections = []
814
+ if kwargs.get("det", False):
815
+ det_ocr = ddddocr.DdddOcr(det=True)
816
+ det_results = det_ocr.detect(image_bytes)
817
+ for box in det_results:
818
+ top_left = (box[0], box[1])
819
+ bottom_right = (box[2], box[3])
820
+ detections.append((top_left, bottom_right))
790
821
 
822
+ if postprocess is None:
823
+ postprocess = dict(
824
+ spell_check=True,
825
+ clean=True,
826
+ filter=dict(min_length=2),
827
+ pattern=None,
828
+ merge=True,
829
+ )
830
+ text_corr = []
831
+ [
832
+ text_corr.extend(text_postprocess(text, **postprocess))
833
+ for _, text, _ in detections
834
+ ]
835
+ # Visualization
836
+ if show:
837
+ if ax is None:
838
+ ax = plt.gca()
839
+ image_vis = image.copy()
840
+ if detections:
841
+ for top_left, bottom_right in detections:
842
+ cv2.rectangle(image_vis, top_left, bottom_right, box_color, 2)
843
+ image_vis = cv2.cvtColor(image_vis, cmap)
844
+ ax.imshow(image_vis)
845
+ ax.axis("off")
846
+ return detections
791
847
  else: # "pytesseract"
792
848
  if ax is None:
793
849
  ax = plt.gca()