py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +1059 -114
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +204 -20
- py2ls/ocr.py +60 -4
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/METADATA +6 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/RECORD +16 -14
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
|
|
626
626
|
)
|
627
627
|
if condition:
|
628
628
|
filtered_links.append(link)
|
629
|
-
return filtered_links
|
629
|
+
return ips.unique(filtered_links)
|
630
630
|
|
631
631
|
|
632
632
|
def find_domain(links):
|
@@ -717,7 +717,7 @@ def downloader(
|
|
717
717
|
kind=[".pdf"],
|
718
718
|
contains=None,
|
719
719
|
rm_folder=False,
|
720
|
-
booster=
|
720
|
+
booster=True,# use find_links
|
721
721
|
verbose=True,
|
722
722
|
timeout=30,
|
723
723
|
n_try=3,
|
@@ -726,7 +726,7 @@ def downloader(
|
|
726
726
|
|
727
727
|
from requests.exceptions import ChunkedEncodingError, ConnectionError
|
728
728
|
|
729
|
-
if verbose:
|
729
|
+
if verbose and ips.run_once_within():
|
730
730
|
print(
|
731
731
|
"usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
|
732
732
|
)
|
@@ -734,8 +734,11 @@ def downloader(
|
|
734
734
|
def fname_corrector(fname, ext):
|
735
735
|
if not ext.startswith("."):
|
736
736
|
ext = "." + ext
|
737
|
-
if not fname.endswith(
|
737
|
+
if not fname.endswith(ext): # if not ext in fname:
|
738
738
|
fname = fname[: -len(ext)] + ext
|
739
|
+
if not any(fname[: -len(ext)]):
|
740
|
+
from datetime import datetime
|
741
|
+
fname = datetime.now().strftime("%H%M%S") + ext
|
739
742
|
return fname
|
740
743
|
|
741
744
|
def check_and_modify_filename(directory, filename):
|
@@ -784,8 +787,8 @@ def downloader(
|
|
784
787
|
kind[i] = "." + kind[i]
|
785
788
|
file_links_all = []
|
786
789
|
for kind_ in kind:
|
787
|
-
if isinstance(contains, str):
|
788
|
-
|
790
|
+
# if isinstance(contains, str):
|
791
|
+
# contains = [contains]
|
789
792
|
if isinstance(url, str):
|
790
793
|
if any(ext in url for ext in kind):
|
791
794
|
file_links = [url]
|
@@ -799,7 +802,7 @@ def downloader(
|
|
799
802
|
if contains is not None:
|
800
803
|
file_links = filter_links(links_all, contains=contains + kind_)
|
801
804
|
else:
|
802
|
-
file_links =
|
805
|
+
file_links = filter_links(links_all, contains=kind_)#links_all #
|
803
806
|
elif isinstance(url, list):
|
804
807
|
links_all = url
|
805
808
|
if contains is not None:
|
@@ -812,6 +815,7 @@ def downloader(
|
|
812
815
|
file_links = filter_links(links_all, contains=contains + kind_)
|
813
816
|
else:
|
814
817
|
file_links = filter_links(links_all, contains=kind_)
|
818
|
+
file_links=ips.unique(file_links)
|
815
819
|
if verbose:
|
816
820
|
if file_links:
|
817
821
|
from pprint import pp
|
@@ -825,6 +829,7 @@ def downloader(
|
|
825
829
|
file_links_all = [file_links]
|
826
830
|
elif isinstance(file_links, list):
|
827
831
|
file_links_all.extend(file_links)
|
832
|
+
file_links_all=ips.unique(file_links_all)
|
828
833
|
if dir_save:
|
829
834
|
if rm_folder:
|
830
835
|
ips.rm_folder(dir_save)
|
@@ -847,7 +852,7 @@ def downloader(
|
|
847
852
|
)
|
848
853
|
if ext is None:
|
849
854
|
ext = kind_
|
850
|
-
|
855
|
+
|
851
856
|
if ext:
|
852
857
|
corrected_fname = fname_corrector(fnames[idx], ext)
|
853
858
|
corrected_fname = check_and_modify_filename(
|
@@ -860,13 +865,13 @@ def downloader(
|
|
860
865
|
datetime.now().strftime("%y%m%d_%H%M%S_")
|
861
866
|
+ corrected_fname
|
862
867
|
)
|
863
|
-
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
868
|
+
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
864
869
|
with open(fpath_tmp, "wb") as file:
|
865
870
|
for chunk in response.iter_content(chunk_size=8192):
|
866
871
|
if chunk: # Filter out keep-alive chunks
|
867
872
|
file.write(chunk)
|
868
873
|
if verbose:
|
869
|
-
print(f"Done
|
874
|
+
print(f"Done⤵{fnames[idx]}")
|
870
875
|
else:
|
871
876
|
if verbose:
|
872
877
|
print(f"Unknown file type for {file_link}")
|
@@ -886,16 +891,7 @@ def downloader(
|
|
886
891
|
|
887
892
|
if itry == n_try:
|
888
893
|
print(f"Failed to download {file_link} after {n_try} attempts.")
|
889
|
-
|
890
|
-
# print(f"\n{len(fnames)} files were downloaded:")
|
891
|
-
if verbose:
|
892
|
-
from pprint import pp
|
893
|
-
|
894
|
-
if corrected_fname:
|
895
|
-
pp(corrected_fname)
|
896
|
-
print(f"\n\nsaved @:\n{dir_save}")
|
897
|
-
else:
|
898
|
-
pp(fnames)
|
894
|
+
|
899
895
|
|
900
896
|
|
901
897
|
def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
|
@@ -1612,3 +1608,191 @@ def ai(*args, **kwargs):
|
|
1612
1608
|
if len(args) == 1 and isinstance(args[0], str):
|
1613
1609
|
kwargs["query"] = args[0]
|
1614
1610
|
return echo(**kwargs)
|
1611
|
+
|
1612
|
+
|
1613
|
+
#! get_ip()
|
1614
|
+
def get_ip(ip=None):
|
1615
|
+
"""
|
1616
|
+
Usage:
|
1617
|
+
from py2ls import netfinder as nt
|
1618
|
+
ip = nt.get_ip()
|
1619
|
+
"""
|
1620
|
+
|
1621
|
+
import requests
|
1622
|
+
import time
|
1623
|
+
import logging
|
1624
|
+
from datetime import datetime, timedelta
|
1625
|
+
|
1626
|
+
# Set up logging configuration
|
1627
|
+
logging.basicConfig(
|
1628
|
+
level=logging.INFO,
|
1629
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
1630
|
+
handlers=[
|
1631
|
+
logging.StreamHandler(),
|
1632
|
+
logging.FileHandler("public_ip_log.log"), # Log to a file
|
1633
|
+
],
|
1634
|
+
)
|
1635
|
+
|
1636
|
+
cache = {}
|
1637
|
+
|
1638
|
+
# Function to fetch IP addresses synchronously
|
1639
|
+
def fetch_ip(url, retries, timeout, headers):
|
1640
|
+
"""
|
1641
|
+
Synchronous function to fetch the IP address with retries.
|
1642
|
+
"""
|
1643
|
+
for attempt in range(retries):
|
1644
|
+
try:
|
1645
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
1646
|
+
response.raise_for_status()
|
1647
|
+
return response.json()
|
1648
|
+
except requests.RequestException as e:
|
1649
|
+
logging.error(f"Attempt {attempt + 1} failed: {e}")
|
1650
|
+
if attempt < retries - 1:
|
1651
|
+
time.sleep(2**attempt) # Exponential backoff
|
1652
|
+
else:
|
1653
|
+
logging.error("Max retries reached.")
|
1654
|
+
return {"error": f"Error fetching IP: {e}"}
|
1655
|
+
except requests.Timeout:
|
1656
|
+
logging.error("Request timed out")
|
1657
|
+
time.sleep(2**attempt)
|
1658
|
+
return {"error": "Failed to fetch IP after retries"}
|
1659
|
+
|
1660
|
+
# Function to fetch geolocation synchronously
|
1661
|
+
def fetch_geolocation(url, retries, timeout, headers):
|
1662
|
+
"""
|
1663
|
+
Synchronous function to fetch geolocation data by IP address.
|
1664
|
+
"""
|
1665
|
+
for attempt in range(retries):
|
1666
|
+
try:
|
1667
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
1668
|
+
response.raise_for_status()
|
1669
|
+
return response.json()
|
1670
|
+
except requests.RequestException as e:
|
1671
|
+
logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
|
1672
|
+
if attempt < retries - 1:
|
1673
|
+
time.sleep(2**attempt) # Exponential backoff
|
1674
|
+
else:
|
1675
|
+
logging.error("Max retries reached.")
|
1676
|
+
return {"error": f"Error fetching geolocation: {e}"}
|
1677
|
+
except requests.Timeout:
|
1678
|
+
logging.error("Geolocation request timed out")
|
1679
|
+
time.sleep(2**attempt)
|
1680
|
+
return {"error": "Failed to fetch geolocation after retries"}
|
1681
|
+
|
1682
|
+
# Main function to get public IP and geolocation
|
1683
|
+
def get_public_ip(
|
1684
|
+
ip4=True,
|
1685
|
+
ip6=True,
|
1686
|
+
verbose=True,
|
1687
|
+
retries=3,
|
1688
|
+
timeout=5,
|
1689
|
+
geolocation=True,
|
1690
|
+
headers=None,
|
1691
|
+
cache_duration=5,
|
1692
|
+
):
|
1693
|
+
"""
|
1694
|
+
Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
|
1695
|
+
"""
|
1696
|
+
# Use the cache if it's still valid
|
1697
|
+
cache_key_ip4 = "public_ip4"
|
1698
|
+
cache_key_ip6 = "public_ip6"
|
1699
|
+
cache_key_geolocation = "geolocation"
|
1700
|
+
|
1701
|
+
if (
|
1702
|
+
cache
|
1703
|
+
and cache_key_ip4 in cache
|
1704
|
+
and datetime.now() < cache[cache_key_ip4]["expires"]
|
1705
|
+
):
|
1706
|
+
logging.info("Cache hit for IPv4, using cached data.")
|
1707
|
+
ip4_data = cache[cache_key_ip4]["data"]
|
1708
|
+
else:
|
1709
|
+
ip4_data = None
|
1710
|
+
|
1711
|
+
if (
|
1712
|
+
cache
|
1713
|
+
and cache_key_ip6 in cache
|
1714
|
+
and datetime.now() < cache[cache_key_ip6]["expires"]
|
1715
|
+
):
|
1716
|
+
logging.info("Cache hit for IPv6, using cached data.")
|
1717
|
+
ip6_data = cache[cache_key_ip6]["data"]
|
1718
|
+
else:
|
1719
|
+
ip6_data = None
|
1720
|
+
|
1721
|
+
if (
|
1722
|
+
cache
|
1723
|
+
and cache_key_geolocation in cache
|
1724
|
+
and datetime.now() < cache[cache_key_geolocation]["expires"]
|
1725
|
+
):
|
1726
|
+
logging.info("Cache hit for Geolocation, using cached data.")
|
1727
|
+
geolocation_data = cache[cache_key_geolocation]["data"]
|
1728
|
+
else:
|
1729
|
+
geolocation_data = None
|
1730
|
+
|
1731
|
+
# Fetch IPv4 if requested
|
1732
|
+
if ip4 and not ip4_data:
|
1733
|
+
logging.info("Fetching IPv4...")
|
1734
|
+
ip4_data = fetch_ip(
|
1735
|
+
"https://api.ipify.org?format=json", retries, timeout, headers
|
1736
|
+
)
|
1737
|
+
cache[cache_key_ip4] = {
|
1738
|
+
"data": ip4_data,
|
1739
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
# Fetch IPv6 if requested
|
1743
|
+
if ip6 and not ip6_data:
|
1744
|
+
logging.info("Fetching IPv6...")
|
1745
|
+
ip6_data = fetch_ip(
|
1746
|
+
"https://api6.ipify.org?format=json", retries, timeout, headers
|
1747
|
+
)
|
1748
|
+
cache[cache_key_ip6] = {
|
1749
|
+
"data": ip6_data,
|
1750
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
# Fetch geolocation if requested
|
1754
|
+
if geolocation and not geolocation_data:
|
1755
|
+
logging.info("Fetching Geolocation...")
|
1756
|
+
geolocation_data = fetch_geolocation(
|
1757
|
+
"https://ipinfo.io/json", retries, timeout, headers
|
1758
|
+
)
|
1759
|
+
cache[cache_key_geolocation] = {
|
1760
|
+
"data": geolocation_data,
|
1761
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
# Prepare the results
|
1765
|
+
ip_info = {
|
1766
|
+
"ip4": ip4_data.get("ip") if ip4_data else "N/A",
|
1767
|
+
"ip6": ip6_data.get("ip") if ip6_data else "N/A",
|
1768
|
+
"geolocation": geolocation_data if geolocation_data else "N/A",
|
1769
|
+
}
|
1770
|
+
|
1771
|
+
# Verbose output if requested
|
1772
|
+
if verbose:
|
1773
|
+
print(f"Public IPv4: {ip_info['ip4']}")
|
1774
|
+
print(f"Public IPv6: {ip_info['ip6']}")
|
1775
|
+
print(f"Geolocation: {ip_info['geolocation']}")
|
1776
|
+
|
1777
|
+
return ip_info
|
1778
|
+
|
1779
|
+
# Function to get geolocation data by IP
|
1780
|
+
def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
|
1781
|
+
"""
|
1782
|
+
Fetches geolocation data for a given IP address.
|
1783
|
+
"""
|
1784
|
+
url = f"https://ipinfo.io/{ip}/json"
|
1785
|
+
geolocation_data = fetch_geolocation(url, retries, timeout, headers)
|
1786
|
+
return geolocation_data
|
1787
|
+
#! here starting get_ip()
|
1788
|
+
headers = {"User-Agent": user_agent()}
|
1789
|
+
if ip is None:
|
1790
|
+
try:
|
1791
|
+
ip_data = get_public_ip(headers=headers, verbose=True)
|
1792
|
+
except Exception as e:
|
1793
|
+
print(e)
|
1794
|
+
ip_data = None
|
1795
|
+
return ip_data
|
1796
|
+
else:
|
1797
|
+
geolocation_data = get_geolocation_by_ip(ip, headers=headers)
|
1798
|
+
return geolocation_data
|
py2ls/ocr.py
CHANGED
@@ -486,6 +486,18 @@ def preprocess_img(
|
|
486
486
|
|
487
487
|
return img_preprocessed
|
488
488
|
|
489
|
+
def convert_image_to_bytes(image):
|
490
|
+
"""
|
491
|
+
Convert a CV2 or numpy image to bytes for ddddocr.
|
492
|
+
"""
|
493
|
+
import io
|
494
|
+
# Convert OpenCV image (numpy array) to PIL image
|
495
|
+
if isinstance(image, np.ndarray):
|
496
|
+
image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
497
|
+
# Save PIL image to a byte stream
|
498
|
+
img_byte_arr = io.BytesIO()
|
499
|
+
image.save(img_byte_arr, format='PNG')
|
500
|
+
return img_byte_arr.getvalue()
|
489
501
|
|
490
502
|
def text_postprocess(
|
491
503
|
text,
|
@@ -604,10 +616,11 @@ def get_text(
|
|
604
616
|
"""
|
605
617
|
)
|
606
618
|
|
607
|
-
models = ["easyocr", "paddleocr", "pytesseract"]
|
619
|
+
models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
|
608
620
|
model = strcmp(model, models)[0]
|
609
621
|
lang = lang_auto_detect(lang, model)
|
610
622
|
if isinstance(image, str):
|
623
|
+
dir_img=image
|
611
624
|
image = cv2.imread(image)
|
612
625
|
|
613
626
|
# Ensure lang is always a list
|
@@ -705,9 +718,10 @@ def get_text(
|
|
705
718
|
) # PaddleOCR supports only one language at a time
|
706
719
|
result = ocr.ocr(image_process, **kwargs)
|
707
720
|
detections = []
|
708
|
-
|
709
|
-
|
710
|
-
|
721
|
+
if result[0] is not None:
|
722
|
+
for line in result[0]:
|
723
|
+
bbox, (text, score) = line
|
724
|
+
detections.append((bbox, text, score))
|
711
725
|
if postprocess is None:
|
712
726
|
postprocess = dict(
|
713
727
|
spell_check=True,
|
@@ -787,7 +801,49 @@ def get_text(
|
|
787
801
|
else:
|
788
802
|
# 默认返回所有检测信息
|
789
803
|
return detections
|
804
|
+
elif "ddddocr" in model.lower():
|
805
|
+
import ddddocr
|
806
|
+
|
807
|
+
ocr = ddddocr.DdddOcr(det=False, ocr=True)
|
808
|
+
image_bytes = convert_image_to_bytes(image_process)
|
809
|
+
|
810
|
+
results = ocr.classification(image_bytes) # Text extraction
|
811
|
+
|
812
|
+
# Optional: Perform detection for bounding boxes
|
813
|
+
detections = []
|
814
|
+
if kwargs.get("det", False):
|
815
|
+
det_ocr = ddddocr.DdddOcr(det=True)
|
816
|
+
det_results = det_ocr.detect(image_bytes)
|
817
|
+
for box in det_results:
|
818
|
+
top_left = (box[0], box[1])
|
819
|
+
bottom_right = (box[2], box[3])
|
820
|
+
detections.append((top_left, bottom_right))
|
790
821
|
|
822
|
+
if postprocess is None:
|
823
|
+
postprocess = dict(
|
824
|
+
spell_check=True,
|
825
|
+
clean=True,
|
826
|
+
filter=dict(min_length=2),
|
827
|
+
pattern=None,
|
828
|
+
merge=True,
|
829
|
+
)
|
830
|
+
text_corr = []
|
831
|
+
[
|
832
|
+
text_corr.extend(text_postprocess(text, **postprocess))
|
833
|
+
for _, text, _ in detections
|
834
|
+
]
|
835
|
+
# Visualization
|
836
|
+
if show:
|
837
|
+
if ax is None:
|
838
|
+
ax = plt.gca()
|
839
|
+
image_vis = image.copy()
|
840
|
+
if detections:
|
841
|
+
for top_left, bottom_right in detections:
|
842
|
+
cv2.rectangle(image_vis, top_left, bottom_right, box_color, 2)
|
843
|
+
image_vis = cv2.cvtColor(image_vis, cmap)
|
844
|
+
ax.imshow(image_vis)
|
845
|
+
ax.axis("off")
|
846
|
+
return detections
|
791
847
|
else: # "pytesseract"
|
792
848
|
if ax is None:
|
793
849
|
ax = plt.gca()
|