py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +1059 -114
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +204 -20
- py2ls/ocr.py +60 -4
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/METADATA +6 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/RECORD +16 -14
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
|
|
626
626
|
)
|
627
627
|
if condition:
|
628
628
|
filtered_links.append(link)
|
629
|
-
return filtered_links
|
629
|
+
return ips.unique(filtered_links)
|
630
630
|
|
631
631
|
|
632
632
|
def find_domain(links):
|
@@ -717,7 +717,7 @@ def downloader(
|
|
717
717
|
kind=[".pdf"],
|
718
718
|
contains=None,
|
719
719
|
rm_folder=False,
|
720
|
-
booster=
|
720
|
+
booster=True,# use find_links
|
721
721
|
verbose=True,
|
722
722
|
timeout=30,
|
723
723
|
n_try=3,
|
@@ -726,7 +726,7 @@ def downloader(
|
|
726
726
|
|
727
727
|
from requests.exceptions import ChunkedEncodingError, ConnectionError
|
728
728
|
|
729
|
-
if verbose:
|
729
|
+
if verbose and ips.run_once_within():
|
730
730
|
print(
|
731
731
|
"usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
|
732
732
|
)
|
@@ -734,8 +734,11 @@ def downloader(
|
|
734
734
|
def fname_corrector(fname, ext):
|
735
735
|
if not ext.startswith("."):
|
736
736
|
ext = "." + ext
|
737
|
-
if not fname.endswith(
|
737
|
+
if not fname.endswith(ext): # if not ext in fname:
|
738
738
|
fname = fname[: -len(ext)] + ext
|
739
|
+
if not any(fname[: -len(ext)]):
|
740
|
+
from datetime import datetime
|
741
|
+
fname = datetime.now().strftime("%H%M%S") + ext
|
739
742
|
return fname
|
740
743
|
|
741
744
|
def check_and_modify_filename(directory, filename):
|
@@ -784,8 +787,8 @@ def downloader(
|
|
784
787
|
kind[i] = "." + kind[i]
|
785
788
|
file_links_all = []
|
786
789
|
for kind_ in kind:
|
787
|
-
if isinstance(contains, str):
|
788
|
-
|
790
|
+
# if isinstance(contains, str):
|
791
|
+
# contains = [contains]
|
789
792
|
if isinstance(url, str):
|
790
793
|
if any(ext in url for ext in kind):
|
791
794
|
file_links = [url]
|
@@ -799,7 +802,7 @@ def downloader(
|
|
799
802
|
if contains is not None:
|
800
803
|
file_links = filter_links(links_all, contains=contains + kind_)
|
801
804
|
else:
|
802
|
-
file_links =
|
805
|
+
file_links = filter_links(links_all, contains=kind_)#links_all #
|
803
806
|
elif isinstance(url, list):
|
804
807
|
links_all = url
|
805
808
|
if contains is not None:
|
@@ -812,6 +815,7 @@ def downloader(
|
|
812
815
|
file_links = filter_links(links_all, contains=contains + kind_)
|
813
816
|
else:
|
814
817
|
file_links = filter_links(links_all, contains=kind_)
|
818
|
+
file_links=ips.unique(file_links)
|
815
819
|
if verbose:
|
816
820
|
if file_links:
|
817
821
|
from pprint import pp
|
@@ -825,6 +829,7 @@ def downloader(
|
|
825
829
|
file_links_all = [file_links]
|
826
830
|
elif isinstance(file_links, list):
|
827
831
|
file_links_all.extend(file_links)
|
832
|
+
file_links_all=ips.unique(file_links_all)
|
828
833
|
if dir_save:
|
829
834
|
if rm_folder:
|
830
835
|
ips.rm_folder(dir_save)
|
@@ -847,7 +852,7 @@ def downloader(
|
|
847
852
|
)
|
848
853
|
if ext is None:
|
849
854
|
ext = kind_
|
850
|
-
|
855
|
+
|
851
856
|
if ext:
|
852
857
|
corrected_fname = fname_corrector(fnames[idx], ext)
|
853
858
|
corrected_fname = check_and_modify_filename(
|
@@ -860,13 +865,13 @@ def downloader(
|
|
860
865
|
datetime.now().strftime("%y%m%d_%H%M%S_")
|
861
866
|
+ corrected_fname
|
862
867
|
)
|
863
|
-
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
868
|
+
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
864
869
|
with open(fpath_tmp, "wb") as file:
|
865
870
|
for chunk in response.iter_content(chunk_size=8192):
|
866
871
|
if chunk: # Filter out keep-alive chunks
|
867
872
|
file.write(chunk)
|
868
873
|
if verbose:
|
869
|
-
print(f"Done
|
874
|
+
print(f"Done⤵{fnames[idx]}")
|
870
875
|
else:
|
871
876
|
if verbose:
|
872
877
|
print(f"Unknown file type for {file_link}")
|
@@ -886,16 +891,7 @@ def downloader(
|
|
886
891
|
|
887
892
|
if itry == n_try:
|
888
893
|
print(f"Failed to download {file_link} after {n_try} attempts.")
|
889
|
-
|
890
|
-
# print(f"\n{len(fnames)} files were downloaded:")
|
891
|
-
if verbose:
|
892
|
-
from pprint import pp
|
893
|
-
|
894
|
-
if corrected_fname:
|
895
|
-
pp(corrected_fname)
|
896
|
-
print(f"\n\nsaved @:\n{dir_save}")
|
897
|
-
else:
|
898
|
-
pp(fnames)
|
894
|
+
|
899
895
|
|
900
896
|
|
901
897
|
def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
|
@@ -1612,3 +1608,191 @@ def ai(*args, **kwargs):
|
|
1612
1608
|
if len(args) == 1 and isinstance(args[0], str):
|
1613
1609
|
kwargs["query"] = args[0]
|
1614
1610
|
return echo(**kwargs)
|
1611
|
+
|
1612
|
+
|
1613
|
+
#! get_ip()
|
1614
|
+
def get_ip(ip=None):
|
1615
|
+
"""
|
1616
|
+
Usage:
|
1617
|
+
from py2ls import netfinder as nt
|
1618
|
+
ip = nt.get_ip()
|
1619
|
+
"""
|
1620
|
+
|
1621
|
+
import requests
|
1622
|
+
import time
|
1623
|
+
import logging
|
1624
|
+
from datetime import datetime, timedelta
|
1625
|
+
|
1626
|
+
# Set up logging configuration
|
1627
|
+
logging.basicConfig(
|
1628
|
+
level=logging.INFO,
|
1629
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
1630
|
+
handlers=[
|
1631
|
+
logging.StreamHandler(),
|
1632
|
+
logging.FileHandler("public_ip_log.log"), # Log to a file
|
1633
|
+
],
|
1634
|
+
)
|
1635
|
+
|
1636
|
+
cache = {}
|
1637
|
+
|
1638
|
+
# Function to fetch IP addresses synchronously
|
1639
|
+
def fetch_ip(url, retries, timeout, headers):
|
1640
|
+
"""
|
1641
|
+
Synchronous function to fetch the IP address with retries.
|
1642
|
+
"""
|
1643
|
+
for attempt in range(retries):
|
1644
|
+
try:
|
1645
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
1646
|
+
response.raise_for_status()
|
1647
|
+
return response.json()
|
1648
|
+
except requests.RequestException as e:
|
1649
|
+
logging.error(f"Attempt {attempt + 1} failed: {e}")
|
1650
|
+
if attempt < retries - 1:
|
1651
|
+
time.sleep(2**attempt) # Exponential backoff
|
1652
|
+
else:
|
1653
|
+
logging.error("Max retries reached.")
|
1654
|
+
return {"error": f"Error fetching IP: {e}"}
|
1655
|
+
except requests.Timeout:
|
1656
|
+
logging.error("Request timed out")
|
1657
|
+
time.sleep(2**attempt)
|
1658
|
+
return {"error": "Failed to fetch IP after retries"}
|
1659
|
+
|
1660
|
+
# Function to fetch geolocation synchronously
|
1661
|
+
def fetch_geolocation(url, retries, timeout, headers):
|
1662
|
+
"""
|
1663
|
+
Synchronous function to fetch geolocation data by IP address.
|
1664
|
+
"""
|
1665
|
+
for attempt in range(retries):
|
1666
|
+
try:
|
1667
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
1668
|
+
response.raise_for_status()
|
1669
|
+
return response.json()
|
1670
|
+
except requests.RequestException as e:
|
1671
|
+
logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
|
1672
|
+
if attempt < retries - 1:
|
1673
|
+
time.sleep(2**attempt) # Exponential backoff
|
1674
|
+
else:
|
1675
|
+
logging.error("Max retries reached.")
|
1676
|
+
return {"error": f"Error fetching geolocation: {e}"}
|
1677
|
+
except requests.Timeout:
|
1678
|
+
logging.error("Geolocation request timed out")
|
1679
|
+
time.sleep(2**attempt)
|
1680
|
+
return {"error": "Failed to fetch geolocation after retries"}
|
1681
|
+
|
1682
|
+
# Main function to get public IP and geolocation
|
1683
|
+
def get_public_ip(
|
1684
|
+
ip4=True,
|
1685
|
+
ip6=True,
|
1686
|
+
verbose=True,
|
1687
|
+
retries=3,
|
1688
|
+
timeout=5,
|
1689
|
+
geolocation=True,
|
1690
|
+
headers=None,
|
1691
|
+
cache_duration=5,
|
1692
|
+
):
|
1693
|
+
"""
|
1694
|
+
Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
|
1695
|
+
"""
|
1696
|
+
# Use the cache if it's still valid
|
1697
|
+
cache_key_ip4 = "public_ip4"
|
1698
|
+
cache_key_ip6 = "public_ip6"
|
1699
|
+
cache_key_geolocation = "geolocation"
|
1700
|
+
|
1701
|
+
if (
|
1702
|
+
cache
|
1703
|
+
and cache_key_ip4 in cache
|
1704
|
+
and datetime.now() < cache[cache_key_ip4]["expires"]
|
1705
|
+
):
|
1706
|
+
logging.info("Cache hit for IPv4, using cached data.")
|
1707
|
+
ip4_data = cache[cache_key_ip4]["data"]
|
1708
|
+
else:
|
1709
|
+
ip4_data = None
|
1710
|
+
|
1711
|
+
if (
|
1712
|
+
cache
|
1713
|
+
and cache_key_ip6 in cache
|
1714
|
+
and datetime.now() < cache[cache_key_ip6]["expires"]
|
1715
|
+
):
|
1716
|
+
logging.info("Cache hit for IPv6, using cached data.")
|
1717
|
+
ip6_data = cache[cache_key_ip6]["data"]
|
1718
|
+
else:
|
1719
|
+
ip6_data = None
|
1720
|
+
|
1721
|
+
if (
|
1722
|
+
cache
|
1723
|
+
and cache_key_geolocation in cache
|
1724
|
+
and datetime.now() < cache[cache_key_geolocation]["expires"]
|
1725
|
+
):
|
1726
|
+
logging.info("Cache hit for Geolocation, using cached data.")
|
1727
|
+
geolocation_data = cache[cache_key_geolocation]["data"]
|
1728
|
+
else:
|
1729
|
+
geolocation_data = None
|
1730
|
+
|
1731
|
+
# Fetch IPv4 if requested
|
1732
|
+
if ip4 and not ip4_data:
|
1733
|
+
logging.info("Fetching IPv4...")
|
1734
|
+
ip4_data = fetch_ip(
|
1735
|
+
"https://api.ipify.org?format=json", retries, timeout, headers
|
1736
|
+
)
|
1737
|
+
cache[cache_key_ip4] = {
|
1738
|
+
"data": ip4_data,
|
1739
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
# Fetch IPv6 if requested
|
1743
|
+
if ip6 and not ip6_data:
|
1744
|
+
logging.info("Fetching IPv6...")
|
1745
|
+
ip6_data = fetch_ip(
|
1746
|
+
"https://api6.ipify.org?format=json", retries, timeout, headers
|
1747
|
+
)
|
1748
|
+
cache[cache_key_ip6] = {
|
1749
|
+
"data": ip6_data,
|
1750
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
# Fetch geolocation if requested
|
1754
|
+
if geolocation and not geolocation_data:
|
1755
|
+
logging.info("Fetching Geolocation...")
|
1756
|
+
geolocation_data = fetch_geolocation(
|
1757
|
+
"https://ipinfo.io/json", retries, timeout, headers
|
1758
|
+
)
|
1759
|
+
cache[cache_key_geolocation] = {
|
1760
|
+
"data": geolocation_data,
|
1761
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
# Prepare the results
|
1765
|
+
ip_info = {
|
1766
|
+
"ip4": ip4_data.get("ip") if ip4_data else "N/A",
|
1767
|
+
"ip6": ip6_data.get("ip") if ip6_data else "N/A",
|
1768
|
+
"geolocation": geolocation_data if geolocation_data else "N/A",
|
1769
|
+
}
|
1770
|
+
|
1771
|
+
# Verbose output if requested
|
1772
|
+
if verbose:
|
1773
|
+
print(f"Public IPv4: {ip_info['ip4']}")
|
1774
|
+
print(f"Public IPv6: {ip_info['ip6']}")
|
1775
|
+
print(f"Geolocation: {ip_info['geolocation']}")
|
1776
|
+
|
1777
|
+
return ip_info
|
1778
|
+
|
1779
|
+
# Function to get geolocation data by IP
|
1780
|
+
def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
|
1781
|
+
"""
|
1782
|
+
Fetches geolocation data for a given IP address.
|
1783
|
+
"""
|
1784
|
+
url = f"https://ipinfo.io/{ip}/json"
|
1785
|
+
geolocation_data = fetch_geolocation(url, retries, timeout, headers)
|
1786
|
+
return geolocation_data
|
1787
|
+
#! here starting get_ip()
|
1788
|
+
headers = {"User-Agent": user_agent()}
|
1789
|
+
if ip is None:
|
1790
|
+
try:
|
1791
|
+
ip_data = get_public_ip(headers=headers, verbose=True)
|
1792
|
+
except Exception as e:
|
1793
|
+
print(e)
|
1794
|
+
ip_data = None
|
1795
|
+
return ip_data
|
1796
|
+
else:
|
1797
|
+
geolocation_data = get_geolocation_by_ip(ip, headers=headers)
|
1798
|
+
return geolocation_data
|
py2ls/ocr.py
CHANGED
@@ -486,6 +486,18 @@ def preprocess_img(
|
|
486
486
|
|
487
487
|
return img_preprocessed
|
488
488
|
|
489
|
+
def convert_image_to_bytes(image):
|
490
|
+
"""
|
491
|
+
Convert a CV2 or numpy image to bytes for ddddocr.
|
492
|
+
"""
|
493
|
+
import io
|
494
|
+
# Convert OpenCV image (numpy array) to PIL image
|
495
|
+
if isinstance(image, np.ndarray):
|
496
|
+
image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
497
|
+
# Save PIL image to a byte stream
|
498
|
+
img_byte_arr = io.BytesIO()
|
499
|
+
image.save(img_byte_arr, format='PNG')
|
500
|
+
return img_byte_arr.getvalue()
|
489
501
|
|
490
502
|
def text_postprocess(
|
491
503
|
text,
|
@@ -604,10 +616,11 @@ def get_text(
|
|
604
616
|
"""
|
605
617
|
)
|
606
618
|
|
607
|
-
models = ["easyocr", "paddleocr", "pytesseract"]
|
619
|
+
models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
|
608
620
|
model = strcmp(model, models)[0]
|
609
621
|
lang = lang_auto_detect(lang, model)
|
610
622
|
if isinstance(image, str):
|
623
|
+
dir_img=image
|
611
624
|
image = cv2.imread(image)
|
612
625
|
|
613
626
|
# Ensure lang is always a list
|
@@ -705,9 +718,10 @@ def get_text(
|
|
705
718
|
) # PaddleOCR supports only one language at a time
|
706
719
|
result = ocr.ocr(image_process, **kwargs)
|
707
720
|
detections = []
|
708
|
-
|
709
|
-
|
710
|
-
|
721
|
+
if result[0] is not None:
|
722
|
+
for line in result[0]:
|
723
|
+
bbox, (text, score) = line
|
724
|
+
detections.append((bbox, text, score))
|
711
725
|
if postprocess is None:
|
712
726
|
postprocess = dict(
|
713
727
|
spell_check=True,
|
@@ -787,7 +801,49 @@ def get_text(
|
|
787
801
|
else:
|
788
802
|
# 默认返回所有检测信息
|
789
803
|
return detections
|
804
|
+
elif "ddddocr" in model.lower():
|
805
|
+
import ddddocr
|
806
|
+
|
807
|
+
ocr = ddddocr.DdddOcr(det=False, ocr=True)
|
808
|
+
image_bytes = convert_image_to_bytes(image_process)
|
809
|
+
|
810
|
+
results = ocr.classification(image_bytes) # Text extraction
|
811
|
+
|
812
|
+
# Optional: Perform detection for bounding boxes
|
813
|
+
detections = []
|
814
|
+
if kwargs.get("det", False):
|
815
|
+
det_ocr = ddddocr.DdddOcr(det=True)
|
816
|
+
det_results = det_ocr.detect(image_bytes)
|
817
|
+
for box in det_results:
|
818
|
+
top_left = (box[0], box[1])
|
819
|
+
bottom_right = (box[2], box[3])
|
820
|
+
detections.append((top_left, bottom_right))
|
790
821
|
|
822
|
+
if postprocess is None:
|
823
|
+
postprocess = dict(
|
824
|
+
spell_check=True,
|
825
|
+
clean=True,
|
826
|
+
filter=dict(min_length=2),
|
827
|
+
pattern=None,
|
828
|
+
merge=True,
|
829
|
+
)
|
830
|
+
text_corr = []
|
831
|
+
[
|
832
|
+
text_corr.extend(text_postprocess(text, **postprocess))
|
833
|
+
for _, text, _ in detections
|
834
|
+
]
|
835
|
+
# Visualization
|
836
|
+
if show:
|
837
|
+
if ax is None:
|
838
|
+
ax = plt.gca()
|
839
|
+
image_vis = image.copy()
|
840
|
+
if detections:
|
841
|
+
for top_left, bottom_right in detections:
|
842
|
+
cv2.rectangle(image_vis, top_left, bottom_right, box_color, 2)
|
843
|
+
image_vis = cv2.cvtColor(image_vis, cmap)
|
844
|
+
ax.imshow(image_vis)
|
845
|
+
ax.axis("off")
|
846
|
+
return detections
|
791
847
|
else: # "pytesseract"
|
792
848
|
if ax is None:
|
793
849
|
ax = plt.gca()
|