py2ls 0.2.4.31__py3-none-any.whl → 0.2.4.33__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ips.py +1518 -554
- py2ls/netfinder.py +99 -0
- py2ls/ocr.py +140 -126
- py2ls/plot.py +612 -376
- {py2ls-0.2.4.31.dist-info → py2ls-0.2.4.33.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.31.dist-info → py2ls-0.2.4.33.dist-info}/RECORD +8 -8
- {py2ls-0.2.4.31.dist-info → py2ls-0.2.4.33.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import pandas as pd
|
3
|
-
import sys
|
3
|
+
import sys
|
4
|
+
import os
|
4
5
|
from IPython.display import display
|
5
6
|
from typing import List, Optional, Union
|
6
7
|
|
@@ -17,13 +18,15 @@ import warnings
|
|
17
18
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
18
19
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
19
20
|
warnings.filterwarnings("ignore")
|
20
|
-
import os
|
21
21
|
import shutil
|
22
22
|
import logging
|
23
23
|
from pathlib import Path
|
24
24
|
from datetime import datetime
|
25
|
+
import re
|
26
|
+
import stat
|
27
|
+
import platform
|
25
28
|
|
26
|
-
def run_once_within(duration=60,reverse=False): # default 60s
|
29
|
+
def run_once_within(duration=60, reverse=False): # default 60s
|
27
30
|
import time
|
28
31
|
|
29
32
|
"""
|
@@ -546,6 +549,7 @@ def is_text(s):
|
|
546
549
|
|
547
550
|
from typing import Any, Union
|
548
551
|
|
552
|
+
|
549
553
|
def share(*args, strict=True, n_shared=2, verbose=True):
|
550
554
|
"""
|
551
555
|
check the shared elelements in two list.
|
@@ -591,13 +595,14 @@ def share(*args, strict=True, n_shared=2, verbose=True):
|
|
591
595
|
elements2show = (
|
592
596
|
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
593
597
|
)
|
594
|
-
tail =
|
598
|
+
tail = "" if len(shared_elements) < 10 else "......"
|
595
599
|
elements2show.append(tail)
|
596
600
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
597
601
|
print("********* checking shared elements *********")
|
598
602
|
return shared_elements
|
599
603
|
|
600
|
-
|
604
|
+
|
605
|
+
def shared(*args, n_shared=None, verbose=True, **kwargs):
|
601
606
|
"""
|
602
607
|
check the shared elelements in two list.
|
603
608
|
usage:
|
@@ -652,7 +657,8 @@ def shared(*args, n_shared=None, verbose=True,**kwargs):
|
|
652
657
|
print("********* checking shared elements *********")
|
653
658
|
return shared_elements
|
654
659
|
|
655
|
-
|
660
|
+
|
661
|
+
def share_not(*args, n_shared=None, verbose=False):
|
656
662
|
"""
|
657
663
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
658
664
|
usage:
|
@@ -660,10 +666,12 @@ def share_not(*args, n_shared=None, verbose=False):
|
|
660
666
|
list2 = [4, 5, 6, 7, 8]
|
661
667
|
not_shared(list1,list2)# output [1,3]
|
662
668
|
"""
|
663
|
-
_common = shared(*args,
|
669
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
664
670
|
list1 = flatten(args[0], verbose=verbose)
|
665
671
|
_not_shared = [item for item in list1 if item not in _common]
|
666
672
|
return _not_shared
|
673
|
+
|
674
|
+
|
667
675
|
def not_shared(*args, n_shared=None, verbose=False):
|
668
676
|
"""
|
669
677
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
@@ -779,13 +787,23 @@ def strcmp(
|
|
779
787
|
print(f"建议: {best_match}")
|
780
788
|
return candidates[best_match_index], best_match_index
|
781
789
|
|
782
|
-
|
790
|
+
|
791
|
+
def imgcmp(img: list,
|
792
|
+
method:str ="knn",
|
793
|
+
thr:float =0.75,
|
794
|
+
detector: str = "sift",
|
795
|
+
plot_:bool =True,
|
796
|
+
figsize=[12, 6],
|
797
|
+
grid_size=10,# only for grid detector
|
798
|
+
**kwargs):
|
783
799
|
"""
|
784
800
|
Compare two images using SSIM, Feature Matching (SIFT), or KNN Matching.
|
785
801
|
|
786
802
|
Parameters:
|
787
|
-
- img (list): List containing two image file paths [img1, img2].
|
803
|
+
- img (list): List containing two image file paths [img1, img2] or two numpy arrays.
|
788
804
|
- method (str): Comparison method ('ssim', 'match', or 'knn').
|
805
|
+
- detector (str): Feature detector ('sift', 'grid', 'pixel').
|
806
|
+
- thr (float): Threshold for filtering matches.
|
789
807
|
- plot_ (bool): Whether to display the results visually.
|
790
808
|
- figsize (list): Size of the figure for plots.
|
791
809
|
|
@@ -796,15 +814,21 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
796
814
|
import cv2
|
797
815
|
import matplotlib.pyplot as plt
|
798
816
|
from skimage.metrics import structural_similarity as ssim
|
817
|
+
|
799
818
|
# Load images
|
800
|
-
|
801
|
-
|
819
|
+
if isinstance(img, list) and isinstance(img[0],str):
|
820
|
+
image1 = cv2.imread(img[0])
|
821
|
+
image2 = cv2.imread(img[1])
|
822
|
+
bool_cvt=True
|
823
|
+
else:
|
824
|
+
image1, image2 = np.array(img[0]),np.array(img[1])
|
825
|
+
bool_cvt=False
|
802
826
|
|
803
827
|
if image1 is None or image2 is None:
|
804
828
|
raise ValueError("Could not load one or both images. Check file paths.")
|
805
|
-
methods=[
|
806
|
-
method=strcmp(method, methods)[0]
|
807
|
-
if method ==
|
829
|
+
methods = ["ssim", "match", "knn"]
|
830
|
+
method = strcmp(method, methods)[0]
|
831
|
+
if method == "ssim":
|
808
832
|
# Convert images to grayscale
|
809
833
|
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
810
834
|
gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
|
@@ -819,107 +843,187 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
819
843
|
# Plot if needed
|
820
844
|
if plot_:
|
821
845
|
fig, ax = plt.subplots(1, 3, figsize=figsize)
|
822
|
-
ax[0].imshow(gray1, cmap=
|
846
|
+
ax[0].imshow(gray1, cmap="gray")
|
823
847
|
ax[0].set_title("Image 1")
|
824
|
-
ax[1].imshow(gray2, cmap=
|
848
|
+
ax[1].imshow(gray2, cmap="gray")
|
825
849
|
ax[1].set_title("Image 2")
|
826
|
-
ax[2].imshow(diff, cmap=
|
850
|
+
ax[2].imshow(diff, cmap="gray")
|
827
851
|
ax[2].set_title("Difference (SSIM)")
|
828
852
|
plt.tight_layout()
|
829
853
|
plt.show()
|
830
|
-
|
854
|
+
|
831
855
|
return diff, score
|
832
856
|
|
833
|
-
elif method in [
|
857
|
+
elif method in ["match", "knn"]:
|
834
858
|
# Convert images to grayscale
|
835
859
|
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
836
|
-
gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
|
860
|
+
gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
|
861
|
+
|
862
|
+
if detector == "sift":
|
863
|
+
# SIFT detector
|
864
|
+
sift = cv2.SIFT_create()
|
865
|
+
keypoints1, descriptors1 = sift.detectAndCompute(gray1, None)
|
866
|
+
keypoints2, descriptors2 = sift.detectAndCompute(gray2, None)
|
867
|
+
|
868
|
+
elif detector == "grid":
|
869
|
+
# Grid-based detection
|
870
|
+
keypoints1, descriptors1 = [], []
|
871
|
+
keypoints2, descriptors2 = [], []
|
872
|
+
|
873
|
+
for i in range(0, gray1.shape[0], grid_size):
|
874
|
+
for j in range(0, gray1.shape[1], grid_size):
|
875
|
+
patch1 = gray1[i:i + grid_size, j:j + grid_size]
|
876
|
+
patch2 = gray2[i:i + grid_size, j:j + grid_size]
|
877
|
+
if patch1.size > 0 and patch2.size > 0:
|
878
|
+
keypoints1.append(cv2.KeyPoint(j + grid_size // 2, i + grid_size // 2, grid_size))
|
879
|
+
keypoints2.append(cv2.KeyPoint(j + grid_size // 2, i + grid_size // 2, grid_size))
|
880
|
+
descriptors1.append(np.mean(patch1))
|
881
|
+
descriptors2.append(np.mean(patch2))
|
882
|
+
|
883
|
+
descriptors1 = np.array(descriptors1).reshape(-1, 1)
|
884
|
+
descriptors2 = np.array(descriptors2).reshape(-1, 1)
|
885
|
+
|
886
|
+
elif detector == "pixel":
|
887
|
+
# Pixel-based direct comparison
|
888
|
+
descriptors1 = gray1.flatten()
|
889
|
+
descriptors2 = gray2.flatten()
|
890
|
+
keypoints1 = [cv2.KeyPoint(x, y, 1) for y in range(gray1.shape[0]) for x in range(gray1.shape[1])]
|
891
|
+
keypoints2 = [cv2.KeyPoint(x, y, 1) for y in range(gray2.shape[0]) for x in range(gray2.shape[1])]
|
837
892
|
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
#
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
if
|
846
|
-
|
893
|
+
else:
|
894
|
+
raise ValueError("Invalid detector. Use 'sift', 'grid', or 'pixel'.")
|
895
|
+
|
896
|
+
# Handle missing descriptors
|
897
|
+
if descriptors1 is None or descriptors2 is None:
|
898
|
+
raise ValueError("Failed to compute descriptors for one or both images.")
|
899
|
+
# Ensure descriptors are in the correct data type
|
900
|
+
if descriptors1.dtype != np.float32:
|
901
|
+
descriptors1 = descriptors1.astype(np.float32)
|
902
|
+
if descriptors2.dtype != np.float32:
|
903
|
+
descriptors2 = descriptors2.astype(np.float32)
|
847
904
|
|
848
905
|
# BFMatcher initialization
|
849
906
|
bf = cv2.BFMatcher()
|
850
|
-
|
851
|
-
if method == 'match': # Cross-check matching
|
907
|
+
if method == "match": # Cross-check matching
|
852
908
|
bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
|
853
909
|
matches = bf.match(descriptors1, descriptors2)
|
854
910
|
matches = sorted(matches, key=lambda x: x.distance)
|
855
911
|
|
856
912
|
# Filter good matches
|
857
|
-
good_matches = [
|
913
|
+
good_matches = [
|
914
|
+
m for m in matches if m.distance < thr * matches[-1].distance
|
915
|
+
]
|
858
916
|
|
859
|
-
elif method ==
|
917
|
+
elif method == "knn": # KNN matching with ratio test
|
918
|
+
bf = cv2.BFMatcher()
|
860
919
|
matches = bf.knnMatch(descriptors1, descriptors2, k=2)
|
861
920
|
# Apply Lowe's ratio test
|
862
|
-
good_matches = [m for m, n in matches if m.distance <
|
921
|
+
good_matches = [m for m, n in matches if m.distance < thr * n.distance]
|
863
922
|
|
864
923
|
# Calculate similarity score
|
865
924
|
similarity_score = len(good_matches) / min(len(keypoints1), len(keypoints2))
|
866
925
|
print(f"Number of good matches: {len(good_matches)}")
|
867
926
|
print(f"Similarity Score: {similarity_score:.4f}")
|
868
|
-
|
927
|
+
# Handle case where no good matches are found
|
869
928
|
if len(good_matches) == 0:
|
870
929
|
print("No good matches found.")
|
871
930
|
return good_matches, 0.0, None
|
872
931
|
|
873
932
|
# Identify matched keypoints
|
874
|
-
src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
933
|
+
src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(
|
934
|
+
-1, 1, 2
|
935
|
+
)
|
936
|
+
dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(
|
937
|
+
-1, 1, 2
|
938
|
+
)
|
880
939
|
# Apply the homography to image2
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
940
|
+
try:
|
941
|
+
# Calculate Homography using RANSAC
|
942
|
+
homography_matrix, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
|
943
|
+
h, w = image1.shape[:2]
|
944
|
+
warped_image2 = cv2.warpPerspective(image2, homography_matrix, (w, h))
|
945
|
+
|
946
|
+
# Plot result if needed
|
947
|
+
if plot_:
|
948
|
+
fig, ax = plt.subplots(1, 2, figsize=figsize)
|
949
|
+
ax[0].imshow(cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[0].imshow(image1)
|
950
|
+
ax[0].set_title("Image 1")
|
951
|
+
ax[1].imshow(cv2.cvtColor(warped_image2, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[1].imshow(warped_image2)
|
952
|
+
ax[1].set_title("Warped Image 2")
|
953
|
+
plt.tight_layout()
|
954
|
+
plt.show()
|
955
|
+
except Exception as e:
|
956
|
+
print(e)
|
893
957
|
|
894
958
|
# Plot matches if needed
|
895
959
|
if plot_:
|
896
|
-
result = cv2.drawMatches(
|
960
|
+
result = cv2.drawMatches(
|
961
|
+
image1, keypoints1, image2, keypoints2, good_matches, None, flags=2
|
962
|
+
)
|
897
963
|
plt.figure(figsize=figsize)
|
898
|
-
plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
|
964
|
+
plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB)) if bool_cvt else plt.imshow(result)
|
899
965
|
plt.title(f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})")
|
900
|
-
plt.axis(
|
966
|
+
plt.axis("off")
|
901
967
|
plt.show()
|
902
968
|
# Identify unmatched keypoints
|
903
969
|
matched_idx1 = [m.queryIdx for m in good_matches]
|
904
970
|
matched_idx2 = [m.trainIdx for m in good_matches]
|
905
|
-
|
971
|
+
matched_kp1 = [kp for i, kp in enumerate(keypoints1) if i in matched_idx1]
|
972
|
+
matched_kp2 = [kp for i, kp in enumerate(keypoints2) if i in matched_idx2]
|
906
973
|
unmatched_kp1 = [kp for i, kp in enumerate(keypoints1) if i not in matched_idx1]
|
907
974
|
unmatched_kp2 = [kp for i, kp in enumerate(keypoints2) if i not in matched_idx2]
|
908
975
|
|
909
|
-
# Mark
|
910
|
-
|
911
|
-
|
976
|
+
# Mark keypoints on the images
|
977
|
+
img1_match = cv2.drawKeypoints(
|
978
|
+
image1,
|
979
|
+
matched_kp1,
|
980
|
+
None,
|
981
|
+
color=(0, 0, 255),
|
982
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
983
|
+
)
|
984
|
+
img2_match = cv2.drawKeypoints(
|
985
|
+
image2,
|
986
|
+
matched_kp2,
|
987
|
+
None,
|
988
|
+
color=(0, 0, 255),
|
989
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
990
|
+
)
|
991
|
+
img1_unmatch = cv2.drawKeypoints(
|
992
|
+
image1,
|
993
|
+
unmatched_kp1,
|
994
|
+
None,
|
995
|
+
color=(0, 0, 255),
|
996
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
997
|
+
)
|
998
|
+
img2_unmatch = cv2.drawKeypoints(
|
999
|
+
image2,
|
1000
|
+
unmatched_kp2,
|
1001
|
+
None,
|
1002
|
+
color=(0, 0, 255),
|
1003
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
1004
|
+
)
|
912
1005
|
|
913
|
-
# Display results
|
914
1006
|
if plot_:
|
915
1007
|
fig, ax = plt.subplots(1, 2, figsize=figsize)
|
916
|
-
ax[0].imshow(cv2.cvtColor(
|
1008
|
+
ax[0].imshow(cv2.cvtColor(img1_unmatch, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[0].imshow(img1_unmatch)
|
917
1009
|
ax[0].set_title("Unmatched Keypoints (Image 1)")
|
918
|
-
ax[1].imshow(cv2.cvtColor(
|
1010
|
+
ax[1].imshow(cv2.cvtColor(img2_unmatch, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[1].imshow(img2_unmatch)
|
919
1011
|
ax[1].set_title("Unmatched Keypoints (Image 2)")
|
1012
|
+
ax[0].axis("off")
|
1013
|
+
ax[1].axis("off")
|
920
1014
|
plt.tight_layout()
|
921
1015
|
plt.show()
|
922
|
-
|
1016
|
+
if plot_:
|
1017
|
+
fig, ax = plt.subplots(1, 2, figsize=figsize)
|
1018
|
+
ax[0].imshow(cv2.cvtColor(img1_match, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[0].imshow(img1_match)
|
1019
|
+
ax[0].set_title("Matched Keypoints (Image 1)")
|
1020
|
+
ax[1].imshow(cv2.cvtColor(img2_match, cv2.COLOR_BGR2RGB)) if bool_cvt else ax[1].imshow(img2_match)
|
1021
|
+
ax[1].set_title("Matched Keypoints (Image 2)")
|
1022
|
+
ax[0].axis("off")
|
1023
|
+
ax[1].axis("off")
|
1024
|
+
plt.tight_layout()
|
1025
|
+
plt.show()
|
1026
|
+
return good_matches, similarity_score#, homography_matrix
|
923
1027
|
|
924
1028
|
else:
|
925
1029
|
raise ValueError("Invalid method. Use 'ssim', 'match', or 'knn'.")
|
@@ -939,9 +1043,7 @@ def cn2pinyin(
|
|
939
1043
|
Args:
|
940
1044
|
cn_str (str): Chinese string to convert.
|
941
1045
|
sep (str): Separator for the output Pinyin string.
|
942
|
-
|
943
|
-
"finals","finals_tone","finals_tone2","finals_tone3",
|
944
|
-
"initials","bopomofo","bopomofo_first","cyrillic","pl",
|
1046
|
+
fmt (Style): "normal","tone", "tone2","tone3","finals","finals_tone","finals_tone2","finals_tone3","initials","bopomofo","bopomofo_first","cyrillic","pl",
|
945
1047
|
Returns:
|
946
1048
|
cn_str: The Pinyin representation of the Chinese string.
|
947
1049
|
"""
|
@@ -1017,6 +1119,7 @@ def counter(list_, verbose=True):
|
|
1017
1119
|
# print(f"Return a list of the n most common elements:\n{c.most_common()}")
|
1018
1120
|
# print(f"Compute the sum of the counts:\n{c.total()}")
|
1019
1121
|
|
1122
|
+
|
1020
1123
|
def dict2df(dict_, fill=None):
|
1021
1124
|
len_max = 0
|
1022
1125
|
for key, value in dict_.items():
|
@@ -1031,11 +1134,12 @@ def dict2df(dict_, fill=None):
|
|
1031
1134
|
dict_[key] = value
|
1032
1135
|
return pd.DataFrame.from_dict(dict_)
|
1033
1136
|
|
1137
|
+
|
1034
1138
|
def text2audio(
|
1035
1139
|
text,
|
1036
1140
|
method=None, # "pyttsx3","gTTS"
|
1037
1141
|
rate=200,
|
1038
|
-
slow=False
|
1142
|
+
slow=False, # "gTTS"
|
1039
1143
|
volume=1.0,
|
1040
1144
|
voice=None,
|
1041
1145
|
lang=None,
|
@@ -1056,16 +1160,38 @@ def text2audio(
|
|
1056
1160
|
# )
|
1057
1161
|
"""
|
1058
1162
|
if method is not None:
|
1059
|
-
methods=["gTTS","pyttsx3","google"]
|
1060
|
-
method=strcmp(method, methods)[0]
|
1163
|
+
methods = ["gTTS", "pyttsx3", "google"]
|
1164
|
+
method = strcmp(method, methods)[0]
|
1061
1165
|
else:
|
1062
1166
|
try:
|
1063
|
-
text2audio(
|
1167
|
+
text2audio(
|
1168
|
+
text,
|
1169
|
+
method="google",
|
1170
|
+
rate=rate,
|
1171
|
+
slow=slow,
|
1172
|
+
volume=volume,
|
1173
|
+
voice=voice,
|
1174
|
+
lang=lang,
|
1175
|
+
gender=gender,
|
1176
|
+
age=age,
|
1177
|
+
dir_save=dir_save,
|
1178
|
+
)
|
1064
1179
|
except Exception as e:
|
1065
1180
|
print(e)
|
1066
|
-
text2audio(
|
1067
|
-
|
1068
|
-
|
1181
|
+
text2audio(
|
1182
|
+
text,
|
1183
|
+
method="pyttsx3",
|
1184
|
+
rate=rate,
|
1185
|
+
slow=slow,
|
1186
|
+
volume=volume,
|
1187
|
+
voice=voice,
|
1188
|
+
lang=lang,
|
1189
|
+
gender=gender,
|
1190
|
+
age=age,
|
1191
|
+
dir_save=dir_save,
|
1192
|
+
)
|
1193
|
+
|
1194
|
+
if method == "pyttsx3":
|
1069
1195
|
import pyttsx3
|
1070
1196
|
|
1071
1197
|
try:
|
@@ -1140,27 +1266,29 @@ def text2audio(
|
|
1140
1266
|
sys.exit()
|
1141
1267
|
except SystemExit:
|
1142
1268
|
pass
|
1143
|
-
elif method.lower() in [
|
1269
|
+
elif method.lower() in ["google", "gtts"]:
|
1144
1270
|
from gtts import gTTS
|
1271
|
+
|
1145
1272
|
try:
|
1146
1273
|
if lang is None:
|
1147
1274
|
from langdetect import detect
|
1275
|
+
|
1148
1276
|
lang = detect(text)
|
1149
1277
|
# Initialize gTTS with the provided parameters
|
1150
1278
|
tts = gTTS(text=text, lang=lang, slow=slow)
|
1151
1279
|
except Exception as e:
|
1152
1280
|
print(f"An error occurred: {e}")
|
1153
|
-
|
1281
|
+
|
1154
1282
|
print("not realtime reading...")
|
1155
1283
|
if dir_save:
|
1156
1284
|
if "." not in dir_save:
|
1157
|
-
dir_save=dir_save+".mp3"
|
1285
|
+
dir_save = dir_save + ".mp3"
|
1158
1286
|
tts.save(dir_save)
|
1159
1287
|
print(f"Audio saved to {dir_save}")
|
1160
1288
|
else:
|
1161
1289
|
dir_save = "temp_audio.mp3"
|
1162
1290
|
if "." not in dir_save:
|
1163
|
-
dir_save=dir_save+".mp3"
|
1291
|
+
dir_save = dir_save + ".mp3"
|
1164
1292
|
tts.save(dir_save)
|
1165
1293
|
try:
|
1166
1294
|
fopen(dir_save)
|
@@ -1624,6 +1752,7 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
|
|
1624
1752
|
def set_dpi(x):
|
1625
1753
|
dpix = dpiy = x
|
1626
1754
|
return image2pdf.get_fixed_dpi_layout_fun((dpix, dpiy))
|
1755
|
+
|
1627
1756
|
if kind is None:
|
1628
1757
|
_, kind = os.path.splitext(dir_img)
|
1629
1758
|
if not kind.startswith("."):
|
@@ -1649,8 +1778,9 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
|
|
1649
1778
|
imgs.append(path)
|
1650
1779
|
else:
|
1651
1780
|
imgs = [
|
1652
|
-
# os.path.isdir(dir_img),
|
1653
|
-
dir_img
|
1781
|
+
# os.path.isdir(dir_img),
|
1782
|
+
dir_img
|
1783
|
+
]
|
1654
1784
|
print(imgs)
|
1655
1785
|
if page_size:
|
1656
1786
|
if isinstance(page_size, str):
|
@@ -2196,7 +2326,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
2196
2326
|
# Check data types
|
2197
2327
|
data_types = df.dtypes
|
2198
2328
|
# messages.append(f"Data types of columns:\n{data_types}")
|
2199
|
-
|
2329
|
+
|
2200
2330
|
# Check for an unreasonable number of rows or columns
|
2201
2331
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
2202
2332
|
messages.append(
|
@@ -2347,33 +2477,36 @@ def fload(fpath, kind=None, **kwargs):
|
|
2347
2477
|
|
2348
2478
|
def load_csv(fpath, **kwargs):
|
2349
2479
|
from pandas.errors import EmptyDataError
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2353
|
-
|
2354
|
-
|
2355
|
-
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2480
|
+
|
2481
|
+
engine = kwargs.pop("engine", "pyarrow") # default: None
|
2482
|
+
sep = kwargs.pop("sep", None) # default: ','
|
2483
|
+
index_col = kwargs.pop("index_col", None) # default: None
|
2484
|
+
memory_map = kwargs.pop("memory_map", False) # default: False
|
2485
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False) # default: False
|
2486
|
+
encoding = kwargs.pop("encoding", "utf-8") # default: "utf-8"
|
2487
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip") # default: 'error'
|
2488
|
+
comment = kwargs.pop("comment", None) # default: None
|
2489
|
+
fmt = kwargs.pop("fmt", False) # default:
|
2490
|
+
chunksize = kwargs.pop("chunksize", None) # default: None
|
2491
|
+
|
2492
|
+
# check filesize
|
2493
|
+
f_size = round(os.path.getsize(fpath) / 1024 / 1024, 3)
|
2494
|
+
if f_size >= 50: # 50 MB
|
2364
2495
|
if chunksize is None:
|
2365
|
-
chunksize
|
2366
|
-
print(
|
2496
|
+
chunksize = 5000
|
2497
|
+
print(
|
2498
|
+
f"file size is {f_size}MB, then set the chunksize with {chunksize}"
|
2499
|
+
)
|
2367
2500
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2368
|
-
low_memory = kwargs.pop("low_memory", True)# default: True
|
2501
|
+
low_memory = kwargs.pop("low_memory", True) # default: True
|
2369
2502
|
low_memory = (
|
2370
2503
|
False if chunksize else True
|
2371
|
-
) # when chunksize, recommend low_memory=False # default:
|
2504
|
+
) # when chunksize, recommend low_memory=False # default:
|
2372
2505
|
verbose = kwargs.pop("verbose", False)
|
2373
2506
|
if run_once_within(reverse=True) and verbose:
|
2374
2507
|
use_pd("read_csv", verbose=verbose)
|
2375
2508
|
|
2376
|
-
if comment is None
|
2509
|
+
if comment is None: # default: None
|
2377
2510
|
comment = get_comment(
|
2378
2511
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
2379
2512
|
)
|
@@ -2503,7 +2636,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2503
2636
|
try:
|
2504
2637
|
sep2show = sep if sep != "\t" else "\\t"
|
2505
2638
|
if verbose:
|
2506
|
-
print(
|
2639
|
+
print(
|
2640
|
+
f"trying with: engine={engine}, sep='{sep2show}'"
|
2641
|
+
)
|
2507
2642
|
# print(".")
|
2508
2643
|
df = pd.read_csv(
|
2509
2644
|
fpath,
|
@@ -2524,12 +2659,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2524
2659
|
if verbose:
|
2525
2660
|
(
|
2526
2661
|
display(df.head(2))
|
2527
|
-
if isinstance(df, pd.DataFrame)
|
2662
|
+
if isinstance(df, pd.DataFrame)
|
2528
2663
|
else display("it is not a DataFrame")
|
2529
2664
|
)
|
2530
2665
|
(
|
2531
2666
|
print(f"shape: {df.shape}")
|
2532
|
-
if isinstance(df, pd.DataFrame)
|
2667
|
+
if isinstance(df, pd.DataFrame)
|
2533
2668
|
else display("it is not a DataFrame")
|
2534
2669
|
)
|
2535
2670
|
return df
|
@@ -2663,9 +2798,10 @@ def fload(fpath, kind=None, **kwargs):
|
|
2663
2798
|
doc = Document(fpath)
|
2664
2799
|
content = [para.text for para in doc.paragraphs]
|
2665
2800
|
return content
|
2666
|
-
|
2801
|
+
|
2667
2802
|
def load_rtf(file_path):
|
2668
2803
|
from striprtf.striprtf import rtf_to_text
|
2804
|
+
|
2669
2805
|
try:
|
2670
2806
|
with open(file_path, "r") as file:
|
2671
2807
|
rtf_content = file.read()
|
@@ -2715,7 +2851,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2715
2851
|
"xml",
|
2716
2852
|
"ipynb",
|
2717
2853
|
"mtx",
|
2718
|
-
"rtf"
|
2854
|
+
"rtf",
|
2719
2855
|
]
|
2720
2856
|
zip_types = [
|
2721
2857
|
"gz",
|
@@ -2735,7 +2871,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2735
2871
|
if kind not in supported_types:
|
2736
2872
|
print(
|
2737
2873
|
f'Warning:\n"{kind}" is not in the supported list '
|
2738
|
-
) # {supported_types}')
|
2874
|
+
) # {supported_types}')
|
2739
2875
|
|
2740
2876
|
if kind == "docx":
|
2741
2877
|
return load_docx(fpath)
|
@@ -2760,10 +2896,11 @@ def fload(fpath, kind=None, **kwargs):
|
|
2760
2896
|
if run_once_within(reverse=True) and verbose:
|
2761
2897
|
use_pd("read_pickle")
|
2762
2898
|
try:
|
2763
|
-
res_=pd.read_pickle(fpath, **kwargs)
|
2899
|
+
res_ = pd.read_pickle(fpath, **kwargs)
|
2764
2900
|
except Exception as e:
|
2765
2901
|
import pickle
|
2766
|
-
|
2902
|
+
|
2903
|
+
with open("sgd_classifier.pkl", "rb") as f:
|
2767
2904
|
res_ = pickle.load(f)
|
2768
2905
|
return res_
|
2769
2906
|
elif kind in ["ods", "ods", "odt"]:
|
@@ -2775,21 +2912,34 @@ def fload(fpath, kind=None, **kwargs):
|
|
2775
2912
|
engine = kwargs.get("engine", "xlrd")
|
2776
2913
|
kwargs.pop("engine", None)
|
2777
2914
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2778
|
-
|
2915
|
+
(
|
2916
|
+
print(f"shape: {content.shape}")
|
2917
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2918
|
+
else None
|
2919
|
+
)
|
2779
2920
|
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2780
2921
|
return content
|
2781
2922
|
elif kind == "xlsx":
|
2782
2923
|
verbose = kwargs.pop("verbose", False)
|
2783
2924
|
content = load_excel(fpath, **kwargs)
|
2784
|
-
|
2925
|
+
(
|
2926
|
+
display(content.head(3))
|
2927
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2928
|
+
else None
|
2929
|
+
)
|
2785
2930
|
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2786
2931
|
return content
|
2787
2932
|
elif kind == "mtx":
|
2788
2933
|
from scipy.io import mmread
|
2934
|
+
|
2789
2935
|
verbose = kwargs.pop("verbose", False)
|
2790
2936
|
dat_mtx = mmread(fpath)
|
2791
2937
|
content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
|
2792
|
-
|
2938
|
+
(
|
2939
|
+
display(content.head(3))
|
2940
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2941
|
+
else None
|
2942
|
+
)
|
2793
2943
|
print(f"shape: {content.shape}")
|
2794
2944
|
return content
|
2795
2945
|
elif kind == "ipynb":
|
@@ -2904,34 +3054,34 @@ def fopen(fpath):
|
|
2904
3054
|
import os
|
2905
3055
|
import platform
|
2906
3056
|
import sys
|
3057
|
+
|
2907
3058
|
try:
|
2908
3059
|
# Check if the file exists
|
2909
3060
|
if not os.path.isfile(fpath):
|
2910
3061
|
print(f"Error: The file does not exist - {fpath}")
|
2911
3062
|
return
|
2912
|
-
|
3063
|
+
|
2913
3064
|
# Get the system platform
|
2914
3065
|
system = platform.system()
|
2915
3066
|
|
2916
3067
|
# Platform-specific file opening commands
|
2917
3068
|
if system == "Darwin": # macOS
|
2918
|
-
os.system(f
|
3069
|
+
os.system(f'open "{fpath}"')
|
2919
3070
|
elif system == "Windows": # Windows
|
2920
3071
|
# Ensure the path is handled correctly in Windows, escape spaces
|
2921
|
-
os.system(f
|
3072
|
+
os.system(f'start "" "{fpath}"')
|
2922
3073
|
elif system == "Linux": # Linux
|
2923
|
-
os.system(f
|
3074
|
+
os.system(f'xdg-open "{fpath}"')
|
2924
3075
|
elif system == "Java": # Java (or other unhandled systems)
|
2925
3076
|
print(f"Opening {fpath} on unsupported system.")
|
2926
3077
|
else:
|
2927
3078
|
print(f"Unsupported OS: {system}")
|
2928
|
-
|
3079
|
+
|
2929
3080
|
print(f"Successfully opened {fpath} with the default application.")
|
2930
3081
|
except Exception as e:
|
2931
3082
|
print(f"Error opening file {fpath}: {e}")
|
2932
3083
|
|
2933
3084
|
|
2934
|
-
|
2935
3085
|
def fupdate(fpath, content=None, how="head"):
|
2936
3086
|
"""
|
2937
3087
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
@@ -3346,9 +3496,10 @@ def fsave(
|
|
3346
3496
|
except Exception as e:
|
3347
3497
|
try:
|
3348
3498
|
import pickle
|
3349
|
-
|
3499
|
+
|
3500
|
+
with open(fpath, "wb") as f:
|
3350
3501
|
pickle.dump(content, f)
|
3351
|
-
print(
|
3502
|
+
print("done!", fpath)
|
3352
3503
|
except Exception as e:
|
3353
3504
|
raise ValueError(
|
3354
3505
|
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
@@ -3508,9 +3659,9 @@ def isa(content, kind):
|
|
3508
3659
|
"""
|
3509
3660
|
if "img" in kind.lower() or "image" in kind.lower():
|
3510
3661
|
return is_image(content)
|
3511
|
-
elif
|
3662
|
+
elif "vid" in kind.lower():
|
3512
3663
|
return is_video(content)
|
3513
|
-
elif
|
3664
|
+
elif "aud" in kind.lower():
|
3514
3665
|
return is_audio(content)
|
3515
3666
|
elif "doc" in kind.lower():
|
3516
3667
|
return is_document(content)
|
@@ -3569,8 +3720,8 @@ def get_os(full=False, verbose=False):
|
|
3569
3720
|
import os
|
3570
3721
|
import subprocess
|
3571
3722
|
from datetime import datetime, timedelta
|
3572
|
-
from collections import defaultdict
|
3573
3723
|
|
3724
|
+
|
3574
3725
|
def get_os_type():
|
3575
3726
|
os_name = sys.platform
|
3576
3727
|
if "dar" in os_name:
|
@@ -3583,7 +3734,8 @@ def get_os(full=False, verbose=False):
|
|
3583
3734
|
else:
|
3584
3735
|
print(f"{os_name}, returned 'None'")
|
3585
3736
|
return None
|
3586
|
-
|
3737
|
+
if not full:
|
3738
|
+
return get_os_type()
|
3587
3739
|
def get_os_info():
|
3588
3740
|
"""Get the detailed OS name, version, and other platform-specific details."""
|
3589
3741
|
|
@@ -3755,22 +3907,28 @@ def get_os(full=False, verbose=False):
|
|
3755
3907
|
|
3756
3908
|
def get_system_uptime():
|
3757
3909
|
"""Returns system uptime as a human-readable string."""
|
3758
|
-
|
3759
|
-
|
3760
|
-
|
3910
|
+
try:
|
3911
|
+
boot_time = datetime.fromtimestamp(psutil.boot_time())
|
3912
|
+
uptime = datetime.now() - boot_time
|
3913
|
+
return str(uptime).split(".")[0] # Remove microseconds
|
3914
|
+
except:
|
3915
|
+
return None
|
3761
3916
|
|
3762
3917
|
def get_active_processes(limit=10):
|
3763
|
-
|
3764
|
-
|
3765
|
-
|
3766
|
-
|
3767
|
-
|
3768
|
-
|
3769
|
-
|
3770
|
-
|
3771
|
-
|
3772
|
-
|
3773
|
-
|
3918
|
+
try:
|
3919
|
+
processes = []
|
3920
|
+
for proc in psutil.process_iter(
|
3921
|
+
["pid", "name", "cpu_percent", "memory_percent"]
|
3922
|
+
):
|
3923
|
+
try:
|
3924
|
+
processes.append(proc.info)
|
3925
|
+
except psutil.NoSuchProcess:
|
3926
|
+
pass
|
3927
|
+
# Handle NoneType values by treating them as 0
|
3928
|
+
processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
|
3929
|
+
return processes[:limit]
|
3930
|
+
except:
|
3931
|
+
return None
|
3774
3932
|
|
3775
3933
|
def get_virtual_environment_info():
|
3776
3934
|
"""Checks if the script is running in a virtual environment and returns details."""
|
@@ -3801,19 +3959,22 @@ def get_os(full=False, verbose=False):
|
|
3801
3959
|
|
3802
3960
|
def get_battery_status():
|
3803
3961
|
"""Returns battery status."""
|
3804
|
-
|
3805
|
-
|
3806
|
-
|
3807
|
-
|
3808
|
-
|
3809
|
-
|
3810
|
-
|
3811
|
-
|
3812
|
-
|
3813
|
-
|
3814
|
-
|
3815
|
-
|
3816
|
-
|
3962
|
+
try:
|
3963
|
+
battery = psutil.sensors_battery()
|
3964
|
+
if battery:
|
3965
|
+
time_left = (
|
3966
|
+
str(timedelta(seconds=battery.secsleft))
|
3967
|
+
if battery.secsleft != psutil.POWER_TIME_UNLIMITED
|
3968
|
+
else "Charging/Unlimited"
|
3969
|
+
)
|
3970
|
+
return {
|
3971
|
+
"Percentage": battery.percent,
|
3972
|
+
"Plugged In": battery.power_plugged,
|
3973
|
+
"Time Left": time_left,
|
3974
|
+
}
|
3975
|
+
return {"Status": "No battery detected"}
|
3976
|
+
except:
|
3977
|
+
return {"Status": "No battery detected"}
|
3817
3978
|
|
3818
3979
|
def get_disk_io():
|
3819
3980
|
"""Returns disk I/O statistics."""
|
@@ -3899,8 +4060,8 @@ def get_os(full=False, verbose=False):
|
|
3899
4060
|
"network": {},
|
3900
4061
|
"network io": get_network_io(),
|
3901
4062
|
"gpu": [],
|
3902
|
-
"temperatures": get_temperatures(),
|
3903
|
-
"battery": get_battery_status(),
|
4063
|
+
# "temperatures": get_temperatures(),
|
4064
|
+
# "battery": get_battery_status(),
|
3904
4065
|
"active processes": get_active_processes(),
|
3905
4066
|
"environment": {
|
3906
4067
|
"user": os.getenv("USER", "Unknown"),
|
@@ -3984,27 +4145,26 @@ def get_os(full=False, verbose=False):
|
|
3984
4145
|
pnrint(e)
|
3985
4146
|
return res
|
3986
4147
|
|
3987
|
-
|
3988
|
-
import stat
|
3989
|
-
import platform
|
4148
|
+
|
3990
4149
|
def listdir(
|
3991
4150
|
rootdir,
|
3992
4151
|
kind=None,
|
3993
4152
|
sort_by="name",
|
3994
4153
|
ascending=True,
|
3995
|
-
contains=None
|
3996
|
-
booster=False
|
3997
|
-
depth
|
4154
|
+
contains=None, # filter filenames using re
|
4155
|
+
booster=False, # walk in subfolders
|
4156
|
+
depth=0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
|
3998
4157
|
hidden=False, # Include hidden files/folders
|
3999
4158
|
orient="list",
|
4000
4159
|
output="df", # "df", 'list','dict','records','index','series'
|
4001
4160
|
verbose=True,
|
4002
|
-
):
|
4161
|
+
):
|
4003
4162
|
def is_hidden(filepath):
|
4004
4163
|
"""Check if a file or folder is hidden."""
|
4005
4164
|
system = platform.system()
|
4006
4165
|
if system == "Windows":
|
4007
4166
|
import ctypes
|
4167
|
+
|
4008
4168
|
attribute = ctypes.windll.kernel32.GetFileAttributesW(filepath)
|
4009
4169
|
if attribute == -1:
|
4010
4170
|
raise FileNotFoundError(f"File {filepath} not found.")
|
@@ -4019,6 +4179,7 @@ def listdir(
|
|
4019
4179
|
return os.environ.get("USERNAME", "Unknown")
|
4020
4180
|
else:
|
4021
4181
|
import pwd
|
4182
|
+
|
4022
4183
|
return pwd.getpwuid(os.getuid()).pw_name
|
4023
4184
|
|
4024
4185
|
if isinstance(kind, list):
|
@@ -4030,7 +4191,7 @@ def listdir(
|
|
4030
4191
|
sort_by=sort_by,
|
4031
4192
|
ascending=ascending,
|
4032
4193
|
contains=contains,
|
4033
|
-
depth=depth
|
4194
|
+
depth=depth, # walk in subfolders
|
4034
4195
|
hidden=hidden,
|
4035
4196
|
orient=orient,
|
4036
4197
|
output=output,
|
@@ -4046,21 +4207,21 @@ def listdir(
|
|
4046
4207
|
i = 0
|
4047
4208
|
f = {
|
4048
4209
|
"name": [],
|
4049
|
-
|
4210
|
+
"kind": [],
|
4050
4211
|
"length": [],
|
4051
|
-
"basename":[],
|
4212
|
+
"basename": [],
|
4052
4213
|
"path": [],
|
4053
4214
|
"created_time": [],
|
4054
4215
|
"modified_time": [],
|
4055
4216
|
"last_open_time": [],
|
4056
4217
|
"size": [],
|
4057
|
-
"permission":[],
|
4058
|
-
"owner":[],
|
4059
|
-
"rootdir":[],
|
4218
|
+
"permission": [],
|
4219
|
+
"owner": [],
|
4220
|
+
"rootdir": [],
|
4060
4221
|
"fname": [],
|
4061
4222
|
"fpath": [],
|
4062
|
-
"num":[],
|
4063
|
-
"os":[]
|
4223
|
+
"num": [],
|
4224
|
+
"os": [],
|
4064
4225
|
}
|
4065
4226
|
root_depth = rootdir.rstrip(os.sep).count(os.sep)
|
4066
4227
|
for dirpath, dirnames, ls in os.walk(rootdir):
|
@@ -4069,30 +4230,32 @@ def listdir(
|
|
4069
4230
|
if depth is not None and current_depth > depth:
|
4070
4231
|
dirnames[:] = [] # Prevent further traversal into subfolders
|
4071
4232
|
continue
|
4072
|
-
|
4233
|
+
|
4073
4234
|
if not hidden:
|
4074
|
-
dirnames[:] = [
|
4235
|
+
dirnames[:] = [
|
4236
|
+
d for d in dirnames if not is_hidden(os.path.join(dirpath, d))
|
4237
|
+
]
|
4075
4238
|
ls = [i for i in ls if not is_hidden(os.path.join(dirpath, i))]
|
4076
4239
|
|
4077
4240
|
for dirname in dirnames:
|
4078
|
-
if kind is not None and kind not in fd:
|
4241
|
+
if kind is not None and kind not in fd: # do not check folders
|
4079
4242
|
continue
|
4080
4243
|
if contains and not re.search(contains, dirname):
|
4081
4244
|
continue
|
4082
4245
|
dirname_path = os.path.join(dirpath, dirname)
|
4083
|
-
fpath = os.path.join(os.path.dirname(dirname_path), dirname)
|
4246
|
+
fpath = os.path.join(os.path.dirname(dirname_path), dirname)
|
4084
4247
|
try:
|
4085
4248
|
stats_file = os.stat(fpath)
|
4086
4249
|
except Exception as e:
|
4087
4250
|
print(e)
|
4088
4251
|
continue
|
4089
4252
|
filename, file_extension = os.path.splitext(dirname)
|
4090
|
-
file_extension = file_extension if file_extension!=
|
4253
|
+
file_extension = file_extension if file_extension != "" else None
|
4091
4254
|
f["name"].append(filename)
|
4092
|
-
f[
|
4255
|
+
f["kind"].append(file_extension)
|
4093
4256
|
f["length"].append(len(filename))
|
4094
4257
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
4095
|
-
f[
|
4258
|
+
f["basename"].append(os.path.basename(dirname_path))
|
4096
4259
|
f["path"].append(os.path.join(os.path.dirname(dirname_path), dirname))
|
4097
4260
|
f["created_time"].append(
|
4098
4261
|
pd.to_datetime(int(os.path.getctime(dirname_path)), unit="s")
|
@@ -4110,7 +4273,7 @@ def listdir(
|
|
4110
4273
|
f["fpath"].append(fpath) # will be removed
|
4111
4274
|
i += 1
|
4112
4275
|
for item in ls:
|
4113
|
-
if kind in fd
|
4276
|
+
if kind in fd: # only check folders
|
4114
4277
|
continue
|
4115
4278
|
if contains and not re.search(contains, item):
|
4116
4279
|
continue
|
@@ -4127,7 +4290,16 @@ def listdir(
|
|
4127
4290
|
is_file = kind.lower() in file_extension.lower() and (
|
4128
4291
|
os.path.isfile(item_path)
|
4129
4292
|
)
|
4130
|
-
if kind in [
|
4293
|
+
if kind in [
|
4294
|
+
".doc",
|
4295
|
+
".img",
|
4296
|
+
".zip",
|
4297
|
+
".code",
|
4298
|
+
".file",
|
4299
|
+
".image",
|
4300
|
+
".video",
|
4301
|
+
".audio",
|
4302
|
+
]: # 选择大的类别
|
4131
4303
|
if kind != ".folder" and not isa(item_path, kind):
|
4132
4304
|
continue
|
4133
4305
|
elif kind in [".all"]:
|
@@ -4135,13 +4307,13 @@ def listdir(
|
|
4135
4307
|
else: # 精确到文件的后缀
|
4136
4308
|
if not is_folder and not is_file:
|
4137
4309
|
continue
|
4138
|
-
file_extension = file_extension if file_extension!=
|
4310
|
+
file_extension = file_extension if file_extension != "" else None
|
4139
4311
|
f["name"].append(filename)
|
4140
|
-
f[
|
4312
|
+
f["kind"].append(file_extension)
|
4141
4313
|
f["length"].append(len(filename))
|
4142
4314
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
4143
|
-
f[
|
4144
|
-
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
4315
|
+
f["basename"].append(os.path.basename(item_path))
|
4316
|
+
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
4145
4317
|
f["created_time"].append(
|
4146
4318
|
pd.to_datetime(int(os.path.getctime(item_path)), unit="s")
|
4147
4319
|
)
|
@@ -4152,7 +4324,9 @@ def listdir(
|
|
4152
4324
|
pd.to_datetime(int(os.path.getatime(item_path)), unit="s")
|
4153
4325
|
)
|
4154
4326
|
f["permission"].append(stat.filemode(stats_file.st_mode)),
|
4155
|
-
f["owner"].append(
|
4327
|
+
f["owner"].append(
|
4328
|
+
os.getlogin() if platform.system() != "Windows" else "N/A"
|
4329
|
+
),
|
4156
4330
|
f["fname"].append(filename) # will be removed
|
4157
4331
|
f["fpath"].append(fpath) # will be removed
|
4158
4332
|
f["rootdir"].append(dirpath)
|
@@ -4162,11 +4336,28 @@ def listdir(
|
|
4162
4336
|
f["os"] = get_os() # os.uname().machine
|
4163
4337
|
# if not booster: # go deeper subfolders
|
4164
4338
|
# break
|
4165
|
-
|
4339
|
+
# * convert to pd.DataFrame
|
4166
4340
|
f = pd.DataFrame(f)
|
4167
|
-
f=f[
|
4168
|
-
|
4169
|
-
|
4341
|
+
f = f[
|
4342
|
+
[
|
4343
|
+
"basename",
|
4344
|
+
"name",
|
4345
|
+
"kind",
|
4346
|
+
"length",
|
4347
|
+
"size",
|
4348
|
+
"num",
|
4349
|
+
"path",
|
4350
|
+
"created_time",
|
4351
|
+
"modified_time",
|
4352
|
+
"last_open_time",
|
4353
|
+
"rootdir",
|
4354
|
+
"permission",
|
4355
|
+
"owner",
|
4356
|
+
"os",
|
4357
|
+
"fname",
|
4358
|
+
"fpath",
|
4359
|
+
]
|
4360
|
+
]
|
4170
4361
|
if "nam" in sort_by.lower():
|
4171
4362
|
f = sort_kind(f, by="name", ascending=ascending)
|
4172
4363
|
elif "crea" in sort_by.lower():
|
@@ -4183,6 +4374,7 @@ def listdir(
|
|
4183
4374
|
return f
|
4184
4375
|
else:
|
4185
4376
|
from box import Box
|
4377
|
+
|
4186
4378
|
if "l" in orient.lower(): # list # default
|
4187
4379
|
res_output = Box(f.to_dict(orient="list"))
|
4188
4380
|
return res_output
|
@@ -4195,6 +4387,7 @@ def listdir(
|
|
4195
4387
|
if "se" in orient.lower(): # records
|
4196
4388
|
return Box(f.to_dict(orient="series"))
|
4197
4389
|
|
4390
|
+
|
4198
4391
|
def listfunc(lib_name, opt="call"):
|
4199
4392
|
if opt == "call":
|
4200
4393
|
funcs = [func for func in dir(lib_name) if callable(getattr(lib_name, func))]
|
@@ -4206,6 +4399,7 @@ def listfunc(lib_name, opt="call"):
|
|
4206
4399
|
def func_list(lib_name, opt="call"):
|
4207
4400
|
return list_func(lib_name, opt=opt)
|
4208
4401
|
|
4402
|
+
|
4209
4403
|
def copy(src, dst, overwrite=False):
|
4210
4404
|
"""Copy a file from src to dst."""
|
4211
4405
|
try:
|
@@ -4223,25 +4417,31 @@ def copy(src, dst, overwrite=False):
|
|
4223
4417
|
if overwrite:
|
4224
4418
|
dst.unlink()
|
4225
4419
|
else:
|
4226
|
-
dst = dst.with_name(
|
4420
|
+
dst = dst.with_name(
|
4421
|
+
f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
|
4422
|
+
)
|
4227
4423
|
shutil.copy(src, dst)
|
4228
4424
|
print(f"\n Done! copy to {dst}\n")
|
4229
4425
|
else:
|
4230
|
-
dst = dst/src.name
|
4426
|
+
dst = dst / src.name
|
4231
4427
|
if dst.exists():
|
4232
4428
|
if overwrite:
|
4233
4429
|
shutil.rmtree(dst) # Remove existing directory
|
4234
4430
|
else:
|
4235
|
-
dst = dst.with_name(
|
4431
|
+
dst = dst.with_name(
|
4432
|
+
f"{dst.stem}_{datetime.now().strftime('%H%M%S')}"
|
4433
|
+
)
|
4236
4434
|
shutil.copytree(src, dst)
|
4237
4435
|
print(f"\n Done! copy to {dst}\n")
|
4238
4436
|
|
4239
4437
|
except Exception as e:
|
4240
4438
|
logging.error(f"Failed {e}")
|
4241
|
-
|
4439
|
+
|
4440
|
+
|
4242
4441
|
def cut(src, dst, overwrite=False):
|
4243
4442
|
return move(src=src, dst=dst, overwrite=overwrite)
|
4244
4443
|
|
4444
|
+
|
4245
4445
|
def move(src, dst, overwrite=False):
|
4246
4446
|
try:
|
4247
4447
|
dir_par_dst = os.path.dirname(dst)
|
@@ -4256,23 +4456,26 @@ def move(src, dst, overwrite=False):
|
|
4256
4456
|
# dst.unlink() # Delete the existing file
|
4257
4457
|
pass
|
4258
4458
|
else:
|
4259
|
-
dst = dst.with_name(
|
4459
|
+
dst = dst.with_name(
|
4460
|
+
f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
|
4461
|
+
)
|
4260
4462
|
shutil.move(src, dst)
|
4261
4463
|
print(f"\n Done! moved to {dst}\n")
|
4262
4464
|
except Exception as e:
|
4263
4465
|
logging.error(f"Failed to move file from {src} to {dst}: {e}")
|
4264
|
-
|
4466
|
+
|
4467
|
+
|
4265
4468
|
def delete(fpath):
|
4266
|
-
"""Delete a file/folder."""
|
4469
|
+
"""Delete a file/folder."""
|
4267
4470
|
try:
|
4268
4471
|
fpath = Path(fpath)
|
4269
|
-
if not fpath.is_dir():
|
4472
|
+
if not fpath.is_dir(): # file
|
4270
4473
|
if fpath.exists():
|
4271
4474
|
fpath.unlink()
|
4272
4475
|
print(f"\n Done! delete {fpath}\n")
|
4273
4476
|
else:
|
4274
4477
|
print(f"File '{fpath}' does not exist.")
|
4275
|
-
else
|
4478
|
+
else: # folder
|
4276
4479
|
if fpath.exists():
|
4277
4480
|
shutil.rmtree(fpath) # Remove existing directory
|
4278
4481
|
print(f"\n Done! delete {fpath}\n")
|
@@ -4280,27 +4483,31 @@ def delete(fpath):
|
|
4280
4483
|
print(f"Folder '{fpath}' does not exist.")
|
4281
4484
|
except Exception as e:
|
4282
4485
|
logging.error(f"Failed to delete {fpath}: {e}")
|
4486
|
+
|
4487
|
+
|
4283
4488
|
def rename(fpath, dst, smart=True):
|
4284
4489
|
"""Rename a file or folder."""
|
4285
4490
|
try:
|
4286
|
-
src_kind,dst_kind = None,None
|
4491
|
+
src_kind, dst_kind = None, None
|
4287
4492
|
if smart:
|
4288
|
-
dir_name_src=os.path.dirname(fpath)
|
4289
|
-
dir_name_dst=os.path.dirname(dst)
|
4290
|
-
src_kind=os.path.splitext(fpath)[1]
|
4291
|
-
dst_kind=os.path.splitext(dst)[1]
|
4292
|
-
if dir_name_dst!=dir_name_src:
|
4293
|
-
dst=os.path.join(dir_name_src,dst)
|
4493
|
+
dir_name_src = os.path.dirname(fpath)
|
4494
|
+
dir_name_dst = os.path.dirname(dst)
|
4495
|
+
src_kind = os.path.splitext(fpath)[1]
|
4496
|
+
dst_kind = os.path.splitext(dst)[1]
|
4497
|
+
if dir_name_dst != dir_name_src:
|
4498
|
+
dst = os.path.join(dir_name_src, dst)
|
4294
4499
|
if dst_kind is not None and src_kind is not None:
|
4295
|
-
if dst_kind!=src_kind:
|
4296
|
-
dst=dst + src_kind
|
4500
|
+
if dst_kind != src_kind:
|
4501
|
+
dst = dst + src_kind
|
4297
4502
|
if os.path.exists(fpath):
|
4298
|
-
os.rename(fpath,dst)
|
4503
|
+
os.rename(fpath, dst)
|
4299
4504
|
print(f"Done! rename to {dst}")
|
4300
4505
|
else:
|
4301
4506
|
print(f"Failed: {fpath} does not exist.")
|
4302
4507
|
except Exception as e:
|
4303
4508
|
logging.error(f"Failed to rename {fpath} to {dst}: {e}")
|
4509
|
+
|
4510
|
+
|
4304
4511
|
def mkdir_nest(fpath: str) -> str:
|
4305
4512
|
"""
|
4306
4513
|
Create nested directories based on the provided file path.
|
@@ -4319,9 +4526,13 @@ def mkdir_nest(fpath: str) -> str:
|
|
4319
4526
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
4320
4527
|
|
4321
4528
|
# Start creating directories from the root to the desired path
|
4322
|
-
root_dir = os.path.splitdrive(fpath)[
|
4323
|
-
|
4324
|
-
|
4529
|
+
root_dir = os.path.splitdrive(fpath)[
|
4530
|
+
0
|
4531
|
+
] # Get the root drive on Windows (e.g., 'C:')
|
4532
|
+
current_path = (
|
4533
|
+
root_dir if root_dir else f_slash
|
4534
|
+
) # Start from the root directory or POSIX '/'
|
4535
|
+
|
4325
4536
|
for part in dir_parts:
|
4326
4537
|
if part:
|
4327
4538
|
current_path = os.path.join(current_path, part)
|
@@ -4346,7 +4557,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
4346
4557
|
- str: The path of the created directory or an error message.
|
4347
4558
|
"""
|
4348
4559
|
rootdir = []
|
4349
|
-
pardir= mkdir_nest(pardir)
|
4560
|
+
pardir = mkdir_nest(pardir)
|
4350
4561
|
if chdir is None:
|
4351
4562
|
return pardir
|
4352
4563
|
else:
|
@@ -4465,6 +4676,7 @@ def figsave(*args, dpi=300):
|
|
4465
4676
|
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
4466
4677
|
elif isinstance(img, np.ndarray):
|
4467
4678
|
import cv2
|
4679
|
+
|
4468
4680
|
# Check the shape of the image to determine color mode
|
4469
4681
|
if img.ndim == 2:
|
4470
4682
|
# Grayscale image
|
@@ -4496,8 +4708,13 @@ def figsave(*args, dpi=300):
|
|
4496
4708
|
)
|
4497
4709
|
else:
|
4498
4710
|
plt.savefig(
|
4499
|
-
fname,
|
4500
|
-
|
4711
|
+
fname,
|
4712
|
+
format=ftype.lower(),
|
4713
|
+
dpi=dpi,
|
4714
|
+
bbox_inches="tight",
|
4715
|
+
transparent=True,
|
4716
|
+
pad_inches=0,
|
4717
|
+
)
|
4501
4718
|
elif ftype.lower() == "emf":
|
4502
4719
|
plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
|
4503
4720
|
elif ftype.lower() == "fig":
|
@@ -4534,6 +4751,7 @@ def is_num(s):
|
|
4534
4751
|
def isnum(s):
|
4535
4752
|
return is_num(s)
|
4536
4753
|
|
4754
|
+
|
4537
4755
|
def is_image(fpath):
|
4538
4756
|
"""
|
4539
4757
|
Determine if a given file is an image based on MIME type and file extension.
|
@@ -4544,37 +4762,60 @@ def is_image(fpath):
|
|
4544
4762
|
Returns:
|
4545
4763
|
bool: True if the file is a recognized image, False otherwise.
|
4546
4764
|
"""
|
4547
|
-
import
|
4548
|
-
|
4549
|
-
|
4550
|
-
|
4551
|
-
|
4552
|
-
|
4553
|
-
|
4554
|
-
|
4555
|
-
|
4556
|
-
|
4557
|
-
|
4558
|
-
|
4559
|
-
|
4560
|
-
|
4765
|
+
from PIL import Image
|
4766
|
+
if isinstance(fpath,str):
|
4767
|
+
import mimetypes
|
4768
|
+
|
4769
|
+
# Known image MIME types
|
4770
|
+
image_mime_types = {
|
4771
|
+
"image/jpeg",
|
4772
|
+
"image/png",
|
4773
|
+
"image/gif",
|
4774
|
+
"image/bmp",
|
4775
|
+
"image/webp",
|
4776
|
+
"image/tiff",
|
4777
|
+
"image/x-icon",
|
4778
|
+
"image/svg+xml",
|
4779
|
+
"image/heic",
|
4780
|
+
"image/heif",
|
4781
|
+
}
|
4561
4782
|
|
4562
|
-
|
4563
|
-
|
4564
|
-
|
4565
|
-
|
4566
|
-
|
4783
|
+
# Known image file extensions
|
4784
|
+
image_extensions = {
|
4785
|
+
".jpg",
|
4786
|
+
".jpeg",
|
4787
|
+
".png",
|
4788
|
+
".gif",
|
4789
|
+
".bmp",
|
4790
|
+
".webp",
|
4791
|
+
".tif",
|
4792
|
+
".tiff",
|
4793
|
+
".ico",
|
4794
|
+
".svg",
|
4795
|
+
".heic",
|
4796
|
+
".heif",
|
4797
|
+
".fig",
|
4798
|
+
".jpg",
|
4799
|
+
}
|
4567
4800
|
|
4568
|
-
|
4569
|
-
|
4801
|
+
# Get MIME type using mimetypes
|
4802
|
+
mime_type, _ = mimetypes.guess_type(fpath)
|
4570
4803
|
|
4571
|
-
|
4572
|
-
|
4573
|
-
|
4804
|
+
# Check MIME type
|
4805
|
+
if mime_type in image_mime_types:
|
4806
|
+
return True
|
4574
4807
|
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4808
|
+
# Fallback: Check file extension
|
4809
|
+
ext = os.path.splitext(fpath)[
|
4810
|
+
-1
|
4811
|
+
].lower() # Get the file extension and ensure lowercase
|
4812
|
+
if ext in image_extensions:
|
4813
|
+
return True
|
4814
|
+
|
4815
|
+
return False
|
4816
|
+
|
4817
|
+
elif isinstance(fpath, Image.Image):
|
4818
|
+
# If the input is a PIL Image object
|
4578
4819
|
return True
|
4579
4820
|
|
4580
4821
|
return False
|
@@ -4590,6 +4831,7 @@ def is_video(fpath):
|
|
4590
4831
|
bool: True if the file is a recognized video, False otherwise.
|
4591
4832
|
"""
|
4592
4833
|
import mimetypes
|
4834
|
+
|
4593
4835
|
# Known video MIME types
|
4594
4836
|
video_mime_types = {
|
4595
4837
|
"video/mp4",
|
@@ -4610,8 +4852,22 @@ def is_video(fpath):
|
|
4610
4852
|
|
4611
4853
|
# Known video file extensions
|
4612
4854
|
video_extensions = {
|
4613
|
-
".mp4",
|
4614
|
-
".
|
4855
|
+
".mp4",
|
4856
|
+
".mov",
|
4857
|
+
".avi",
|
4858
|
+
".mkv",
|
4859
|
+
".flv",
|
4860
|
+
".webm",
|
4861
|
+
".ogv",
|
4862
|
+
".wmv",
|
4863
|
+
".mpg",
|
4864
|
+
".mpeg",
|
4865
|
+
".3gp",
|
4866
|
+
".mpeg2",
|
4867
|
+
".asf",
|
4868
|
+
".ts",
|
4869
|
+
".m4v",
|
4870
|
+
".divx",
|
4615
4871
|
}
|
4616
4872
|
|
4617
4873
|
# Get MIME type using mimetypes
|
@@ -4622,12 +4878,15 @@ def is_video(fpath):
|
|
4622
4878
|
return True
|
4623
4879
|
|
4624
4880
|
# Fallback: Check file extension
|
4625
|
-
ext = os.path.splitext(fpath)[
|
4881
|
+
ext = os.path.splitext(fpath)[
|
4882
|
+
-1
|
4883
|
+
].lower() # Get the file extension and ensure lowercase
|
4626
4884
|
if ext in video_extensions:
|
4627
4885
|
return True
|
4628
4886
|
|
4629
4887
|
return False
|
4630
4888
|
|
4889
|
+
|
4631
4890
|
def is_document(fpath):
|
4632
4891
|
"""
|
4633
4892
|
Determine if a given file is a document based on MIME type and file extension.
|
@@ -4639,6 +4898,7 @@ def is_document(fpath):
|
|
4639
4898
|
bool: True if the file is a recognized document, False otherwise.
|
4640
4899
|
"""
|
4641
4900
|
import mimetypes
|
4901
|
+
|
4642
4902
|
# Define known MIME types for documents
|
4643
4903
|
document_mime_types = {
|
4644
4904
|
"text/",
|
@@ -4679,18 +4939,23 @@ def is_document(fpath):
|
|
4679
4939
|
|
4680
4940
|
# Get MIME type
|
4681
4941
|
mime_type, _ = mimetypes.guess_type(fpath)
|
4682
|
-
|
4942
|
+
|
4683
4943
|
# Check MIME type
|
4684
|
-
if mime_type and any(
|
4944
|
+
if mime_type and any(
|
4945
|
+
mime_type.startswith(doc_type) for doc_type in document_mime_types
|
4946
|
+
):
|
4685
4947
|
return True
|
4686
4948
|
|
4687
4949
|
# Fallback: Check file extension
|
4688
|
-
ext = os.path.splitext(fpath)[
|
4950
|
+
ext = os.path.splitext(fpath)[
|
4951
|
+
-1
|
4952
|
+
].lower() # Get the extension, ensure it's lowercase
|
4689
4953
|
if ext in document_extensions:
|
4690
4954
|
return True
|
4691
4955
|
|
4692
4956
|
return False
|
4693
4957
|
|
4958
|
+
|
4694
4959
|
def is_audio(fpath):
|
4695
4960
|
"""
|
4696
4961
|
Determine if a given file is an audio file based on MIME type and file extension.
|
@@ -4702,6 +4967,7 @@ def is_audio(fpath):
|
|
4702
4967
|
bool: True if the file is a recognized audio file, False otherwise.
|
4703
4968
|
"""
|
4704
4969
|
import mimetypes
|
4970
|
+
|
4705
4971
|
# Known audio MIME types
|
4706
4972
|
audio_mime_types = {
|
4707
4973
|
"audio/mpeg",
|
@@ -4720,8 +4986,19 @@ def is_audio(fpath):
|
|
4720
4986
|
|
4721
4987
|
# Known audio file extensions
|
4722
4988
|
audio_extensions = {
|
4723
|
-
".mp3",
|
4724
|
-
".
|
4989
|
+
".mp3",
|
4990
|
+
".wav",
|
4991
|
+
".ogg",
|
4992
|
+
".aac",
|
4993
|
+
".flac",
|
4994
|
+
".midi",
|
4995
|
+
".m4a",
|
4996
|
+
".aiff",
|
4997
|
+
".pcm",
|
4998
|
+
".wma",
|
4999
|
+
".ape",
|
5000
|
+
".alac",
|
5001
|
+
".opus",
|
4725
5002
|
}
|
4726
5003
|
|
4727
5004
|
# Get MIME type using mimetypes
|
@@ -4732,12 +5009,15 @@ def is_audio(fpath):
|
|
4732
5009
|
return True
|
4733
5010
|
|
4734
5011
|
# Fallback: Check file extension
|
4735
|
-
ext = os.path.splitext(fpath)[
|
5012
|
+
ext = os.path.splitext(fpath)[
|
5013
|
+
-1
|
5014
|
+
].lower() # Get the file extension and ensure lowercase
|
4736
5015
|
if ext in audio_extensions:
|
4737
5016
|
return True
|
4738
5017
|
|
4739
5018
|
return False
|
4740
5019
|
|
5020
|
+
|
4741
5021
|
def is_code(fpath):
|
4742
5022
|
"""
|
4743
5023
|
Determine if a given file is a code file based on file extension and optionally MIME type.
|
@@ -4751,16 +5031,37 @@ def is_code(fpath):
|
|
4751
5031
|
"""
|
4752
5032
|
# Known programming and scripting file extensions
|
4753
5033
|
code_extensions = {
|
4754
|
-
".m",
|
4755
|
-
".
|
5034
|
+
".m",
|
5035
|
+
".py",
|
5036
|
+
".ipynb",
|
5037
|
+
".js",
|
5038
|
+
".html",
|
5039
|
+
".css",
|
5040
|
+
".java",
|
5041
|
+
".cpp",
|
5042
|
+
".h",
|
5043
|
+
".cs",
|
5044
|
+
".go",
|
5045
|
+
".rs",
|
5046
|
+
".sh",
|
5047
|
+
".rb",
|
5048
|
+
".swift",
|
5049
|
+
".ts",
|
5050
|
+
".json",
|
5051
|
+
".xml",
|
5052
|
+
".yaml",
|
5053
|
+
".toml",
|
5054
|
+
".bash",
|
5055
|
+
".r",
|
4756
5056
|
}
|
4757
5057
|
|
4758
5058
|
# Check file extension
|
4759
|
-
ext = os.path.splitext(fpath)[-1].lower()
|
5059
|
+
ext = os.path.splitext(fpath)[-1].lower()
|
4760
5060
|
if ext in code_extensions:
|
4761
|
-
return True
|
5061
|
+
return True
|
4762
5062
|
return False
|
4763
|
-
|
5063
|
+
|
5064
|
+
|
4764
5065
|
def is_zip(fpath):
|
4765
5066
|
import mimetypes
|
4766
5067
|
|
@@ -4828,6 +5129,105 @@ def str2list(str_):
|
|
4828
5129
|
[l.append(x) for x in str_]
|
4829
5130
|
return l
|
4830
5131
|
|
5132
|
+
def str2words(content, method="combined", custom_dict=None, sym_spell_params=None, use_threading=True):
|
5133
|
+
"""
|
5134
|
+
Ultimate text correction function supporting multiple methods,
|
5135
|
+
lists or strings, and domain-specific corrections.
|
5136
|
+
|
5137
|
+
Parameters:
|
5138
|
+
content (str or list): Input text or list of strings to correct.
|
5139
|
+
method (str): Correction method ('textblob', 'sym', 'combined').
|
5140
|
+
custom_dict (dict): Custom dictionary for domain-specific corrections.
|
5141
|
+
sym_spell_params (dict): Parameters for initializing SymSpell.
|
5142
|
+
|
5143
|
+
Returns:
|
5144
|
+
str or list: Corrected text or list of corrected strings.
|
5145
|
+
"""
|
5146
|
+
from textblob import TextBlob
|
5147
|
+
from symspellpy import SymSpell, Verbosity
|
5148
|
+
from functools import lru_cache
|
5149
|
+
import pkg_resources
|
5150
|
+
from concurrent.futures import ThreadPoolExecutor
|
5151
|
+
|
5152
|
+
def initialize_symspell(max_edit_distance=2, prefix_length=7):
|
5153
|
+
"""Initialize SymSpell for advanced spelling correction."""
|
5154
|
+
sym_spell = SymSpell(max_edit_distance, prefix_length)
|
5155
|
+
dictionary_path = pkg_resources.resource_filename(
|
5156
|
+
"symspellpy",
|
5157
|
+
# "frequency_bigramdictionary_en_243_342.txt",
|
5158
|
+
"frequency_dictionary_en_82_765.txt",
|
5159
|
+
)
|
5160
|
+
|
5161
|
+
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
|
5162
|
+
return sym_spell
|
5163
|
+
|
5164
|
+
def segment_words(text, sym_spell):
|
5165
|
+
"""Segment concatenated words into separate words."""
|
5166
|
+
segmented = sym_spell.word_segmentation(text)
|
5167
|
+
return segmented.corrected_string
|
5168
|
+
|
5169
|
+
@lru_cache(maxsize=1000) # Cache results for repeated corrections
|
5170
|
+
def advanced_correction(word, sym_spell):
|
5171
|
+
"""Correct a single word using SymSpell."""
|
5172
|
+
suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
|
5173
|
+
return suggestions[0].term if suggestions else word
|
5174
|
+
|
5175
|
+
def apply_custom_corrections(word, custom_dict):
|
5176
|
+
"""Apply domain-specific corrections using a custom dictionary."""
|
5177
|
+
return custom_dict.get(word.lower(), word)
|
5178
|
+
def preserve_case(original_word, corrected_word):
|
5179
|
+
"""
|
5180
|
+
Preserve the case of the original word in the corrected word.
|
5181
|
+
"""
|
5182
|
+
if original_word.isupper():
|
5183
|
+
return corrected_word.upper()
|
5184
|
+
elif original_word[0].isupper():
|
5185
|
+
return corrected_word.capitalize()
|
5186
|
+
else:
|
5187
|
+
return corrected_word.lower()
|
5188
|
+
def process_string(text, method, sym_spell=None, custom_dict=None):
|
5189
|
+
"""
|
5190
|
+
Process a single string for spelling corrections.
|
5191
|
+
Handles TextBlob, SymSpell, and custom corrections.
|
5192
|
+
"""
|
5193
|
+
if method in ("sym", "combined") and sym_spell:
|
5194
|
+
text = segment_words(text, sym_spell)
|
5195
|
+
|
5196
|
+
if method in ("textblob", "combined"):
|
5197
|
+
text = str(TextBlob(text).correct())
|
5198
|
+
|
5199
|
+
corrected_words = []
|
5200
|
+
for word in text.split():
|
5201
|
+
original_word = word
|
5202
|
+
if method in ("sym", "combined") and sym_spell:
|
5203
|
+
word = advanced_correction(word, sym_spell)
|
5204
|
+
|
5205
|
+
# Step 3: Apply custom corrections
|
5206
|
+
if custom_dict:
|
5207
|
+
word = apply_custom_corrections(word, custom_dict)
|
5208
|
+
# Preserve original case
|
5209
|
+
word = preserve_case(original_word, word)
|
5210
|
+
corrected_words.append(word)
|
5211
|
+
|
5212
|
+
return " ".join(corrected_words)
|
5213
|
+
|
5214
|
+
# Initialize SymSpell if needed
|
5215
|
+
sym_spell = None
|
5216
|
+
if method in ("sym", "combined"):
|
5217
|
+
if not sym_spell_params:
|
5218
|
+
sym_spell_params = {"max_edit_distance": 2, "prefix_length": 7}
|
5219
|
+
sym_spell = initialize_symspell(**sym_spell_params)
|
5220
|
+
|
5221
|
+
# Process lists or strings
|
5222
|
+
if isinstance(content, list):
|
5223
|
+
if use_threading:
|
5224
|
+
with ThreadPoolExecutor() as executor:
|
5225
|
+
corrected_content = list(executor.map(lambda x: process_string(x, method, sym_spell, custom_dict), content))
|
5226
|
+
return corrected_content
|
5227
|
+
else:
|
5228
|
+
return [process_string(item, method, sym_spell, custom_dict) for item in content]
|
5229
|
+
else:
|
5230
|
+
return process_string(content, method, sym_spell, custom_dict)
|
4831
5231
|
|
4832
5232
|
def load_img(fpath):
|
4833
5233
|
"""
|
@@ -4851,7 +5251,7 @@ def load_img(fpath):
|
|
4851
5251
|
raise OSError(f"Unable to open file '{fpath}' or it is not a valid image file.")
|
4852
5252
|
|
4853
5253
|
|
4854
|
-
def apply_filter(img, *args):
|
5254
|
+
def apply_filter(img, *args,verbose=True):
|
4855
5255
|
# def apply_filter(img, filter_name, filter_value=None):
|
4856
5256
|
"""
|
4857
5257
|
Apply the specified filter to the image.
|
@@ -4865,7 +5265,7 @@ def apply_filter(img, *args):
|
|
4865
5265
|
from PIL import ImageFilter
|
4866
5266
|
|
4867
5267
|
def correct_filter_name(filter_name):
|
4868
|
-
if "
|
5268
|
+
if all(["b" in filter_name.lower(),"ur" in filter_name.lower(), "box" not in filter_name.lower()]):
|
4869
5269
|
return "BLUR"
|
4870
5270
|
elif "cont" in filter_name.lower():
|
4871
5271
|
return "Contour"
|
@@ -4929,10 +5329,11 @@ def apply_filter(img, *args):
|
|
4929
5329
|
|
4930
5330
|
for arg in args:
|
4931
5331
|
if isinstance(arg, str):
|
4932
|
-
filter_name = arg
|
4933
|
-
filter_name = correct_filter_name(filter_name)
|
5332
|
+
filter_name = correct_filter_name(arg)
|
4934
5333
|
else:
|
4935
5334
|
filter_value = arg
|
5335
|
+
if verbose:
|
5336
|
+
print(f'processing {filter_name}')
|
4936
5337
|
filter_name = filter_name.upper() # Ensure filter name is uppercase
|
4937
5338
|
|
4938
5339
|
# Supported filters
|
@@ -4976,12 +5377,13 @@ def apply_filter(img, *args):
|
|
4976
5377
|
bands = filter_value if filter_value is not None else None
|
4977
5378
|
return img.filter(supported_filters[filter_name](bands))
|
4978
5379
|
else:
|
4979
|
-
if filter_value is not None:
|
5380
|
+
if filter_value is not None and verbose:
|
4980
5381
|
print(
|
4981
5382
|
f"{filter_name} doesn't require a value for {filter_value}, but it remains unaffected"
|
4982
5383
|
)
|
4983
5384
|
return img.filter(supported_filters[filter_name])
|
4984
5385
|
|
5386
|
+
|
4985
5387
|
def detect_angle(image, by="median", template=None):
|
4986
5388
|
"""Detect the angle of rotation using various methods."""
|
4987
5389
|
from sklearn.decomposition import PCA
|
@@ -4989,8 +5391,11 @@ def detect_angle(image, by="median", template=None):
|
|
4989
5391
|
from skimage.color import rgb2gray
|
4990
5392
|
from scipy.fftpack import fftshift, fft2
|
4991
5393
|
import numpy as np
|
4992
|
-
import cv2
|
5394
|
+
import cv2
|
5395
|
+
|
4993
5396
|
# Convert to grayscale
|
5397
|
+
if np.array(image).shape[-1]>3:
|
5398
|
+
image=np.array(image)[:,:,:3]
|
4994
5399
|
gray_image = rgb2gray(image)
|
4995
5400
|
|
4996
5401
|
# Detect edges using Canny edge detector
|
@@ -5002,9 +5407,10 @@ def detect_angle(image, by="median", template=None):
|
|
5002
5407
|
if not lines and any(["me" in by, "pca" in by]):
|
5003
5408
|
print("No lines detected. Adjust the edge detection parameters.")
|
5004
5409
|
return 0
|
5005
|
-
|
5410
|
+
methods=['mean','median','pca','gradient orientation','template matching','moments','fft']
|
5411
|
+
by=strcmp(by, methods)[0]
|
5006
5412
|
# Hough Transform-based angle detection (Median/Mean)
|
5007
|
-
if "me" in by:
|
5413
|
+
if "me" in by.lower():
|
5008
5414
|
angles = []
|
5009
5415
|
for line in lines:
|
5010
5416
|
(x0, y0), (x1, y1) = line
|
@@ -5027,7 +5433,7 @@ def detect_angle(image, by="median", template=None):
|
|
5027
5433
|
return rotation_angle
|
5028
5434
|
|
5029
5435
|
# PCA-based angle detection
|
5030
|
-
elif "pca" in by:
|
5436
|
+
elif "pca" in by.lower():
|
5031
5437
|
y, x = np.nonzero(edges)
|
5032
5438
|
if len(x) == 0:
|
5033
5439
|
return 0
|
@@ -5037,14 +5443,14 @@ def detect_angle(image, by="median", template=None):
|
|
5037
5443
|
return angle
|
5038
5444
|
|
5039
5445
|
# Gradient Orientation-based angle detection
|
5040
|
-
elif "gra" in by:
|
5446
|
+
elif "gra" in by.lower():
|
5041
5447
|
gx, gy = np.gradient(gray_image)
|
5042
5448
|
angles = np.arctan2(gy, gx) * 180 / np.pi
|
5043
5449
|
hist, bin_edges = np.histogram(angles, bins=360, range=(-180, 180))
|
5044
5450
|
return bin_edges[np.argmax(hist)]
|
5045
5451
|
|
5046
5452
|
# Template Matching-based angle detection
|
5047
|
-
elif "temp" in by:
|
5453
|
+
elif "temp" in by.lower():
|
5048
5454
|
if template is None:
|
5049
5455
|
# Automatically extract a template from the center of the image
|
5050
5456
|
height, width = gray_image.shape
|
@@ -5067,7 +5473,7 @@ def detect_angle(image, by="median", template=None):
|
|
5067
5473
|
return best_angle
|
5068
5474
|
|
5069
5475
|
# Image Moments-based angle detection
|
5070
|
-
elif "mo" in by:
|
5476
|
+
elif "mo" in by.lower():
|
5071
5477
|
moments = measure.moments_central(gray_image)
|
5072
5478
|
angle = (
|
5073
5479
|
0.5
|
@@ -5078,7 +5484,7 @@ def detect_angle(image, by="median", template=None):
|
|
5078
5484
|
return angle
|
5079
5485
|
|
5080
5486
|
# Fourier Transform-based angle detection
|
5081
|
-
elif "fft" in by:
|
5487
|
+
elif "fft" in by.lower():
|
5082
5488
|
f = fft2(gray_image)
|
5083
5489
|
fshift = fftshift(f)
|
5084
5490
|
magnitude_spectrum = np.log(np.abs(fshift) + 1)
|
@@ -5088,10 +5494,19 @@ def detect_angle(image, by="median", template=None):
|
|
5088
5494
|
return angle
|
5089
5495
|
|
5090
5496
|
else:
|
5091
|
-
print(f"Unknown method {by}")
|
5497
|
+
print(f"Unknown method {by}: supported methods: {methods}")
|
5092
5498
|
return 0
|
5093
5499
|
|
5094
|
-
|
5500
|
+
|
5501
|
+
def imgsets(img,
|
5502
|
+
auto:bool=True,
|
5503
|
+
size=None,
|
5504
|
+
figsize=None,
|
5505
|
+
dpi:int=200,
|
5506
|
+
show_axis:bool=False,
|
5507
|
+
plot_:bool=True,
|
5508
|
+
verbose:bool=False,
|
5509
|
+
**kwargs):
|
5095
5510
|
"""
|
5096
5511
|
Apply various enhancements and filters to an image using PIL's ImageEnhance and ImageFilter modules.
|
5097
5512
|
|
@@ -5125,6 +5540,9 @@ def imgsets(img, **kwargs):
|
|
5125
5540
|
Note:
|
5126
5541
|
The "color" and "enhance" enhancements are not implemented in this function.
|
5127
5542
|
"""
|
5543
|
+
|
5544
|
+
import matplotlib.pyplot as plt
|
5545
|
+
from PIL import ImageEnhance, ImageOps,Image
|
5128
5546
|
supported_filters = [
|
5129
5547
|
"BLUR",
|
5130
5548
|
"CONTOUR",
|
@@ -5144,8 +5562,22 @@ def imgsets(img, **kwargs):
|
|
5144
5562
|
"BOX_BLUR",
|
5145
5563
|
"MEDIAN_FILTER",
|
5146
5564
|
]
|
5147
|
-
|
5148
|
-
|
5565
|
+
str_usage="""
|
5566
|
+
imgsets(dir_img, auto=1, color=1.5, plot_=0)
|
5567
|
+
imgsets(dir_img, color=2)
|
5568
|
+
imgsets(dir_img, pad=(300, 300), bgcolor=(73, 162, 127), plot_=0)
|
5569
|
+
imgsets(dir_img, contrast=0, color=1.2, plot_=0)
|
5570
|
+
imgsets(get_clip(), flip="tb")# flip top and bottom
|
5571
|
+
imgsets(get_clip(), contrast=1, rm=[100, 5, 2]) #'foreground_threshold', 'background_threshold' and 'erode_structure_size'
|
5572
|
+
"""
|
5573
|
+
if run_once_within():
|
5574
|
+
print(str_usage)
|
5575
|
+
|
5576
|
+
def gamma_correction(image, gamma=1.0, v_max=255):
|
5577
|
+
# adjust gama value
|
5578
|
+
inv_gamma = 1.0 / gamma
|
5579
|
+
lut = [int((i / float(v_max)) ** inv_gamma * int(v_max)) for i in range(int(v_max))]
|
5580
|
+
return lut #image.point(lut)
|
5149
5581
|
|
5150
5582
|
def confirm_rembg_models(model_name):
|
5151
5583
|
models_support = [
|
@@ -5169,37 +5601,52 @@ def imgsets(img, **kwargs):
|
|
5169
5601
|
|
5170
5602
|
def auto_enhance(img):
|
5171
5603
|
"""
|
5172
|
-
Automatically enhances the image based on its characteristics
|
5604
|
+
Automatically enhances the image based on its characteristics, including brightness,
|
5605
|
+
contrast, color range, sharpness, and gamma correction.
|
5606
|
+
|
5173
5607
|
Args:
|
5174
5608
|
img (PIL.Image): The input image.
|
5609
|
+
|
5175
5610
|
Returns:
|
5176
|
-
dict: A dictionary containing the optimal enhancement values.
|
5611
|
+
dict: A dictionary containing the optimal enhancement values applied.
|
5612
|
+
PIL.Image: The enhanced image.
|
5177
5613
|
"""
|
5614
|
+
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
|
5615
|
+
import numpy as np
|
5178
5616
|
# Determine the bit depth based on the image mode
|
5179
|
-
|
5180
|
-
|
5181
|
-
|
5182
|
-
|
5183
|
-
|
5617
|
+
try:
|
5618
|
+
if img.mode in ["1", "L", "P", "RGB", "YCbCr", "LAB", "HSV"]:
|
5619
|
+
bit_depth = 8
|
5620
|
+
elif img.mode in ["RGBA", "CMYK"]:
|
5621
|
+
bit_depth = 8
|
5622
|
+
elif img.mode in ["I", "F"]:
|
5623
|
+
bit_depth = 16
|
5624
|
+
else:
|
5625
|
+
raise ValueError("Unsupported image mode")
|
5626
|
+
except:
|
5184
5627
|
bit_depth = 8
|
5185
|
-
|
5186
|
-
|
5187
|
-
|
5188
|
-
|
5189
|
-
|
5190
|
-
|
5628
|
+
|
5629
|
+
# Initialize enhancement factors
|
5630
|
+
enhancements = {
|
5631
|
+
"brightness": 1.0,
|
5632
|
+
"contrast": 0,# autocontrasted
|
5633
|
+
"color": 1.35,
|
5634
|
+
"sharpness": 1.0,
|
5635
|
+
"gamma": 1.0
|
5636
|
+
}
|
5637
|
+
|
5638
|
+
# Calculate brightness and contrast for each channel
|
5191
5639
|
num_channels = len(img.getbands())
|
5192
5640
|
brightness_factors = []
|
5193
5641
|
contrast_factors = []
|
5194
5642
|
for channel in range(num_channels):
|
5195
5643
|
channel_histogram = img.split()[channel].histogram()
|
5196
|
-
|
5197
|
-
|
5198
|
-
)
|
5644
|
+
total_pixels = sum(channel_histogram)
|
5645
|
+
brightness = sum(i * w for i, w in enumerate(channel_histogram)) / total_pixels
|
5199
5646
|
channel_min, channel_max = img.split()[channel].getextrema()
|
5200
5647
|
contrast = channel_max - channel_min
|
5201
5648
|
# Adjust calculations based on bit depth
|
5202
|
-
normalization_factor = 2**bit_depth - 1
|
5649
|
+
normalization_factor = 2**bit_depth - 1
|
5203
5650
|
brightness_factor = (
|
5204
5651
|
1.0 + (brightness - normalization_factor / 2) / normalization_factor
|
5205
5652
|
)
|
@@ -5208,37 +5655,62 @@ def imgsets(img, **kwargs):
|
|
5208
5655
|
)
|
5209
5656
|
brightness_factors.append(brightness_factor)
|
5210
5657
|
contrast_factors.append(contrast_factor)
|
5211
|
-
# Calculate the average brightness and contrast factors across channels
|
5212
|
-
avg_brightness_factor = sum(brightness_factors) / num_channels
|
5213
|
-
avg_contrast_factor = sum(contrast_factors) / num_channels
|
5214
|
-
return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
|
5215
5658
|
|
5216
|
-
|
5217
|
-
|
5659
|
+
# Calculate average brightness and contrast factors across channels
|
5660
|
+
enhancements["brightness"] = sum(brightness_factors) / num_channels
|
5661
|
+
# Adjust brightness and contrast
|
5662
|
+
img = ImageEnhance.Brightness(img).enhance(enhancements["brightness"])
|
5663
|
+
|
5664
|
+
# # Automatic color enhancement (saturation)
|
5665
|
+
# if img.mode == "RGB":
|
5666
|
+
# color_enhancer = ImageEnhance.Color(img)
|
5667
|
+
# color_histogram = np.array(img.histogram()).reshape(3, -1)
|
5668
|
+
# avg_saturation = np.mean([np.std(channel) for channel in color_histogram]) / normalization_factor
|
5669
|
+
# print(avg_saturation)
|
5670
|
+
# enhancements["color"] = min(0, max(0.5, 1.0 + avg_saturation)) # Clamp to a reasonable range
|
5671
|
+
# # img = color_enhancer.enhance(enhancements["color"])
|
5672
|
+
|
5673
|
+
# Adjust sharpness
|
5674
|
+
sharpness_enhancer = ImageEnhance.Sharpness(img)
|
5675
|
+
# Use edge detection to estimate sharpness need
|
5676
|
+
edges = img.filter(ImageFilter.FIND_EDGES).convert("L")
|
5677
|
+
avg_edge_intensity = np.mean(np.array(edges))
|
5678
|
+
enhancements["sharpness"] = min(2.0, max(0.5, 1.0 + avg_edge_intensity / normalization_factor))
|
5679
|
+
# img = sharpness_enhancer.enhance(enhancements["sharpness"])
|
5680
|
+
|
5681
|
+
# # Apply gamma correction
|
5682
|
+
# def gamma_correction(image, gamma):
|
5683
|
+
# inv_gamma = 1.0 / gamma
|
5684
|
+
# lut = [min(255, max(0, int((i / 255.0) ** inv_gamma * 255))) for i in range(256)]
|
5685
|
+
# return image.point(lut)
|
5686
|
+
|
5687
|
+
# avg_brightness = np.mean(np.array(img.convert("L"))) / 255
|
5688
|
+
# enhancements["gamma"] = min(2.0, max(0.5, 1.0 if avg_brightness > 0.5 else 1.2 - avg_brightness))
|
5689
|
+
# img = gamma_correction(img, enhancements["gamma"])
|
5690
|
+
|
5691
|
+
# Return the enhancements and the enhanced image
|
5692
|
+
return enhancements
|
5693
|
+
|
5218
5694
|
|
5219
5695
|
# Load image if input is a file path
|
5220
5696
|
if isinstance(img, str):
|
5221
5697
|
img = load_img(img)
|
5222
|
-
img_update = img.copy()
|
5223
|
-
# Auto-enhance image if requested
|
5224
|
-
|
5225
|
-
auto = kwargs.get("auto", False)
|
5226
|
-
show = kwargs.get("show", True)
|
5227
|
-
show_axis = kwargs.get("show_axis", False)
|
5228
|
-
size = kwargs.get("size", None)
|
5229
|
-
figsize = kwargs.get("figsize", None)
|
5230
|
-
dpi = kwargs.get("dpi", 100)
|
5698
|
+
img_update = img.copy()
|
5231
5699
|
|
5232
5700
|
if auto:
|
5233
5701
|
kwargs = {**auto_enhance(img_update), **kwargs}
|
5234
|
-
|
5702
|
+
params=["sharp","color","contrast","bright","crop","rotate",'size',"resize",
|
5703
|
+
"thumbnail","cover","contain","filter","fit","pad",
|
5704
|
+
"rem","rm","back","bg_color","cut",'gamma','flip']
|
5235
5705
|
for k, value in kwargs.items():
|
5706
|
+
k = strcmp(k, params)[0] # correct the param name
|
5236
5707
|
if "shar" in k.lower():
|
5237
5708
|
enhancer = ImageEnhance.Sharpness(img_update)
|
5238
5709
|
img_update = enhancer.enhance(value)
|
5239
5710
|
elif all(
|
5240
5711
|
["col" in k.lower(), "bg" not in k.lower(), "background" not in k.lower()]
|
5241
5712
|
):
|
5713
|
+
# *color
|
5242
5714
|
enhancer = ImageEnhance.Color(img_update)
|
5243
5715
|
img_update = enhancer.enhance(value)
|
5244
5716
|
elif "contr" in k.lower():
|
@@ -5246,8 +5718,11 @@ def imgsets(img, **kwargs):
|
|
5246
5718
|
enhancer = ImageEnhance.Contrast(img_update)
|
5247
5719
|
img_update = enhancer.enhance(value)
|
5248
5720
|
else:
|
5249
|
-
|
5250
|
-
|
5721
|
+
try:
|
5722
|
+
img_update = ImageOps.autocontrast(img_update)
|
5723
|
+
print("autocontrasted")
|
5724
|
+
except Exception as e:
|
5725
|
+
print(f"Failed 'autocontrasted':{e}")
|
5251
5726
|
elif "bri" in k.lower():
|
5252
5727
|
enhancer = ImageEnhance.Brightness(img_update)
|
5253
5728
|
img_update = enhancer.enhance(value)
|
@@ -5258,7 +5733,13 @@ def imgsets(img, **kwargs):
|
|
5258
5733
|
value = detect_angle(img_update, by=value)
|
5259
5734
|
print(f"rotated by {value}°")
|
5260
5735
|
img_update = img_update.rotate(value)
|
5261
|
-
|
5736
|
+
elif 'flip' in k.lower():
|
5737
|
+
if 'l' in value and 'r' in value:
|
5738
|
+
# left/right
|
5739
|
+
img_update = img_update.transpose(Image.FLIP_LEFT_RIGHT)
|
5740
|
+
elif any(['u' in value and'd' in value, 't' in value and 'b' in value]):
|
5741
|
+
# up/down or top/bottom
|
5742
|
+
img_update = img_update.transpose(Image.FLIP_TOP_BOTTOM)
|
5262
5743
|
elif "si" in k.lower():
|
5263
5744
|
if isinstance(value, tuple):
|
5264
5745
|
value = list(value)
|
@@ -5270,13 +5751,17 @@ def imgsets(img, **kwargs):
|
|
5270
5751
|
img_update = ImageOps.cover(img_update, size=value)
|
5271
5752
|
elif "contain" in k.lower():
|
5272
5753
|
img_update = ImageOps.contain(img_update, size=value)
|
5273
|
-
elif "
|
5754
|
+
elif "fi" in k.lower() and "t" in k.lower(): # filter
|
5274
5755
|
if isinstance(value, dict):
|
5756
|
+
if verbose:
|
5757
|
+
print(f"supported filter: {supported_filters}")
|
5275
5758
|
for filter_name, filter_value in value.items():
|
5276
|
-
img_update = apply_filter(img_update, filter_name, filter_value)
|
5759
|
+
img_update = apply_filter(img_update, filter_name, filter_value,verbose=verbose)
|
5277
5760
|
else:
|
5278
5761
|
img_update = ImageOps.fit(img_update, size=value)
|
5279
5762
|
elif "pad" in k.lower():
|
5763
|
+
# *ImageOps.pad ensures that the resized image has the exact size specified by the size parameter while maintaining the aspect ratio.
|
5764
|
+
# size: A tuple specifying the target size (width, height).
|
5280
5765
|
img_update = ImageOps.pad(img_update, size=value)
|
5281
5766
|
elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
|
5282
5767
|
from rembg import remove, new_session
|
@@ -5285,7 +5770,9 @@ def imgsets(img, **kwargs):
|
|
5285
5770
|
session = new_session("isnet-general-use")
|
5286
5771
|
img_update = remove(img_update, session=session)
|
5287
5772
|
elif value and isinstance(value, (int, float, list)):
|
5288
|
-
|
5773
|
+
if verbose:
|
5774
|
+
print("https://github.com/danielgatis/rembg/blob/main/USAGE.md")
|
5775
|
+
print(f"rm=True # using default setting;\nrm=(240,10,10)\n'foreground_threshold'(240) and 'background_threshold' (10) values used to determine foreground and background pixels. \nThe 'erode_structure_size'(10) parameter specifies the size of the erosion structure to be applied to the mask.")
|
5289
5776
|
if isinstance(value, int):
|
5290
5777
|
value = [value]
|
5291
5778
|
if len(value) < 2:
|
@@ -5327,8 +5814,11 @@ def imgsets(img, **kwargs):
|
|
5327
5814
|
if len(value) == 3:
|
5328
5815
|
value += (255,)
|
5329
5816
|
img_update = remove(img_update, bgcolor=value)
|
5817
|
+
|
5818
|
+
# elif "ga" in k.lower() and "m" in k.lower():
|
5819
|
+
# img_update = gamma_correction(img_update, gamma=value)
|
5330
5820
|
# Display the image if requested
|
5331
|
-
if
|
5821
|
+
if plot_:
|
5332
5822
|
if figsize is None:
|
5333
5823
|
plt.figure(dpi=dpi)
|
5334
5824
|
else:
|
@@ -6355,13 +6845,13 @@ def _df_outlier(
|
|
6355
6845
|
from scipy.stats import zscore
|
6356
6846
|
from sklearn.ensemble import IsolationForest
|
6357
6847
|
from sklearn.preprocessing import StandardScaler
|
6358
|
-
|
6848
|
+
|
6359
6849
|
# Fill completely NaN columns with a default value (e.g., 0)
|
6360
6850
|
data = data.copy()
|
6361
6851
|
data.loc[:, data.isna().all()] = 0
|
6362
6852
|
if columns is not None:
|
6363
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
6364
|
-
data=data[columns]
|
6853
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
6854
|
+
data = data[columns]
|
6365
6855
|
col_names_org = data.columns.tolist()
|
6366
6856
|
index_names_org = data.index.tolist()
|
6367
6857
|
# Separate numeric and non-numeric columns
|
@@ -6527,6 +7017,7 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
6527
7017
|
data = data.explode(column, ignore_index=True)
|
6528
7018
|
return data
|
6529
7019
|
|
7020
|
+
|
6530
7021
|
def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
6531
7022
|
"""
|
6532
7023
|
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
@@ -6536,24 +7027,30 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
|
6536
7027
|
data = df_cycle(data, 'month', 12)
|
6537
7028
|
"""
|
6538
7029
|
if columns is None:
|
6539
|
-
columns = list(
|
7030
|
+
columns = list(
|
7031
|
+
data.select_dtypes(include=np.number).columns
|
7032
|
+
) # If no columns specified, use all columns
|
6540
7033
|
if max_val is None:
|
6541
|
-
max_val = np.max(
|
7034
|
+
max_val = np.max(
|
7035
|
+
data[columns]
|
7036
|
+
) # If no max_val specified, use the maximum value across all columns
|
6542
7037
|
if isinstance(columns, str):
|
6543
|
-
columns = [
|
6544
|
-
|
7038
|
+
columns = [
|
7039
|
+
columns
|
7040
|
+
] # If a single column name is provided as a string, convert it to a list
|
7041
|
+
|
6545
7042
|
# Check if inplace is True, so we modify the original dataframe
|
6546
7043
|
if inplace:
|
6547
7044
|
# Modify the data in place, no return statement needed
|
6548
7045
|
for col in columns:
|
6549
|
-
data[col +
|
6550
|
-
data[col +
|
7046
|
+
data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
|
7047
|
+
data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
|
6551
7048
|
else:
|
6552
7049
|
# If inplace is False, return the modified dataframe
|
6553
7050
|
new_data = data.copy()
|
6554
7051
|
for col in columns:
|
6555
|
-
new_data[col +
|
6556
|
-
new_data[col +
|
7052
|
+
new_data[col + "_sin"] = np.sin(2 * np.pi * new_data[col] / max_val)
|
7053
|
+
new_data[col + "_cos"] = np.cos(2 * np.pi * new_data[col] / max_val)
|
6557
7054
|
return new_data
|
6558
7055
|
|
6559
7056
|
|
@@ -6561,7 +7058,7 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
|
6561
7058
|
def df_astype(
|
6562
7059
|
data: pd.DataFrame,
|
6563
7060
|
columns: Optional[Union[str, List[str]]] = None,
|
6564
|
-
astype: str = None
|
7061
|
+
astype: str = None, # "datetime",
|
6565
7062
|
skip_row: Union[str, list] = None,
|
6566
7063
|
fmt: Optional[str] = None,
|
6567
7064
|
inplace: bool = False,
|
@@ -6624,7 +7121,7 @@ def df_astype(
|
|
6624
7121
|
"day",
|
6625
7122
|
"month",
|
6626
7123
|
"year",
|
6627
|
-
"circular"
|
7124
|
+
"circular",
|
6628
7125
|
]
|
6629
7126
|
# If inplace is False, make a copy of the DataFrame
|
6630
7127
|
if not inplace:
|
@@ -6720,12 +7217,12 @@ def df_astype(
|
|
6720
7217
|
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
6721
7218
|
# print(f"Successfully converted '{column}' to timedelta.")
|
6722
7219
|
elif astype == "circular":
|
6723
|
-
max_val = kwargs.get(
|
6724
|
-
data[column]=df_cycle(data=data,columns=column,max_val=max_val)
|
7220
|
+
max_val = kwargs.get("max_val", None)
|
7221
|
+
data[column] = df_cycle(data=data, columns=column, max_val=max_val)
|
6725
7222
|
else:
|
6726
7223
|
# Convert to other types (e.g., float, int)
|
6727
|
-
if astype==
|
6728
|
-
data[column] = data[column].astype(
|
7224
|
+
if astype == "int":
|
7225
|
+
data[column] = data[column].astype("float").astype("int")
|
6729
7226
|
else:
|
6730
7227
|
data[column] = data[column].astype(astype)
|
6731
7228
|
# print(f"Successfully converted '{column}' to {astype}.")
|
@@ -6775,7 +7272,9 @@ def df_sort_values(data, column, by=None, ascending=True, inplace=True, **kwargs
|
|
6775
7272
|
).index.tolist()
|
6776
7273
|
|
6777
7274
|
# Convert to a categorical type with the new order
|
6778
|
-
data[column] = pd.Categorical(
|
7275
|
+
data[column] = pd.Categorical(
|
7276
|
+
data[column], categories=sorted_counts, ordered=True
|
7277
|
+
)
|
6779
7278
|
# Set ascending to count_ascending for sorting
|
6780
7279
|
ascending = count_ascending # Adjust ascending for the final sort
|
6781
7280
|
elif isinstance(by, list):
|
@@ -6977,7 +7476,7 @@ def df_fillna(
|
|
6977
7476
|
# Fill completely NaN columns with a default value (e.g., 0)
|
6978
7477
|
data = data.copy()
|
6979
7478
|
data.loc[:, data.isna().all()] = 0
|
6980
|
-
|
7479
|
+
|
6981
7480
|
col_names_org = data.columns.tolist()
|
6982
7481
|
index_names_org = data.index.tolist()
|
6983
7482
|
# Separate numeric and non-numeric columns
|
@@ -7034,7 +7533,7 @@ def df_fillna(
|
|
7034
7533
|
imputed_data = imputer.fit_transform(numeric_data.T)
|
7035
7534
|
else:
|
7036
7535
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
7037
|
-
|
7536
|
+
|
7038
7537
|
imputed_data = pd.DataFrame(
|
7039
7538
|
imputed_data if axis == 0 else imputed_data.T,
|
7040
7539
|
index=numeric_data.index if axis == 0 else numeric_data.columns,
|
@@ -7179,11 +7678,15 @@ def df_encoder(
|
|
7179
7678
|
|
7180
7679
|
encoder = LabelEncoder()
|
7181
7680
|
# Apply LabelEncoder only to non-numeric columns
|
7182
|
-
non_numeric_columns = [
|
7681
|
+
non_numeric_columns = [
|
7682
|
+
col for col in columns if not pd.api.types.is_numeric_dtype(data[col])
|
7683
|
+
]
|
7183
7684
|
|
7184
7685
|
if not non_numeric_columns:
|
7185
7686
|
return data
|
7186
|
-
encoded_data = data[non_numeric_columns].apply(
|
7687
|
+
encoded_data = data[non_numeric_columns].apply(
|
7688
|
+
lambda col: encoder.fit_transform(col)
|
7689
|
+
)
|
7187
7690
|
return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
|
7188
7691
|
|
7189
7692
|
# Target encoding (Mean of the target for each category)
|
@@ -7210,13 +7713,13 @@ def df_scaler(
|
|
7210
7713
|
scaler=None,
|
7211
7714
|
method="standard",
|
7212
7715
|
columns=None, # default, select all numeric col/row
|
7213
|
-
feature_range=None
|
7716
|
+
feature_range=None, # specific for 'minmax'
|
7214
7717
|
vmin=0,
|
7215
7718
|
vmax=1,
|
7216
7719
|
inplace=False,
|
7217
7720
|
verbose=False, # show usage
|
7218
7721
|
axis=0, # defalut column-wise
|
7219
|
-
return_scaler:bool=False
|
7722
|
+
return_scaler: bool = False, # True: return both: return df, scaler
|
7220
7723
|
**kwargs,
|
7221
7724
|
):
|
7222
7725
|
"""
|
@@ -7235,34 +7738,56 @@ def df_scaler(
|
|
7235
7738
|
if verbose:
|
7236
7739
|
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
7237
7740
|
if scaler is None:
|
7238
|
-
methods = ["standard", "minmax", "robust","maxabs"]
|
7741
|
+
methods = ["standard", "minmax", "robust", "maxabs"]
|
7239
7742
|
method = strcmp(method, methods)[0]
|
7240
7743
|
if method == "standard":
|
7241
7744
|
from sklearn.preprocessing import StandardScaler
|
7745
|
+
|
7242
7746
|
if verbose:
|
7243
|
-
print(
|
7244
|
-
|
7747
|
+
print(
|
7748
|
+
"performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1."
|
7749
|
+
)
|
7750
|
+
print(
|
7751
|
+
"Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks."
|
7752
|
+
)
|
7245
7753
|
scaler = StandardScaler(**kwargs)
|
7246
7754
|
elif method == "minmax":
|
7247
7755
|
from sklearn.preprocessing import MinMaxScaler
|
7756
|
+
|
7248
7757
|
if feature_range is None:
|
7249
|
-
feature_range=(vmin,vmax)
|
7758
|
+
feature_range = (vmin, vmax)
|
7250
7759
|
if verbose:
|
7251
|
-
print(
|
7252
|
-
|
7253
|
-
|
7254
|
-
|
7760
|
+
print(
|
7761
|
+
"don't forget to define the range: e.g., 'feature_range=(0, 1)'. "
|
7762
|
+
)
|
7763
|
+
print(
|
7764
|
+
"scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1]."
|
7765
|
+
)
|
7766
|
+
print(
|
7767
|
+
"Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks."
|
7768
|
+
)
|
7769
|
+
scaler = MinMaxScaler(feature_range=feature_range, **kwargs)
|
7255
7770
|
elif method == "robust":
|
7256
7771
|
from sklearn.preprocessing import RobustScaler
|
7772
|
+
|
7257
7773
|
if verbose:
|
7258
|
-
print(
|
7259
|
-
|
7774
|
+
print(
|
7775
|
+
"scales the data based on the median and interquartile range, which is robust to outliers."
|
7776
|
+
)
|
7777
|
+
print(
|
7778
|
+
"Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation."
|
7779
|
+
)
|
7260
7780
|
scaler = RobustScaler(**kwargs)
|
7261
|
-
elif method=="maxabs":
|
7781
|
+
elif method == "maxabs":
|
7262
7782
|
from sklearn.preprocessing import MaxAbsScaler
|
7783
|
+
|
7263
7784
|
if verbose:
|
7264
|
-
print(
|
7265
|
-
|
7785
|
+
print(
|
7786
|
+
"This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature."
|
7787
|
+
)
|
7788
|
+
print(
|
7789
|
+
"Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems."
|
7790
|
+
)
|
7266
7791
|
scaler = MaxAbsScaler(**kwargs)
|
7267
7792
|
if axis not in [0, 1]:
|
7268
7793
|
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
@@ -7275,7 +7800,7 @@ def df_scaler(
|
|
7275
7800
|
non_numeric_columns = data.columns.difference(columns)
|
7276
7801
|
|
7277
7802
|
# scaled_data = scaler.fit_transform(data[columns])
|
7278
|
-
if scaler is None or not hasattr(scaler,
|
7803
|
+
if scaler is None or not hasattr(scaler, "mean_"):
|
7279
7804
|
scaled_data = scaler.fit_transform(data[columns])
|
7280
7805
|
else:
|
7281
7806
|
scaled_data = scaler.transform(data[columns])
|
@@ -7293,7 +7818,7 @@ def df_scaler(
|
|
7293
7818
|
)
|
7294
7819
|
scaled_df = scaled_df[data.columns] # Maintain column order
|
7295
7820
|
if return_scaler:
|
7296
|
-
return scaled_df,scaler
|
7821
|
+
return scaled_df, scaler
|
7297
7822
|
else:
|
7298
7823
|
return scaled_df
|
7299
7824
|
|
@@ -7310,7 +7835,11 @@ def df_scaler(
|
|
7310
7835
|
# scaled_data = scaler.fit_transform(
|
7311
7836
|
# numeric_rows.T
|
7312
7837
|
# ).T # Transpose for scaling and then back
|
7313
|
-
scaled_data =
|
7838
|
+
scaled_data = (
|
7839
|
+
scaler.fit_transform(numeric_rows.T).T
|
7840
|
+
if scaler is None or not hasattr(scaler, "mean_")
|
7841
|
+
else scaler.transform(numeric_rows.T).T
|
7842
|
+
)
|
7314
7843
|
|
7315
7844
|
if inplace:
|
7316
7845
|
data.loc[numeric_rows.index] = scaled_data
|
@@ -7319,7 +7848,7 @@ def df_scaler(
|
|
7319
7848
|
scaled_df = data.copy()
|
7320
7849
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
7321
7850
|
if return_scaler:
|
7322
|
-
return scaled_df,scaler
|
7851
|
+
return scaled_df, scaler
|
7323
7852
|
else:
|
7324
7853
|
return scaled_df
|
7325
7854
|
|
@@ -7683,10 +8212,10 @@ def df_reducer(
|
|
7683
8212
|
hue: str = None, # lda-specific
|
7684
8213
|
scale: bool = True,
|
7685
8214
|
fill_missing: bool = True,
|
7686
|
-
size=2
|
7687
|
-
markerscale=4
|
7688
|
-
edgecolor=
|
7689
|
-
legend_loc=
|
8215
|
+
size=2, # for plot marker size
|
8216
|
+
markerscale=4, # for plot, legend marker size scale
|
8217
|
+
edgecolor="none", # for plot,
|
8218
|
+
legend_loc="best", # for plot,
|
7690
8219
|
bbox_to_anchor=None,
|
7691
8220
|
ncols=1,
|
7692
8221
|
debug: bool = False,
|
@@ -7719,7 +8248,7 @@ def df_reducer(
|
|
7719
8248
|
"autoencoder": "Autoencoder:\n\tA neural network-based approach for complex feature learning and non-linear dimensionality reduction. Advantage: Can capture very complex relationships. Limitation: Computationally expensive, requires neural network expertise for effective tuning.",
|
7720
8249
|
"nmf": "Non-negative Matrix Factorization:\n\tEffective for parts-based decomposition, commonly used for sparse and non-negative data, e.g., text data or images. Advantage: Interpretability with non-negativity, efficient with sparse data. Limitation: Less effective for negative or zero-centered data.",
|
7721
8250
|
"umap_hdbscan": "UMAP + HDBSCAN:\n\tCombination of UMAP for dimensionality reduction and HDBSCAN for density-based clustering, suitable for cluster discovery in high-dimensional data. Advantage: Effective in discovering clusters in embeddings. Limitation: Requires careful tuning of both UMAP and HDBSCAN parameters.",
|
7722
|
-
"manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise."
|
8251
|
+
"manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise.",
|
7723
8252
|
}
|
7724
8253
|
|
7725
8254
|
from sklearn.preprocessing import StandardScaler
|
@@ -7730,14 +8259,27 @@ def df_reducer(
|
|
7730
8259
|
import seaborn as sns
|
7731
8260
|
# Check valid method input
|
7732
8261
|
methods = [
|
7733
|
-
"pca",
|
7734
|
-
"
|
8262
|
+
"pca",
|
8263
|
+
"umap",
|
8264
|
+
"umap_hdbscan",
|
8265
|
+
"tsne",
|
8266
|
+
"factor",
|
8267
|
+
"isolation_forest",
|
8268
|
+
"manifold_learning",
|
8269
|
+
"lda",
|
8270
|
+
"kpca",
|
8271
|
+
"ica",
|
8272
|
+
"mds",
|
8273
|
+
"lle",
|
8274
|
+
"svd",
|
8275
|
+
"truncated_svd",
|
8276
|
+
"spectral_embedding",
|
7735
8277
|
# "autoencoder","nmf",
|
7736
8278
|
]
|
7737
8279
|
method = strcmp(method, methods)[0]
|
7738
8280
|
if run_once_within(reverse=True):
|
7739
8281
|
print(f"support methods:{methods}")
|
7740
|
-
|
8282
|
+
|
7741
8283
|
if verbose:
|
7742
8284
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
7743
8285
|
xlabel, ylabel = None, None
|
@@ -8050,8 +8592,9 @@ def df_reducer(
|
|
8050
8592
|
svd_df[hue] = y
|
8051
8593
|
if debug:
|
8052
8594
|
print("Singular Value Decomposition (SVD) completed.")
|
8053
|
-
elif method=="truncated_svd":
|
8595
|
+
elif method == "truncated_svd":
|
8054
8596
|
from sklearn.decomposition import TruncatedSVD
|
8597
|
+
|
8055
8598
|
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
|
8056
8599
|
X_reduced = svd.fit_transform(X)
|
8057
8600
|
reduced_df = pd.DataFrame(
|
@@ -8070,7 +8613,9 @@ def df_reducer(
|
|
8070
8613
|
elif method == "spectral_embedding":
|
8071
8614
|
from sklearn.manifold import SpectralEmbedding
|
8072
8615
|
|
8073
|
-
spectral = SpectralEmbedding(
|
8616
|
+
spectral = SpectralEmbedding(
|
8617
|
+
n_components=n_components, random_state=random_state
|
8618
|
+
)
|
8074
8619
|
X_reduced = spectral.fit_transform(X)
|
8075
8620
|
reduced_df = pd.DataFrame(
|
8076
8621
|
X_reduced,
|
@@ -8168,7 +8713,7 @@ def df_reducer(
|
|
8168
8713
|
print("Manifold Learning (Isomap) completed.")
|
8169
8714
|
if hue:
|
8170
8715
|
reduced_df[hue] = y
|
8171
|
-
|
8716
|
+
|
8172
8717
|
#! Return reduced data and info as a new DataFrame with the same index
|
8173
8718
|
if method == "pca":
|
8174
8719
|
reduced_df = pca_df
|
@@ -8225,7 +8770,8 @@ def df_reducer(
|
|
8225
8770
|
colname_met = "SVD_"
|
8226
8771
|
# Quick plots
|
8227
8772
|
if plot_ and (not method in ["isolation_forest"]):
|
8228
|
-
from .plot import plotxy,figsets,get_color
|
8773
|
+
from .plot import plotxy, figsets, get_color
|
8774
|
+
|
8229
8775
|
# if ax is None:
|
8230
8776
|
# if figsize is None:
|
8231
8777
|
# _, ax = plt.subplots(figsize=cm2inch(8, 8))
|
@@ -8235,9 +8781,9 @@ def df_reducer(
|
|
8235
8781
|
# ax = ax.cla()
|
8236
8782
|
xlabel = f"{colname_met}1" if xlabel is None else xlabel
|
8237
8783
|
ylabel = f"{colname_met}2" if ylabel is None else ylabel
|
8238
|
-
palette=get_color(len(flatten(data[hue],verbose=0)))
|
8784
|
+
palette = get_color(len(flatten(data[hue], verbose=0)))
|
8239
8785
|
|
8240
|
-
reduced_df=reduced_df.sort_values(by=hue)
|
8786
|
+
reduced_df = reduced_df.sort_values(by=hue)
|
8241
8787
|
print(flatten(reduced_df[hue]))
|
8242
8788
|
ax = plotxy(
|
8243
8789
|
data=reduced_df,
|
@@ -8247,24 +8793,31 @@ def df_reducer(
|
|
8247
8793
|
palette=palette,
|
8248
8794
|
# size=size,
|
8249
8795
|
edgecolor=edgecolor,
|
8250
|
-
kind_=[
|
8251
|
-
|
8252
|
-
|
8253
|
-
|
8796
|
+
kind_=[
|
8797
|
+
"joint",
|
8798
|
+
# "kde",
|
8799
|
+
"ell",
|
8800
|
+
],
|
8254
8801
|
kws_kde=dict(
|
8255
|
-
|
8256
|
-
|
8257
|
-
|
8258
|
-
|
8259
|
-
|
8260
|
-
|
8261
|
-
kws_joint=dict(kind=
|
8262
|
-
kws_ellipse=dict(alpha=0.1,lw=1,label=None),
|
8802
|
+
hue=hue,
|
8803
|
+
levels=2,
|
8804
|
+
common_norm=False,
|
8805
|
+
fill=True,
|
8806
|
+
alpha=0.05,
|
8807
|
+
),
|
8808
|
+
kws_joint=dict(kind="scatter", joint_kws=dict(s=size)),
|
8809
|
+
kws_ellipse=dict(alpha=0.1, lw=1, label=None),
|
8263
8810
|
verbose=False,
|
8264
8811
|
**kwargs,
|
8265
8812
|
)
|
8266
8813
|
figsets(
|
8267
|
-
legend=dict(
|
8814
|
+
legend=dict(
|
8815
|
+
loc=legend_loc,
|
8816
|
+
markerscale=markerscale,
|
8817
|
+
bbox_to_anchor=bbox_to_anchor,
|
8818
|
+
ncols=ncols,
|
8819
|
+
fontsize=8,
|
8820
|
+
),
|
8268
8821
|
xlabel=xlabel if xlabel else None,
|
8269
8822
|
ylabel=ylabel if ylabel else None,
|
8270
8823
|
)
|
@@ -8297,6 +8850,7 @@ def df_reducer(
|
|
8297
8850
|
# example:
|
8298
8851
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
8299
8852
|
|
8853
|
+
|
8300
8854
|
def get_df_format(data, threshold_unique=0.5, verbose=False):
|
8301
8855
|
"""
|
8302
8856
|
检测表格: long, wide or uncertain.
|
@@ -8396,7 +8950,9 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
|
|
8396
8950
|
if cluster_labels.nunique() < len(numeric_cols) * 0.5:
|
8397
8951
|
wide_score += 2
|
8398
8952
|
if verbose:
|
8399
|
-
print(
|
8953
|
+
print(
|
8954
|
+
"Clustering on columns shows grouping, suggesting wide format."
|
8955
|
+
)
|
8400
8956
|
except Exception as e:
|
8401
8957
|
print(e) if verbose else None
|
8402
8958
|
|
@@ -8487,7 +9043,8 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
|
|
8487
9043
|
if verbose:
|
8488
9044
|
print("Final decision: Uncertain format.")
|
8489
9045
|
return "uncertain"
|
8490
|
-
|
9046
|
+
|
9047
|
+
|
8491
9048
|
def plot_cluster(
|
8492
9049
|
data: pd.DataFrame,
|
8493
9050
|
labels: np.ndarray,
|
@@ -8735,6 +9292,8 @@ def evaluate_cluster(
|
|
8735
9292
|
metrics["V-Measure"] = np.nan
|
8736
9293
|
|
8737
9294
|
return metrics
|
9295
|
+
|
9296
|
+
|
8738
9297
|
def df_qc(
|
8739
9298
|
data: pd.DataFrame,
|
8740
9299
|
columns=None,
|
@@ -8744,7 +9303,7 @@ def df_qc(
|
|
8744
9303
|
hue=None,
|
8745
9304
|
output=False,
|
8746
9305
|
verbose=True,
|
8747
|
-
dir_save=None
|
9306
|
+
dir_save=None,
|
8748
9307
|
):
|
8749
9308
|
"""
|
8750
9309
|
Usage example:
|
@@ -8752,16 +9311,17 @@ def df_qc(
|
|
8752
9311
|
"""
|
8753
9312
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
8754
9313
|
from scipy.stats import skew, kurtosis, entropy
|
8755
|
-
|
9314
|
+
|
8756
9315
|
pd.options.display.max_seq_items = 10
|
8757
9316
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
8758
9317
|
#!skim
|
8759
9318
|
if columns is not None:
|
8760
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
8761
|
-
data=data[columns]
|
9319
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9320
|
+
data = data[columns]
|
8762
9321
|
if skim:
|
8763
9322
|
try:
|
8764
|
-
import skimpy
|
9323
|
+
import skimpy
|
9324
|
+
|
8765
9325
|
skimpy.skim(data)
|
8766
9326
|
except:
|
8767
9327
|
numerical_data = data.select_dtypes(include=[np.number])
|
@@ -8775,13 +9335,19 @@ def df_qc(
|
|
8775
9335
|
|
8776
9336
|
# Missing values
|
8777
9337
|
res_qc["missing_values"] = data.isnull().sum()
|
8778
|
-
res_qc["missing_percentage"] = round(
|
9338
|
+
res_qc["missing_percentage"] = round(
|
9339
|
+
(res_qc["missing_values"] / len(data)) * 100, 2
|
9340
|
+
)
|
8779
9341
|
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
8780
9342
|
|
8781
9343
|
# Data types and unique values
|
8782
9344
|
res_qc["data_types"] = data.dtypes
|
8783
|
-
res_qc["unique_counts"] =
|
8784
|
-
|
9345
|
+
res_qc["unique_counts"] = (
|
9346
|
+
data.select_dtypes(exclude=np.number).nunique().sort_values()
|
9347
|
+
)
|
9348
|
+
res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(
|
9349
|
+
lambda x: x.unique()
|
9350
|
+
)
|
8785
9351
|
res_qc["constant_columns"] = [
|
8786
9352
|
col for col in data.columns if data[col].nunique() <= 1
|
8787
9353
|
]
|
@@ -8797,8 +9363,8 @@ def df_qc(
|
|
8797
9363
|
data_outliers = df_outlier(data)
|
8798
9364
|
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
8799
9365
|
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
8800
|
-
outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
|
8801
|
-
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
9366
|
+
outlier_percentage = round((outlier_num / len(data_outliers)) * 100, 2)
|
9367
|
+
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage > 0]
|
8802
9368
|
try:
|
8803
9369
|
# Correlation and multicollinearity (VIF)
|
8804
9370
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
@@ -8816,16 +9382,16 @@ def df_qc(
|
|
8816
9382
|
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
8817
9383
|
if isinstance(numeric_df.columns, pd.MultiIndex):
|
8818
9384
|
numeric_df.columns = [
|
8819
|
-
"_".join(col).strip() if isinstance(col, tuple) else col
|
9385
|
+
"_".join(col).strip() if isinstance(col, tuple) else col
|
9386
|
+
for col in numeric_df.columns
|
8820
9387
|
]
|
8821
9388
|
|
8822
|
-
|
8823
9389
|
vif_data = pd.DataFrame()
|
8824
|
-
res_qc["vif"]=vif_data
|
9390
|
+
res_qc["vif"] = vif_data
|
8825
9391
|
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
8826
9392
|
vif_data["feature"] = numeric_df.columns.tolist()
|
8827
9393
|
vif_data["VIF"] = [
|
8828
|
-
round(variance_inflation_factor(numeric_df.values, i),2)
|
9394
|
+
round(variance_inflation_factor(numeric_df.values, i), 2)
|
8829
9395
|
for i in range(numeric_df.shape[1])
|
8830
9396
|
]
|
8831
9397
|
res_qc["vif"] = vif_data[
|
@@ -8847,8 +9413,8 @@ def df_qc(
|
|
8847
9413
|
}
|
8848
9414
|
|
8849
9415
|
# dtypes counts
|
8850
|
-
res_qc[
|
8851
|
-
|
9416
|
+
res_qc["dtype_counts"] = data.dtypes.value_counts()
|
9417
|
+
|
8852
9418
|
# Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
|
8853
9419
|
distribution_stats = data.select_dtypes(include=[np.number]).describe().T
|
8854
9420
|
iqr = data.select_dtypes(include=[np.number]).apply(
|
@@ -8880,7 +9446,6 @@ def df_qc(
|
|
8880
9446
|
if len(unique_types) > 1:
|
8881
9447
|
inconsistent_types[col] = unique_types
|
8882
9448
|
res_qc["inconsistent_types"] = inconsistent_types
|
8883
|
-
|
8884
9449
|
|
8885
9450
|
# Text length analysis for text fields
|
8886
9451
|
text_lengths = {}
|
@@ -8892,7 +9457,9 @@ def df_qc(
|
|
8892
9457
|
res_qc["text_length_analysis"] = text_lengths
|
8893
9458
|
|
8894
9459
|
# Summary statistics
|
8895
|
-
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(
|
9460
|
+
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(
|
9461
|
+
cmap="coolwarm", axis=0
|
9462
|
+
)
|
8896
9463
|
|
8897
9464
|
# Automated warnings
|
8898
9465
|
warnings = []
|
@@ -8920,39 +9487,60 @@ def df_qc(
|
|
8920
9487
|
display(res_qc["data_types"])
|
8921
9488
|
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
8922
9489
|
print(" ⤵ Missing Values Counts:")
|
8923
|
-
display(
|
8924
|
-
|
8925
|
-
|
8926
|
-
|
8927
|
-
|
8928
|
-
|
8929
|
-
|
8930
|
-
|
8931
|
-
|
9490
|
+
display(
|
9491
|
+
pd.DataFrame(
|
9492
|
+
{
|
9493
|
+
"missing_values": res_qc["missing_values"][
|
9494
|
+
res_qc["missing_values"] > 0
|
9495
|
+
],
|
9496
|
+
"missing_percent(%)": res_qc["missing_percentage"][
|
9497
|
+
res_qc["missing_percentage"] > 0
|
9498
|
+
],
|
9499
|
+
}
|
9500
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9501
|
+
)
|
8932
9502
|
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
8933
|
-
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
9503
|
+
print("\n⤵ Rows with Missing Values:", res_qc["rows_with_missing"])
|
9504
|
+
|
9505
|
+
(
|
9506
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"])
|
9507
|
+
if any(res_qc["constant_columns"])
|
9508
|
+
else None
|
9509
|
+
)
|
9510
|
+
(
|
9511
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"])
|
9512
|
+
if res_qc["duplicate_rows"]
|
9513
|
+
else None
|
9514
|
+
)
|
9515
|
+
(
|
9516
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"])
|
9517
|
+
if any(res_qc["duplicate_columns"])
|
9518
|
+
else None
|
9519
|
+
)
|
8934
9520
|
|
8935
|
-
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
8936
|
-
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
8937
|
-
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
8938
|
-
|
8939
9521
|
if any(res_qc["outlier_num"]):
|
8940
9522
|
print("\n⤵ Outlier Report:")
|
8941
|
-
display(
|
8942
|
-
|
8943
|
-
|
8944
|
-
|
8945
|
-
|
8946
|
-
|
8947
|
-
|
8948
|
-
|
8949
|
-
|
9523
|
+
display(
|
9524
|
+
pd.DataFrame(
|
9525
|
+
{
|
9526
|
+
"outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
|
9527
|
+
"outlier_percentage(%)": res_qc["outlier_percentage"][
|
9528
|
+
res_qc["outlier_percentage"] > 0
|
9529
|
+
],
|
9530
|
+
}
|
9531
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9532
|
+
)
|
8950
9533
|
|
8951
9534
|
if any(res_qc["unique_counts"]):
|
8952
9535
|
print("\n⤵ Unique Values per Column:")
|
8953
|
-
display(
|
8954
|
-
|
8955
|
-
|
9536
|
+
display(
|
9537
|
+
pd.DataFrame(
|
9538
|
+
{
|
9539
|
+
"unique_counts": res_qc["unique_counts"],
|
9540
|
+
"unique_values": res_qc["unique_values"],
|
9541
|
+
}
|
9542
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9543
|
+
)
|
8956
9544
|
|
8957
9545
|
if res_qc["empty_columns"]:
|
8958
9546
|
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
@@ -8971,7 +9559,7 @@ def df_qc(
|
|
8971
9559
|
print(res_qc["high_cardinality_categoricals"])
|
8972
9560
|
if any(res_qc["inconsistent_types"]):
|
8973
9561
|
print("\n⤵ Inconsistent Data Types:")
|
8974
|
-
display(res_qc["inconsistent_types"])
|
9562
|
+
display(res_qc["inconsistent_types"])
|
8975
9563
|
if any(res_qc["text_length_analysis"]):
|
8976
9564
|
print("\n⤵ Text Length Analysis:")
|
8977
9565
|
for col, stats in res_qc["text_length_analysis"].items():
|
@@ -8986,67 +9574,93 @@ def df_qc(
|
|
8986
9574
|
|
8987
9575
|
pd.reset_option("display.max_seq_items")
|
8988
9576
|
if plot_:
|
8989
|
-
df_qc_plots(
|
9577
|
+
df_qc_plots(
|
9578
|
+
data=data, res_qc=res_qc, max_cols=max_cols, hue=hue, dir_save=dir_save
|
9579
|
+
)
|
8990
9580
|
if output or not plot_:
|
8991
9581
|
return res_qc
|
8992
9582
|
return None
|
8993
9583
|
|
8994
9584
|
|
8995
|
-
def df_qc_plots(
|
9585
|
+
def df_qc_plots(
|
9586
|
+
data: pd.DataFrame,
|
9587
|
+
columns=None,
|
9588
|
+
res_qc: dict = None,
|
9589
|
+
max_cols=20,
|
9590
|
+
hue=None,
|
9591
|
+
dir_save=None,
|
9592
|
+
):
|
8996
9593
|
import matplotlib.pyplot as plt
|
8997
9594
|
import seaborn as sns
|
8998
9595
|
from .plot import subplot, figsets, get_color
|
8999
9596
|
from datetime import datetime
|
9597
|
+
|
9000
9598
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
9001
|
-
|
9599
|
+
|
9002
9600
|
if columns is not None:
|
9003
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
9004
|
-
data=data[columns]
|
9601
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9602
|
+
data = data[columns]
|
9005
9603
|
len_total = len(res_qc)
|
9006
9604
|
n_row, n_col = int((len_total + 10)), 3
|
9007
|
-
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
9605
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row], verbose=False)
|
9008
9606
|
|
9009
9607
|
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
9010
9608
|
ascending=False
|
9011
9609
|
)
|
9012
9610
|
if len(missing_data) > max_cols:
|
9013
9611
|
missing_data = missing_data[:max_cols]
|
9014
|
-
ax_missing_data=sns.barplot(
|
9612
|
+
ax_missing_data = sns.barplot(
|
9015
9613
|
y=missing_data.index,
|
9016
9614
|
x=missing_data.values,
|
9017
9615
|
hue=missing_data.index,
|
9018
9616
|
palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
|
9019
9617
|
ax=nexttile(),
|
9020
9618
|
)
|
9021
|
-
figsets(
|
9619
|
+
figsets(
|
9620
|
+
title="Missing (#)",
|
9621
|
+
xlabel="#",
|
9622
|
+
ax=ax_missing_data,
|
9623
|
+
ylabel=None,
|
9624
|
+
fontsize=8 if len(missing_data) <= 20 else 6,
|
9625
|
+
)
|
9022
9626
|
|
9023
9627
|
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
9024
9628
|
if len(outlier_num) > max_cols:
|
9025
9629
|
outlier_num = outlier_num[:max_cols]
|
9026
|
-
ax_outlier_num=sns.barplot(
|
9630
|
+
ax_outlier_num = sns.barplot(
|
9027
9631
|
y=outlier_num.index,
|
9028
9632
|
x=outlier_num.values,
|
9029
|
-
hue=outlier_num.index,
|
9633
|
+
hue=outlier_num.index,
|
9030
9634
|
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
9031
9635
|
ax=nexttile(),
|
9032
9636
|
)
|
9033
|
-
figsets(
|
9034
|
-
|
9637
|
+
figsets(
|
9638
|
+
ax=ax_outlier_num,
|
9639
|
+
title="Outliers (#)",
|
9640
|
+
xlabel="#",
|
9641
|
+
ylabel=None,
|
9642
|
+
fontsize=8 if len(outlier_num) <= 20 else 6,
|
9643
|
+
)
|
9644
|
+
|
9035
9645
|
#!
|
9036
9646
|
try:
|
9037
|
-
for col in data.select_dtypes(include=
|
9038
|
-
sns.countplot(
|
9039
|
-
|
9040
|
-
|
9647
|
+
for col in data.select_dtypes(include="category").columns:
|
9648
|
+
sns.countplot(
|
9649
|
+
y=data[col],
|
9650
|
+
palette=get_color(
|
9651
|
+
data.select_dtypes(include="category").shape[1], cmap="coolwarm"
|
9652
|
+
)[::-1],
|
9653
|
+
ax=nexttile(),
|
9654
|
+
)
|
9041
9655
|
figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
|
9042
9656
|
except Exception as e:
|
9043
|
-
pass
|
9657
|
+
pass
|
9044
9658
|
|
9045
9659
|
# Skewness and Kurtosis Plots
|
9046
9660
|
skewness = res_qc["skewness"].sort_values(ascending=False)
|
9047
9661
|
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
9048
9662
|
if not skewness.empty:
|
9049
|
-
ax_skewness=sns.barplot(
|
9663
|
+
ax_skewness = sns.barplot(
|
9050
9664
|
y=skewness.index,
|
9051
9665
|
x=skewness.values,
|
9052
9666
|
hue=skewness.index,
|
@@ -9055,11 +9669,13 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9055
9669
|
)
|
9056
9670
|
figsets(
|
9057
9671
|
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
9058
|
-
xlabel="Skewness",
|
9059
|
-
|
9672
|
+
xlabel="Skewness",
|
9673
|
+
ylabel=None,
|
9674
|
+
ax=ax_skewness,
|
9675
|
+
fontsize=8 if len(skewness) <= 20 else 6,
|
9060
9676
|
)
|
9061
9677
|
if not kurtosis.empty:
|
9062
|
-
ax_kurtosis=sns.barplot(
|
9678
|
+
ax_kurtosis = sns.barplot(
|
9063
9679
|
y=kurtosis.index,
|
9064
9680
|
x=kurtosis.values,
|
9065
9681
|
hue=kurtosis.index,
|
@@ -9068,59 +9684,68 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9068
9684
|
)
|
9069
9685
|
figsets(
|
9070
9686
|
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
9071
|
-
xlabel="Kurtosis",
|
9072
|
-
|
9687
|
+
xlabel="Kurtosis",
|
9688
|
+
ylabel=None,
|
9689
|
+
ax=ax_kurtosis,
|
9690
|
+
fontsize=8 if len(kurtosis) <= 20 else 6,
|
9073
9691
|
)
|
9074
9692
|
|
9075
9693
|
# Entropy for Categorical Variables
|
9076
9694
|
entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
|
9077
9695
|
ascending=False
|
9078
9696
|
)
|
9079
|
-
ax_entropy_data=sns.barplot(
|
9080
|
-
y=entropy_data.index,
|
9697
|
+
ax_entropy_data = sns.barplot(
|
9698
|
+
y=entropy_data.index,
|
9699
|
+
x=entropy_data.values,
|
9700
|
+
hue=entropy_data.index,
|
9081
9701
|
palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
|
9082
|
-
ax=nexttile()
|
9083
|
-
)
|
9702
|
+
ax=nexttile(),
|
9703
|
+
)
|
9084
9704
|
figsets(
|
9085
|
-
|
9086
|
-
|
9087
|
-
|
9088
|
-
|
9089
|
-
|
9090
|
-
|
9705
|
+
ylabel="Categorical Columns",
|
9706
|
+
title="Entropy of Categorical Variables",
|
9707
|
+
xlabel="Entropy (bits)",
|
9708
|
+
ax=ax_entropy_data,
|
9709
|
+
fontsize=8 if len(entropy_data) <= 20 else 6,
|
9710
|
+
)
|
9091
9711
|
|
9092
9712
|
# unique counts
|
9093
|
-
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
9094
|
-
ax_unique_counts_=sns.barplot(
|
9095
|
-
|
9096
|
-
|
9097
|
-
|
9098
|
-
|
9099
|
-
|
9713
|
+
unique_counts = res_qc["unique_counts"].sort_values(ascending=False)
|
9714
|
+
ax_unique_counts_ = sns.barplot(
|
9715
|
+
y=unique_counts.index,
|
9716
|
+
x=unique_counts.values,
|
9717
|
+
hue=unique_counts.index,
|
9718
|
+
palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
|
9719
|
+
ax=nexttile(),
|
9720
|
+
)
|
9100
9721
|
figsets(
|
9101
|
-
|
9102
|
-
|
9103
|
-
|
9104
|
-
|
9105
|
-
|
9106
|
-
|
9722
|
+
title="Unique Counts",
|
9723
|
+
ylabel=None,
|
9724
|
+
xlabel="#",
|
9725
|
+
ax=ax_unique_counts_,
|
9726
|
+
fontsize=8 if len(unique_counts) <= 20 else 6,
|
9727
|
+
)
|
9107
9728
|
# Binary Checking
|
9108
|
-
ax_unique_counts=sns.barplot(
|
9109
|
-
|
9110
|
-
|
9111
|
-
|
9112
|
-
|
9729
|
+
ax_unique_counts = sns.barplot(
|
9730
|
+
y=unique_counts[unique_counts < 8].index,
|
9731
|
+
x=unique_counts[unique_counts < 8].values,
|
9732
|
+
hue=unique_counts[unique_counts < 8].index,
|
9733
|
+
palette=get_color(len(unique_counts[unique_counts < 8].index), cmap="coolwarm")[
|
9734
|
+
::-1
|
9735
|
+
],
|
9736
|
+
ax=nexttile(),
|
9737
|
+
)
|
9113
9738
|
plt.axvline(x=2, color="r", linestyle="--", lw=2)
|
9114
9739
|
figsets(
|
9115
|
-
|
9116
|
-
|
9117
|
-
|
9118
|
-
|
9119
|
-
|
9120
|
-
|
9740
|
+
ylabel=None,
|
9741
|
+
title="Binary Checking",
|
9742
|
+
xlabel="#",
|
9743
|
+
ax=ax_unique_counts,
|
9744
|
+
fontsize=8 if len(unique_counts[unique_counts < 10].index) <= 20 else 6,
|
9745
|
+
)
|
9121
9746
|
|
9122
9747
|
# dtypes counts
|
9123
|
-
dtype_counts = res_qc[
|
9748
|
+
dtype_counts = res_qc["dtype_counts"]
|
9124
9749
|
txt = []
|
9125
9750
|
for tp in dtype_counts.index:
|
9126
9751
|
txt.append(list(data.select_dtypes(include=tp).columns))
|
@@ -9131,9 +9756,9 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9131
9756
|
color="#F3C8B2",
|
9132
9757
|
ax=nexttile(),
|
9133
9758
|
)
|
9134
|
-
max_columns_per_row = 1
|
9759
|
+
max_columns_per_row = 1 # Maximum number of columns per row
|
9135
9760
|
for i, tp in enumerate(dtype_counts.index):
|
9136
|
-
if i<=20:
|
9761
|
+
if i <= 20:
|
9137
9762
|
column_names = txt[i]
|
9138
9763
|
# Split the column names into multiple lines if too long
|
9139
9764
|
column_name_str = ", ".join(column_names)
|
@@ -9152,7 +9777,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9152
9777
|
ha="center",
|
9153
9778
|
va="top",
|
9154
9779
|
c="k",
|
9155
|
-
fontsize=8
|
9780
|
+
fontsize=8 if len(dtype_counts.index) <= 20 else 6,
|
9156
9781
|
rotation=0,
|
9157
9782
|
)
|
9158
9783
|
figsets(
|
@@ -9160,7 +9785,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9160
9785
|
title="Dtypes",
|
9161
9786
|
ylabel="#",
|
9162
9787
|
ax=ax_dtype_counts,
|
9163
|
-
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
9788
|
+
fontsize=8 if len(dtype_counts.index) <= 20 else 6,
|
9164
9789
|
)
|
9165
9790
|
# from .plot import pie
|
9166
9791
|
# pie()
|
@@ -9175,57 +9800,66 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9175
9800
|
)
|
9176
9801
|
|
9177
9802
|
if high_cardinality:
|
9178
|
-
ax_high_cardinality=sns.barplot(
|
9803
|
+
ax_high_cardinality = sns.barplot(
|
9179
9804
|
y=list(high_cardinality.keys()),
|
9180
9805
|
x=list(high_cardinality.values()),
|
9181
9806
|
hue=list(high_cardinality.keys()),
|
9182
|
-
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[
|
9807
|
+
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[
|
9808
|
+
::-1
|
9809
|
+
],
|
9183
9810
|
ax=nexttile(),
|
9184
9811
|
)
|
9185
9812
|
figsets(
|
9186
9813
|
title="High Cardinality Categorical Columns",
|
9187
9814
|
xlabel="Unique Value Count",
|
9188
9815
|
ax=ax_high_cardinality,
|
9189
|
-
fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
|
9816
|
+
fontsize=8 if len(list(high_cardinality.keys())) <= 20 else 6,
|
9190
9817
|
)
|
9191
9818
|
if res_qc["low_variance_features"]:
|
9192
9819
|
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
9193
9820
|
for col in low_variance_data.columns:
|
9194
|
-
ax_low_variance_features=sns.histplot(
|
9821
|
+
ax_low_variance_features = sns.histplot(
|
9195
9822
|
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
9196
9823
|
)
|
9197
|
-
figsets(
|
9198
|
-
|
9824
|
+
figsets(
|
9825
|
+
title=f"Low Variance Feature: {col}",
|
9826
|
+
ax=ax_low_variance_features,
|
9827
|
+
fontsize=8 if len(low_variance_data[col]) <= 20 else 6,
|
9828
|
+
)
|
9199
9829
|
|
9200
9830
|
# VIF plot for multicollinearity detection
|
9201
9831
|
if "vif" in res_qc and not res_qc["vif"].empty:
|
9202
9832
|
vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
|
9203
9833
|
if len(vif_data) > max_cols:
|
9204
9834
|
vif_data = vif_data[:max_cols]
|
9205
|
-
ax_vif=sns.barplot(
|
9206
|
-
|
9207
|
-
|
9208
|
-
|
9209
|
-
|
9210
|
-
|
9835
|
+
ax_vif = sns.barplot(
|
9836
|
+
data=vif_data,
|
9837
|
+
x="VIF",
|
9838
|
+
y="feature",
|
9839
|
+
hue="VIF",
|
9840
|
+
palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
|
9841
|
+
ax=nexttile(),
|
9842
|
+
)
|
9211
9843
|
figsets(
|
9212
9844
|
title="Variance Inflation Factor(VIF)",
|
9213
9845
|
xlabel="VIF",
|
9214
9846
|
ylabel="Features",
|
9215
9847
|
legend=None,
|
9216
9848
|
ax=ax_vif,
|
9217
|
-
fontsize=8 if len(vif_data)<=20 else 6
|
9849
|
+
fontsize=8 if len(vif_data) <= 20 else 6,
|
9218
9850
|
)
|
9219
9851
|
|
9220
9852
|
# Correlation heatmap for numeric columns with high correlation pairs
|
9221
9853
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
9222
9854
|
corr = data.select_dtypes(include=[np.number]).corr()
|
9223
|
-
if corr.shape[1]<=33:
|
9855
|
+
if corr.shape[1] <= 33:
|
9224
9856
|
mask = np.triu(np.ones_like(corr, dtype=bool))
|
9225
9857
|
num_columns = corr.shape[1]
|
9226
|
-
fontsize = max(
|
9858
|
+
fontsize = max(
|
9859
|
+
6, min(12, 12 - (num_columns - 10) * 0.2)
|
9860
|
+
) # Scale between 8 and 12
|
9227
9861
|
|
9228
|
-
ax_heatmap=sns.heatmap(
|
9862
|
+
ax_heatmap = sns.heatmap(
|
9229
9863
|
corr,
|
9230
9864
|
mask=mask,
|
9231
9865
|
annot=True,
|
@@ -9233,24 +9867,21 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9233
9867
|
center=0,
|
9234
9868
|
fmt=".1f",
|
9235
9869
|
linewidths=0.5,
|
9236
|
-
vmin=-1,
|
9870
|
+
vmin=-1,
|
9871
|
+
vmax=1,
|
9237
9872
|
ax=nexttile(2, 2),
|
9238
|
-
cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
|
9239
|
-
annot_kws={"size": fontsize}
|
9240
|
-
)
|
9241
|
-
|
9242
|
-
figsets(
|
9243
|
-
xangle=45,
|
9244
|
-
title="Correlation Heatmap",
|
9245
|
-
ax=ax_heatmap
|
9873
|
+
cbar_kws=dict(shrink=0.2, ticks=np.arange(-1, 2, 1)),
|
9874
|
+
annot_kws={"size": fontsize},
|
9246
9875
|
)
|
9876
|
+
|
9877
|
+
figsets(xangle=45, title="Correlation Heatmap", ax=ax_heatmap)
|
9247
9878
|
# # save figure
|
9248
9879
|
# if dir_save:
|
9249
9880
|
# figsave(dir_save,f"qc_plot_{now_}.pdf")
|
9250
9881
|
|
9251
9882
|
if columns is not None:
|
9252
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
9253
|
-
data=data[columns]
|
9883
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9884
|
+
data = data[columns]
|
9254
9885
|
|
9255
9886
|
# len_total = len(res_qc)
|
9256
9887
|
# n_row, n_col = int((len_total + 10) / 3), 3
|
@@ -9258,30 +9889,36 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9258
9889
|
#! check distribution
|
9259
9890
|
data_num = data.select_dtypes(include=np.number)
|
9260
9891
|
if len(data_num) > max_cols:
|
9261
|
-
data_num = data_num.iloc[
|
9892
|
+
data_num = data_num.iloc[:, :max_cols]
|
9893
|
+
|
9894
|
+
data_num = df_scaler(data=data_num, method="standard")
|
9262
9895
|
|
9263
|
-
data_num = df_scaler(data=data_num, method='standard')
|
9264
|
-
|
9265
9896
|
import scipy.stats as stats
|
9897
|
+
|
9266
9898
|
for column in data_num.columns:
|
9267
|
-
|
9899
|
+
# * Shapiro-Wilk test for normality
|
9268
9900
|
stat, p_value = stats.shapiro(data_num[column])
|
9269
|
-
normality = "norm" if p_value > 0.05 else "not_norm"
|
9270
|
-
|
9271
|
-
ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
|
9901
|
+
normality = "norm" if p_value > 0.05 else "not_norm"
|
9902
|
+
# * Plot histogram
|
9903
|
+
ax_hist = sns.histplot(data_num[column], kde=True, ax=nexttile())
|
9272
9904
|
x_min, x_max = ax_hist.get_xlim()
|
9273
9905
|
y_min, y_max = ax_hist.get_ylim()
|
9274
|
-
ax_hist.text(
|
9275
|
-
|
9276
|
-
|
9277
|
-
|
9278
|
-
|
9279
|
-
|
9906
|
+
ax_hist.text(
|
9907
|
+
x_min + (x_max - x_min) * 0.5,
|
9908
|
+
y_min + (y_max - y_min) * 0.75,
|
9909
|
+
f"p(Shapiro-Wilk)={p_value:.3f}\n{normality}",
|
9910
|
+
ha="center",
|
9911
|
+
va="top",
|
9912
|
+
)
|
9913
|
+
figsets(title=column, ax=ax_hist)
|
9914
|
+
ax_twin = ax_hist.twinx()
|
9915
|
+
# * Q-Q plot
|
9280
9916
|
stats.probplot(data_num[column], dist="norm", plot=ax_twin)
|
9281
|
-
figsets(ylabel=f
|
9917
|
+
figsets(ylabel=f"Q-Q Plot:{column}", title=None)
|
9282
9918
|
# save figure
|
9283
9919
|
if dir_save:
|
9284
|
-
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
9920
|
+
figsave(dir_save, f"qc_plot_{now_}.pdf")
|
9921
|
+
|
9285
9922
|
|
9286
9923
|
def df_corr(df: pd.DataFrame, method="pearson"):
|
9287
9924
|
"""
|
@@ -9318,6 +9955,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
|
|
9318
9955
|
|
9319
9956
|
return corr_matrix, pval_matrix
|
9320
9957
|
|
9958
|
+
|
9321
9959
|
def use_pd(
|
9322
9960
|
func_name="excel",
|
9323
9961
|
verbose=True,
|
@@ -9338,7 +9976,8 @@ def use_pd(
|
|
9338
9976
|
if verbose:
|
9339
9977
|
print(e)
|
9340
9978
|
|
9341
|
-
|
9979
|
+
|
9980
|
+
def get_phone(phone_number: str, region: str = None, verbose=True):
|
9342
9981
|
"""
|
9343
9982
|
usage:
|
9344
9983
|
info = get_phone(15237654321, "DE")
|
@@ -9426,21 +10065,23 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9426
10065
|
dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
|
9427
10066
|
|
9428
10067
|
# Advanced Timezone Handling
|
9429
|
-
gmt_offsets =
|
10068
|
+
gmt_offsets = (
|
10069
|
+
pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds() / 3600
|
10070
|
+
)
|
9430
10071
|
# Get the local timezone (current computer's time)
|
9431
10072
|
local_timezone = get_localzone()
|
9432
|
-
#local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
10073
|
+
# local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
9433
10074
|
local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
|
9434
10075
|
offset_diff = local_offset - gmt_offsets
|
9435
10076
|
head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
|
9436
|
-
res= {
|
10077
|
+
res = {
|
9437
10078
|
"valid": True,
|
9438
10079
|
"possible": possible,
|
9439
10080
|
"formatted": {
|
9440
10081
|
"international": formatted_international,
|
9441
10082
|
"national": formatted_national,
|
9442
10083
|
"e164": formatted_e164,
|
9443
|
-
},
|
10084
|
+
},
|
9444
10085
|
"country_code": country_code,
|
9445
10086
|
"country_name": country_name,
|
9446
10087
|
"region_code": region_code,
|
@@ -9448,13 +10089,13 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9448
10089
|
"carrier": carrier_name,
|
9449
10090
|
"time_zone": time_zones,
|
9450
10091
|
"current_times": current_times,
|
9451
|
-
"local_offset":f"{local_offset} utcoffset",
|
10092
|
+
"local_offset": f"{local_offset} utcoffset",
|
9452
10093
|
"time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
|
9453
10094
|
"number_type": number_type_str,
|
9454
10095
|
"is_toll_free": is_toll_free,
|
9455
|
-
"is_premium_rate": is_premium_rate,
|
10096
|
+
"is_premium_rate": is_premium_rate,
|
9456
10097
|
"dialing_instructions": dialing_instructions,
|
9457
|
-
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
10098
|
+
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
9458
10099
|
"logs": {
|
9459
10100
|
"number_analysis_completed": datetime.now().strftime(
|
9460
10101
|
"%Y-%m-%d %H:%M:%S"
|
@@ -9465,7 +10106,7 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9465
10106
|
}
|
9466
10107
|
|
9467
10108
|
except phonenumbers.NumberParseException as e:
|
9468
|
-
res= {"valid": False, "error": str(e)}
|
10109
|
+
res = {"valid": False, "error": str(e)}
|
9469
10110
|
if verbose:
|
9470
10111
|
preview(res)
|
9471
10112
|
return res
|
@@ -9531,7 +10172,8 @@ def decode_pluscode(
|
|
9531
10172
|
|
9532
10173
|
return latitude, longitude
|
9533
10174
|
|
9534
|
-
|
10175
|
+
|
10176
|
+
def get_loc(input_data, user_agent="0413@mygmail.com)", verbose=True):
|
9535
10177
|
"""
|
9536
10178
|
Determine if the input is a city name, lat/lon, or DMS and perform geocoding or reverse geocoding.
|
9537
10179
|
Usage:
|
@@ -9562,13 +10204,17 @@ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
|
|
9562
10204
|
# Case 1: Input is a city name (string)
|
9563
10205
|
if isinstance(input_data, str) and not re.match(r"^\d+(\.\d+)?$", input_data):
|
9564
10206
|
location = geolocator.geocode(input_data)
|
9565
|
-
|
9566
|
-
|
9567
|
-
|
9568
|
-
|
9569
|
-
|
9570
|
-
|
9571
|
-
|
10207
|
+
try:
|
10208
|
+
if verbose:
|
10209
|
+
print(
|
10210
|
+
f"Latitude and Longitude for {input_data}: {location.latitude}, {location.longitude}"
|
10211
|
+
)
|
10212
|
+
else:
|
10213
|
+
print(f"Could not find {input_data}.")
|
10214
|
+
return location
|
10215
|
+
except Exception as e:
|
10216
|
+
print(f'Error: {e}')
|
10217
|
+
return
|
9572
10218
|
|
9573
10219
|
# Case 2: Input is latitude and longitude (float or tuple)
|
9574
10220
|
elif isinstance(input_data, (float, tuple)):
|
@@ -9607,7 +10253,8 @@ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
|
|
9607
10253
|
"Invalid input format. Please provide a city name, latitude/longitude, or DMS string."
|
9608
10254
|
)
|
9609
10255
|
|
9610
|
-
|
10256
|
+
|
10257
|
+
def enpass(code: str, method: str = "AES", key: str = None):
|
9611
10258
|
"""
|
9612
10259
|
usage: enpass("admin")
|
9613
10260
|
Master encryption function that supports multiple methods: AES, RSA, and SHA256.
|
@@ -9617,6 +10264,7 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9617
10264
|
:return: The encrypted data or hashed value.
|
9618
10265
|
"""
|
9619
10266
|
import hashlib
|
10267
|
+
|
9620
10268
|
# AES Encryption (Advanced)
|
9621
10269
|
def aes_encrypt(data: str, key: str):
|
9622
10270
|
"""
|
@@ -9630,9 +10278,10 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9630
10278
|
from cryptography.hazmat.primitives import padding
|
9631
10279
|
import base64
|
9632
10280
|
import os
|
10281
|
+
|
9633
10282
|
# Generate a 256-bit key from the provided password
|
9634
10283
|
key = hashlib.sha256(key.encode()).digest()
|
9635
|
-
|
10284
|
+
|
9636
10285
|
# Generate a random initialization vector (IV)
|
9637
10286
|
iv = os.urandom(16) # 16 bytes for AES block size
|
9638
10287
|
|
@@ -9659,10 +10308,12 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9659
10308
|
import base64
|
9660
10309
|
from Crypto.PublicKey import RSA
|
9661
10310
|
from Crypto.Cipher import PKCS1_OAEP
|
10311
|
+
|
9662
10312
|
public_key_obj = RSA.import_key(public_key)
|
9663
10313
|
cipher_rsa = PKCS1_OAEP.new(public_key_obj)
|
9664
10314
|
encrypted_data = cipher_rsa.encrypt(data.encode())
|
9665
10315
|
return base64.b64encode(encrypted_data).decode()
|
10316
|
+
|
9666
10317
|
# SHA256 Hashing (Non-reversible)
|
9667
10318
|
def sha256_hash(data: str):
|
9668
10319
|
"""
|
@@ -9671,9 +10322,10 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9671
10322
|
:return: The hashed value (hex string).
|
9672
10323
|
"""
|
9673
10324
|
return hashlib.sha256(data.encode()).hexdigest()
|
10325
|
+
|
9674
10326
|
if key is None:
|
9675
|
-
key="worldpeace"
|
9676
|
-
method=strcmp(method,["AES","RSA",
|
10327
|
+
key = "worldpeace"
|
10328
|
+
method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
|
9677
10329
|
if method == "AES":
|
9678
10330
|
return aes_encrypt(code, key)
|
9679
10331
|
elif method == "RSA":
|
@@ -9685,7 +10337,7 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9685
10337
|
|
9686
10338
|
|
9687
10339
|
# Master Decryption Function (Supports AES, RSA)
|
9688
|
-
def depass(encrypted_code: str, method: str=
|
10340
|
+
def depass(encrypted_code: str, method: str = "AES", key: str = None):
|
9689
10341
|
"""
|
9690
10342
|
Master decryption function that supports multiple methods: AES and RSA.
|
9691
10343
|
:param encrypted_code: The encrypted data to decrypt.
|
@@ -9694,6 +10346,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9694
10346
|
:return: The decrypted data.
|
9695
10347
|
"""
|
9696
10348
|
import hashlib
|
10349
|
+
|
9697
10350
|
def aes_decrypt(encrypted_data: str, key: str):
|
9698
10351
|
"""
|
9699
10352
|
Decrypts data encrypted using AES in CBC mode.
|
@@ -9705,12 +10358,13 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9705
10358
|
from cryptography.hazmat.backends import default_backend
|
9706
10359
|
from cryptography.hazmat.primitives import padding
|
9707
10360
|
import base64
|
10361
|
+
|
9708
10362
|
# Generate the same 256-bit key from the password
|
9709
10363
|
key = hashlib.sha256(key.encode()).digest()
|
9710
|
-
|
10364
|
+
|
9711
10365
|
# Decode the encrypted data from base64
|
9712
10366
|
encrypted_data = base64.b64decode(encrypted_data)
|
9713
|
-
|
10367
|
+
|
9714
10368
|
# Extract the IV and the actual encrypted data
|
9715
10369
|
iv = encrypted_data[:16] # First 16 bytes are the IV
|
9716
10370
|
encrypted_data = encrypted_data[16:] # Remaining data is the encrypted message
|
@@ -9724,7 +10378,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9724
10378
|
unpadder = padding.PKCS7(128).unpadder()
|
9725
10379
|
unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
|
9726
10380
|
|
9727
|
-
return unpadded_data.decode()
|
10381
|
+
return unpadded_data.decode()
|
10382
|
+
|
9728
10383
|
def rsa_decrypt(encrypted_data: str, private_key: str):
|
9729
10384
|
"""
|
9730
10385
|
Decrypts RSA-encrypted data using the private key.
|
@@ -9735,6 +10390,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9735
10390
|
from Crypto.PublicKey import RSA
|
9736
10391
|
from Crypto.Cipher import PKCS1_OAEP
|
9737
10392
|
import base64
|
10393
|
+
|
9738
10394
|
encrypted_data = base64.b64decode(encrypted_data)
|
9739
10395
|
private_key_obj = RSA.import_key(private_key)
|
9740
10396
|
cipher_rsa = PKCS1_OAEP.new(private_key_obj)
|
@@ -9742,8 +10398,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9742
10398
|
return decrypted_data.decode()
|
9743
10399
|
|
9744
10400
|
if key is None:
|
9745
|
-
key="worldpeace"
|
9746
|
-
method=strcmp(method,["AES","RSA",
|
10401
|
+
key = "worldpeace"
|
10402
|
+
method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
|
9747
10403
|
if method == "AES":
|
9748
10404
|
return aes_decrypt(encrypted_code, key)
|
9749
10405
|
elif method == "RSA":
|
@@ -9752,3 +10408,311 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9752
10408
|
raise ValueError("SHA256 is a hash function and cannot be decrypted.")
|
9753
10409
|
else:
|
9754
10410
|
raise ValueError("Unsupported decryption method")
|
10411
|
+
|
10412
|
+
def get_clip(dir_save=None):
|
10413
|
+
"""
|
10414
|
+
Master function to extract content from the clipboard (text, URL, or image).
|
10415
|
+
|
10416
|
+
Parameters:
|
10417
|
+
dir_save (str, optional): If an image is found, save it to this path.
|
10418
|
+
|
10419
|
+
Returns:
|
10420
|
+
dict: A dictionary with extracted content:
|
10421
|
+
{
|
10422
|
+
"type": "text" | "url" | "image" | "none",
|
10423
|
+
"content": <str|Image|None>,
|
10424
|
+
"saved_to": <str|None> # Path if an image is saved
|
10425
|
+
}
|
10426
|
+
"""
|
10427
|
+
result = {"type": "none", "content": None, "saved_to": None}
|
10428
|
+
|
10429
|
+
try:
|
10430
|
+
import pyperclip
|
10431
|
+
from PIL import ImageGrab, Image
|
10432
|
+
import validators
|
10433
|
+
# 1. Check for text in the clipboard
|
10434
|
+
clipboard_content = pyperclip.paste()
|
10435
|
+
if clipboard_content:
|
10436
|
+
if validators.url(clipboard_content.strip()):
|
10437
|
+
result["type"] = "url"
|
10438
|
+
result["content"] = clipboard_content.strip()
|
10439
|
+
|
10440
|
+
else:
|
10441
|
+
result["type"] = "text"
|
10442
|
+
result["content"] = clipboard_content.strip()
|
10443
|
+
return clipboard_content.strip()
|
10444
|
+
|
10445
|
+
# 2. Check for image in the clipboard
|
10446
|
+
image = ImageGrab.grabclipboard()
|
10447
|
+
if isinstance(image, Image.Image):
|
10448
|
+
result["type"] = "image"
|
10449
|
+
result["content"] = image
|
10450
|
+
if dir_save:
|
10451
|
+
image.save(dir_save)
|
10452
|
+
result["saved_to"] = dir_save
|
10453
|
+
print(f"Image saved to {dir_save}.")
|
10454
|
+
else:
|
10455
|
+
print("Image detected in clipboard but not saved.")
|
10456
|
+
return image
|
10457
|
+
print("No valid text, URL, or image found in clipboard.")
|
10458
|
+
return result
|
10459
|
+
|
10460
|
+
except Exception as e:
|
10461
|
+
print(f"An error occurred: {e}")
|
10462
|
+
return result
|
10463
|
+
|
10464
|
+
def keyboard(*args, action='press', n_click=1,interval=0,verbose=False,**kwargs):
|
10465
|
+
"""
|
10466
|
+
Simulates keyboard input using pyautogui.
|
10467
|
+
|
10468
|
+
Parameters:
|
10469
|
+
input_key (str): The key to simulate. Check the list of supported keys with verbose=True.
|
10470
|
+
action (str): The action to perform. Options are 'press', 'keyDown', or 'keyUp'.
|
10471
|
+
n_click (int): Number of times to press the key (only for 'press' action).
|
10472
|
+
interval (float): Time interval between key presses for 'press' action.
|
10473
|
+
verbose (bool): Print detailed output, including supported keys and debug info.
|
10474
|
+
kwargs: Additional arguments (reserved for future extensions).
|
10475
|
+
|
10476
|
+
keyboard("command", "d", action="shorcut")
|
10477
|
+
"""
|
10478
|
+
import pyautogui
|
10479
|
+
input_key = args
|
10480
|
+
|
10481
|
+
actions = ['press','keyDown','keyUp', 'hold','release', 'hotkey','shortcut']
|
10482
|
+
action = strcmp(action,actions)[0]
|
10483
|
+
keyboard_keys_=['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(',
|
10484
|
+
')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
|
10485
|
+
'8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
|
10486
|
+
'a', 'b', 'c', 'd', 'e','f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
10487
|
+
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
|
10488
|
+
'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace',
|
10489
|
+
'browserback', 'browserfavorites', 'browserforward', 'browserhome',
|
10490
|
+
'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear',
|
10491
|
+
'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete',
|
10492
|
+
'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10',
|
10493
|
+
'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20',
|
10494
|
+
'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
|
10495
|
+
'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja',
|
10496
|
+
'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail',
|
10497
|
+
'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack',
|
10498
|
+
'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6',
|
10499
|
+
'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn',
|
10500
|
+
'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn',
|
10501
|
+
'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator',
|
10502
|
+
'shift', 'shiftleft', 'shiftright', 'sleep', 'space', 'stop', 'subtract', 'tab',
|
10503
|
+
'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen',
|
10504
|
+
'command', 'option', 'optionleft', 'optionright']
|
10505
|
+
if verbose:
|
10506
|
+
print(f"supported keys: {keyboard_keys_}")
|
10507
|
+
|
10508
|
+
if action not in ['hotkey','shortcut']:
|
10509
|
+
if not isinstance(input_key, list):
|
10510
|
+
input_key=list(input_key)
|
10511
|
+
input_key = [strcmp(i, keyboard_keys_)[0] for i in input_key ]
|
10512
|
+
|
10513
|
+
# correct action
|
10514
|
+
cmd_keys = ['command', 'option', 'optionleft', 'optionright','win', 'winleft', 'winright','ctrl', 'ctrlleft', 'ctrlright']
|
10515
|
+
try:
|
10516
|
+
if any([i in cmd_keys for i in input_key]):
|
10517
|
+
action='hotkey'
|
10518
|
+
except:
|
10519
|
+
pass
|
10520
|
+
|
10521
|
+
print(f"\n{action}: {input_key}")
|
10522
|
+
# keyboard
|
10523
|
+
if action in ["press"]:
|
10524
|
+
# pyautogui.press(input_key, presses=n_click,interval=interval)
|
10525
|
+
for _ in range(n_click):
|
10526
|
+
for key in input_key:
|
10527
|
+
pyautogui.press(key)
|
10528
|
+
pyautogui.sleep(interval)
|
10529
|
+
elif action in ['keyDown','hold']:
|
10530
|
+
# pyautogui.keyDown(input_key)
|
10531
|
+
for _ in range(n_click):
|
10532
|
+
for key in input_key:
|
10533
|
+
pyautogui.keyDown(key)
|
10534
|
+
pyautogui.sleep(interval)
|
10535
|
+
|
10536
|
+
elif action in ['keyUp','release']:
|
10537
|
+
# pyautogui.keyUp(input_key)
|
10538
|
+
for _ in range(n_click):
|
10539
|
+
for key in input_key:
|
10540
|
+
pyautogui.keyUp(key)
|
10541
|
+
pyautogui.sleep(interval)
|
10542
|
+
|
10543
|
+
elif action in ['hotkey','shortcut']:
|
10544
|
+
pyautogui.hotkey(input_key)
|
10545
|
+
|
10546
|
+
def mouse(
|
10547
|
+
*args, # loc
|
10548
|
+
action: str = "move",
|
10549
|
+
duration: float = 0.5,
|
10550
|
+
loc_type: str = "absolute", # 'absolute', 'relative'
|
10551
|
+
region: tuple = None, # (tuple, optional): A region (x, y, width, height) to search for the image.
|
10552
|
+
image_path: str = None,
|
10553
|
+
wait:float = 0,
|
10554
|
+
text: str = None,
|
10555
|
+
confidence: float = 0.8,
|
10556
|
+
button: str = "left",
|
10557
|
+
n_click: int = 1, # number of clicks
|
10558
|
+
interval: float = 0.25, # time between clicks
|
10559
|
+
scroll_amount: int = -500,
|
10560
|
+
fail_safe: bool = True,
|
10561
|
+
grayscale: bool = False,
|
10562
|
+
**kwargs,
|
10563
|
+
):
|
10564
|
+
"""
|
10565
|
+
Master function to handle pyautogui actions.
|
10566
|
+
|
10567
|
+
Parameters:
|
10568
|
+
action (str): The action to perform ('click', 'double_click', 'type', 'drag', 'scroll', 'move', 'locate', etc.).
|
10569
|
+
image_path (str, optional): Path to the image for 'locate' or 'click' actions.
|
10570
|
+
text (str, optional): Text to type for 'type' action.
|
10571
|
+
confidence (float, optional): Confidence level for image recognition (default 0.8).
|
10572
|
+
duration (float, optional): Duration for smooth movements in seconds (default 0.5).
|
10573
|
+
region (tuple, optional): A region (x, y, width, height) to search for the image.
|
10574
|
+
button (str, optional): Mouse button to use ('left', 'right', 'middle').
|
10575
|
+
n_click (int, optional): Number of times to click for 'click' actions.
|
10576
|
+
interval (float, optional): Interval between clicks for 'click' actions.
|
10577
|
+
offset (tuple, optional): Horizontal offset from the located image. y_offset (int, optional): Vertical offset from the located image.
|
10578
|
+
scroll_amount (int, optional): Amount to scroll (positive for up, negative for down).
|
10579
|
+
fail_safe (bool, optional): Enable/disable pyautogui's fail-safe feature.
|
10580
|
+
grayscale (bool, optional): Search for the image in grayscale mode.
|
10581
|
+
|
10582
|
+
Returns:
|
10583
|
+
tuple or None: Returns coordinates for 'locate' actions, otherwise None.
|
10584
|
+
"""
|
10585
|
+
import pyautogui
|
10586
|
+
import time
|
10587
|
+
|
10588
|
+
pyautogui.FAILSAFE = fail_safe # Enable/disable fail-safe
|
10589
|
+
loc_type = "absolute" if "abs" in loc_type else "relative"
|
10590
|
+
if len(args) == 1:
|
10591
|
+
if isinstance(args[0], str):
|
10592
|
+
image_path = args[0]
|
10593
|
+
x_offset, y_offset = None, None
|
10594
|
+
else:
|
10595
|
+
x_offset, y_offset = args
|
10596
|
+
|
10597
|
+
elif len(args) == 2:
|
10598
|
+
x_offset, y_offset = args
|
10599
|
+
elif len(args) == 3:
|
10600
|
+
x_offset, y_offset, action = args
|
10601
|
+
elif len(args) == 4:
|
10602
|
+
x_offset, y_offset, action, duration = args
|
10603
|
+
else:
|
10604
|
+
x_offset, y_offset = None, None
|
10605
|
+
|
10606
|
+
what_action = [
|
10607
|
+
"locate",
|
10608
|
+
"click",
|
10609
|
+
"double_click",
|
10610
|
+
"triple_click",
|
10611
|
+
"input",
|
10612
|
+
"write",
|
10613
|
+
"type",
|
10614
|
+
"drag",
|
10615
|
+
"move",
|
10616
|
+
"scroll",
|
10617
|
+
"down",
|
10618
|
+
"up",
|
10619
|
+
"hold",
|
10620
|
+
"press",
|
10621
|
+
"release"
|
10622
|
+
]
|
10623
|
+
action = strcmp(action, what_action)[0]
|
10624
|
+
# get the locations
|
10625
|
+
location = None
|
10626
|
+
if any([x_offset is None, y_offset is None]):
|
10627
|
+
if region is None:
|
10628
|
+
w,h=pyautogui.size()
|
10629
|
+
region=(0,0,w,h)
|
10630
|
+
print(region)
|
10631
|
+
try:
|
10632
|
+
print(image_path)
|
10633
|
+
location = pyautogui.locateOnScreen(
|
10634
|
+
image_path, confidence=confidence, region=region, grayscale=grayscale
|
10635
|
+
)
|
10636
|
+
print(pyautogui.center(location))
|
10637
|
+
except Exception as e:
|
10638
|
+
location = None
|
10639
|
+
|
10640
|
+
# try:
|
10641
|
+
if location:
|
10642
|
+
x, y = pyautogui.center(location)
|
10643
|
+
x += x_offset if x_offset else 0
|
10644
|
+
y += y_offset if y_offset else 0
|
10645
|
+
x_offset, y_offset = x,y
|
10646
|
+
print(action)
|
10647
|
+
if action in ['locate']:
|
10648
|
+
x, y = pyautogui.position()
|
10649
|
+
elif action in ["click", "double_click","triple_click"]:
|
10650
|
+
# if location:
|
10651
|
+
# x, y = pyautogui.center(location)
|
10652
|
+
# x += x_offset
|
10653
|
+
# y += y_offset
|
10654
|
+
# pyautogui.moveTo(x, y, duration=duration)
|
10655
|
+
# if action == "click":
|
10656
|
+
# pyautogui.click(x=x, y=y, clicks=n_click, interval=interval, button=button)
|
10657
|
+
# elif action == "double_click":
|
10658
|
+
# pyautogui.doubleClick(x=x, y=y, interval=interval, button=button)
|
10659
|
+
# elif action=='triple_click':
|
10660
|
+
# pyautogui.tripleClick(x=x,y=y,interval=interval, button=button)
|
10661
|
+
# else:
|
10662
|
+
if action == "click":
|
10663
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10664
|
+
time.sleep(wait)
|
10665
|
+
pyautogui.click(x=x_offset, y=y_offset, clicks=n_click, interval=interval, button=button)
|
10666
|
+
elif action == "double_click":
|
10667
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10668
|
+
time.sleep(wait)
|
10669
|
+
pyautogui.doubleClick(x=x_offset, y=y_offset, interval=interval, button=button)
|
10670
|
+
elif action=='triple_click':
|
10671
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10672
|
+
time.sleep(wait)
|
10673
|
+
pyautogui.tripleClick(x=x_offset, y=y_offset, interval=interval, button=button)
|
10674
|
+
|
10675
|
+
elif action in ["type", "write", "input"]:
|
10676
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10677
|
+
time.sleep(wait)
|
10678
|
+
if text is not None:
|
10679
|
+
pyautogui.typewrite(text, interval=interval)
|
10680
|
+
else:
|
10681
|
+
raise ValueError("Text must be provided for the 'type' action.")
|
10682
|
+
|
10683
|
+
elif action == "drag":
|
10684
|
+
if loc_type == "absolute":
|
10685
|
+
pyautogui.dragTo(x_offset, y_offset, duration=duration, button=button)
|
10686
|
+
else:
|
10687
|
+
pyautogui.dragRel(x_offset, y_offset, duration=duration, button=button)
|
10688
|
+
|
10689
|
+
elif action in ["move"]:
|
10690
|
+
if loc_type == "absolute":
|
10691
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10692
|
+
else:
|
10693
|
+
pyautogui.moveRel(x_offset, y_offset, duration=duration)
|
10694
|
+
|
10695
|
+
elif action == "scroll":
|
10696
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10697
|
+
time.sleep(wait)
|
10698
|
+
pyautogui.scroll(scroll_amount)
|
10699
|
+
|
10700
|
+
elif action in ["down",'hold','press']:
|
10701
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10702
|
+
time.sleep(wait)
|
10703
|
+
pyautogui.mouseDown(x_offset, y_offset, button=button, duration=duration)
|
10704
|
+
|
10705
|
+
elif action in ['up','release']:
|
10706
|
+
pyautogui.moveTo(x_offset, y_offset, duration=duration)
|
10707
|
+
time.sleep(wait)
|
10708
|
+
pyautogui.mouseUp(x_offset, y_offset, button=button, duration=duration)
|
10709
|
+
|
10710
|
+
else:
|
10711
|
+
raise ValueError(f"Unsupported action: {action}")
|
10712
|
+
|
10713
|
+
# except pyautogui.ImageNotFoundException:
|
10714
|
+
# print(
|
10715
|
+
# "Image not found. Ensure the image is visible and parameters are correct."
|
10716
|
+
# )
|
10717
|
+
# except Exception as e:
|
10718
|
+
# print(f"An error occurred: {e}")
|