py2ls 0.2.4.30__py3-none-any.whl → 0.2.4.32__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py
CHANGED
@@ -23,7 +23,8 @@ import logging
|
|
23
23
|
from pathlib import Path
|
24
24
|
from datetime import datetime
|
25
25
|
|
26
|
-
|
26
|
+
|
27
|
+
def run_once_within(duration=60, reverse=False): # default 60s
|
27
28
|
import time
|
28
29
|
|
29
30
|
"""
|
@@ -546,6 +547,7 @@ def is_text(s):
|
|
546
547
|
|
547
548
|
from typing import Any, Union
|
548
549
|
|
550
|
+
|
549
551
|
def share(*args, strict=True, n_shared=2, verbose=True):
|
550
552
|
"""
|
551
553
|
check the shared elelements in two list.
|
@@ -591,13 +593,14 @@ def share(*args, strict=True, n_shared=2, verbose=True):
|
|
591
593
|
elements2show = (
|
592
594
|
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
593
595
|
)
|
594
|
-
tail =
|
596
|
+
tail = "" if len(shared_elements) < 10 else "......"
|
595
597
|
elements2show.append(tail)
|
596
598
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
597
599
|
print("********* checking shared elements *********")
|
598
600
|
return shared_elements
|
599
601
|
|
600
|
-
|
602
|
+
|
603
|
+
def shared(*args, n_shared=None, verbose=True, **kwargs):
|
601
604
|
"""
|
602
605
|
check the shared elelements in two list.
|
603
606
|
usage:
|
@@ -652,7 +655,8 @@ def shared(*args, n_shared=None, verbose=True,**kwargs):
|
|
652
655
|
print("********* checking shared elements *********")
|
653
656
|
return shared_elements
|
654
657
|
|
655
|
-
|
658
|
+
|
659
|
+
def share_not(*args, n_shared=None, verbose=False):
|
656
660
|
"""
|
657
661
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
658
662
|
usage:
|
@@ -660,10 +664,12 @@ def share_not(*args, n_shared=None, verbose=False):
|
|
660
664
|
list2 = [4, 5, 6, 7, 8]
|
661
665
|
not_shared(list1,list2)# output [1,3]
|
662
666
|
"""
|
663
|
-
_common = shared(*args,
|
667
|
+
_common = shared(*args, n_shared=n_shared, verbose=verbose)
|
664
668
|
list1 = flatten(args[0], verbose=verbose)
|
665
669
|
_not_shared = [item for item in list1 if item not in _common]
|
666
670
|
return _not_shared
|
671
|
+
|
672
|
+
|
667
673
|
def not_shared(*args, n_shared=None, verbose=False):
|
668
674
|
"""
|
669
675
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
@@ -779,7 +785,8 @@ def strcmp(
|
|
779
785
|
print(f"建议: {best_match}")
|
780
786
|
return candidates[best_match_index], best_match_index
|
781
787
|
|
782
|
-
|
788
|
+
|
789
|
+
def imgcmp(img: list, method="knn", plot_=True, figsize=[12, 6]):
|
783
790
|
"""
|
784
791
|
Compare two images using SSIM, Feature Matching (SIFT), or KNN Matching.
|
785
792
|
|
@@ -796,15 +803,16 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
796
803
|
import cv2
|
797
804
|
import matplotlib.pyplot as plt
|
798
805
|
from skimage.metrics import structural_similarity as ssim
|
806
|
+
|
799
807
|
# Load images
|
800
808
|
image1 = cv2.imread(img[0])
|
801
809
|
image2 = cv2.imread(img[1])
|
802
810
|
|
803
811
|
if image1 is None or image2 is None:
|
804
812
|
raise ValueError("Could not load one or both images. Check file paths.")
|
805
|
-
methods=[
|
806
|
-
method=strcmp(method, methods)[0]
|
807
|
-
if method ==
|
813
|
+
methods = ["ssim", "match", "knn"]
|
814
|
+
method = strcmp(method, methods)[0]
|
815
|
+
if method == "ssim":
|
808
816
|
# Convert images to grayscale
|
809
817
|
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
810
818
|
gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
|
@@ -819,18 +827,18 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
819
827
|
# Plot if needed
|
820
828
|
if plot_:
|
821
829
|
fig, ax = plt.subplots(1, 3, figsize=figsize)
|
822
|
-
ax[0].imshow(gray1, cmap=
|
830
|
+
ax[0].imshow(gray1, cmap="gray")
|
823
831
|
ax[0].set_title("Image 1")
|
824
|
-
ax[1].imshow(gray2, cmap=
|
832
|
+
ax[1].imshow(gray2, cmap="gray")
|
825
833
|
ax[1].set_title("Image 2")
|
826
|
-
ax[2].imshow(diff, cmap=
|
834
|
+
ax[2].imshow(diff, cmap="gray")
|
827
835
|
ax[2].set_title("Difference (SSIM)")
|
828
836
|
plt.tight_layout()
|
829
837
|
plt.show()
|
830
|
-
|
838
|
+
|
831
839
|
return diff, score
|
832
840
|
|
833
|
-
elif method in [
|
841
|
+
elif method in ["match", "knn"]:
|
834
842
|
# Convert images to grayscale
|
835
843
|
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
836
844
|
gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
|
@@ -848,15 +856,17 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
848
856
|
# BFMatcher initialization
|
849
857
|
bf = cv2.BFMatcher()
|
850
858
|
|
851
|
-
if method ==
|
859
|
+
if method == "match": # Cross-check matching
|
852
860
|
bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
|
853
861
|
matches = bf.match(descriptors1, descriptors2)
|
854
862
|
matches = sorted(matches, key=lambda x: x.distance)
|
855
863
|
|
856
864
|
# Filter good matches
|
857
|
-
good_matches = [
|
865
|
+
good_matches = [
|
866
|
+
m for m in matches if m.distance < 0.75 * matches[-1].distance
|
867
|
+
]
|
858
868
|
|
859
|
-
elif method ==
|
869
|
+
elif method == "knn": # KNN matching with ratio test
|
860
870
|
matches = bf.knnMatch(descriptors1, descriptors2, k=2)
|
861
871
|
# Apply Lowe's ratio test
|
862
872
|
good_matches = [m for m, n in matches if m.distance < 0.75 * n.distance]
|
@@ -865,14 +875,18 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
865
875
|
similarity_score = len(good_matches) / min(len(keypoints1), len(keypoints2))
|
866
876
|
print(f"Number of good matches: {len(good_matches)}")
|
867
877
|
print(f"Similarity Score: {similarity_score:.4f}")
|
868
|
-
|
878
|
+
# Handle case where no good matches are found
|
869
879
|
if len(good_matches) == 0:
|
870
880
|
print("No good matches found.")
|
871
881
|
return good_matches, 0.0, None
|
872
882
|
|
873
883
|
# Identify matched keypoints
|
874
|
-
src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(
|
875
|
-
|
884
|
+
src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(
|
885
|
+
-1, 1, 2
|
886
|
+
)
|
887
|
+
dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(
|
888
|
+
-1, 1, 2
|
889
|
+
)
|
876
890
|
|
877
891
|
# Calculate Homography using RANSAC
|
878
892
|
homography_matrix, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
|
@@ -893,11 +907,15 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
893
907
|
|
894
908
|
# Plot matches if needed
|
895
909
|
if plot_:
|
896
|
-
result = cv2.drawMatches(
|
910
|
+
result = cv2.drawMatches(
|
911
|
+
image1, keypoints1, image2, keypoints2, good_matches, None, flags=2
|
912
|
+
)
|
897
913
|
plt.figure(figsize=figsize)
|
898
914
|
plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
|
899
|
-
plt.title(
|
900
|
-
|
915
|
+
plt.title(
|
916
|
+
f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})"
|
917
|
+
)
|
918
|
+
plt.axis("off")
|
901
919
|
plt.show()
|
902
920
|
# Identify unmatched keypoints
|
903
921
|
matched_idx1 = [m.queryIdx for m in good_matches]
|
@@ -907,8 +925,20 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
|
|
907
925
|
unmatched_kp2 = [kp for i, kp in enumerate(keypoints2) if i not in matched_idx2]
|
908
926
|
|
909
927
|
# Mark unmatched keypoints on the images
|
910
|
-
img1_marked = cv2.drawKeypoints(
|
911
|
-
|
928
|
+
img1_marked = cv2.drawKeypoints(
|
929
|
+
image1,
|
930
|
+
unmatched_kp1,
|
931
|
+
None,
|
932
|
+
color=(0, 0, 255),
|
933
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
934
|
+
)
|
935
|
+
img2_marked = cv2.drawKeypoints(
|
936
|
+
image2,
|
937
|
+
unmatched_kp2,
|
938
|
+
None,
|
939
|
+
color=(0, 0, 255),
|
940
|
+
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
|
941
|
+
)
|
912
942
|
|
913
943
|
# Display results
|
914
944
|
if plot_:
|
@@ -1017,6 +1047,7 @@ def counter(list_, verbose=True):
|
|
1017
1047
|
# print(f"Return a list of the n most common elements:\n{c.most_common()}")
|
1018
1048
|
# print(f"Compute the sum of the counts:\n{c.total()}")
|
1019
1049
|
|
1050
|
+
|
1020
1051
|
def dict2df(dict_, fill=None):
|
1021
1052
|
len_max = 0
|
1022
1053
|
for key, value in dict_.items():
|
@@ -1031,11 +1062,12 @@ def dict2df(dict_, fill=None):
|
|
1031
1062
|
dict_[key] = value
|
1032
1063
|
return pd.DataFrame.from_dict(dict_)
|
1033
1064
|
|
1065
|
+
|
1034
1066
|
def text2audio(
|
1035
1067
|
text,
|
1036
1068
|
method=None, # "pyttsx3","gTTS"
|
1037
1069
|
rate=200,
|
1038
|
-
slow=False
|
1070
|
+
slow=False, # "gTTS"
|
1039
1071
|
volume=1.0,
|
1040
1072
|
voice=None,
|
1041
1073
|
lang=None,
|
@@ -1056,16 +1088,38 @@ def text2audio(
|
|
1056
1088
|
# )
|
1057
1089
|
"""
|
1058
1090
|
if method is not None:
|
1059
|
-
methods=["gTTS","pyttsx3","google"]
|
1060
|
-
method=strcmp(method, methods)[0]
|
1091
|
+
methods = ["gTTS", "pyttsx3", "google"]
|
1092
|
+
method = strcmp(method, methods)[0]
|
1061
1093
|
else:
|
1062
1094
|
try:
|
1063
|
-
text2audio(
|
1095
|
+
text2audio(
|
1096
|
+
text,
|
1097
|
+
method="google",
|
1098
|
+
rate=rate,
|
1099
|
+
slow=slow,
|
1100
|
+
volume=volume,
|
1101
|
+
voice=voice,
|
1102
|
+
lang=lang,
|
1103
|
+
gender=gender,
|
1104
|
+
age=age,
|
1105
|
+
dir_save=dir_save,
|
1106
|
+
)
|
1064
1107
|
except Exception as e:
|
1065
1108
|
print(e)
|
1066
|
-
text2audio(
|
1067
|
-
|
1068
|
-
|
1109
|
+
text2audio(
|
1110
|
+
text,
|
1111
|
+
method="pyttsx3",
|
1112
|
+
rate=rate,
|
1113
|
+
slow=slow,
|
1114
|
+
volume=volume,
|
1115
|
+
voice=voice,
|
1116
|
+
lang=lang,
|
1117
|
+
gender=gender,
|
1118
|
+
age=age,
|
1119
|
+
dir_save=dir_save,
|
1120
|
+
)
|
1121
|
+
|
1122
|
+
if method == "pyttsx3":
|
1069
1123
|
import pyttsx3
|
1070
1124
|
|
1071
1125
|
try:
|
@@ -1140,27 +1194,29 @@ def text2audio(
|
|
1140
1194
|
sys.exit()
|
1141
1195
|
except SystemExit:
|
1142
1196
|
pass
|
1143
|
-
elif method.lower() in [
|
1197
|
+
elif method.lower() in ["google", "gtts"]:
|
1144
1198
|
from gtts import gTTS
|
1199
|
+
|
1145
1200
|
try:
|
1146
1201
|
if lang is None:
|
1147
1202
|
from langdetect import detect
|
1203
|
+
|
1148
1204
|
lang = detect(text)
|
1149
1205
|
# Initialize gTTS with the provided parameters
|
1150
1206
|
tts = gTTS(text=text, lang=lang, slow=slow)
|
1151
1207
|
except Exception as e:
|
1152
1208
|
print(f"An error occurred: {e}")
|
1153
|
-
|
1209
|
+
|
1154
1210
|
print("not realtime reading...")
|
1155
1211
|
if dir_save:
|
1156
1212
|
if "." not in dir_save:
|
1157
|
-
dir_save=dir_save+".mp3"
|
1213
|
+
dir_save = dir_save + ".mp3"
|
1158
1214
|
tts.save(dir_save)
|
1159
1215
|
print(f"Audio saved to {dir_save}")
|
1160
1216
|
else:
|
1161
1217
|
dir_save = "temp_audio.mp3"
|
1162
1218
|
if "." not in dir_save:
|
1163
|
-
dir_save=dir_save+".mp3"
|
1219
|
+
dir_save = dir_save + ".mp3"
|
1164
1220
|
tts.save(dir_save)
|
1165
1221
|
try:
|
1166
1222
|
fopen(dir_save)
|
@@ -1168,6 +1224,7 @@ def text2audio(
|
|
1168
1224
|
print(f"Error opening file: {e}")
|
1169
1225
|
print("done")
|
1170
1226
|
|
1227
|
+
|
1171
1228
|
def str2time(time_str, fmt="24"):
|
1172
1229
|
"""
|
1173
1230
|
Convert a time string into the specified format.
|
@@ -1624,6 +1681,7 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
|
|
1624
1681
|
def set_dpi(x):
|
1625
1682
|
dpix = dpiy = x
|
1626
1683
|
return image2pdf.get_fixed_dpi_layout_fun((dpix, dpiy))
|
1684
|
+
|
1627
1685
|
if kind is None:
|
1628
1686
|
_, kind = os.path.splitext(dir_img)
|
1629
1687
|
if not kind.startswith("."):
|
@@ -1649,8 +1707,9 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
|
|
1649
1707
|
imgs.append(path)
|
1650
1708
|
else:
|
1651
1709
|
imgs = [
|
1652
|
-
# os.path.isdir(dir_img),
|
1653
|
-
dir_img
|
1710
|
+
# os.path.isdir(dir_img),
|
1711
|
+
dir_img
|
1712
|
+
]
|
1654
1713
|
print(imgs)
|
1655
1714
|
if page_size:
|
1656
1715
|
if isinstance(page_size, str):
|
@@ -2196,7 +2255,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
2196
2255
|
# Check data types
|
2197
2256
|
data_types = df.dtypes
|
2198
2257
|
# messages.append(f"Data types of columns:\n{data_types}")
|
2199
|
-
|
2258
|
+
|
2200
2259
|
# Check for an unreasonable number of rows or columns
|
2201
2260
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
2202
2261
|
messages.append(
|
@@ -2347,33 +2406,36 @@ def fload(fpath, kind=None, **kwargs):
|
|
2347
2406
|
|
2348
2407
|
def load_csv(fpath, **kwargs):
|
2349
2408
|
from pandas.errors import EmptyDataError
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2353
|
-
|
2354
|
-
|
2355
|
-
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2409
|
+
|
2410
|
+
engine = kwargs.pop("engine", "pyarrow") # default: None
|
2411
|
+
sep = kwargs.pop("sep", None) # default: ','
|
2412
|
+
index_col = kwargs.pop("index_col", None) # default: None
|
2413
|
+
memory_map = kwargs.pop("memory_map", False) # default: False
|
2414
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False) # default: False
|
2415
|
+
encoding = kwargs.pop("encoding", "utf-8") # default: "utf-8"
|
2416
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip") # default: 'error'
|
2417
|
+
comment = kwargs.pop("comment", None) # default: None
|
2418
|
+
fmt = kwargs.pop("fmt", False) # default:
|
2419
|
+
chunksize = kwargs.pop("chunksize", None) # default: None
|
2420
|
+
|
2421
|
+
# check filesize
|
2422
|
+
f_size = round(os.path.getsize(fpath) / 1024 / 1024, 3)
|
2423
|
+
if f_size >= 50: # 50 MB
|
2364
2424
|
if chunksize is None:
|
2365
|
-
chunksize
|
2366
|
-
print(
|
2425
|
+
chunksize = 5000
|
2426
|
+
print(
|
2427
|
+
f"file size is {f_size}MB, then set the chunksize with {chunksize}"
|
2428
|
+
)
|
2367
2429
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2368
|
-
low_memory = kwargs.pop("low_memory", True)# default: True
|
2430
|
+
low_memory = kwargs.pop("low_memory", True) # default: True
|
2369
2431
|
low_memory = (
|
2370
2432
|
False if chunksize else True
|
2371
|
-
) # when chunksize, recommend low_memory=False # default:
|
2433
|
+
) # when chunksize, recommend low_memory=False # default:
|
2372
2434
|
verbose = kwargs.pop("verbose", False)
|
2373
2435
|
if run_once_within(reverse=True) and verbose:
|
2374
2436
|
use_pd("read_csv", verbose=verbose)
|
2375
2437
|
|
2376
|
-
if comment is None
|
2438
|
+
if comment is None: # default: None
|
2377
2439
|
comment = get_comment(
|
2378
2440
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
2379
2441
|
)
|
@@ -2503,7 +2565,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2503
2565
|
try:
|
2504
2566
|
sep2show = sep if sep != "\t" else "\\t"
|
2505
2567
|
if verbose:
|
2506
|
-
print(
|
2568
|
+
print(
|
2569
|
+
f"trying with: engine={engine}, sep='{sep2show}'"
|
2570
|
+
)
|
2507
2571
|
# print(".")
|
2508
2572
|
df = pd.read_csv(
|
2509
2573
|
fpath,
|
@@ -2524,12 +2588,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2524
2588
|
if verbose:
|
2525
2589
|
(
|
2526
2590
|
display(df.head(2))
|
2527
|
-
if isinstance(df, pd.DataFrame)
|
2591
|
+
if isinstance(df, pd.DataFrame)
|
2528
2592
|
else display("it is not a DataFrame")
|
2529
2593
|
)
|
2530
2594
|
(
|
2531
2595
|
print(f"shape: {df.shape}")
|
2532
|
-
if isinstance(df, pd.DataFrame)
|
2596
|
+
if isinstance(df, pd.DataFrame)
|
2533
2597
|
else display("it is not a DataFrame")
|
2534
2598
|
)
|
2535
2599
|
return df
|
@@ -2663,9 +2727,10 @@ def fload(fpath, kind=None, **kwargs):
|
|
2663
2727
|
doc = Document(fpath)
|
2664
2728
|
content = [para.text for para in doc.paragraphs]
|
2665
2729
|
return content
|
2666
|
-
|
2730
|
+
|
2667
2731
|
def load_rtf(file_path):
|
2668
2732
|
from striprtf.striprtf import rtf_to_text
|
2733
|
+
|
2669
2734
|
try:
|
2670
2735
|
with open(file_path, "r") as file:
|
2671
2736
|
rtf_content = file.read()
|
@@ -2715,7 +2780,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2715
2780
|
"xml",
|
2716
2781
|
"ipynb",
|
2717
2782
|
"mtx",
|
2718
|
-
"rtf"
|
2783
|
+
"rtf",
|
2719
2784
|
]
|
2720
2785
|
zip_types = [
|
2721
2786
|
"gz",
|
@@ -2735,7 +2800,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2735
2800
|
if kind not in supported_types:
|
2736
2801
|
print(
|
2737
2802
|
f'Warning:\n"{kind}" is not in the supported list '
|
2738
|
-
) # {supported_types}')
|
2803
|
+
) # {supported_types}')
|
2739
2804
|
|
2740
2805
|
if kind == "docx":
|
2741
2806
|
return load_docx(fpath)
|
@@ -2760,10 +2825,11 @@ def fload(fpath, kind=None, **kwargs):
|
|
2760
2825
|
if run_once_within(reverse=True) and verbose:
|
2761
2826
|
use_pd("read_pickle")
|
2762
2827
|
try:
|
2763
|
-
res_=pd.read_pickle(fpath, **kwargs)
|
2828
|
+
res_ = pd.read_pickle(fpath, **kwargs)
|
2764
2829
|
except Exception as e:
|
2765
2830
|
import pickle
|
2766
|
-
|
2831
|
+
|
2832
|
+
with open("sgd_classifier.pkl", "rb") as f:
|
2767
2833
|
res_ = pickle.load(f)
|
2768
2834
|
return res_
|
2769
2835
|
elif kind in ["ods", "ods", "odt"]:
|
@@ -2775,21 +2841,34 @@ def fload(fpath, kind=None, **kwargs):
|
|
2775
2841
|
engine = kwargs.get("engine", "xlrd")
|
2776
2842
|
kwargs.pop("engine", None)
|
2777
2843
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2778
|
-
|
2844
|
+
(
|
2845
|
+
print(f"shape: {content.shape}")
|
2846
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2847
|
+
else None
|
2848
|
+
)
|
2779
2849
|
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2780
2850
|
return content
|
2781
2851
|
elif kind == "xlsx":
|
2782
2852
|
verbose = kwargs.pop("verbose", False)
|
2783
2853
|
content = load_excel(fpath, **kwargs)
|
2784
|
-
|
2854
|
+
(
|
2855
|
+
display(content.head(3))
|
2856
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2857
|
+
else None
|
2858
|
+
)
|
2785
2859
|
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2786
2860
|
return content
|
2787
2861
|
elif kind == "mtx":
|
2788
2862
|
from scipy.io import mmread
|
2863
|
+
|
2789
2864
|
verbose = kwargs.pop("verbose", False)
|
2790
2865
|
dat_mtx = mmread(fpath)
|
2791
2866
|
content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
|
2792
|
-
|
2867
|
+
(
|
2868
|
+
display(content.head(3))
|
2869
|
+
if isinstance(content, pd.DataFrame) and verbose
|
2870
|
+
else None
|
2871
|
+
)
|
2793
2872
|
print(f"shape: {content.shape}")
|
2794
2873
|
return content
|
2795
2874
|
elif kind == "ipynb":
|
@@ -2904,34 +2983,34 @@ def fopen(fpath):
|
|
2904
2983
|
import os
|
2905
2984
|
import platform
|
2906
2985
|
import sys
|
2986
|
+
|
2907
2987
|
try:
|
2908
2988
|
# Check if the file exists
|
2909
2989
|
if not os.path.isfile(fpath):
|
2910
2990
|
print(f"Error: The file does not exist - {fpath}")
|
2911
2991
|
return
|
2912
|
-
|
2992
|
+
|
2913
2993
|
# Get the system platform
|
2914
2994
|
system = platform.system()
|
2915
2995
|
|
2916
2996
|
# Platform-specific file opening commands
|
2917
2997
|
if system == "Darwin": # macOS
|
2918
|
-
os.system(f
|
2998
|
+
os.system(f'open "{fpath}"')
|
2919
2999
|
elif system == "Windows": # Windows
|
2920
3000
|
# Ensure the path is handled correctly in Windows, escape spaces
|
2921
|
-
os.system(f
|
3001
|
+
os.system(f'start "" "{fpath}"')
|
2922
3002
|
elif system == "Linux": # Linux
|
2923
|
-
os.system(f
|
3003
|
+
os.system(f'xdg-open "{fpath}"')
|
2924
3004
|
elif system == "Java": # Java (or other unhandled systems)
|
2925
3005
|
print(f"Opening {fpath} on unsupported system.")
|
2926
3006
|
else:
|
2927
3007
|
print(f"Unsupported OS: {system}")
|
2928
|
-
|
3008
|
+
|
2929
3009
|
print(f"Successfully opened {fpath} with the default application.")
|
2930
3010
|
except Exception as e:
|
2931
3011
|
print(f"Error opening file {fpath}: {e}")
|
2932
3012
|
|
2933
3013
|
|
2934
|
-
|
2935
3014
|
def fupdate(fpath, content=None, how="head"):
|
2936
3015
|
"""
|
2937
3016
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
@@ -3346,9 +3425,10 @@ def fsave(
|
|
3346
3425
|
except Exception as e:
|
3347
3426
|
try:
|
3348
3427
|
import pickle
|
3349
|
-
|
3428
|
+
|
3429
|
+
with open(fpath, "wb") as f:
|
3350
3430
|
pickle.dump(content, f)
|
3351
|
-
print(
|
3431
|
+
print("done!", fpath)
|
3352
3432
|
except Exception as e:
|
3353
3433
|
raise ValueError(
|
3354
3434
|
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
@@ -3508,9 +3588,9 @@ def isa(content, kind):
|
|
3508
3588
|
"""
|
3509
3589
|
if "img" in kind.lower() or "image" in kind.lower():
|
3510
3590
|
return is_image(content)
|
3511
|
-
elif
|
3591
|
+
elif "vid" in kind.lower():
|
3512
3592
|
return is_video(content)
|
3513
|
-
elif
|
3593
|
+
elif "aud" in kind.lower():
|
3514
3594
|
return is_audio(content)
|
3515
3595
|
elif "doc" in kind.lower():
|
3516
3596
|
return is_document(content)
|
@@ -3755,22 +3835,28 @@ def get_os(full=False, verbose=False):
|
|
3755
3835
|
|
3756
3836
|
def get_system_uptime():
|
3757
3837
|
"""Returns system uptime as a human-readable string."""
|
3758
|
-
|
3759
|
-
|
3760
|
-
|
3838
|
+
try:
|
3839
|
+
boot_time = datetime.fromtimestamp(psutil.boot_time())
|
3840
|
+
uptime = datetime.now() - boot_time
|
3841
|
+
return str(uptime).split(".")[0] # Remove microseconds
|
3842
|
+
except:
|
3843
|
+
return None
|
3761
3844
|
|
3762
3845
|
def get_active_processes(limit=10):
|
3763
|
-
|
3764
|
-
|
3765
|
-
|
3766
|
-
|
3767
|
-
|
3768
|
-
|
3769
|
-
|
3770
|
-
|
3771
|
-
|
3772
|
-
|
3773
|
-
|
3846
|
+
try:
|
3847
|
+
processes = []
|
3848
|
+
for proc in psutil.process_iter(
|
3849
|
+
["pid", "name", "cpu_percent", "memory_percent"]
|
3850
|
+
):
|
3851
|
+
try:
|
3852
|
+
processes.append(proc.info)
|
3853
|
+
except psutil.NoSuchProcess:
|
3854
|
+
pass
|
3855
|
+
# Handle NoneType values by treating them as 0
|
3856
|
+
processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
|
3857
|
+
return processes[:limit]
|
3858
|
+
except:
|
3859
|
+
return None
|
3774
3860
|
|
3775
3861
|
def get_virtual_environment_info():
|
3776
3862
|
"""Checks if the script is running in a virtual environment and returns details."""
|
@@ -3801,19 +3887,22 @@ def get_os(full=False, verbose=False):
|
|
3801
3887
|
|
3802
3888
|
def get_battery_status():
|
3803
3889
|
"""Returns battery status."""
|
3804
|
-
|
3805
|
-
|
3806
|
-
|
3807
|
-
|
3808
|
-
|
3809
|
-
|
3810
|
-
|
3811
|
-
|
3812
|
-
|
3813
|
-
|
3814
|
-
|
3815
|
-
|
3816
|
-
|
3890
|
+
try:
|
3891
|
+
battery = psutil.sensors_battery()
|
3892
|
+
if battery:
|
3893
|
+
time_left = (
|
3894
|
+
str(timedelta(seconds=battery.secsleft))
|
3895
|
+
if battery.secsleft != psutil.POWER_TIME_UNLIMITED
|
3896
|
+
else "Charging/Unlimited"
|
3897
|
+
)
|
3898
|
+
return {
|
3899
|
+
"Percentage": battery.percent,
|
3900
|
+
"Plugged In": battery.power_plugged,
|
3901
|
+
"Time Left": time_left,
|
3902
|
+
}
|
3903
|
+
return {"Status": "No battery detected"}
|
3904
|
+
except:
|
3905
|
+
return {"Status": "No battery detected"}
|
3817
3906
|
|
3818
3907
|
def get_disk_io():
|
3819
3908
|
"""Returns disk I/O statistics."""
|
@@ -3899,8 +3988,8 @@ def get_os(full=False, verbose=False):
|
|
3899
3988
|
"network": {},
|
3900
3989
|
"network io": get_network_io(),
|
3901
3990
|
"gpu": [],
|
3902
|
-
"temperatures": get_temperatures(),
|
3903
|
-
"battery": get_battery_status(),
|
3991
|
+
# "temperatures": get_temperatures(),
|
3992
|
+
# "battery": get_battery_status(),
|
3904
3993
|
"active processes": get_active_processes(),
|
3905
3994
|
"environment": {
|
3906
3995
|
"user": os.getenv("USER", "Unknown"),
|
@@ -3984,27 +4073,31 @@ def get_os(full=False, verbose=False):
|
|
3984
4073
|
pnrint(e)
|
3985
4074
|
return res
|
3986
4075
|
|
4076
|
+
|
3987
4077
|
import re
|
3988
4078
|
import stat
|
3989
4079
|
import platform
|
4080
|
+
|
4081
|
+
|
3990
4082
|
def listdir(
|
3991
4083
|
rootdir,
|
3992
4084
|
kind=None,
|
3993
4085
|
sort_by="name",
|
3994
4086
|
ascending=True,
|
3995
|
-
contains=None
|
3996
|
-
booster=False
|
3997
|
-
depth
|
4087
|
+
contains=None, # filter filenames using re
|
4088
|
+
booster=False, # walk in subfolders
|
4089
|
+
depth=0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
|
3998
4090
|
hidden=False, # Include hidden files/folders
|
3999
4091
|
orient="list",
|
4000
4092
|
output="df", # "df", 'list','dict','records','index','series'
|
4001
4093
|
verbose=True,
|
4002
|
-
):
|
4094
|
+
):
|
4003
4095
|
def is_hidden(filepath):
|
4004
4096
|
"""Check if a file or folder is hidden."""
|
4005
4097
|
system = platform.system()
|
4006
4098
|
if system == "Windows":
|
4007
4099
|
import ctypes
|
4100
|
+
|
4008
4101
|
attribute = ctypes.windll.kernel32.GetFileAttributesW(filepath)
|
4009
4102
|
if attribute == -1:
|
4010
4103
|
raise FileNotFoundError(f"File {filepath} not found.")
|
@@ -4019,6 +4112,7 @@ def listdir(
|
|
4019
4112
|
return os.environ.get("USERNAME", "Unknown")
|
4020
4113
|
else:
|
4021
4114
|
import pwd
|
4115
|
+
|
4022
4116
|
return pwd.getpwuid(os.getuid()).pw_name
|
4023
4117
|
|
4024
4118
|
if isinstance(kind, list):
|
@@ -4030,7 +4124,7 @@ def listdir(
|
|
4030
4124
|
sort_by=sort_by,
|
4031
4125
|
ascending=ascending,
|
4032
4126
|
contains=contains,
|
4033
|
-
depth=depth
|
4127
|
+
depth=depth, # walk in subfolders
|
4034
4128
|
hidden=hidden,
|
4035
4129
|
orient=orient,
|
4036
4130
|
output=output,
|
@@ -4046,21 +4140,21 @@ def listdir(
|
|
4046
4140
|
i = 0
|
4047
4141
|
f = {
|
4048
4142
|
"name": [],
|
4049
|
-
|
4143
|
+
"kind": [],
|
4050
4144
|
"length": [],
|
4051
|
-
"basename":[],
|
4145
|
+
"basename": [],
|
4052
4146
|
"path": [],
|
4053
4147
|
"created_time": [],
|
4054
4148
|
"modified_time": [],
|
4055
4149
|
"last_open_time": [],
|
4056
4150
|
"size": [],
|
4057
|
-
"permission":[],
|
4058
|
-
"owner":[],
|
4059
|
-
"rootdir":[],
|
4151
|
+
"permission": [],
|
4152
|
+
"owner": [],
|
4153
|
+
"rootdir": [],
|
4060
4154
|
"fname": [],
|
4061
4155
|
"fpath": [],
|
4062
|
-
"num":[],
|
4063
|
-
"os":[]
|
4156
|
+
"num": [],
|
4157
|
+
"os": [],
|
4064
4158
|
}
|
4065
4159
|
root_depth = rootdir.rstrip(os.sep).count(os.sep)
|
4066
4160
|
for dirpath, dirnames, ls in os.walk(rootdir):
|
@@ -4069,30 +4163,32 @@ def listdir(
|
|
4069
4163
|
if depth is not None and current_depth > depth:
|
4070
4164
|
dirnames[:] = [] # Prevent further traversal into subfolders
|
4071
4165
|
continue
|
4072
|
-
|
4166
|
+
|
4073
4167
|
if not hidden:
|
4074
|
-
dirnames[:] = [
|
4168
|
+
dirnames[:] = [
|
4169
|
+
d for d in dirnames if not is_hidden(os.path.join(dirpath, d))
|
4170
|
+
]
|
4075
4171
|
ls = [i for i in ls if not is_hidden(os.path.join(dirpath, i))]
|
4076
4172
|
|
4077
4173
|
for dirname in dirnames:
|
4078
|
-
if kind is not None and kind not in fd:
|
4174
|
+
if kind is not None and kind not in fd: # do not check folders
|
4079
4175
|
continue
|
4080
4176
|
if contains and not re.search(contains, dirname):
|
4081
4177
|
continue
|
4082
4178
|
dirname_path = os.path.join(dirpath, dirname)
|
4083
|
-
fpath = os.path.join(os.path.dirname(dirname_path), dirname)
|
4179
|
+
fpath = os.path.join(os.path.dirname(dirname_path), dirname)
|
4084
4180
|
try:
|
4085
4181
|
stats_file = os.stat(fpath)
|
4086
4182
|
except Exception as e:
|
4087
4183
|
print(e)
|
4088
4184
|
continue
|
4089
4185
|
filename, file_extension = os.path.splitext(dirname)
|
4090
|
-
file_extension = file_extension if file_extension!=
|
4186
|
+
file_extension = file_extension if file_extension != "" else None
|
4091
4187
|
f["name"].append(filename)
|
4092
|
-
f[
|
4188
|
+
f["kind"].append(file_extension)
|
4093
4189
|
f["length"].append(len(filename))
|
4094
4190
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
4095
|
-
f[
|
4191
|
+
f["basename"].append(os.path.basename(dirname_path))
|
4096
4192
|
f["path"].append(os.path.join(os.path.dirname(dirname_path), dirname))
|
4097
4193
|
f["created_time"].append(
|
4098
4194
|
pd.to_datetime(int(os.path.getctime(dirname_path)), unit="s")
|
@@ -4110,7 +4206,7 @@ def listdir(
|
|
4110
4206
|
f["fpath"].append(fpath) # will be removed
|
4111
4207
|
i += 1
|
4112
4208
|
for item in ls:
|
4113
|
-
if kind in fd
|
4209
|
+
if kind in fd: # only check folders
|
4114
4210
|
continue
|
4115
4211
|
if contains and not re.search(contains, item):
|
4116
4212
|
continue
|
@@ -4127,7 +4223,16 @@ def listdir(
|
|
4127
4223
|
is_file = kind.lower() in file_extension.lower() and (
|
4128
4224
|
os.path.isfile(item_path)
|
4129
4225
|
)
|
4130
|
-
if kind in [
|
4226
|
+
if kind in [
|
4227
|
+
".doc",
|
4228
|
+
".img",
|
4229
|
+
".zip",
|
4230
|
+
".code",
|
4231
|
+
".file",
|
4232
|
+
".image",
|
4233
|
+
".video",
|
4234
|
+
".audio",
|
4235
|
+
]: # 选择大的类别
|
4131
4236
|
if kind != ".folder" and not isa(item_path, kind):
|
4132
4237
|
continue
|
4133
4238
|
elif kind in [".all"]:
|
@@ -4135,13 +4240,13 @@ def listdir(
|
|
4135
4240
|
else: # 精确到文件的后缀
|
4136
4241
|
if not is_folder and not is_file:
|
4137
4242
|
continue
|
4138
|
-
file_extension = file_extension if file_extension!=
|
4243
|
+
file_extension = file_extension if file_extension != "" else None
|
4139
4244
|
f["name"].append(filename)
|
4140
|
-
f[
|
4245
|
+
f["kind"].append(file_extension)
|
4141
4246
|
f["length"].append(len(filename))
|
4142
4247
|
f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
|
4143
|
-
f[
|
4144
|
-
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
4248
|
+
f["basename"].append(os.path.basename(item_path))
|
4249
|
+
f["path"].append(os.path.join(os.path.dirname(item_path), item))
|
4145
4250
|
f["created_time"].append(
|
4146
4251
|
pd.to_datetime(int(os.path.getctime(item_path)), unit="s")
|
4147
4252
|
)
|
@@ -4152,7 +4257,9 @@ def listdir(
|
|
4152
4257
|
pd.to_datetime(int(os.path.getatime(item_path)), unit="s")
|
4153
4258
|
)
|
4154
4259
|
f["permission"].append(stat.filemode(stats_file.st_mode)),
|
4155
|
-
f["owner"].append(
|
4260
|
+
f["owner"].append(
|
4261
|
+
os.getlogin() if platform.system() != "Windows" else "N/A"
|
4262
|
+
),
|
4156
4263
|
f["fname"].append(filename) # will be removed
|
4157
4264
|
f["fpath"].append(fpath) # will be removed
|
4158
4265
|
f["rootdir"].append(dirpath)
|
@@ -4162,11 +4269,28 @@ def listdir(
|
|
4162
4269
|
f["os"] = get_os() # os.uname().machine
|
4163
4270
|
# if not booster: # go deeper subfolders
|
4164
4271
|
# break
|
4165
|
-
|
4272
|
+
# * convert to pd.DataFrame
|
4166
4273
|
f = pd.DataFrame(f)
|
4167
|
-
f=f[
|
4168
|
-
|
4169
|
-
|
4274
|
+
f = f[
|
4275
|
+
[
|
4276
|
+
"basename",
|
4277
|
+
"name",
|
4278
|
+
"kind",
|
4279
|
+
"length",
|
4280
|
+
"size",
|
4281
|
+
"num",
|
4282
|
+
"path",
|
4283
|
+
"created_time",
|
4284
|
+
"modified_time",
|
4285
|
+
"last_open_time",
|
4286
|
+
"rootdir",
|
4287
|
+
"permission",
|
4288
|
+
"owner",
|
4289
|
+
"os",
|
4290
|
+
"fname",
|
4291
|
+
"fpath",
|
4292
|
+
]
|
4293
|
+
]
|
4170
4294
|
if "nam" in sort_by.lower():
|
4171
4295
|
f = sort_kind(f, by="name", ascending=ascending)
|
4172
4296
|
elif "crea" in sort_by.lower():
|
@@ -4183,6 +4307,7 @@ def listdir(
|
|
4183
4307
|
return f
|
4184
4308
|
else:
|
4185
4309
|
from box import Box
|
4310
|
+
|
4186
4311
|
if "l" in orient.lower(): # list # default
|
4187
4312
|
res_output = Box(f.to_dict(orient="list"))
|
4188
4313
|
return res_output
|
@@ -4195,6 +4320,7 @@ def listdir(
|
|
4195
4320
|
if "se" in orient.lower(): # records
|
4196
4321
|
return Box(f.to_dict(orient="series"))
|
4197
4322
|
|
4323
|
+
|
4198
4324
|
def listfunc(lib_name, opt="call"):
|
4199
4325
|
if opt == "call":
|
4200
4326
|
funcs = [func for func in dir(lib_name) if callable(getattr(lib_name, func))]
|
@@ -4206,6 +4332,7 @@ def listfunc(lib_name, opt="call"):
|
|
4206
4332
|
def func_list(lib_name, opt="call"):
|
4207
4333
|
return list_func(lib_name, opt=opt)
|
4208
4334
|
|
4335
|
+
|
4209
4336
|
def copy(src, dst, overwrite=False):
|
4210
4337
|
"""Copy a file from src to dst."""
|
4211
4338
|
try:
|
@@ -4223,25 +4350,31 @@ def copy(src, dst, overwrite=False):
|
|
4223
4350
|
if overwrite:
|
4224
4351
|
dst.unlink()
|
4225
4352
|
else:
|
4226
|
-
dst = dst.with_name(
|
4353
|
+
dst = dst.with_name(
|
4354
|
+
f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
|
4355
|
+
)
|
4227
4356
|
shutil.copy(src, dst)
|
4228
4357
|
print(f"\n Done! copy to {dst}\n")
|
4229
4358
|
else:
|
4230
|
-
dst = dst/src.name
|
4359
|
+
dst = dst / src.name
|
4231
4360
|
if dst.exists():
|
4232
4361
|
if overwrite:
|
4233
4362
|
shutil.rmtree(dst) # Remove existing directory
|
4234
4363
|
else:
|
4235
|
-
dst = dst.with_name(
|
4364
|
+
dst = dst.with_name(
|
4365
|
+
f"{dst.stem}_{datetime.now().strftime('%H%M%S')}"
|
4366
|
+
)
|
4236
4367
|
shutil.copytree(src, dst)
|
4237
4368
|
print(f"\n Done! copy to {dst}\n")
|
4238
4369
|
|
4239
4370
|
except Exception as e:
|
4240
4371
|
logging.error(f"Failed {e}")
|
4241
|
-
|
4372
|
+
|
4373
|
+
|
4242
4374
|
def cut(src, dst, overwrite=False):
|
4243
4375
|
return move(src=src, dst=dst, overwrite=overwrite)
|
4244
4376
|
|
4377
|
+
|
4245
4378
|
def move(src, dst, overwrite=False):
|
4246
4379
|
try:
|
4247
4380
|
dir_par_dst = os.path.dirname(dst)
|
@@ -4256,23 +4389,26 @@ def move(src, dst, overwrite=False):
|
|
4256
4389
|
# dst.unlink() # Delete the existing file
|
4257
4390
|
pass
|
4258
4391
|
else:
|
4259
|
-
dst = dst.with_name(
|
4392
|
+
dst = dst.with_name(
|
4393
|
+
f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
|
4394
|
+
)
|
4260
4395
|
shutil.move(src, dst)
|
4261
4396
|
print(f"\n Done! moved to {dst}\n")
|
4262
4397
|
except Exception as e:
|
4263
4398
|
logging.error(f"Failed to move file from {src} to {dst}: {e}")
|
4264
|
-
|
4399
|
+
|
4400
|
+
|
4265
4401
|
def delete(fpath):
|
4266
|
-
"""Delete a file/folder."""
|
4402
|
+
"""Delete a file/folder."""
|
4267
4403
|
try:
|
4268
4404
|
fpath = Path(fpath)
|
4269
|
-
if not fpath.is_dir():
|
4405
|
+
if not fpath.is_dir(): # file
|
4270
4406
|
if fpath.exists():
|
4271
4407
|
fpath.unlink()
|
4272
4408
|
print(f"\n Done! delete {fpath}\n")
|
4273
4409
|
else:
|
4274
4410
|
print(f"File '{fpath}' does not exist.")
|
4275
|
-
else
|
4411
|
+
else: # folder
|
4276
4412
|
if fpath.exists():
|
4277
4413
|
shutil.rmtree(fpath) # Remove existing directory
|
4278
4414
|
print(f"\n Done! delete {fpath}\n")
|
@@ -4280,27 +4416,31 @@ def delete(fpath):
|
|
4280
4416
|
print(f"Folder '{fpath}' does not exist.")
|
4281
4417
|
except Exception as e:
|
4282
4418
|
logging.error(f"Failed to delete {fpath}: {e}")
|
4419
|
+
|
4420
|
+
|
4283
4421
|
def rename(fpath, dst, smart=True):
|
4284
4422
|
"""Rename a file or folder."""
|
4285
4423
|
try:
|
4286
|
-
src_kind,dst_kind = None,None
|
4424
|
+
src_kind, dst_kind = None, None
|
4287
4425
|
if smart:
|
4288
|
-
dir_name_src=os.path.dirname(fpath)
|
4289
|
-
dir_name_dst=os.path.dirname(dst)
|
4290
|
-
src_kind=os.path.splitext(fpath)[1]
|
4291
|
-
dst_kind=os.path.splitext(dst)[1]
|
4292
|
-
if dir_name_dst!=dir_name_src:
|
4293
|
-
dst=os.path.join(dir_name_src,dst)
|
4426
|
+
dir_name_src = os.path.dirname(fpath)
|
4427
|
+
dir_name_dst = os.path.dirname(dst)
|
4428
|
+
src_kind = os.path.splitext(fpath)[1]
|
4429
|
+
dst_kind = os.path.splitext(dst)[1]
|
4430
|
+
if dir_name_dst != dir_name_src:
|
4431
|
+
dst = os.path.join(dir_name_src, dst)
|
4294
4432
|
if dst_kind is not None and src_kind is not None:
|
4295
|
-
if dst_kind!=src_kind:
|
4296
|
-
dst=dst + src_kind
|
4433
|
+
if dst_kind != src_kind:
|
4434
|
+
dst = dst + src_kind
|
4297
4435
|
if os.path.exists(fpath):
|
4298
|
-
os.rename(fpath,dst)
|
4436
|
+
os.rename(fpath, dst)
|
4299
4437
|
print(f"Done! rename to {dst}")
|
4300
4438
|
else:
|
4301
4439
|
print(f"Failed: {fpath} does not exist.")
|
4302
4440
|
except Exception as e:
|
4303
4441
|
logging.error(f"Failed to rename {fpath} to {dst}: {e}")
|
4442
|
+
|
4443
|
+
|
4304
4444
|
def mkdir_nest(fpath: str) -> str:
|
4305
4445
|
"""
|
4306
4446
|
Create nested directories based on the provided file path.
|
@@ -4319,9 +4459,13 @@ def mkdir_nest(fpath: str) -> str:
|
|
4319
4459
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
4320
4460
|
|
4321
4461
|
# Start creating directories from the root to the desired path
|
4322
|
-
root_dir = os.path.splitdrive(fpath)[
|
4323
|
-
|
4324
|
-
|
4462
|
+
root_dir = os.path.splitdrive(fpath)[
|
4463
|
+
0
|
4464
|
+
] # Get the root drive on Windows (e.g., 'C:')
|
4465
|
+
current_path = (
|
4466
|
+
root_dir if root_dir else f_slash
|
4467
|
+
) # Start from the root directory or POSIX '/'
|
4468
|
+
|
4325
4469
|
for part in dir_parts:
|
4326
4470
|
if part:
|
4327
4471
|
current_path = os.path.join(current_path, part)
|
@@ -4346,7 +4490,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
4346
4490
|
- str: The path of the created directory or an error message.
|
4347
4491
|
"""
|
4348
4492
|
rootdir = []
|
4349
|
-
pardir= mkdir_nest(pardir)
|
4493
|
+
pardir = mkdir_nest(pardir)
|
4350
4494
|
if chdir is None:
|
4351
4495
|
return pardir
|
4352
4496
|
else:
|
@@ -4465,6 +4609,7 @@ def figsave(*args, dpi=300):
|
|
4465
4609
|
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
4466
4610
|
elif isinstance(img, np.ndarray):
|
4467
4611
|
import cv2
|
4612
|
+
|
4468
4613
|
# Check the shape of the image to determine color mode
|
4469
4614
|
if img.ndim == 2:
|
4470
4615
|
# Grayscale image
|
@@ -4496,8 +4641,13 @@ def figsave(*args, dpi=300):
|
|
4496
4641
|
)
|
4497
4642
|
else:
|
4498
4643
|
plt.savefig(
|
4499
|
-
fname,
|
4500
|
-
|
4644
|
+
fname,
|
4645
|
+
format=ftype.lower(),
|
4646
|
+
dpi=dpi,
|
4647
|
+
bbox_inches="tight",
|
4648
|
+
transparent=True,
|
4649
|
+
pad_inches=0,
|
4650
|
+
)
|
4501
4651
|
elif ftype.lower() == "emf":
|
4502
4652
|
plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
|
4503
4653
|
elif ftype.lower() == "fig":
|
@@ -4534,6 +4684,7 @@ def is_num(s):
|
|
4534
4684
|
def isnum(s):
|
4535
4685
|
return is_num(s)
|
4536
4686
|
|
4687
|
+
|
4537
4688
|
def is_image(fpath):
|
4538
4689
|
"""
|
4539
4690
|
Determine if a given file is an image based on MIME type and file extension.
|
@@ -4545,6 +4696,7 @@ def is_image(fpath):
|
|
4545
4696
|
bool: True if the file is a recognized image, False otherwise.
|
4546
4697
|
"""
|
4547
4698
|
import mimetypes
|
4699
|
+
|
4548
4700
|
# Known image MIME types
|
4549
4701
|
image_mime_types = {
|
4550
4702
|
"image/jpeg",
|
@@ -4561,8 +4713,20 @@ def is_image(fpath):
|
|
4561
4713
|
|
4562
4714
|
# Known image file extensions
|
4563
4715
|
image_extensions = {
|
4564
|
-
".jpg",
|
4565
|
-
".
|
4716
|
+
".jpg",
|
4717
|
+
".jpeg",
|
4718
|
+
".png",
|
4719
|
+
".gif",
|
4720
|
+
".bmp",
|
4721
|
+
".webp",
|
4722
|
+
".tif",
|
4723
|
+
".tiff",
|
4724
|
+
".ico",
|
4725
|
+
".svg",
|
4726
|
+
".heic",
|
4727
|
+
".heif",
|
4728
|
+
".fig",
|
4729
|
+
".jpg",
|
4566
4730
|
}
|
4567
4731
|
|
4568
4732
|
# Get MIME type using mimetypes
|
@@ -4573,12 +4737,15 @@ def is_image(fpath):
|
|
4573
4737
|
return True
|
4574
4738
|
|
4575
4739
|
# Fallback: Check file extension
|
4576
|
-
ext = os.path.splitext(fpath)[
|
4740
|
+
ext = os.path.splitext(fpath)[
|
4741
|
+
-1
|
4742
|
+
].lower() # Get the file extension and ensure lowercase
|
4577
4743
|
if ext in image_extensions:
|
4578
4744
|
return True
|
4579
4745
|
|
4580
4746
|
return False
|
4581
4747
|
|
4748
|
+
|
4582
4749
|
def is_video(fpath):
|
4583
4750
|
"""
|
4584
4751
|
Determine if a given file is a video based on MIME type and file extension.
|
@@ -4590,6 +4757,7 @@ def is_video(fpath):
|
|
4590
4757
|
bool: True if the file is a recognized video, False otherwise.
|
4591
4758
|
"""
|
4592
4759
|
import mimetypes
|
4760
|
+
|
4593
4761
|
# Known video MIME types
|
4594
4762
|
video_mime_types = {
|
4595
4763
|
"video/mp4",
|
@@ -4610,8 +4778,22 @@ def is_video(fpath):
|
|
4610
4778
|
|
4611
4779
|
# Known video file extensions
|
4612
4780
|
video_extensions = {
|
4613
|
-
".mp4",
|
4614
|
-
".
|
4781
|
+
".mp4",
|
4782
|
+
".mov",
|
4783
|
+
".avi",
|
4784
|
+
".mkv",
|
4785
|
+
".flv",
|
4786
|
+
".webm",
|
4787
|
+
".ogv",
|
4788
|
+
".wmv",
|
4789
|
+
".mpg",
|
4790
|
+
".mpeg",
|
4791
|
+
".3gp",
|
4792
|
+
".mpeg2",
|
4793
|
+
".asf",
|
4794
|
+
".ts",
|
4795
|
+
".m4v",
|
4796
|
+
".divx",
|
4615
4797
|
}
|
4616
4798
|
|
4617
4799
|
# Get MIME type using mimetypes
|
@@ -4622,12 +4804,15 @@ def is_video(fpath):
|
|
4622
4804
|
return True
|
4623
4805
|
|
4624
4806
|
# Fallback: Check file extension
|
4625
|
-
ext = os.path.splitext(fpath)[
|
4807
|
+
ext = os.path.splitext(fpath)[
|
4808
|
+
-1
|
4809
|
+
].lower() # Get the file extension and ensure lowercase
|
4626
4810
|
if ext in video_extensions:
|
4627
4811
|
return True
|
4628
4812
|
|
4629
4813
|
return False
|
4630
4814
|
|
4815
|
+
|
4631
4816
|
def is_document(fpath):
|
4632
4817
|
"""
|
4633
4818
|
Determine if a given file is a document based on MIME type and file extension.
|
@@ -4639,6 +4824,7 @@ def is_document(fpath):
|
|
4639
4824
|
bool: True if the file is a recognized document, False otherwise.
|
4640
4825
|
"""
|
4641
4826
|
import mimetypes
|
4827
|
+
|
4642
4828
|
# Define known MIME types for documents
|
4643
4829
|
document_mime_types = {
|
4644
4830
|
"text/",
|
@@ -4679,18 +4865,23 @@ def is_document(fpath):
|
|
4679
4865
|
|
4680
4866
|
# Get MIME type
|
4681
4867
|
mime_type, _ = mimetypes.guess_type(fpath)
|
4682
|
-
|
4868
|
+
|
4683
4869
|
# Check MIME type
|
4684
|
-
if mime_type and any(
|
4870
|
+
if mime_type and any(
|
4871
|
+
mime_type.startswith(doc_type) for doc_type in document_mime_types
|
4872
|
+
):
|
4685
4873
|
return True
|
4686
4874
|
|
4687
4875
|
# Fallback: Check file extension
|
4688
|
-
ext = os.path.splitext(fpath)[
|
4876
|
+
ext = os.path.splitext(fpath)[
|
4877
|
+
-1
|
4878
|
+
].lower() # Get the extension, ensure it's lowercase
|
4689
4879
|
if ext in document_extensions:
|
4690
4880
|
return True
|
4691
4881
|
|
4692
4882
|
return False
|
4693
4883
|
|
4884
|
+
|
4694
4885
|
def is_audio(fpath):
|
4695
4886
|
"""
|
4696
4887
|
Determine if a given file is an audio file based on MIME type and file extension.
|
@@ -4702,6 +4893,7 @@ def is_audio(fpath):
|
|
4702
4893
|
bool: True if the file is a recognized audio file, False otherwise.
|
4703
4894
|
"""
|
4704
4895
|
import mimetypes
|
4896
|
+
|
4705
4897
|
# Known audio MIME types
|
4706
4898
|
audio_mime_types = {
|
4707
4899
|
"audio/mpeg",
|
@@ -4720,8 +4912,19 @@ def is_audio(fpath):
|
|
4720
4912
|
|
4721
4913
|
# Known audio file extensions
|
4722
4914
|
audio_extensions = {
|
4723
|
-
".mp3",
|
4724
|
-
".
|
4915
|
+
".mp3",
|
4916
|
+
".wav",
|
4917
|
+
".ogg",
|
4918
|
+
".aac",
|
4919
|
+
".flac",
|
4920
|
+
".midi",
|
4921
|
+
".m4a",
|
4922
|
+
".aiff",
|
4923
|
+
".pcm",
|
4924
|
+
".wma",
|
4925
|
+
".ape",
|
4926
|
+
".alac",
|
4927
|
+
".opus",
|
4725
4928
|
}
|
4726
4929
|
|
4727
4930
|
# Get MIME type using mimetypes
|
@@ -4732,12 +4935,15 @@ def is_audio(fpath):
|
|
4732
4935
|
return True
|
4733
4936
|
|
4734
4937
|
# Fallback: Check file extension
|
4735
|
-
ext = os.path.splitext(fpath)[
|
4938
|
+
ext = os.path.splitext(fpath)[
|
4939
|
+
-1
|
4940
|
+
].lower() # Get the file extension and ensure lowercase
|
4736
4941
|
if ext in audio_extensions:
|
4737
4942
|
return True
|
4738
4943
|
|
4739
4944
|
return False
|
4740
4945
|
|
4946
|
+
|
4741
4947
|
def is_code(fpath):
|
4742
4948
|
"""
|
4743
4949
|
Determine if a given file is a code file based on file extension and optionally MIME type.
|
@@ -4751,16 +4957,37 @@ def is_code(fpath):
|
|
4751
4957
|
"""
|
4752
4958
|
# Known programming and scripting file extensions
|
4753
4959
|
code_extensions = {
|
4754
|
-
".m",
|
4755
|
-
".
|
4960
|
+
".m",
|
4961
|
+
".py",
|
4962
|
+
".ipynb",
|
4963
|
+
".js",
|
4964
|
+
".html",
|
4965
|
+
".css",
|
4966
|
+
".java",
|
4967
|
+
".cpp",
|
4968
|
+
".h",
|
4969
|
+
".cs",
|
4970
|
+
".go",
|
4971
|
+
".rs",
|
4972
|
+
".sh",
|
4973
|
+
".rb",
|
4974
|
+
".swift",
|
4975
|
+
".ts",
|
4976
|
+
".json",
|
4977
|
+
".xml",
|
4978
|
+
".yaml",
|
4979
|
+
".toml",
|
4980
|
+
".bash",
|
4981
|
+
".r",
|
4756
4982
|
}
|
4757
4983
|
|
4758
4984
|
# Check file extension
|
4759
|
-
ext = os.path.splitext(fpath)[-1].lower()
|
4985
|
+
ext = os.path.splitext(fpath)[-1].lower()
|
4760
4986
|
if ext in code_extensions:
|
4761
|
-
return True
|
4987
|
+
return True
|
4762
4988
|
return False
|
4763
|
-
|
4989
|
+
|
4990
|
+
|
4764
4991
|
def is_zip(fpath):
|
4765
4992
|
import mimetypes
|
4766
4993
|
|
@@ -4982,6 +5209,7 @@ def apply_filter(img, *args):
|
|
4982
5209
|
)
|
4983
5210
|
return img.filter(supported_filters[filter_name])
|
4984
5211
|
|
5212
|
+
|
4985
5213
|
def detect_angle(image, by="median", template=None):
|
4986
5214
|
"""Detect the angle of rotation using various methods."""
|
4987
5215
|
from sklearn.decomposition import PCA
|
@@ -4989,7 +5217,8 @@ def detect_angle(image, by="median", template=None):
|
|
4989
5217
|
from skimage.color import rgb2gray
|
4990
5218
|
from scipy.fftpack import fftshift, fft2
|
4991
5219
|
import numpy as np
|
4992
|
-
import cv2
|
5220
|
+
import cv2
|
5221
|
+
|
4993
5222
|
# Convert to grayscale
|
4994
5223
|
gray_image = rgb2gray(image)
|
4995
5224
|
|
@@ -5091,6 +5320,7 @@ def detect_angle(image, by="median", template=None):
|
|
5091
5320
|
print(f"Unknown method {by}")
|
5092
5321
|
return 0
|
5093
5322
|
|
5323
|
+
|
5094
5324
|
def imgsets(img, **kwargs):
|
5095
5325
|
"""
|
5096
5326
|
Apply various enhancements and filters to an image using PIL's ImageEnhance and ImageFilter modules.
|
@@ -6355,13 +6585,13 @@ def _df_outlier(
|
|
6355
6585
|
from scipy.stats import zscore
|
6356
6586
|
from sklearn.ensemble import IsolationForest
|
6357
6587
|
from sklearn.preprocessing import StandardScaler
|
6358
|
-
|
6588
|
+
|
6359
6589
|
# Fill completely NaN columns with a default value (e.g., 0)
|
6360
6590
|
data = data.copy()
|
6361
6591
|
data.loc[:, data.isna().all()] = 0
|
6362
6592
|
if columns is not None:
|
6363
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
6364
|
-
data=data[columns]
|
6593
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
6594
|
+
data = data[columns]
|
6365
6595
|
col_names_org = data.columns.tolist()
|
6366
6596
|
index_names_org = data.index.tolist()
|
6367
6597
|
# Separate numeric and non-numeric columns
|
@@ -6527,6 +6757,7 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
6527
6757
|
data = data.explode(column, ignore_index=True)
|
6528
6758
|
return data
|
6529
6759
|
|
6760
|
+
|
6530
6761
|
def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
6531
6762
|
"""
|
6532
6763
|
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
@@ -6536,24 +6767,30 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
|
6536
6767
|
data = df_cycle(data, 'month', 12)
|
6537
6768
|
"""
|
6538
6769
|
if columns is None:
|
6539
|
-
columns = list(
|
6770
|
+
columns = list(
|
6771
|
+
data.select_dtypes(include=np.number).columns
|
6772
|
+
) # If no columns specified, use all columns
|
6540
6773
|
if max_val is None:
|
6541
|
-
max_val = np.max(
|
6774
|
+
max_val = np.max(
|
6775
|
+
data[columns]
|
6776
|
+
) # If no max_val specified, use the maximum value across all columns
|
6542
6777
|
if isinstance(columns, str):
|
6543
|
-
columns = [
|
6544
|
-
|
6778
|
+
columns = [
|
6779
|
+
columns
|
6780
|
+
] # If a single column name is provided as a string, convert it to a list
|
6781
|
+
|
6545
6782
|
# Check if inplace is True, so we modify the original dataframe
|
6546
6783
|
if inplace:
|
6547
6784
|
# Modify the data in place, no return statement needed
|
6548
6785
|
for col in columns:
|
6549
|
-
data[col +
|
6550
|
-
data[col +
|
6786
|
+
data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
|
6787
|
+
data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
|
6551
6788
|
else:
|
6552
6789
|
# If inplace is False, return the modified dataframe
|
6553
6790
|
new_data = data.copy()
|
6554
6791
|
for col in columns:
|
6555
|
-
new_data[col +
|
6556
|
-
new_data[col +
|
6792
|
+
new_data[col + "_sin"] = np.sin(2 * np.pi * new_data[col] / max_val)
|
6793
|
+
new_data[col + "_cos"] = np.cos(2 * np.pi * new_data[col] / max_val)
|
6557
6794
|
return new_data
|
6558
6795
|
|
6559
6796
|
|
@@ -6561,7 +6798,7 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
|
6561
6798
|
def df_astype(
|
6562
6799
|
data: pd.DataFrame,
|
6563
6800
|
columns: Optional[Union[str, List[str]]] = None,
|
6564
|
-
astype: str = None
|
6801
|
+
astype: str = None, # "datetime",
|
6565
6802
|
skip_row: Union[str, list] = None,
|
6566
6803
|
fmt: Optional[str] = None,
|
6567
6804
|
inplace: bool = False,
|
@@ -6624,7 +6861,7 @@ def df_astype(
|
|
6624
6861
|
"day",
|
6625
6862
|
"month",
|
6626
6863
|
"year",
|
6627
|
-
"circular"
|
6864
|
+
"circular",
|
6628
6865
|
]
|
6629
6866
|
# If inplace is False, make a copy of the DataFrame
|
6630
6867
|
if not inplace:
|
@@ -6720,12 +6957,12 @@ def df_astype(
|
|
6720
6957
|
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
6721
6958
|
# print(f"Successfully converted '{column}' to timedelta.")
|
6722
6959
|
elif astype == "circular":
|
6723
|
-
max_val = kwargs.get(
|
6724
|
-
data[column]=df_cycle(data=data,columns=column,max_val=max_val)
|
6960
|
+
max_val = kwargs.get("max_val", None)
|
6961
|
+
data[column] = df_cycle(data=data, columns=column, max_val=max_val)
|
6725
6962
|
else:
|
6726
6963
|
# Convert to other types (e.g., float, int)
|
6727
|
-
if astype==
|
6728
|
-
data[column] = data[column].astype(
|
6964
|
+
if astype == "int":
|
6965
|
+
data[column] = data[column].astype("float").astype("int")
|
6729
6966
|
else:
|
6730
6967
|
data[column] = data[column].astype(astype)
|
6731
6968
|
# print(f"Successfully converted '{column}' to {astype}.")
|
@@ -6775,7 +7012,9 @@ def df_sort_values(data, column, by=None, ascending=True, inplace=True, **kwargs
|
|
6775
7012
|
).index.tolist()
|
6776
7013
|
|
6777
7014
|
# Convert to a categorical type with the new order
|
6778
|
-
data[column] = pd.Categorical(
|
7015
|
+
data[column] = pd.Categorical(
|
7016
|
+
data[column], categories=sorted_counts, ordered=True
|
7017
|
+
)
|
6779
7018
|
# Set ascending to count_ascending for sorting
|
6780
7019
|
ascending = count_ascending # Adjust ascending for the final sort
|
6781
7020
|
elif isinstance(by, list):
|
@@ -6977,7 +7216,7 @@ def df_fillna(
|
|
6977
7216
|
# Fill completely NaN columns with a default value (e.g., 0)
|
6978
7217
|
data = data.copy()
|
6979
7218
|
data.loc[:, data.isna().all()] = 0
|
6980
|
-
|
7219
|
+
|
6981
7220
|
col_names_org = data.columns.tolist()
|
6982
7221
|
index_names_org = data.index.tolist()
|
6983
7222
|
# Separate numeric and non-numeric columns
|
@@ -7034,7 +7273,7 @@ def df_fillna(
|
|
7034
7273
|
imputed_data = imputer.fit_transform(numeric_data.T)
|
7035
7274
|
else:
|
7036
7275
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
7037
|
-
|
7276
|
+
|
7038
7277
|
imputed_data = pd.DataFrame(
|
7039
7278
|
imputed_data if axis == 0 else imputed_data.T,
|
7040
7279
|
index=numeric_data.index if axis == 0 else numeric_data.columns,
|
@@ -7179,11 +7418,15 @@ def df_encoder(
|
|
7179
7418
|
|
7180
7419
|
encoder = LabelEncoder()
|
7181
7420
|
# Apply LabelEncoder only to non-numeric columns
|
7182
|
-
non_numeric_columns = [
|
7421
|
+
non_numeric_columns = [
|
7422
|
+
col for col in columns if not pd.api.types.is_numeric_dtype(data[col])
|
7423
|
+
]
|
7183
7424
|
|
7184
7425
|
if not non_numeric_columns:
|
7185
7426
|
return data
|
7186
|
-
encoded_data = data[non_numeric_columns].apply(
|
7427
|
+
encoded_data = data[non_numeric_columns].apply(
|
7428
|
+
lambda col: encoder.fit_transform(col)
|
7429
|
+
)
|
7187
7430
|
return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
|
7188
7431
|
|
7189
7432
|
# Target encoding (Mean of the target for each category)
|
@@ -7210,13 +7453,13 @@ def df_scaler(
|
|
7210
7453
|
scaler=None,
|
7211
7454
|
method="standard",
|
7212
7455
|
columns=None, # default, select all numeric col/row
|
7213
|
-
feature_range=None
|
7456
|
+
feature_range=None, # specific for 'minmax'
|
7214
7457
|
vmin=0,
|
7215
7458
|
vmax=1,
|
7216
7459
|
inplace=False,
|
7217
7460
|
verbose=False, # show usage
|
7218
7461
|
axis=0, # defalut column-wise
|
7219
|
-
return_scaler:bool=False
|
7462
|
+
return_scaler: bool = False, # True: return both: return df, scaler
|
7220
7463
|
**kwargs,
|
7221
7464
|
):
|
7222
7465
|
"""
|
@@ -7235,34 +7478,56 @@ def df_scaler(
|
|
7235
7478
|
if verbose:
|
7236
7479
|
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
7237
7480
|
if scaler is None:
|
7238
|
-
methods = ["standard", "minmax", "robust","maxabs"]
|
7481
|
+
methods = ["standard", "minmax", "robust", "maxabs"]
|
7239
7482
|
method = strcmp(method, methods)[0]
|
7240
7483
|
if method == "standard":
|
7241
7484
|
from sklearn.preprocessing import StandardScaler
|
7485
|
+
|
7242
7486
|
if verbose:
|
7243
|
-
print(
|
7244
|
-
|
7487
|
+
print(
|
7488
|
+
"performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1."
|
7489
|
+
)
|
7490
|
+
print(
|
7491
|
+
"Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks."
|
7492
|
+
)
|
7245
7493
|
scaler = StandardScaler(**kwargs)
|
7246
7494
|
elif method == "minmax":
|
7247
7495
|
from sklearn.preprocessing import MinMaxScaler
|
7496
|
+
|
7248
7497
|
if feature_range is None:
|
7249
|
-
feature_range=(vmin,vmax)
|
7498
|
+
feature_range = (vmin, vmax)
|
7250
7499
|
if verbose:
|
7251
|
-
print(
|
7252
|
-
|
7253
|
-
|
7254
|
-
|
7500
|
+
print(
|
7501
|
+
"don't forget to define the range: e.g., 'feature_range=(0, 1)'. "
|
7502
|
+
)
|
7503
|
+
print(
|
7504
|
+
"scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1]."
|
7505
|
+
)
|
7506
|
+
print(
|
7507
|
+
"Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks."
|
7508
|
+
)
|
7509
|
+
scaler = MinMaxScaler(feature_range=feature_range, **kwargs)
|
7255
7510
|
elif method == "robust":
|
7256
7511
|
from sklearn.preprocessing import RobustScaler
|
7512
|
+
|
7257
7513
|
if verbose:
|
7258
|
-
print(
|
7259
|
-
|
7514
|
+
print(
|
7515
|
+
"scales the data based on the median and interquartile range, which is robust to outliers."
|
7516
|
+
)
|
7517
|
+
print(
|
7518
|
+
"Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation."
|
7519
|
+
)
|
7260
7520
|
scaler = RobustScaler(**kwargs)
|
7261
|
-
elif method=="maxabs":
|
7521
|
+
elif method == "maxabs":
|
7262
7522
|
from sklearn.preprocessing import MaxAbsScaler
|
7523
|
+
|
7263
7524
|
if verbose:
|
7264
|
-
print(
|
7265
|
-
|
7525
|
+
print(
|
7526
|
+
"This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature."
|
7527
|
+
)
|
7528
|
+
print(
|
7529
|
+
"Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems."
|
7530
|
+
)
|
7266
7531
|
scaler = MaxAbsScaler(**kwargs)
|
7267
7532
|
if axis not in [0, 1]:
|
7268
7533
|
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
@@ -7275,7 +7540,7 @@ def df_scaler(
|
|
7275
7540
|
non_numeric_columns = data.columns.difference(columns)
|
7276
7541
|
|
7277
7542
|
# scaled_data = scaler.fit_transform(data[columns])
|
7278
|
-
if scaler is None or not hasattr(scaler,
|
7543
|
+
if scaler is None or not hasattr(scaler, "mean_"):
|
7279
7544
|
scaled_data = scaler.fit_transform(data[columns])
|
7280
7545
|
else:
|
7281
7546
|
scaled_data = scaler.transform(data[columns])
|
@@ -7293,7 +7558,7 @@ def df_scaler(
|
|
7293
7558
|
)
|
7294
7559
|
scaled_df = scaled_df[data.columns] # Maintain column order
|
7295
7560
|
if return_scaler:
|
7296
|
-
return scaled_df,scaler
|
7561
|
+
return scaled_df, scaler
|
7297
7562
|
else:
|
7298
7563
|
return scaled_df
|
7299
7564
|
|
@@ -7310,7 +7575,11 @@ def df_scaler(
|
|
7310
7575
|
# scaled_data = scaler.fit_transform(
|
7311
7576
|
# numeric_rows.T
|
7312
7577
|
# ).T # Transpose for scaling and then back
|
7313
|
-
scaled_data =
|
7578
|
+
scaled_data = (
|
7579
|
+
scaler.fit_transform(numeric_rows.T).T
|
7580
|
+
if scaler is None or not hasattr(scaler, "mean_")
|
7581
|
+
else scaler.transform(numeric_rows.T).T
|
7582
|
+
)
|
7314
7583
|
|
7315
7584
|
if inplace:
|
7316
7585
|
data.loc[numeric_rows.index] = scaled_data
|
@@ -7319,7 +7588,7 @@ def df_scaler(
|
|
7319
7588
|
scaled_df = data.copy()
|
7320
7589
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
7321
7590
|
if return_scaler:
|
7322
|
-
return scaled_df,scaler
|
7591
|
+
return scaled_df, scaler
|
7323
7592
|
else:
|
7324
7593
|
return scaled_df
|
7325
7594
|
|
@@ -7683,10 +7952,10 @@ def df_reducer(
|
|
7683
7952
|
hue: str = None, # lda-specific
|
7684
7953
|
scale: bool = True,
|
7685
7954
|
fill_missing: bool = True,
|
7686
|
-
size=2
|
7687
|
-
markerscale=4
|
7688
|
-
edgecolor=
|
7689
|
-
legend_loc=
|
7955
|
+
size=2, # for plot marker size
|
7956
|
+
markerscale=4, # for plot, legend marker size scale
|
7957
|
+
edgecolor="none", # for plot,
|
7958
|
+
legend_loc="best", # for plot,
|
7690
7959
|
bbox_to_anchor=None,
|
7691
7960
|
ncols=1,
|
7692
7961
|
debug: bool = False,
|
@@ -7719,7 +7988,7 @@ def df_reducer(
|
|
7719
7988
|
"autoencoder": "Autoencoder:\n\tA neural network-based approach for complex feature learning and non-linear dimensionality reduction. Advantage: Can capture very complex relationships. Limitation: Computationally expensive, requires neural network expertise for effective tuning.",
|
7720
7989
|
"nmf": "Non-negative Matrix Factorization:\n\tEffective for parts-based decomposition, commonly used for sparse and non-negative data, e.g., text data or images. Advantage: Interpretability with non-negativity, efficient with sparse data. Limitation: Less effective for negative or zero-centered data.",
|
7721
7990
|
"umap_hdbscan": "UMAP + HDBSCAN:\n\tCombination of UMAP for dimensionality reduction and HDBSCAN for density-based clustering, suitable for cluster discovery in high-dimensional data. Advantage: Effective in discovering clusters in embeddings. Limitation: Requires careful tuning of both UMAP and HDBSCAN parameters.",
|
7722
|
-
"manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise."
|
7991
|
+
"manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise.",
|
7723
7992
|
}
|
7724
7993
|
|
7725
7994
|
from sklearn.preprocessing import StandardScaler
|
@@ -7730,14 +7999,27 @@ def df_reducer(
|
|
7730
7999
|
import seaborn as sns
|
7731
8000
|
# Check valid method input
|
7732
8001
|
methods = [
|
7733
|
-
"pca",
|
7734
|
-
"
|
8002
|
+
"pca",
|
8003
|
+
"umap",
|
8004
|
+
"umap_hdbscan",
|
8005
|
+
"tsne",
|
8006
|
+
"factor",
|
8007
|
+
"isolation_forest",
|
8008
|
+
"manifold_learning",
|
8009
|
+
"lda",
|
8010
|
+
"kpca",
|
8011
|
+
"ica",
|
8012
|
+
"mds",
|
8013
|
+
"lle",
|
8014
|
+
"svd",
|
8015
|
+
"truncated_svd",
|
8016
|
+
"spectral_embedding",
|
7735
8017
|
# "autoencoder","nmf",
|
7736
8018
|
]
|
7737
8019
|
method = strcmp(method, methods)[0]
|
7738
8020
|
if run_once_within(reverse=True):
|
7739
8021
|
print(f"support methods:{methods}")
|
7740
|
-
|
8022
|
+
|
7741
8023
|
if verbose:
|
7742
8024
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
7743
8025
|
xlabel, ylabel = None, None
|
@@ -8050,8 +8332,9 @@ def df_reducer(
|
|
8050
8332
|
svd_df[hue] = y
|
8051
8333
|
if debug:
|
8052
8334
|
print("Singular Value Decomposition (SVD) completed.")
|
8053
|
-
elif method=="truncated_svd":
|
8335
|
+
elif method == "truncated_svd":
|
8054
8336
|
from sklearn.decomposition import TruncatedSVD
|
8337
|
+
|
8055
8338
|
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
|
8056
8339
|
X_reduced = svd.fit_transform(X)
|
8057
8340
|
reduced_df = pd.DataFrame(
|
@@ -8070,7 +8353,9 @@ def df_reducer(
|
|
8070
8353
|
elif method == "spectral_embedding":
|
8071
8354
|
from sklearn.manifold import SpectralEmbedding
|
8072
8355
|
|
8073
|
-
spectral = SpectralEmbedding(
|
8356
|
+
spectral = SpectralEmbedding(
|
8357
|
+
n_components=n_components, random_state=random_state
|
8358
|
+
)
|
8074
8359
|
X_reduced = spectral.fit_transform(X)
|
8075
8360
|
reduced_df = pd.DataFrame(
|
8076
8361
|
X_reduced,
|
@@ -8168,7 +8453,7 @@ def df_reducer(
|
|
8168
8453
|
print("Manifold Learning (Isomap) completed.")
|
8169
8454
|
if hue:
|
8170
8455
|
reduced_df[hue] = y
|
8171
|
-
|
8456
|
+
|
8172
8457
|
#! Return reduced data and info as a new DataFrame with the same index
|
8173
8458
|
if method == "pca":
|
8174
8459
|
reduced_df = pca_df
|
@@ -8225,7 +8510,8 @@ def df_reducer(
|
|
8225
8510
|
colname_met = "SVD_"
|
8226
8511
|
# Quick plots
|
8227
8512
|
if plot_ and (not method in ["isolation_forest"]):
|
8228
|
-
from .plot import plotxy,figsets,get_color
|
8513
|
+
from .plot import plotxy, figsets, get_color
|
8514
|
+
|
8229
8515
|
# if ax is None:
|
8230
8516
|
# if figsize is None:
|
8231
8517
|
# _, ax = plt.subplots(figsize=cm2inch(8, 8))
|
@@ -8235,9 +8521,9 @@ def df_reducer(
|
|
8235
8521
|
# ax = ax.cla()
|
8236
8522
|
xlabel = f"{colname_met}1" if xlabel is None else xlabel
|
8237
8523
|
ylabel = f"{colname_met}2" if ylabel is None else ylabel
|
8238
|
-
palette=get_color(len(flatten(data[hue],verbose=0)))
|
8524
|
+
palette = get_color(len(flatten(data[hue], verbose=0)))
|
8239
8525
|
|
8240
|
-
reduced_df=reduced_df.sort_values(by=hue)
|
8526
|
+
reduced_df = reduced_df.sort_values(by=hue)
|
8241
8527
|
print(flatten(reduced_df[hue]))
|
8242
8528
|
ax = plotxy(
|
8243
8529
|
data=reduced_df,
|
@@ -8247,24 +8533,31 @@ def df_reducer(
|
|
8247
8533
|
palette=palette,
|
8248
8534
|
# size=size,
|
8249
8535
|
edgecolor=edgecolor,
|
8250
|
-
kind_=[
|
8251
|
-
|
8252
|
-
|
8253
|
-
|
8536
|
+
kind_=[
|
8537
|
+
"joint",
|
8538
|
+
# "kde",
|
8539
|
+
"ell",
|
8540
|
+
],
|
8254
8541
|
kws_kde=dict(
|
8255
|
-
|
8256
|
-
|
8257
|
-
|
8258
|
-
|
8259
|
-
|
8260
|
-
|
8261
|
-
kws_joint=dict(kind=
|
8262
|
-
kws_ellipse=dict(alpha=0.1,lw=1,label=None),
|
8542
|
+
hue=hue,
|
8543
|
+
levels=2,
|
8544
|
+
common_norm=False,
|
8545
|
+
fill=True,
|
8546
|
+
alpha=0.05,
|
8547
|
+
),
|
8548
|
+
kws_joint=dict(kind="scatter", joint_kws=dict(s=size)),
|
8549
|
+
kws_ellipse=dict(alpha=0.1, lw=1, label=None),
|
8263
8550
|
verbose=False,
|
8264
8551
|
**kwargs,
|
8265
8552
|
)
|
8266
8553
|
figsets(
|
8267
|
-
legend=dict(
|
8554
|
+
legend=dict(
|
8555
|
+
loc=legend_loc,
|
8556
|
+
markerscale=markerscale,
|
8557
|
+
bbox_to_anchor=bbox_to_anchor,
|
8558
|
+
ncols=ncols,
|
8559
|
+
fontsize=8,
|
8560
|
+
),
|
8268
8561
|
xlabel=xlabel if xlabel else None,
|
8269
8562
|
ylabel=ylabel if ylabel else None,
|
8270
8563
|
)
|
@@ -8297,6 +8590,7 @@ def df_reducer(
|
|
8297
8590
|
# example:
|
8298
8591
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
8299
8592
|
|
8593
|
+
|
8300
8594
|
def get_df_format(data, threshold_unique=0.5, verbose=False):
|
8301
8595
|
"""
|
8302
8596
|
检测表格: long, wide or uncertain.
|
@@ -8396,7 +8690,9 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
|
|
8396
8690
|
if cluster_labels.nunique() < len(numeric_cols) * 0.5:
|
8397
8691
|
wide_score += 2
|
8398
8692
|
if verbose:
|
8399
|
-
print(
|
8693
|
+
print(
|
8694
|
+
"Clustering on columns shows grouping, suggesting wide format."
|
8695
|
+
)
|
8400
8696
|
except Exception as e:
|
8401
8697
|
print(e) if verbose else None
|
8402
8698
|
|
@@ -8487,7 +8783,8 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
|
|
8487
8783
|
if verbose:
|
8488
8784
|
print("Final decision: Uncertain format.")
|
8489
8785
|
return "uncertain"
|
8490
|
-
|
8786
|
+
|
8787
|
+
|
8491
8788
|
def plot_cluster(
|
8492
8789
|
data: pd.DataFrame,
|
8493
8790
|
labels: np.ndarray,
|
@@ -8735,6 +9032,8 @@ def evaluate_cluster(
|
|
8735
9032
|
metrics["V-Measure"] = np.nan
|
8736
9033
|
|
8737
9034
|
return metrics
|
9035
|
+
|
9036
|
+
|
8738
9037
|
def df_qc(
|
8739
9038
|
data: pd.DataFrame,
|
8740
9039
|
columns=None,
|
@@ -8744,7 +9043,7 @@ def df_qc(
|
|
8744
9043
|
hue=None,
|
8745
9044
|
output=False,
|
8746
9045
|
verbose=True,
|
8747
|
-
dir_save=None
|
9046
|
+
dir_save=None,
|
8748
9047
|
):
|
8749
9048
|
"""
|
8750
9049
|
Usage example:
|
@@ -8752,16 +9051,17 @@ def df_qc(
|
|
8752
9051
|
"""
|
8753
9052
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
8754
9053
|
from scipy.stats import skew, kurtosis, entropy
|
8755
|
-
|
9054
|
+
|
8756
9055
|
pd.options.display.max_seq_items = 10
|
8757
9056
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
8758
9057
|
#!skim
|
8759
9058
|
if columns is not None:
|
8760
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
8761
|
-
data=data[columns]
|
9059
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9060
|
+
data = data[columns]
|
8762
9061
|
if skim:
|
8763
9062
|
try:
|
8764
|
-
import skimpy
|
9063
|
+
import skimpy
|
9064
|
+
|
8765
9065
|
skimpy.skim(data)
|
8766
9066
|
except:
|
8767
9067
|
numerical_data = data.select_dtypes(include=[np.number])
|
@@ -8775,13 +9075,19 @@ def df_qc(
|
|
8775
9075
|
|
8776
9076
|
# Missing values
|
8777
9077
|
res_qc["missing_values"] = data.isnull().sum()
|
8778
|
-
res_qc["missing_percentage"] = round(
|
9078
|
+
res_qc["missing_percentage"] = round(
|
9079
|
+
(res_qc["missing_values"] / len(data)) * 100, 2
|
9080
|
+
)
|
8779
9081
|
res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
|
8780
9082
|
|
8781
9083
|
# Data types and unique values
|
8782
9084
|
res_qc["data_types"] = data.dtypes
|
8783
|
-
res_qc["unique_counts"] =
|
8784
|
-
|
9085
|
+
res_qc["unique_counts"] = (
|
9086
|
+
data.select_dtypes(exclude=np.number).nunique().sort_values()
|
9087
|
+
)
|
9088
|
+
res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(
|
9089
|
+
lambda x: x.unique()
|
9090
|
+
)
|
8785
9091
|
res_qc["constant_columns"] = [
|
8786
9092
|
col for col in data.columns if data[col].nunique() <= 1
|
8787
9093
|
]
|
@@ -8797,8 +9103,8 @@ def df_qc(
|
|
8797
9103
|
data_outliers = df_outlier(data)
|
8798
9104
|
outlier_num = data_outliers.isna().sum() - data.isnull().sum()
|
8799
9105
|
res_qc["outlier_num"] = outlier_num[outlier_num > 0]
|
8800
|
-
outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
|
8801
|
-
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
|
9106
|
+
outlier_percentage = round((outlier_num / len(data_outliers)) * 100, 2)
|
9107
|
+
res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage > 0]
|
8802
9108
|
try:
|
8803
9109
|
# Correlation and multicollinearity (VIF)
|
8804
9110
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
@@ -8816,16 +9122,16 @@ def df_qc(
|
|
8816
9122
|
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
8817
9123
|
if isinstance(numeric_df.columns, pd.MultiIndex):
|
8818
9124
|
numeric_df.columns = [
|
8819
|
-
"_".join(col).strip() if isinstance(col, tuple) else col
|
9125
|
+
"_".join(col).strip() if isinstance(col, tuple) else col
|
9126
|
+
for col in numeric_df.columns
|
8820
9127
|
]
|
8821
9128
|
|
8822
|
-
|
8823
9129
|
vif_data = pd.DataFrame()
|
8824
|
-
res_qc["vif"]=vif_data
|
9130
|
+
res_qc["vif"] = vif_data
|
8825
9131
|
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
8826
9132
|
vif_data["feature"] = numeric_df.columns.tolist()
|
8827
9133
|
vif_data["VIF"] = [
|
8828
|
-
round(variance_inflation_factor(numeric_df.values, i),2)
|
9134
|
+
round(variance_inflation_factor(numeric_df.values, i), 2)
|
8829
9135
|
for i in range(numeric_df.shape[1])
|
8830
9136
|
]
|
8831
9137
|
res_qc["vif"] = vif_data[
|
@@ -8847,8 +9153,8 @@ def df_qc(
|
|
8847
9153
|
}
|
8848
9154
|
|
8849
9155
|
# dtypes counts
|
8850
|
-
res_qc[
|
8851
|
-
|
9156
|
+
res_qc["dtype_counts"] = data.dtypes.value_counts()
|
9157
|
+
|
8852
9158
|
# Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
|
8853
9159
|
distribution_stats = data.select_dtypes(include=[np.number]).describe().T
|
8854
9160
|
iqr = data.select_dtypes(include=[np.number]).apply(
|
@@ -8880,7 +9186,6 @@ def df_qc(
|
|
8880
9186
|
if len(unique_types) > 1:
|
8881
9187
|
inconsistent_types[col] = unique_types
|
8882
9188
|
res_qc["inconsistent_types"] = inconsistent_types
|
8883
|
-
|
8884
9189
|
|
8885
9190
|
# Text length analysis for text fields
|
8886
9191
|
text_lengths = {}
|
@@ -8892,7 +9197,9 @@ def df_qc(
|
|
8892
9197
|
res_qc["text_length_analysis"] = text_lengths
|
8893
9198
|
|
8894
9199
|
# Summary statistics
|
8895
|
-
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(
|
9200
|
+
res_qc["summary_statistics"] = data.describe().T.style.background_gradient(
|
9201
|
+
cmap="coolwarm", axis=0
|
9202
|
+
)
|
8896
9203
|
|
8897
9204
|
# Automated warnings
|
8898
9205
|
warnings = []
|
@@ -8920,39 +9227,60 @@ def df_qc(
|
|
8920
9227
|
display(res_qc["data_types"])
|
8921
9228
|
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
8922
9229
|
print(" ⤵ Missing Values Counts:")
|
8923
|
-
display(
|
8924
|
-
|
8925
|
-
|
8926
|
-
|
8927
|
-
|
8928
|
-
|
8929
|
-
|
8930
|
-
|
8931
|
-
|
9230
|
+
display(
|
9231
|
+
pd.DataFrame(
|
9232
|
+
{
|
9233
|
+
"missing_values": res_qc["missing_values"][
|
9234
|
+
res_qc["missing_values"] > 0
|
9235
|
+
],
|
9236
|
+
"missing_percent(%)": res_qc["missing_percentage"][
|
9237
|
+
res_qc["missing_percentage"] > 0
|
9238
|
+
],
|
9239
|
+
}
|
9240
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9241
|
+
)
|
8932
9242
|
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
8933
|
-
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
9243
|
+
print("\n⤵ Rows with Missing Values:", res_qc["rows_with_missing"])
|
9244
|
+
|
9245
|
+
(
|
9246
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"])
|
9247
|
+
if any(res_qc["constant_columns"])
|
9248
|
+
else None
|
9249
|
+
)
|
9250
|
+
(
|
9251
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"])
|
9252
|
+
if res_qc["duplicate_rows"]
|
9253
|
+
else None
|
9254
|
+
)
|
9255
|
+
(
|
9256
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"])
|
9257
|
+
if any(res_qc["duplicate_columns"])
|
9258
|
+
else None
|
9259
|
+
)
|
8934
9260
|
|
8935
|
-
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
8936
|
-
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
8937
|
-
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
8938
|
-
|
8939
9261
|
if any(res_qc["outlier_num"]):
|
8940
9262
|
print("\n⤵ Outlier Report:")
|
8941
|
-
display(
|
8942
|
-
|
8943
|
-
|
8944
|
-
|
8945
|
-
|
8946
|
-
|
8947
|
-
|
8948
|
-
|
8949
|
-
|
9263
|
+
display(
|
9264
|
+
pd.DataFrame(
|
9265
|
+
{
|
9266
|
+
"outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
|
9267
|
+
"outlier_percentage(%)": res_qc["outlier_percentage"][
|
9268
|
+
res_qc["outlier_percentage"] > 0
|
9269
|
+
],
|
9270
|
+
}
|
9271
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9272
|
+
)
|
8950
9273
|
|
8951
9274
|
if any(res_qc["unique_counts"]):
|
8952
9275
|
print("\n⤵ Unique Values per Column:")
|
8953
|
-
display(
|
8954
|
-
|
8955
|
-
|
9276
|
+
display(
|
9277
|
+
pd.DataFrame(
|
9278
|
+
{
|
9279
|
+
"unique_counts": res_qc["unique_counts"],
|
9280
|
+
"unique_values": res_qc["unique_values"],
|
9281
|
+
}
|
9282
|
+
).style.background_gradient(cmap="coolwarm", axis=0)
|
9283
|
+
)
|
8956
9284
|
|
8957
9285
|
if res_qc["empty_columns"]:
|
8958
9286
|
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
@@ -8971,7 +9299,7 @@ def df_qc(
|
|
8971
9299
|
print(res_qc["high_cardinality_categoricals"])
|
8972
9300
|
if any(res_qc["inconsistent_types"]):
|
8973
9301
|
print("\n⤵ Inconsistent Data Types:")
|
8974
|
-
display(res_qc["inconsistent_types"])
|
9302
|
+
display(res_qc["inconsistent_types"])
|
8975
9303
|
if any(res_qc["text_length_analysis"]):
|
8976
9304
|
print("\n⤵ Text Length Analysis:")
|
8977
9305
|
for col, stats in res_qc["text_length_analysis"].items():
|
@@ -8986,67 +9314,93 @@ def df_qc(
|
|
8986
9314
|
|
8987
9315
|
pd.reset_option("display.max_seq_items")
|
8988
9316
|
if plot_:
|
8989
|
-
df_qc_plots(
|
9317
|
+
df_qc_plots(
|
9318
|
+
data=data, res_qc=res_qc, max_cols=max_cols, hue=hue, dir_save=dir_save
|
9319
|
+
)
|
8990
9320
|
if output or not plot_:
|
8991
9321
|
return res_qc
|
8992
9322
|
return None
|
8993
9323
|
|
8994
9324
|
|
8995
|
-
def df_qc_plots(
|
9325
|
+
def df_qc_plots(
|
9326
|
+
data: pd.DataFrame,
|
9327
|
+
columns=None,
|
9328
|
+
res_qc: dict = None,
|
9329
|
+
max_cols=20,
|
9330
|
+
hue=None,
|
9331
|
+
dir_save=None,
|
9332
|
+
):
|
8996
9333
|
import matplotlib.pyplot as plt
|
8997
9334
|
import seaborn as sns
|
8998
9335
|
from .plot import subplot, figsets, get_color
|
8999
9336
|
from datetime import datetime
|
9337
|
+
|
9000
9338
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
9001
|
-
|
9339
|
+
|
9002
9340
|
if columns is not None:
|
9003
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
9004
|
-
data=data[columns]
|
9341
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9342
|
+
data = data[columns]
|
9005
9343
|
len_total = len(res_qc)
|
9006
9344
|
n_row, n_col = int((len_total + 10)), 3
|
9007
|
-
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
9345
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row], verbose=False)
|
9008
9346
|
|
9009
9347
|
missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
|
9010
9348
|
ascending=False
|
9011
9349
|
)
|
9012
9350
|
if len(missing_data) > max_cols:
|
9013
9351
|
missing_data = missing_data[:max_cols]
|
9014
|
-
ax_missing_data=sns.barplot(
|
9352
|
+
ax_missing_data = sns.barplot(
|
9015
9353
|
y=missing_data.index,
|
9016
9354
|
x=missing_data.values,
|
9017
9355
|
hue=missing_data.index,
|
9018
9356
|
palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
|
9019
9357
|
ax=nexttile(),
|
9020
9358
|
)
|
9021
|
-
figsets(
|
9359
|
+
figsets(
|
9360
|
+
title="Missing (#)",
|
9361
|
+
xlabel="#",
|
9362
|
+
ax=ax_missing_data,
|
9363
|
+
ylabel=None,
|
9364
|
+
fontsize=8 if len(missing_data) <= 20 else 6,
|
9365
|
+
)
|
9022
9366
|
|
9023
9367
|
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
9024
9368
|
if len(outlier_num) > max_cols:
|
9025
9369
|
outlier_num = outlier_num[:max_cols]
|
9026
|
-
ax_outlier_num=sns.barplot(
|
9370
|
+
ax_outlier_num = sns.barplot(
|
9027
9371
|
y=outlier_num.index,
|
9028
9372
|
x=outlier_num.values,
|
9029
|
-
hue=outlier_num.index,
|
9373
|
+
hue=outlier_num.index,
|
9030
9374
|
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
9031
9375
|
ax=nexttile(),
|
9032
9376
|
)
|
9033
|
-
figsets(
|
9034
|
-
|
9377
|
+
figsets(
|
9378
|
+
ax=ax_outlier_num,
|
9379
|
+
title="Outliers (#)",
|
9380
|
+
xlabel="#",
|
9381
|
+
ylabel=None,
|
9382
|
+
fontsize=8 if len(outlier_num) <= 20 else 6,
|
9383
|
+
)
|
9384
|
+
|
9035
9385
|
#!
|
9036
9386
|
try:
|
9037
|
-
for col in data.select_dtypes(include=
|
9038
|
-
sns.countplot(
|
9039
|
-
|
9040
|
-
|
9387
|
+
for col in data.select_dtypes(include="category").columns:
|
9388
|
+
sns.countplot(
|
9389
|
+
y=data[col],
|
9390
|
+
palette=get_color(
|
9391
|
+
data.select_dtypes(include="category").shape[1], cmap="coolwarm"
|
9392
|
+
)[::-1],
|
9393
|
+
ax=nexttile(),
|
9394
|
+
)
|
9041
9395
|
figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
|
9042
9396
|
except Exception as e:
|
9043
|
-
pass
|
9397
|
+
pass
|
9044
9398
|
|
9045
9399
|
# Skewness and Kurtosis Plots
|
9046
9400
|
skewness = res_qc["skewness"].sort_values(ascending=False)
|
9047
9401
|
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
9048
9402
|
if not skewness.empty:
|
9049
|
-
ax_skewness=sns.barplot(
|
9403
|
+
ax_skewness = sns.barplot(
|
9050
9404
|
y=skewness.index,
|
9051
9405
|
x=skewness.values,
|
9052
9406
|
hue=skewness.index,
|
@@ -9055,11 +9409,13 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9055
9409
|
)
|
9056
9410
|
figsets(
|
9057
9411
|
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
9058
|
-
xlabel="Skewness",
|
9059
|
-
|
9412
|
+
xlabel="Skewness",
|
9413
|
+
ylabel=None,
|
9414
|
+
ax=ax_skewness,
|
9415
|
+
fontsize=8 if len(skewness) <= 20 else 6,
|
9060
9416
|
)
|
9061
9417
|
if not kurtosis.empty:
|
9062
|
-
ax_kurtosis=sns.barplot(
|
9418
|
+
ax_kurtosis = sns.barplot(
|
9063
9419
|
y=kurtosis.index,
|
9064
9420
|
x=kurtosis.values,
|
9065
9421
|
hue=kurtosis.index,
|
@@ -9068,59 +9424,68 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9068
9424
|
)
|
9069
9425
|
figsets(
|
9070
9426
|
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
9071
|
-
xlabel="Kurtosis",
|
9072
|
-
|
9427
|
+
xlabel="Kurtosis",
|
9428
|
+
ylabel=None,
|
9429
|
+
ax=ax_kurtosis,
|
9430
|
+
fontsize=8 if len(kurtosis) <= 20 else 6,
|
9073
9431
|
)
|
9074
9432
|
|
9075
9433
|
# Entropy for Categorical Variables
|
9076
9434
|
entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
|
9077
9435
|
ascending=False
|
9078
9436
|
)
|
9079
|
-
ax_entropy_data=sns.barplot(
|
9080
|
-
y=entropy_data.index,
|
9437
|
+
ax_entropy_data = sns.barplot(
|
9438
|
+
y=entropy_data.index,
|
9439
|
+
x=entropy_data.values,
|
9440
|
+
hue=entropy_data.index,
|
9081
9441
|
palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
|
9082
|
-
ax=nexttile()
|
9083
|
-
)
|
9442
|
+
ax=nexttile(),
|
9443
|
+
)
|
9084
9444
|
figsets(
|
9085
|
-
|
9086
|
-
|
9087
|
-
|
9088
|
-
|
9089
|
-
|
9090
|
-
|
9445
|
+
ylabel="Categorical Columns",
|
9446
|
+
title="Entropy of Categorical Variables",
|
9447
|
+
xlabel="Entropy (bits)",
|
9448
|
+
ax=ax_entropy_data,
|
9449
|
+
fontsize=8 if len(entropy_data) <= 20 else 6,
|
9450
|
+
)
|
9091
9451
|
|
9092
9452
|
# unique counts
|
9093
|
-
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
9094
|
-
ax_unique_counts_=sns.barplot(
|
9095
|
-
|
9096
|
-
|
9097
|
-
|
9098
|
-
|
9099
|
-
|
9453
|
+
unique_counts = res_qc["unique_counts"].sort_values(ascending=False)
|
9454
|
+
ax_unique_counts_ = sns.barplot(
|
9455
|
+
y=unique_counts.index,
|
9456
|
+
x=unique_counts.values,
|
9457
|
+
hue=unique_counts.index,
|
9458
|
+
palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
|
9459
|
+
ax=nexttile(),
|
9460
|
+
)
|
9100
9461
|
figsets(
|
9101
|
-
|
9102
|
-
|
9103
|
-
|
9104
|
-
|
9105
|
-
|
9106
|
-
|
9462
|
+
title="Unique Counts",
|
9463
|
+
ylabel=None,
|
9464
|
+
xlabel="#",
|
9465
|
+
ax=ax_unique_counts_,
|
9466
|
+
fontsize=8 if len(unique_counts) <= 20 else 6,
|
9467
|
+
)
|
9107
9468
|
# Binary Checking
|
9108
|
-
ax_unique_counts=sns.barplot(
|
9109
|
-
|
9110
|
-
|
9111
|
-
|
9112
|
-
|
9469
|
+
ax_unique_counts = sns.barplot(
|
9470
|
+
y=unique_counts[unique_counts < 8].index,
|
9471
|
+
x=unique_counts[unique_counts < 8].values,
|
9472
|
+
hue=unique_counts[unique_counts < 8].index,
|
9473
|
+
palette=get_color(len(unique_counts[unique_counts < 8].index), cmap="coolwarm")[
|
9474
|
+
::-1
|
9475
|
+
],
|
9476
|
+
ax=nexttile(),
|
9477
|
+
)
|
9113
9478
|
plt.axvline(x=2, color="r", linestyle="--", lw=2)
|
9114
9479
|
figsets(
|
9115
|
-
|
9116
|
-
|
9117
|
-
|
9118
|
-
|
9119
|
-
|
9120
|
-
|
9480
|
+
ylabel=None,
|
9481
|
+
title="Binary Checking",
|
9482
|
+
xlabel="#",
|
9483
|
+
ax=ax_unique_counts,
|
9484
|
+
fontsize=8 if len(unique_counts[unique_counts < 10].index) <= 20 else 6,
|
9485
|
+
)
|
9121
9486
|
|
9122
9487
|
# dtypes counts
|
9123
|
-
dtype_counts = res_qc[
|
9488
|
+
dtype_counts = res_qc["dtype_counts"]
|
9124
9489
|
txt = []
|
9125
9490
|
for tp in dtype_counts.index:
|
9126
9491
|
txt.append(list(data.select_dtypes(include=tp).columns))
|
@@ -9131,9 +9496,9 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9131
9496
|
color="#F3C8B2",
|
9132
9497
|
ax=nexttile(),
|
9133
9498
|
)
|
9134
|
-
max_columns_per_row = 1
|
9499
|
+
max_columns_per_row = 1 # Maximum number of columns per row
|
9135
9500
|
for i, tp in enumerate(dtype_counts.index):
|
9136
|
-
if i<=20:
|
9501
|
+
if i <= 20:
|
9137
9502
|
column_names = txt[i]
|
9138
9503
|
# Split the column names into multiple lines if too long
|
9139
9504
|
column_name_str = ", ".join(column_names)
|
@@ -9152,7 +9517,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9152
9517
|
ha="center",
|
9153
9518
|
va="top",
|
9154
9519
|
c="k",
|
9155
|
-
fontsize=8
|
9520
|
+
fontsize=8 if len(dtype_counts.index) <= 20 else 6,
|
9156
9521
|
rotation=0,
|
9157
9522
|
)
|
9158
9523
|
figsets(
|
@@ -9160,7 +9525,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9160
9525
|
title="Dtypes",
|
9161
9526
|
ylabel="#",
|
9162
9527
|
ax=ax_dtype_counts,
|
9163
|
-
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
9528
|
+
fontsize=8 if len(dtype_counts.index) <= 20 else 6,
|
9164
9529
|
)
|
9165
9530
|
# from .plot import pie
|
9166
9531
|
# pie()
|
@@ -9175,57 +9540,66 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9175
9540
|
)
|
9176
9541
|
|
9177
9542
|
if high_cardinality:
|
9178
|
-
ax_high_cardinality=sns.barplot(
|
9543
|
+
ax_high_cardinality = sns.barplot(
|
9179
9544
|
y=list(high_cardinality.keys()),
|
9180
9545
|
x=list(high_cardinality.values()),
|
9181
9546
|
hue=list(high_cardinality.keys()),
|
9182
|
-
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[
|
9547
|
+
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[
|
9548
|
+
::-1
|
9549
|
+
],
|
9183
9550
|
ax=nexttile(),
|
9184
9551
|
)
|
9185
9552
|
figsets(
|
9186
9553
|
title="High Cardinality Categorical Columns",
|
9187
9554
|
xlabel="Unique Value Count",
|
9188
9555
|
ax=ax_high_cardinality,
|
9189
|
-
fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
|
9556
|
+
fontsize=8 if len(list(high_cardinality.keys())) <= 20 else 6,
|
9190
9557
|
)
|
9191
9558
|
if res_qc["low_variance_features"]:
|
9192
9559
|
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
9193
9560
|
for col in low_variance_data.columns:
|
9194
|
-
ax_low_variance_features=sns.histplot(
|
9561
|
+
ax_low_variance_features = sns.histplot(
|
9195
9562
|
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
9196
9563
|
)
|
9197
|
-
figsets(
|
9198
|
-
|
9564
|
+
figsets(
|
9565
|
+
title=f"Low Variance Feature: {col}",
|
9566
|
+
ax=ax_low_variance_features,
|
9567
|
+
fontsize=8 if len(low_variance_data[col]) <= 20 else 6,
|
9568
|
+
)
|
9199
9569
|
|
9200
9570
|
# VIF plot for multicollinearity detection
|
9201
9571
|
if "vif" in res_qc and not res_qc["vif"].empty:
|
9202
9572
|
vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
|
9203
9573
|
if len(vif_data) > max_cols:
|
9204
9574
|
vif_data = vif_data[:max_cols]
|
9205
|
-
ax_vif=sns.barplot(
|
9206
|
-
|
9207
|
-
|
9208
|
-
|
9209
|
-
|
9210
|
-
|
9575
|
+
ax_vif = sns.barplot(
|
9576
|
+
data=vif_data,
|
9577
|
+
x="VIF",
|
9578
|
+
y="feature",
|
9579
|
+
hue="VIF",
|
9580
|
+
palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
|
9581
|
+
ax=nexttile(),
|
9582
|
+
)
|
9211
9583
|
figsets(
|
9212
9584
|
title="Variance Inflation Factor(VIF)",
|
9213
9585
|
xlabel="VIF",
|
9214
9586
|
ylabel="Features",
|
9215
9587
|
legend=None,
|
9216
9588
|
ax=ax_vif,
|
9217
|
-
fontsize=8 if len(vif_data)<=20 else 6
|
9589
|
+
fontsize=8 if len(vif_data) <= 20 else 6,
|
9218
9590
|
)
|
9219
9591
|
|
9220
9592
|
# Correlation heatmap for numeric columns with high correlation pairs
|
9221
9593
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
9222
9594
|
corr = data.select_dtypes(include=[np.number]).corr()
|
9223
|
-
if corr.shape[1]<=33:
|
9595
|
+
if corr.shape[1] <= 33:
|
9224
9596
|
mask = np.triu(np.ones_like(corr, dtype=bool))
|
9225
9597
|
num_columns = corr.shape[1]
|
9226
|
-
fontsize = max(
|
9598
|
+
fontsize = max(
|
9599
|
+
6, min(12, 12 - (num_columns - 10) * 0.2)
|
9600
|
+
) # Scale between 8 and 12
|
9227
9601
|
|
9228
|
-
ax_heatmap=sns.heatmap(
|
9602
|
+
ax_heatmap = sns.heatmap(
|
9229
9603
|
corr,
|
9230
9604
|
mask=mask,
|
9231
9605
|
annot=True,
|
@@ -9233,24 +9607,21 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9233
9607
|
center=0,
|
9234
9608
|
fmt=".1f",
|
9235
9609
|
linewidths=0.5,
|
9236
|
-
vmin=-1,
|
9610
|
+
vmin=-1,
|
9611
|
+
vmax=1,
|
9237
9612
|
ax=nexttile(2, 2),
|
9238
|
-
cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
|
9239
|
-
annot_kws={"size": fontsize}
|
9240
|
-
)
|
9241
|
-
|
9242
|
-
figsets(
|
9243
|
-
xangle=45,
|
9244
|
-
title="Correlation Heatmap",
|
9245
|
-
ax=ax_heatmap
|
9613
|
+
cbar_kws=dict(shrink=0.2, ticks=np.arange(-1, 2, 1)),
|
9614
|
+
annot_kws={"size": fontsize},
|
9246
9615
|
)
|
9616
|
+
|
9617
|
+
figsets(xangle=45, title="Correlation Heatmap", ax=ax_heatmap)
|
9247
9618
|
# # save figure
|
9248
9619
|
# if dir_save:
|
9249
9620
|
# figsave(dir_save,f"qc_plot_{now_}.pdf")
|
9250
9621
|
|
9251
9622
|
if columns is not None:
|
9252
|
-
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
9253
|
-
data=data[columns]
|
9623
|
+
if isinstance(columns, (list, pd.core.indexes.base.Index)):
|
9624
|
+
data = data[columns]
|
9254
9625
|
|
9255
9626
|
# len_total = len(res_qc)
|
9256
9627
|
# n_row, n_col = int((len_total + 10) / 3), 3
|
@@ -9258,30 +9629,36 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
|
|
9258
9629
|
#! check distribution
|
9259
9630
|
data_num = data.select_dtypes(include=np.number)
|
9260
9631
|
if len(data_num) > max_cols:
|
9261
|
-
data_num = data_num.iloc[
|
9632
|
+
data_num = data_num.iloc[:, :max_cols]
|
9633
|
+
|
9634
|
+
data_num = df_scaler(data=data_num, method="standard")
|
9262
9635
|
|
9263
|
-
data_num = df_scaler(data=data_num, method='standard')
|
9264
|
-
|
9265
9636
|
import scipy.stats as stats
|
9637
|
+
|
9266
9638
|
for column in data_num.columns:
|
9267
|
-
|
9639
|
+
# * Shapiro-Wilk test for normality
|
9268
9640
|
stat, p_value = stats.shapiro(data_num[column])
|
9269
|
-
normality = "norm" if p_value > 0.05 else "not_norm"
|
9270
|
-
|
9271
|
-
ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
|
9641
|
+
normality = "norm" if p_value > 0.05 else "not_norm"
|
9642
|
+
# * Plot histogram
|
9643
|
+
ax_hist = sns.histplot(data_num[column], kde=True, ax=nexttile())
|
9272
9644
|
x_min, x_max = ax_hist.get_xlim()
|
9273
9645
|
y_min, y_max = ax_hist.get_ylim()
|
9274
|
-
ax_hist.text(
|
9275
|
-
|
9276
|
-
|
9277
|
-
|
9278
|
-
|
9279
|
-
|
9646
|
+
ax_hist.text(
|
9647
|
+
x_min + (x_max - x_min) * 0.5,
|
9648
|
+
y_min + (y_max - y_min) * 0.75,
|
9649
|
+
f"p(Shapiro-Wilk)={p_value:.3f}\n{normality}",
|
9650
|
+
ha="center",
|
9651
|
+
va="top",
|
9652
|
+
)
|
9653
|
+
figsets(title=column, ax=ax_hist)
|
9654
|
+
ax_twin = ax_hist.twinx()
|
9655
|
+
# * Q-Q plot
|
9280
9656
|
stats.probplot(data_num[column], dist="norm", plot=ax_twin)
|
9281
|
-
figsets(ylabel=f
|
9657
|
+
figsets(ylabel=f"Q-Q Plot:{column}", title=None)
|
9282
9658
|
# save figure
|
9283
9659
|
if dir_save:
|
9284
|
-
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
9660
|
+
figsave(dir_save, f"qc_plot_{now_}.pdf")
|
9661
|
+
|
9285
9662
|
|
9286
9663
|
def df_corr(df: pd.DataFrame, method="pearson"):
|
9287
9664
|
"""
|
@@ -9318,6 +9695,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
|
|
9318
9695
|
|
9319
9696
|
return corr_matrix, pval_matrix
|
9320
9697
|
|
9698
|
+
|
9321
9699
|
def use_pd(
|
9322
9700
|
func_name="excel",
|
9323
9701
|
verbose=True,
|
@@ -9338,7 +9716,8 @@ def use_pd(
|
|
9338
9716
|
if verbose:
|
9339
9717
|
print(e)
|
9340
9718
|
|
9341
|
-
|
9719
|
+
|
9720
|
+
def get_phone(phone_number: str, region: str = None, verbose=True):
|
9342
9721
|
"""
|
9343
9722
|
usage:
|
9344
9723
|
info = get_phone(15237654321, "DE")
|
@@ -9426,21 +9805,23 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9426
9805
|
dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
|
9427
9806
|
|
9428
9807
|
# Advanced Timezone Handling
|
9429
|
-
gmt_offsets =
|
9808
|
+
gmt_offsets = (
|
9809
|
+
pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds() / 3600
|
9810
|
+
)
|
9430
9811
|
# Get the local timezone (current computer's time)
|
9431
9812
|
local_timezone = get_localzone()
|
9432
|
-
#local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
9813
|
+
# local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
|
9433
9814
|
local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
|
9434
9815
|
offset_diff = local_offset - gmt_offsets
|
9435
9816
|
head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
|
9436
|
-
res= {
|
9817
|
+
res = {
|
9437
9818
|
"valid": True,
|
9438
9819
|
"possible": possible,
|
9439
9820
|
"formatted": {
|
9440
9821
|
"international": formatted_international,
|
9441
9822
|
"national": formatted_national,
|
9442
9823
|
"e164": formatted_e164,
|
9443
|
-
},
|
9824
|
+
},
|
9444
9825
|
"country_code": country_code,
|
9445
9826
|
"country_name": country_name,
|
9446
9827
|
"region_code": region_code,
|
@@ -9448,13 +9829,13 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9448
9829
|
"carrier": carrier_name,
|
9449
9830
|
"time_zone": time_zones,
|
9450
9831
|
"current_times": current_times,
|
9451
|
-
"local_offset":f"{local_offset} utcoffset",
|
9832
|
+
"local_offset": f"{local_offset} utcoffset",
|
9452
9833
|
"time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
|
9453
9834
|
"number_type": number_type_str,
|
9454
9835
|
"is_toll_free": is_toll_free,
|
9455
|
-
"is_premium_rate": is_premium_rate,
|
9836
|
+
"is_premium_rate": is_premium_rate,
|
9456
9837
|
"dialing_instructions": dialing_instructions,
|
9457
|
-
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
9838
|
+
"suggested_fix": None, # Use phonenumbers.example_number if invalid
|
9458
9839
|
"logs": {
|
9459
9840
|
"number_analysis_completed": datetime.now().strftime(
|
9460
9841
|
"%Y-%m-%d %H:%M:%S"
|
@@ -9465,7 +9846,7 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
|
|
9465
9846
|
}
|
9466
9847
|
|
9467
9848
|
except phonenumbers.NumberParseException as e:
|
9468
|
-
res= {"valid": False, "error": str(e)}
|
9849
|
+
res = {"valid": False, "error": str(e)}
|
9469
9850
|
if verbose:
|
9470
9851
|
preview(res)
|
9471
9852
|
return res
|
@@ -9531,7 +9912,8 @@ def decode_pluscode(
|
|
9531
9912
|
|
9532
9913
|
return latitude, longitude
|
9533
9914
|
|
9534
|
-
|
9915
|
+
|
9916
|
+
def get_loc(input_data, user_agent="0413@mygmail.com)", verbose=True):
|
9535
9917
|
"""
|
9536
9918
|
Determine if the input is a city name, lat/lon, or DMS and perform geocoding or reverse geocoding.
|
9537
9919
|
Usage:
|
@@ -9607,7 +9989,8 @@ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
|
|
9607
9989
|
"Invalid input format. Please provide a city name, latitude/longitude, or DMS string."
|
9608
9990
|
)
|
9609
9991
|
|
9610
|
-
|
9992
|
+
|
9993
|
+
def enpass(code: str, method: str = "AES", key: str = None):
|
9611
9994
|
"""
|
9612
9995
|
usage: enpass("admin")
|
9613
9996
|
Master encryption function that supports multiple methods: AES, RSA, and SHA256.
|
@@ -9617,6 +10000,7 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9617
10000
|
:return: The encrypted data or hashed value.
|
9618
10001
|
"""
|
9619
10002
|
import hashlib
|
10003
|
+
|
9620
10004
|
# AES Encryption (Advanced)
|
9621
10005
|
def aes_encrypt(data: str, key: str):
|
9622
10006
|
"""
|
@@ -9630,9 +10014,10 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9630
10014
|
from cryptography.hazmat.primitives import padding
|
9631
10015
|
import base64
|
9632
10016
|
import os
|
10017
|
+
|
9633
10018
|
# Generate a 256-bit key from the provided password
|
9634
10019
|
key = hashlib.sha256(key.encode()).digest()
|
9635
|
-
|
10020
|
+
|
9636
10021
|
# Generate a random initialization vector (IV)
|
9637
10022
|
iv = os.urandom(16) # 16 bytes for AES block size
|
9638
10023
|
|
@@ -9659,10 +10044,12 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9659
10044
|
import base64
|
9660
10045
|
from Crypto.PublicKey import RSA
|
9661
10046
|
from Crypto.Cipher import PKCS1_OAEP
|
10047
|
+
|
9662
10048
|
public_key_obj = RSA.import_key(public_key)
|
9663
10049
|
cipher_rsa = PKCS1_OAEP.new(public_key_obj)
|
9664
10050
|
encrypted_data = cipher_rsa.encrypt(data.encode())
|
9665
10051
|
return base64.b64encode(encrypted_data).decode()
|
10052
|
+
|
9666
10053
|
# SHA256 Hashing (Non-reversible)
|
9667
10054
|
def sha256_hash(data: str):
|
9668
10055
|
"""
|
@@ -9671,9 +10058,10 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9671
10058
|
:return: The hashed value (hex string).
|
9672
10059
|
"""
|
9673
10060
|
return hashlib.sha256(data.encode()).hexdigest()
|
10061
|
+
|
9674
10062
|
if key is None:
|
9675
|
-
key="worldpeace"
|
9676
|
-
method=strcmp(method,["AES","RSA",
|
10063
|
+
key = "worldpeace"
|
10064
|
+
method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
|
9677
10065
|
if method == "AES":
|
9678
10066
|
return aes_encrypt(code, key)
|
9679
10067
|
elif method == "RSA":
|
@@ -9685,7 +10073,7 @@ def enpass(code: str, method: str="AES", key: str = None):
|
|
9685
10073
|
|
9686
10074
|
|
9687
10075
|
# Master Decryption Function (Supports AES, RSA)
|
9688
|
-
def depass(encrypted_code: str, method: str=
|
10076
|
+
def depass(encrypted_code: str, method: str = "AES", key: str = None):
|
9689
10077
|
"""
|
9690
10078
|
Master decryption function that supports multiple methods: AES and RSA.
|
9691
10079
|
:param encrypted_code: The encrypted data to decrypt.
|
@@ -9694,6 +10082,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9694
10082
|
:return: The decrypted data.
|
9695
10083
|
"""
|
9696
10084
|
import hashlib
|
10085
|
+
|
9697
10086
|
def aes_decrypt(encrypted_data: str, key: str):
|
9698
10087
|
"""
|
9699
10088
|
Decrypts data encrypted using AES in CBC mode.
|
@@ -9705,12 +10094,13 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9705
10094
|
from cryptography.hazmat.backends import default_backend
|
9706
10095
|
from cryptography.hazmat.primitives import padding
|
9707
10096
|
import base64
|
10097
|
+
|
9708
10098
|
# Generate the same 256-bit key from the password
|
9709
10099
|
key = hashlib.sha256(key.encode()).digest()
|
9710
|
-
|
10100
|
+
|
9711
10101
|
# Decode the encrypted data from base64
|
9712
10102
|
encrypted_data = base64.b64decode(encrypted_data)
|
9713
|
-
|
10103
|
+
|
9714
10104
|
# Extract the IV and the actual encrypted data
|
9715
10105
|
iv = encrypted_data[:16] # First 16 bytes are the IV
|
9716
10106
|
encrypted_data = encrypted_data[16:] # Remaining data is the encrypted message
|
@@ -9724,7 +10114,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9724
10114
|
unpadder = padding.PKCS7(128).unpadder()
|
9725
10115
|
unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
|
9726
10116
|
|
9727
|
-
return unpadded_data.decode()
|
10117
|
+
return unpadded_data.decode()
|
10118
|
+
|
9728
10119
|
def rsa_decrypt(encrypted_data: str, private_key: str):
|
9729
10120
|
"""
|
9730
10121
|
Decrypts RSA-encrypted data using the private key.
|
@@ -9735,6 +10126,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9735
10126
|
from Crypto.PublicKey import RSA
|
9736
10127
|
from Crypto.Cipher import PKCS1_OAEP
|
9737
10128
|
import base64
|
10129
|
+
|
9738
10130
|
encrypted_data = base64.b64decode(encrypted_data)
|
9739
10131
|
private_key_obj = RSA.import_key(private_key)
|
9740
10132
|
cipher_rsa = PKCS1_OAEP.new(private_key_obj)
|
@@ -9742,8 +10134,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
|
|
9742
10134
|
return decrypted_data.decode()
|
9743
10135
|
|
9744
10136
|
if key is None:
|
9745
|
-
key="worldpeace"
|
9746
|
-
method=strcmp(method,["AES","RSA",
|
10137
|
+
key = "worldpeace"
|
10138
|
+
method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
|
9747
10139
|
if method == "AES":
|
9748
10140
|
return aes_decrypt(encrypted_code, key)
|
9749
10141
|
elif method == "RSA":
|