py2ls 0.2.4.31__py3-none-any.whl → 0.2.4.32__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -23,7 +23,8 @@ import logging
23
23
  from pathlib import Path
24
24
  from datetime import datetime
25
25
 
26
- def run_once_within(duration=60,reverse=False): # default 60s
26
+
27
+ def run_once_within(duration=60, reverse=False): # default 60s
27
28
  import time
28
29
 
29
30
  """
@@ -546,6 +547,7 @@ def is_text(s):
546
547
 
547
548
  from typing import Any, Union
548
549
 
550
+
549
551
  def share(*args, strict=True, n_shared=2, verbose=True):
550
552
  """
551
553
  check the shared elelements in two list.
@@ -591,13 +593,14 @@ def share(*args, strict=True, n_shared=2, verbose=True):
591
593
  elements2show = (
592
594
  shared_elements if len(shared_elements) < 10 else shared_elements[:5]
593
595
  )
594
- tail = '' if len(shared_elements) < 10 else '......'
596
+ tail = "" if len(shared_elements) < 10 else "......"
595
597
  elements2show.append(tail)
596
598
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
597
599
  print("********* checking shared elements *********")
598
600
  return shared_elements
599
601
 
600
- def shared(*args, n_shared=None, verbose=True,**kwargs):
602
+
603
+ def shared(*args, n_shared=None, verbose=True, **kwargs):
601
604
  """
602
605
  check the shared elelements in two list.
603
606
  usage:
@@ -652,7 +655,8 @@ def shared(*args, n_shared=None, verbose=True,**kwargs):
652
655
  print("********* checking shared elements *********")
653
656
  return shared_elements
654
657
 
655
- def share_not(*args, n_shared=None, verbose=False):
658
+
659
+ def share_not(*args, n_shared=None, verbose=False):
656
660
  """
657
661
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
658
662
  usage:
@@ -660,10 +664,12 @@ def share_not(*args, n_shared=None, verbose=False):
660
664
  list2 = [4, 5, 6, 7, 8]
661
665
  not_shared(list1,list2)# output [1,3]
662
666
  """
663
- _common = shared(*args, n_shared=n_shared, verbose=verbose)
667
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
664
668
  list1 = flatten(args[0], verbose=verbose)
665
669
  _not_shared = [item for item in list1 if item not in _common]
666
670
  return _not_shared
671
+
672
+
667
673
  def not_shared(*args, n_shared=None, verbose=False):
668
674
  """
669
675
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
@@ -779,7 +785,8 @@ def strcmp(
779
785
  print(f"建议: {best_match}")
780
786
  return candidates[best_match_index], best_match_index
781
787
 
782
- def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
788
+
789
+ def imgcmp(img: list, method="knn", plot_=True, figsize=[12, 6]):
783
790
  """
784
791
  Compare two images using SSIM, Feature Matching (SIFT), or KNN Matching.
785
792
 
@@ -796,15 +803,16 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
796
803
  import cv2
797
804
  import matplotlib.pyplot as plt
798
805
  from skimage.metrics import structural_similarity as ssim
806
+
799
807
  # Load images
800
808
  image1 = cv2.imread(img[0])
801
809
  image2 = cv2.imread(img[1])
802
810
 
803
811
  if image1 is None or image2 is None:
804
812
  raise ValueError("Could not load one or both images. Check file paths.")
805
- methods=['ssim','match','knn']
806
- method=strcmp(method, methods)[0]
807
- if method == 'ssim':
813
+ methods = ["ssim", "match", "knn"]
814
+ method = strcmp(method, methods)[0]
815
+ if method == "ssim":
808
816
  # Convert images to grayscale
809
817
  gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
810
818
  gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
@@ -819,18 +827,18 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
819
827
  # Plot if needed
820
828
  if plot_:
821
829
  fig, ax = plt.subplots(1, 3, figsize=figsize)
822
- ax[0].imshow(gray1, cmap='gray')
830
+ ax[0].imshow(gray1, cmap="gray")
823
831
  ax[0].set_title("Image 1")
824
- ax[1].imshow(gray2, cmap='gray')
832
+ ax[1].imshow(gray2, cmap="gray")
825
833
  ax[1].set_title("Image 2")
826
- ax[2].imshow(diff, cmap='gray')
834
+ ax[2].imshow(diff, cmap="gray")
827
835
  ax[2].set_title("Difference (SSIM)")
828
836
  plt.tight_layout()
829
837
  plt.show()
830
-
838
+
831
839
  return diff, score
832
840
 
833
- elif method in ['match', 'knn']:
841
+ elif method in ["match", "knn"]:
834
842
  # Convert images to grayscale
835
843
  gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
836
844
  gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
@@ -848,15 +856,17 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
848
856
  # BFMatcher initialization
849
857
  bf = cv2.BFMatcher()
850
858
 
851
- if method == 'match': # Cross-check matching
859
+ if method == "match": # Cross-check matching
852
860
  bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
853
861
  matches = bf.match(descriptors1, descriptors2)
854
862
  matches = sorted(matches, key=lambda x: x.distance)
855
863
 
856
864
  # Filter good matches
857
- good_matches = [m for m in matches if m.distance < 0.75 * matches[-1].distance]
865
+ good_matches = [
866
+ m for m in matches if m.distance < 0.75 * matches[-1].distance
867
+ ]
858
868
 
859
- elif method == 'knn': # KNN matching with ratio test
869
+ elif method == "knn": # KNN matching with ratio test
860
870
  matches = bf.knnMatch(descriptors1, descriptors2, k=2)
861
871
  # Apply Lowe's ratio test
862
872
  good_matches = [m for m, n in matches if m.distance < 0.75 * n.distance]
@@ -865,14 +875,18 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
865
875
  similarity_score = len(good_matches) / min(len(keypoints1), len(keypoints2))
866
876
  print(f"Number of good matches: {len(good_matches)}")
867
877
  print(f"Similarity Score: {similarity_score:.4f}")
868
- # Handle case where no good matches are found
878
+ # Handle case where no good matches are found
869
879
  if len(good_matches) == 0:
870
880
  print("No good matches found.")
871
881
  return good_matches, 0.0, None
872
882
 
873
883
  # Identify matched keypoints
874
- src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
875
- dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
884
+ src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(
885
+ -1, 1, 2
886
+ )
887
+ dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(
888
+ -1, 1, 2
889
+ )
876
890
 
877
891
  # Calculate Homography using RANSAC
878
892
  homography_matrix, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
@@ -893,11 +907,15 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
893
907
 
894
908
  # Plot matches if needed
895
909
  if plot_:
896
- result = cv2.drawMatches(image1, keypoints1, image2, keypoints2, good_matches, None, flags=2)
910
+ result = cv2.drawMatches(
911
+ image1, keypoints1, image2, keypoints2, good_matches, None, flags=2
912
+ )
897
913
  plt.figure(figsize=figsize)
898
914
  plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
899
- plt.title(f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})")
900
- plt.axis('off')
915
+ plt.title(
916
+ f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})"
917
+ )
918
+ plt.axis("off")
901
919
  plt.show()
902
920
  # Identify unmatched keypoints
903
921
  matched_idx1 = [m.queryIdx for m in good_matches]
@@ -907,8 +925,20 @@ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
907
925
  unmatched_kp2 = [kp for i, kp in enumerate(keypoints2) if i not in matched_idx2]
908
926
 
909
927
  # Mark unmatched keypoints on the images
910
- img1_marked = cv2.drawKeypoints(image1, unmatched_kp1, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
911
- img2_marked = cv2.drawKeypoints(image2, unmatched_kp2, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
928
+ img1_marked = cv2.drawKeypoints(
929
+ image1,
930
+ unmatched_kp1,
931
+ None,
932
+ color=(0, 0, 255),
933
+ flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
934
+ )
935
+ img2_marked = cv2.drawKeypoints(
936
+ image2,
937
+ unmatched_kp2,
938
+ None,
939
+ color=(0, 0, 255),
940
+ flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
941
+ )
912
942
 
913
943
  # Display results
914
944
  if plot_:
@@ -1017,6 +1047,7 @@ def counter(list_, verbose=True):
1017
1047
  # print(f"Return a list of the n most common elements:\n{c.most_common()}")
1018
1048
  # print(f"Compute the sum of the counts:\n{c.total()}")
1019
1049
 
1050
+
1020
1051
  def dict2df(dict_, fill=None):
1021
1052
  len_max = 0
1022
1053
  for key, value in dict_.items():
@@ -1031,11 +1062,12 @@ def dict2df(dict_, fill=None):
1031
1062
  dict_[key] = value
1032
1063
  return pd.DataFrame.from_dict(dict_)
1033
1064
 
1065
+
1034
1066
  def text2audio(
1035
1067
  text,
1036
1068
  method=None, # "pyttsx3","gTTS"
1037
1069
  rate=200,
1038
- slow=False,#"gTTS"
1070
+ slow=False, # "gTTS"
1039
1071
  volume=1.0,
1040
1072
  voice=None,
1041
1073
  lang=None,
@@ -1056,16 +1088,38 @@ def text2audio(
1056
1088
  # )
1057
1089
  """
1058
1090
  if method is not None:
1059
- methods=["gTTS","pyttsx3","google"]
1060
- method=strcmp(method, methods)[0]
1091
+ methods = ["gTTS", "pyttsx3", "google"]
1092
+ method = strcmp(method, methods)[0]
1061
1093
  else:
1062
1094
  try:
1063
- text2audio(text,method='google',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1095
+ text2audio(
1096
+ text,
1097
+ method="google",
1098
+ rate=rate,
1099
+ slow=slow,
1100
+ volume=volume,
1101
+ voice=voice,
1102
+ lang=lang,
1103
+ gender=gender,
1104
+ age=age,
1105
+ dir_save=dir_save,
1106
+ )
1064
1107
  except Exception as e:
1065
1108
  print(e)
1066
- text2audio(text,method='pyttsx3',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1067
-
1068
- if method=="pyttsx3":
1109
+ text2audio(
1110
+ text,
1111
+ method="pyttsx3",
1112
+ rate=rate,
1113
+ slow=slow,
1114
+ volume=volume,
1115
+ voice=voice,
1116
+ lang=lang,
1117
+ gender=gender,
1118
+ age=age,
1119
+ dir_save=dir_save,
1120
+ )
1121
+
1122
+ if method == "pyttsx3":
1069
1123
  import pyttsx3
1070
1124
 
1071
1125
  try:
@@ -1140,27 +1194,29 @@ def text2audio(
1140
1194
  sys.exit()
1141
1195
  except SystemExit:
1142
1196
  pass
1143
- elif method.lower() in ['google','gtts']:
1197
+ elif method.lower() in ["google", "gtts"]:
1144
1198
  from gtts import gTTS
1199
+
1145
1200
  try:
1146
1201
  if lang is None:
1147
1202
  from langdetect import detect
1203
+
1148
1204
  lang = detect(text)
1149
1205
  # Initialize gTTS with the provided parameters
1150
1206
  tts = gTTS(text=text, lang=lang, slow=slow)
1151
1207
  except Exception as e:
1152
1208
  print(f"An error occurred: {e}")
1153
-
1209
+
1154
1210
  print("not realtime reading...")
1155
1211
  if dir_save:
1156
1212
  if "." not in dir_save:
1157
- dir_save=dir_save+".mp3"
1213
+ dir_save = dir_save + ".mp3"
1158
1214
  tts.save(dir_save)
1159
1215
  print(f"Audio saved to {dir_save}")
1160
1216
  else:
1161
1217
  dir_save = "temp_audio.mp3"
1162
1218
  if "." not in dir_save:
1163
- dir_save=dir_save+".mp3"
1219
+ dir_save = dir_save + ".mp3"
1164
1220
  tts.save(dir_save)
1165
1221
  try:
1166
1222
  fopen(dir_save)
@@ -1168,6 +1224,7 @@ def text2audio(
1168
1224
  print(f"Error opening file: {e}")
1169
1225
  print("done")
1170
1226
 
1227
+
1171
1228
  def str2time(time_str, fmt="24"):
1172
1229
  """
1173
1230
  Convert a time string into the specified format.
@@ -1624,6 +1681,7 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
1624
1681
  def set_dpi(x):
1625
1682
  dpix = dpiy = x
1626
1683
  return image2pdf.get_fixed_dpi_layout_fun((dpix, dpiy))
1684
+
1627
1685
  if kind is None:
1628
1686
  _, kind = os.path.splitext(dir_img)
1629
1687
  if not kind.startswith("."):
@@ -1649,8 +1707,9 @@ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=30
1649
1707
  imgs.append(path)
1650
1708
  else:
1651
1709
  imgs = [
1652
- # os.path.isdir(dir_img),
1653
- dir_img]
1710
+ # os.path.isdir(dir_img),
1711
+ dir_img
1712
+ ]
1654
1713
  print(imgs)
1655
1714
  if page_size:
1656
1715
  if isinstance(page_size, str):
@@ -2196,7 +2255,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
2196
2255
  # Check data types
2197
2256
  data_types = df.dtypes
2198
2257
  # messages.append(f"Data types of columns:\n{data_types}")
2199
-
2258
+
2200
2259
  # Check for an unreasonable number of rows or columns
2201
2260
  if actual_shape[0] < 2 or actual_shape[1] < 2:
2202
2261
  messages.append(
@@ -2347,33 +2406,36 @@ def fload(fpath, kind=None, **kwargs):
2347
2406
 
2348
2407
  def load_csv(fpath, **kwargs):
2349
2408
  from pandas.errors import EmptyDataError
2350
- engine = kwargs.pop("engine", "pyarrow")# default: None
2351
- sep = kwargs.pop("sep", None)# default: ','
2352
- index_col = kwargs.pop("index_col", None)# default: None
2353
- memory_map = kwargs.pop("memory_map", False)# default: False
2354
- skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
2355
- encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
2356
- on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
2357
- comment = kwargs.pop("comment", None)# default: None
2358
- fmt = kwargs.pop("fmt", False)# default:
2359
- chunksize = kwargs.pop("chunksize", None)# default: None
2360
-
2361
- #check filesize
2362
- f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
2363
- if f_size>=50: #50 MB
2409
+
2410
+ engine = kwargs.pop("engine", "pyarrow") # default: None
2411
+ sep = kwargs.pop("sep", None) # default: ','
2412
+ index_col = kwargs.pop("index_col", None) # default: None
2413
+ memory_map = kwargs.pop("memory_map", False) # default: False
2414
+ skipinitialspace = kwargs.pop("skipinitialspace", False) # default: False
2415
+ encoding = kwargs.pop("encoding", "utf-8") # default: "utf-8"
2416
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip") # default: 'error'
2417
+ comment = kwargs.pop("comment", None) # default: None
2418
+ fmt = kwargs.pop("fmt", False) # default:
2419
+ chunksize = kwargs.pop("chunksize", None) # default: None
2420
+
2421
+ # check filesize
2422
+ f_size = round(os.path.getsize(fpath) / 1024 / 1024, 3)
2423
+ if f_size >= 50: # 50 MB
2364
2424
  if chunksize is None:
2365
- chunksize = 5000
2366
- print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
2425
+ chunksize = 5000
2426
+ print(
2427
+ f"file size is {f_size}MB, then set the chunksize with {chunksize}"
2428
+ )
2367
2429
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2368
- low_memory = kwargs.pop("low_memory", True)# default: True
2430
+ low_memory = kwargs.pop("low_memory", True) # default: True
2369
2431
  low_memory = (
2370
2432
  False if chunksize else True
2371
- ) # when chunksize, recommend low_memory=False # default:
2433
+ ) # when chunksize, recommend low_memory=False # default:
2372
2434
  verbose = kwargs.pop("verbose", False)
2373
2435
  if run_once_within(reverse=True) and verbose:
2374
2436
  use_pd("read_csv", verbose=verbose)
2375
2437
 
2376
- if comment is None:# default: None
2438
+ if comment is None: # default: None
2377
2439
  comment = get_comment(
2378
2440
  fpath, comment=None, encoding="utf-8", lines_to_check=5
2379
2441
  )
@@ -2503,7 +2565,9 @@ def fload(fpath, kind=None, **kwargs):
2503
2565
  try:
2504
2566
  sep2show = sep if sep != "\t" else "\\t"
2505
2567
  if verbose:
2506
- print(f"trying with: engine={engine}, sep='{sep2show}'")
2568
+ print(
2569
+ f"trying with: engine={engine}, sep='{sep2show}'"
2570
+ )
2507
2571
  # print(".")
2508
2572
  df = pd.read_csv(
2509
2573
  fpath,
@@ -2524,12 +2588,12 @@ def fload(fpath, kind=None, **kwargs):
2524
2588
  if verbose:
2525
2589
  (
2526
2590
  display(df.head(2))
2527
- if isinstance(df, pd.DataFrame)
2591
+ if isinstance(df, pd.DataFrame)
2528
2592
  else display("it is not a DataFrame")
2529
2593
  )
2530
2594
  (
2531
2595
  print(f"shape: {df.shape}")
2532
- if isinstance(df, pd.DataFrame)
2596
+ if isinstance(df, pd.DataFrame)
2533
2597
  else display("it is not a DataFrame")
2534
2598
  )
2535
2599
  return df
@@ -2663,9 +2727,10 @@ def fload(fpath, kind=None, **kwargs):
2663
2727
  doc = Document(fpath)
2664
2728
  content = [para.text for para in doc.paragraphs]
2665
2729
  return content
2666
-
2730
+
2667
2731
  def load_rtf(file_path):
2668
2732
  from striprtf.striprtf import rtf_to_text
2733
+
2669
2734
  try:
2670
2735
  with open(file_path, "r") as file:
2671
2736
  rtf_content = file.read()
@@ -2715,7 +2780,7 @@ def fload(fpath, kind=None, **kwargs):
2715
2780
  "xml",
2716
2781
  "ipynb",
2717
2782
  "mtx",
2718
- "rtf"
2783
+ "rtf",
2719
2784
  ]
2720
2785
  zip_types = [
2721
2786
  "gz",
@@ -2735,7 +2800,7 @@ def fload(fpath, kind=None, **kwargs):
2735
2800
  if kind not in supported_types:
2736
2801
  print(
2737
2802
  f'Warning:\n"{kind}" is not in the supported list '
2738
- ) # {supported_types}')
2803
+ ) # {supported_types}')
2739
2804
 
2740
2805
  if kind == "docx":
2741
2806
  return load_docx(fpath)
@@ -2760,10 +2825,11 @@ def fload(fpath, kind=None, **kwargs):
2760
2825
  if run_once_within(reverse=True) and verbose:
2761
2826
  use_pd("read_pickle")
2762
2827
  try:
2763
- res_=pd.read_pickle(fpath, **kwargs)
2828
+ res_ = pd.read_pickle(fpath, **kwargs)
2764
2829
  except Exception as e:
2765
2830
  import pickle
2766
- with open('sgd_classifier.pkl', 'rb') as f:
2831
+
2832
+ with open("sgd_classifier.pkl", "rb") as f:
2767
2833
  res_ = pickle.load(f)
2768
2834
  return res_
2769
2835
  elif kind in ["ods", "ods", "odt"]:
@@ -2775,21 +2841,34 @@ def fload(fpath, kind=None, **kwargs):
2775
2841
  engine = kwargs.get("engine", "xlrd")
2776
2842
  kwargs.pop("engine", None)
2777
2843
  content = load_excel(fpath, engine=engine, **kwargs)
2778
- print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) and verbose else None
2844
+ (
2845
+ print(f"shape: {content.shape}")
2846
+ if isinstance(content, pd.DataFrame) and verbose
2847
+ else None
2848
+ )
2779
2849
  display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2780
2850
  return content
2781
2851
  elif kind == "xlsx":
2782
2852
  verbose = kwargs.pop("verbose", False)
2783
2853
  content = load_excel(fpath, **kwargs)
2784
- display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2854
+ (
2855
+ display(content.head(3))
2856
+ if isinstance(content, pd.DataFrame) and verbose
2857
+ else None
2858
+ )
2785
2859
  print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2786
2860
  return content
2787
2861
  elif kind == "mtx":
2788
2862
  from scipy.io import mmread
2863
+
2789
2864
  verbose = kwargs.pop("verbose", False)
2790
2865
  dat_mtx = mmread(fpath)
2791
2866
  content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
2792
- display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2867
+ (
2868
+ display(content.head(3))
2869
+ if isinstance(content, pd.DataFrame) and verbose
2870
+ else None
2871
+ )
2793
2872
  print(f"shape: {content.shape}")
2794
2873
  return content
2795
2874
  elif kind == "ipynb":
@@ -2904,34 +2983,34 @@ def fopen(fpath):
2904
2983
  import os
2905
2984
  import platform
2906
2985
  import sys
2986
+
2907
2987
  try:
2908
2988
  # Check if the file exists
2909
2989
  if not os.path.isfile(fpath):
2910
2990
  print(f"Error: The file does not exist - {fpath}")
2911
2991
  return
2912
-
2992
+
2913
2993
  # Get the system platform
2914
2994
  system = platform.system()
2915
2995
 
2916
2996
  # Platform-specific file opening commands
2917
2997
  if system == "Darwin": # macOS
2918
- os.system(f"open \"{fpath}\"")
2998
+ os.system(f'open "{fpath}"')
2919
2999
  elif system == "Windows": # Windows
2920
3000
  # Ensure the path is handled correctly in Windows, escape spaces
2921
- os.system(f"start \"\" \"{fpath}\"")
3001
+ os.system(f'start "" "{fpath}"')
2922
3002
  elif system == "Linux": # Linux
2923
- os.system(f"xdg-open \"{fpath}\"")
3003
+ os.system(f'xdg-open "{fpath}"')
2924
3004
  elif system == "Java": # Java (or other unhandled systems)
2925
3005
  print(f"Opening {fpath} on unsupported system.")
2926
3006
  else:
2927
3007
  print(f"Unsupported OS: {system}")
2928
-
3008
+
2929
3009
  print(f"Successfully opened {fpath} with the default application.")
2930
3010
  except Exception as e:
2931
3011
  print(f"Error opening file {fpath}: {e}")
2932
3012
 
2933
3013
 
2934
-
2935
3014
  def fupdate(fpath, content=None, how="head"):
2936
3015
  """
2937
3016
  Update a file by adding new content at the top and moving the old content to the bottom.
@@ -3346,9 +3425,10 @@ def fsave(
3346
3425
  except Exception as e:
3347
3426
  try:
3348
3427
  import pickle
3349
- with open(fpath, 'wb') as f:
3428
+
3429
+ with open(fpath, "wb") as f:
3350
3430
  pickle.dump(content, f)
3351
- print('done!', fpath)
3431
+ print("done!", fpath)
3352
3432
  except Exception as e:
3353
3433
  raise ValueError(
3354
3434
  f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
@@ -3508,9 +3588,9 @@ def isa(content, kind):
3508
3588
  """
3509
3589
  if "img" in kind.lower() or "image" in kind.lower():
3510
3590
  return is_image(content)
3511
- elif 'vid' in kind.lower():
3591
+ elif "vid" in kind.lower():
3512
3592
  return is_video(content)
3513
- elif 'aud' in kind.lower():
3593
+ elif "aud" in kind.lower():
3514
3594
  return is_audio(content)
3515
3595
  elif "doc" in kind.lower():
3516
3596
  return is_document(content)
@@ -3755,22 +3835,28 @@ def get_os(full=False, verbose=False):
3755
3835
 
3756
3836
  def get_system_uptime():
3757
3837
  """Returns system uptime as a human-readable string."""
3758
- boot_time = datetime.fromtimestamp(psutil.boot_time())
3759
- uptime = datetime.now() - boot_time
3760
- return str(uptime).split(".")[0] # Remove microseconds
3838
+ try:
3839
+ boot_time = datetime.fromtimestamp(psutil.boot_time())
3840
+ uptime = datetime.now() - boot_time
3841
+ return str(uptime).split(".")[0] # Remove microseconds
3842
+ except:
3843
+ return None
3761
3844
 
3762
3845
  def get_active_processes(limit=10):
3763
- processes = []
3764
- for proc in psutil.process_iter(
3765
- ["pid", "name", "cpu_percent", "memory_percent"]
3766
- ):
3767
- try:
3768
- processes.append(proc.info)
3769
- except psutil.NoSuchProcess:
3770
- pass
3771
- # Handle NoneType values by treating them as 0
3772
- processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
3773
- return processes[:limit]
3846
+ try:
3847
+ processes = []
3848
+ for proc in psutil.process_iter(
3849
+ ["pid", "name", "cpu_percent", "memory_percent"]
3850
+ ):
3851
+ try:
3852
+ processes.append(proc.info)
3853
+ except psutil.NoSuchProcess:
3854
+ pass
3855
+ # Handle NoneType values by treating them as 0
3856
+ processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
3857
+ return processes[:limit]
3858
+ except:
3859
+ return None
3774
3860
 
3775
3861
  def get_virtual_environment_info():
3776
3862
  """Checks if the script is running in a virtual environment and returns details."""
@@ -3801,19 +3887,22 @@ def get_os(full=False, verbose=False):
3801
3887
 
3802
3888
  def get_battery_status():
3803
3889
  """Returns battery status."""
3804
- battery = psutil.sensors_battery()
3805
- if battery:
3806
- time_left = (
3807
- str(timedelta(seconds=battery.secsleft))
3808
- if battery.secsleft != psutil.POWER_TIME_UNLIMITED
3809
- else "Charging/Unlimited"
3810
- )
3811
- return {
3812
- "Percentage": battery.percent,
3813
- "Plugged In": battery.power_plugged,
3814
- "Time Left": time_left,
3815
- }
3816
- return {"Status": "No battery detected"}
3890
+ try:
3891
+ battery = psutil.sensors_battery()
3892
+ if battery:
3893
+ time_left = (
3894
+ str(timedelta(seconds=battery.secsleft))
3895
+ if battery.secsleft != psutil.POWER_TIME_UNLIMITED
3896
+ else "Charging/Unlimited"
3897
+ )
3898
+ return {
3899
+ "Percentage": battery.percent,
3900
+ "Plugged In": battery.power_plugged,
3901
+ "Time Left": time_left,
3902
+ }
3903
+ return {"Status": "No battery detected"}
3904
+ except:
3905
+ return {"Status": "No battery detected"}
3817
3906
 
3818
3907
  def get_disk_io():
3819
3908
  """Returns disk I/O statistics."""
@@ -3899,8 +3988,8 @@ def get_os(full=False, verbose=False):
3899
3988
  "network": {},
3900
3989
  "network io": get_network_io(),
3901
3990
  "gpu": [],
3902
- "temperatures": get_temperatures(),
3903
- "battery": get_battery_status(),
3991
+ # "temperatures": get_temperatures(),
3992
+ # "battery": get_battery_status(),
3904
3993
  "active processes": get_active_processes(),
3905
3994
  "environment": {
3906
3995
  "user": os.getenv("USER", "Unknown"),
@@ -3984,27 +4073,31 @@ def get_os(full=False, verbose=False):
3984
4073
  pnrint(e)
3985
4074
  return res
3986
4075
 
4076
+
3987
4077
  import re
3988
4078
  import stat
3989
4079
  import platform
4080
+
4081
+
3990
4082
  def listdir(
3991
4083
  rootdir,
3992
4084
  kind=None,
3993
4085
  sort_by="name",
3994
4086
  ascending=True,
3995
- contains=None,# filter filenames using re
3996
- booster=False,# walk in subfolders
3997
- depth = 0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
4087
+ contains=None, # filter filenames using re
4088
+ booster=False, # walk in subfolders
4089
+ depth=0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
3998
4090
  hidden=False, # Include hidden files/folders
3999
4091
  orient="list",
4000
4092
  output="df", # "df", 'list','dict','records','index','series'
4001
4093
  verbose=True,
4002
- ):
4094
+ ):
4003
4095
  def is_hidden(filepath):
4004
4096
  """Check if a file or folder is hidden."""
4005
4097
  system = platform.system()
4006
4098
  if system == "Windows":
4007
4099
  import ctypes
4100
+
4008
4101
  attribute = ctypes.windll.kernel32.GetFileAttributesW(filepath)
4009
4102
  if attribute == -1:
4010
4103
  raise FileNotFoundError(f"File {filepath} not found.")
@@ -4019,6 +4112,7 @@ def listdir(
4019
4112
  return os.environ.get("USERNAME", "Unknown")
4020
4113
  else:
4021
4114
  import pwd
4115
+
4022
4116
  return pwd.getpwuid(os.getuid()).pw_name
4023
4117
 
4024
4118
  if isinstance(kind, list):
@@ -4030,7 +4124,7 @@ def listdir(
4030
4124
  sort_by=sort_by,
4031
4125
  ascending=ascending,
4032
4126
  contains=contains,
4033
- depth=depth,# walk in subfolders
4127
+ depth=depth, # walk in subfolders
4034
4128
  hidden=hidden,
4035
4129
  orient=orient,
4036
4130
  output=output,
@@ -4046,21 +4140,21 @@ def listdir(
4046
4140
  i = 0
4047
4141
  f = {
4048
4142
  "name": [],
4049
- 'kind':[],
4143
+ "kind": [],
4050
4144
  "length": [],
4051
- "basename":[],
4145
+ "basename": [],
4052
4146
  "path": [],
4053
4147
  "created_time": [],
4054
4148
  "modified_time": [],
4055
4149
  "last_open_time": [],
4056
4150
  "size": [],
4057
- "permission":[],
4058
- "owner":[],
4059
- "rootdir":[],
4151
+ "permission": [],
4152
+ "owner": [],
4153
+ "rootdir": [],
4060
4154
  "fname": [],
4061
4155
  "fpath": [],
4062
- "num":[],
4063
- "os":[]
4156
+ "num": [],
4157
+ "os": [],
4064
4158
  }
4065
4159
  root_depth = rootdir.rstrip(os.sep).count(os.sep)
4066
4160
  for dirpath, dirnames, ls in os.walk(rootdir):
@@ -4069,30 +4163,32 @@ def listdir(
4069
4163
  if depth is not None and current_depth > depth:
4070
4164
  dirnames[:] = [] # Prevent further traversal into subfolders
4071
4165
  continue
4072
-
4166
+
4073
4167
  if not hidden:
4074
- dirnames[:] = [d for d in dirnames if not is_hidden(os.path.join(dirpath, d))]
4168
+ dirnames[:] = [
4169
+ d for d in dirnames if not is_hidden(os.path.join(dirpath, d))
4170
+ ]
4075
4171
  ls = [i for i in ls if not is_hidden(os.path.join(dirpath, i))]
4076
4172
 
4077
4173
  for dirname in dirnames:
4078
- if kind is not None and kind not in fd: # do not check folders
4174
+ if kind is not None and kind not in fd: # do not check folders
4079
4175
  continue
4080
4176
  if contains and not re.search(contains, dirname):
4081
4177
  continue
4082
4178
  dirname_path = os.path.join(dirpath, dirname)
4083
- fpath = os.path.join(os.path.dirname(dirname_path), dirname)
4179
+ fpath = os.path.join(os.path.dirname(dirname_path), dirname)
4084
4180
  try:
4085
4181
  stats_file = os.stat(fpath)
4086
4182
  except Exception as e:
4087
4183
  print(e)
4088
4184
  continue
4089
4185
  filename, file_extension = os.path.splitext(dirname)
4090
- file_extension = file_extension if file_extension!='' else None
4186
+ file_extension = file_extension if file_extension != "" else None
4091
4187
  f["name"].append(filename)
4092
- f['kind'].append(file_extension)
4188
+ f["kind"].append(file_extension)
4093
4189
  f["length"].append(len(filename))
4094
4190
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
4095
- f['basename'].append(os.path.basename(dirname_path))
4191
+ f["basename"].append(os.path.basename(dirname_path))
4096
4192
  f["path"].append(os.path.join(os.path.dirname(dirname_path), dirname))
4097
4193
  f["created_time"].append(
4098
4194
  pd.to_datetime(int(os.path.getctime(dirname_path)), unit="s")
@@ -4110,7 +4206,7 @@ def listdir(
4110
4206
  f["fpath"].append(fpath) # will be removed
4111
4207
  i += 1
4112
4208
  for item in ls:
4113
- if kind in fd:# only check folders
4209
+ if kind in fd: # only check folders
4114
4210
  continue
4115
4211
  if contains and not re.search(contains, item):
4116
4212
  continue
@@ -4127,7 +4223,16 @@ def listdir(
4127
4223
  is_file = kind.lower() in file_extension.lower() and (
4128
4224
  os.path.isfile(item_path)
4129
4225
  )
4130
- if kind in [".doc", ".img", ".zip",".code",".file",".image",".video",".audio"]: # 选择大的类别
4226
+ if kind in [
4227
+ ".doc",
4228
+ ".img",
4229
+ ".zip",
4230
+ ".code",
4231
+ ".file",
4232
+ ".image",
4233
+ ".video",
4234
+ ".audio",
4235
+ ]: # 选择大的类别
4131
4236
  if kind != ".folder" and not isa(item_path, kind):
4132
4237
  continue
4133
4238
  elif kind in [".all"]:
@@ -4135,13 +4240,13 @@ def listdir(
4135
4240
  else: # 精确到文件的后缀
4136
4241
  if not is_folder and not is_file:
4137
4242
  continue
4138
- file_extension = file_extension if file_extension!='' else None
4243
+ file_extension = file_extension if file_extension != "" else None
4139
4244
  f["name"].append(filename)
4140
- f['kind'].append(file_extension)
4245
+ f["kind"].append(file_extension)
4141
4246
  f["length"].append(len(filename))
4142
4247
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
4143
- f['basename'].append(os.path.basename(item_path))
4144
- f["path"].append(os.path.join(os.path.dirname(item_path), item))
4248
+ f["basename"].append(os.path.basename(item_path))
4249
+ f["path"].append(os.path.join(os.path.dirname(item_path), item))
4145
4250
  f["created_time"].append(
4146
4251
  pd.to_datetime(int(os.path.getctime(item_path)), unit="s")
4147
4252
  )
@@ -4152,7 +4257,9 @@ def listdir(
4152
4257
  pd.to_datetime(int(os.path.getatime(item_path)), unit="s")
4153
4258
  )
4154
4259
  f["permission"].append(stat.filemode(stats_file.st_mode)),
4155
- f["owner"].append(os.getlogin() if platform.system() != "Windows" else "N/A"),
4260
+ f["owner"].append(
4261
+ os.getlogin() if platform.system() != "Windows" else "N/A"
4262
+ ),
4156
4263
  f["fname"].append(filename) # will be removed
4157
4264
  f["fpath"].append(fpath) # will be removed
4158
4265
  f["rootdir"].append(dirpath)
@@ -4162,11 +4269,28 @@ def listdir(
4162
4269
  f["os"] = get_os() # os.uname().machine
4163
4270
  # if not booster: # go deeper subfolders
4164
4271
  # break
4165
- #* convert to pd.DataFrame
4272
+ # * convert to pd.DataFrame
4166
4273
  f = pd.DataFrame(f)
4167
- f=f[["basename","name","kind","length","size","num","path","created_time",
4168
- "modified_time","last_open_time","rootdir",
4169
- "permission","owner","os","fname","fpath",]]
4274
+ f = f[
4275
+ [
4276
+ "basename",
4277
+ "name",
4278
+ "kind",
4279
+ "length",
4280
+ "size",
4281
+ "num",
4282
+ "path",
4283
+ "created_time",
4284
+ "modified_time",
4285
+ "last_open_time",
4286
+ "rootdir",
4287
+ "permission",
4288
+ "owner",
4289
+ "os",
4290
+ "fname",
4291
+ "fpath",
4292
+ ]
4293
+ ]
4170
4294
  if "nam" in sort_by.lower():
4171
4295
  f = sort_kind(f, by="name", ascending=ascending)
4172
4296
  elif "crea" in sort_by.lower():
@@ -4183,6 +4307,7 @@ def listdir(
4183
4307
  return f
4184
4308
  else:
4185
4309
  from box import Box
4310
+
4186
4311
  if "l" in orient.lower(): # list # default
4187
4312
  res_output = Box(f.to_dict(orient="list"))
4188
4313
  return res_output
@@ -4195,6 +4320,7 @@ def listdir(
4195
4320
  if "se" in orient.lower(): # records
4196
4321
  return Box(f.to_dict(orient="series"))
4197
4322
 
4323
+
4198
4324
  def listfunc(lib_name, opt="call"):
4199
4325
  if opt == "call":
4200
4326
  funcs = [func for func in dir(lib_name) if callable(getattr(lib_name, func))]
@@ -4206,6 +4332,7 @@ def listfunc(lib_name, opt="call"):
4206
4332
  def func_list(lib_name, opt="call"):
4207
4333
  return list_func(lib_name, opt=opt)
4208
4334
 
4335
+
4209
4336
  def copy(src, dst, overwrite=False):
4210
4337
  """Copy a file from src to dst."""
4211
4338
  try:
@@ -4223,25 +4350,31 @@ def copy(src, dst, overwrite=False):
4223
4350
  if overwrite:
4224
4351
  dst.unlink()
4225
4352
  else:
4226
- dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
4353
+ dst = dst.with_name(
4354
+ f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
4355
+ )
4227
4356
  shutil.copy(src, dst)
4228
4357
  print(f"\n Done! copy to {dst}\n")
4229
4358
  else:
4230
- dst = dst/src.name
4359
+ dst = dst / src.name
4231
4360
  if dst.exists():
4232
4361
  if overwrite:
4233
4362
  shutil.rmtree(dst) # Remove existing directory
4234
4363
  else:
4235
- dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('%H%M%S')}")
4364
+ dst = dst.with_name(
4365
+ f"{dst.stem}_{datetime.now().strftime('%H%M%S')}"
4366
+ )
4236
4367
  shutil.copytree(src, dst)
4237
4368
  print(f"\n Done! copy to {dst}\n")
4238
4369
 
4239
4370
  except Exception as e:
4240
4371
  logging.error(f"Failed {e}")
4241
-
4372
+
4373
+
4242
4374
  def cut(src, dst, overwrite=False):
4243
4375
  return move(src=src, dst=dst, overwrite=overwrite)
4244
4376
 
4377
+
4245
4378
  def move(src, dst, overwrite=False):
4246
4379
  try:
4247
4380
  dir_par_dst = os.path.dirname(dst)
@@ -4256,23 +4389,26 @@ def move(src, dst, overwrite=False):
4256
4389
  # dst.unlink() # Delete the existing file
4257
4390
  pass
4258
4391
  else:
4259
- dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
4392
+ dst = dst.with_name(
4393
+ f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}"
4394
+ )
4260
4395
  shutil.move(src, dst)
4261
4396
  print(f"\n Done! moved to {dst}\n")
4262
4397
  except Exception as e:
4263
4398
  logging.error(f"Failed to move file from {src} to {dst}: {e}")
4264
-
4399
+
4400
+
4265
4401
  def delete(fpath):
4266
- """Delete a file/folder."""
4402
+ """Delete a file/folder."""
4267
4403
  try:
4268
4404
  fpath = Path(fpath)
4269
- if not fpath.is_dir(): # file
4405
+ if not fpath.is_dir(): # file
4270
4406
  if fpath.exists():
4271
4407
  fpath.unlink()
4272
4408
  print(f"\n Done! delete {fpath}\n")
4273
4409
  else:
4274
4410
  print(f"File '{fpath}' does not exist.")
4275
- else:#folder
4411
+ else: # folder
4276
4412
  if fpath.exists():
4277
4413
  shutil.rmtree(fpath) # Remove existing directory
4278
4414
  print(f"\n Done! delete {fpath}\n")
@@ -4280,27 +4416,31 @@ def delete(fpath):
4280
4416
  print(f"Folder '{fpath}' does not exist.")
4281
4417
  except Exception as e:
4282
4418
  logging.error(f"Failed to delete {fpath}: {e}")
4419
+
4420
+
4283
4421
  def rename(fpath, dst, smart=True):
4284
4422
  """Rename a file or folder."""
4285
4423
  try:
4286
- src_kind,dst_kind = None,None
4424
+ src_kind, dst_kind = None, None
4287
4425
  if smart:
4288
- dir_name_src=os.path.dirname(fpath)
4289
- dir_name_dst=os.path.dirname(dst)
4290
- src_kind=os.path.splitext(fpath)[1]
4291
- dst_kind=os.path.splitext(dst)[1]
4292
- if dir_name_dst!=dir_name_src:
4293
- dst=os.path.join(dir_name_src,dst)
4426
+ dir_name_src = os.path.dirname(fpath)
4427
+ dir_name_dst = os.path.dirname(dst)
4428
+ src_kind = os.path.splitext(fpath)[1]
4429
+ dst_kind = os.path.splitext(dst)[1]
4430
+ if dir_name_dst != dir_name_src:
4431
+ dst = os.path.join(dir_name_src, dst)
4294
4432
  if dst_kind is not None and src_kind is not None:
4295
- if dst_kind!=src_kind:
4296
- dst=dst + src_kind
4433
+ if dst_kind != src_kind:
4434
+ dst = dst + src_kind
4297
4435
  if os.path.exists(fpath):
4298
- os.rename(fpath,dst)
4436
+ os.rename(fpath, dst)
4299
4437
  print(f"Done! rename to {dst}")
4300
4438
  else:
4301
4439
  print(f"Failed: {fpath} does not exist.")
4302
4440
  except Exception as e:
4303
4441
  logging.error(f"Failed to rename {fpath} to {dst}: {e}")
4442
+
4443
+
4304
4444
  def mkdir_nest(fpath: str) -> str:
4305
4445
  """
4306
4446
  Create nested directories based on the provided file path.
@@ -4319,9 +4459,13 @@ def mkdir_nest(fpath: str) -> str:
4319
4459
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
4320
4460
 
4321
4461
  # Start creating directories from the root to the desired path
4322
- root_dir = os.path.splitdrive(fpath)[0] # Get the root drive on Windows (e.g., 'C:')
4323
- current_path = root_dir if root_dir else f_slash # Start from the root directory or POSIX '/'
4324
-
4462
+ root_dir = os.path.splitdrive(fpath)[
4463
+ 0
4464
+ ] # Get the root drive on Windows (e.g., 'C:')
4465
+ current_path = (
4466
+ root_dir if root_dir else f_slash
4467
+ ) # Start from the root directory or POSIX '/'
4468
+
4325
4469
  for part in dir_parts:
4326
4470
  if part:
4327
4471
  current_path = os.path.join(current_path, part)
@@ -4346,7 +4490,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
4346
4490
  - str: The path of the created directory or an error message.
4347
4491
  """
4348
4492
  rootdir = []
4349
- pardir= mkdir_nest(pardir)
4493
+ pardir = mkdir_nest(pardir)
4350
4494
  if chdir is None:
4351
4495
  return pardir
4352
4496
  else:
@@ -4465,6 +4609,7 @@ def figsave(*args, dpi=300):
4465
4609
  img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
4466
4610
  elif isinstance(img, np.ndarray):
4467
4611
  import cv2
4612
+
4468
4613
  # Check the shape of the image to determine color mode
4469
4614
  if img.ndim == 2:
4470
4615
  # Grayscale image
@@ -4496,8 +4641,13 @@ def figsave(*args, dpi=300):
4496
4641
  )
4497
4642
  else:
4498
4643
  plt.savefig(
4499
- fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
4500
- )
4644
+ fname,
4645
+ format=ftype.lower(),
4646
+ dpi=dpi,
4647
+ bbox_inches="tight",
4648
+ transparent=True,
4649
+ pad_inches=0,
4650
+ )
4501
4651
  elif ftype.lower() == "emf":
4502
4652
  plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
4503
4653
  elif ftype.lower() == "fig":
@@ -4534,6 +4684,7 @@ def is_num(s):
4534
4684
  def isnum(s):
4535
4685
  return is_num(s)
4536
4686
 
4687
+
4537
4688
  def is_image(fpath):
4538
4689
  """
4539
4690
  Determine if a given file is an image based on MIME type and file extension.
@@ -4545,6 +4696,7 @@ def is_image(fpath):
4545
4696
  bool: True if the file is a recognized image, False otherwise.
4546
4697
  """
4547
4698
  import mimetypes
4699
+
4548
4700
  # Known image MIME types
4549
4701
  image_mime_types = {
4550
4702
  "image/jpeg",
@@ -4561,8 +4713,20 @@ def is_image(fpath):
4561
4713
 
4562
4714
  # Known image file extensions
4563
4715
  image_extensions = {
4564
- ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tif", ".tiff",
4565
- ".ico", ".svg", ".heic", ".heif",".fig",".jpg"
4716
+ ".jpg",
4717
+ ".jpeg",
4718
+ ".png",
4719
+ ".gif",
4720
+ ".bmp",
4721
+ ".webp",
4722
+ ".tif",
4723
+ ".tiff",
4724
+ ".ico",
4725
+ ".svg",
4726
+ ".heic",
4727
+ ".heif",
4728
+ ".fig",
4729
+ ".jpg",
4566
4730
  }
4567
4731
 
4568
4732
  # Get MIME type using mimetypes
@@ -4573,12 +4737,15 @@ def is_image(fpath):
4573
4737
  return True
4574
4738
 
4575
4739
  # Fallback: Check file extension
4576
- ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4740
+ ext = os.path.splitext(fpath)[
4741
+ -1
4742
+ ].lower() # Get the file extension and ensure lowercase
4577
4743
  if ext in image_extensions:
4578
4744
  return True
4579
4745
 
4580
4746
  return False
4581
4747
 
4748
+
4582
4749
  def is_video(fpath):
4583
4750
  """
4584
4751
  Determine if a given file is a video based on MIME type and file extension.
@@ -4590,6 +4757,7 @@ def is_video(fpath):
4590
4757
  bool: True if the file is a recognized video, False otherwise.
4591
4758
  """
4592
4759
  import mimetypes
4760
+
4593
4761
  # Known video MIME types
4594
4762
  video_mime_types = {
4595
4763
  "video/mp4",
@@ -4610,8 +4778,22 @@ def is_video(fpath):
4610
4778
 
4611
4779
  # Known video file extensions
4612
4780
  video_extensions = {
4613
- ".mp4", ".mov", ".avi", ".mkv", ".flv", ".webm", ".ogv", ".wmv",
4614
- ".mpg", ".mpeg", ".3gp", ".mpeg2", ".asf", ".ts", ".m4v", ".divx",
4781
+ ".mp4",
4782
+ ".mov",
4783
+ ".avi",
4784
+ ".mkv",
4785
+ ".flv",
4786
+ ".webm",
4787
+ ".ogv",
4788
+ ".wmv",
4789
+ ".mpg",
4790
+ ".mpeg",
4791
+ ".3gp",
4792
+ ".mpeg2",
4793
+ ".asf",
4794
+ ".ts",
4795
+ ".m4v",
4796
+ ".divx",
4615
4797
  }
4616
4798
 
4617
4799
  # Get MIME type using mimetypes
@@ -4622,12 +4804,15 @@ def is_video(fpath):
4622
4804
  return True
4623
4805
 
4624
4806
  # Fallback: Check file extension
4625
- ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4807
+ ext = os.path.splitext(fpath)[
4808
+ -1
4809
+ ].lower() # Get the file extension and ensure lowercase
4626
4810
  if ext in video_extensions:
4627
4811
  return True
4628
4812
 
4629
4813
  return False
4630
4814
 
4815
+
4631
4816
  def is_document(fpath):
4632
4817
  """
4633
4818
  Determine if a given file is a document based on MIME type and file extension.
@@ -4639,6 +4824,7 @@ def is_document(fpath):
4639
4824
  bool: True if the file is a recognized document, False otherwise.
4640
4825
  """
4641
4826
  import mimetypes
4827
+
4642
4828
  # Define known MIME types for documents
4643
4829
  document_mime_types = {
4644
4830
  "text/",
@@ -4679,18 +4865,23 @@ def is_document(fpath):
4679
4865
 
4680
4866
  # Get MIME type
4681
4867
  mime_type, _ = mimetypes.guess_type(fpath)
4682
-
4868
+
4683
4869
  # Check MIME type
4684
- if mime_type and any(mime_type.startswith(doc_type) for doc_type in document_mime_types):
4870
+ if mime_type and any(
4871
+ mime_type.startswith(doc_type) for doc_type in document_mime_types
4872
+ ):
4685
4873
  return True
4686
4874
 
4687
4875
  # Fallback: Check file extension
4688
- ext = os.path.splitext(fpath)[-1].lower() # Get the extension, ensure it's lowercase
4876
+ ext = os.path.splitext(fpath)[
4877
+ -1
4878
+ ].lower() # Get the extension, ensure it's lowercase
4689
4879
  if ext in document_extensions:
4690
4880
  return True
4691
4881
 
4692
4882
  return False
4693
4883
 
4884
+
4694
4885
  def is_audio(fpath):
4695
4886
  """
4696
4887
  Determine if a given file is an audio file based on MIME type and file extension.
@@ -4702,6 +4893,7 @@ def is_audio(fpath):
4702
4893
  bool: True if the file is a recognized audio file, False otherwise.
4703
4894
  """
4704
4895
  import mimetypes
4896
+
4705
4897
  # Known audio MIME types
4706
4898
  audio_mime_types = {
4707
4899
  "audio/mpeg",
@@ -4720,8 +4912,19 @@ def is_audio(fpath):
4720
4912
 
4721
4913
  # Known audio file extensions
4722
4914
  audio_extensions = {
4723
- ".mp3", ".wav", ".ogg", ".aac", ".flac", ".midi", ".m4a",
4724
- ".aiff", ".pcm", ".wma", ".ape", ".alac", ".opus",
4915
+ ".mp3",
4916
+ ".wav",
4917
+ ".ogg",
4918
+ ".aac",
4919
+ ".flac",
4920
+ ".midi",
4921
+ ".m4a",
4922
+ ".aiff",
4923
+ ".pcm",
4924
+ ".wma",
4925
+ ".ape",
4926
+ ".alac",
4927
+ ".opus",
4725
4928
  }
4726
4929
 
4727
4930
  # Get MIME type using mimetypes
@@ -4732,12 +4935,15 @@ def is_audio(fpath):
4732
4935
  return True
4733
4936
 
4734
4937
  # Fallback: Check file extension
4735
- ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4938
+ ext = os.path.splitext(fpath)[
4939
+ -1
4940
+ ].lower() # Get the file extension and ensure lowercase
4736
4941
  if ext in audio_extensions:
4737
4942
  return True
4738
4943
 
4739
4944
  return False
4740
4945
 
4946
+
4741
4947
  def is_code(fpath):
4742
4948
  """
4743
4949
  Determine if a given file is a code file based on file extension and optionally MIME type.
@@ -4751,16 +4957,37 @@ def is_code(fpath):
4751
4957
  """
4752
4958
  # Known programming and scripting file extensions
4753
4959
  code_extensions = {
4754
- ".m", ".py", ".ipynb", ".js", ".html", ".css", ".java", ".cpp", ".h", ".cs", ".go",
4755
- ".rs", ".sh", ".rb", ".swift", ".ts", ".json", ".xml", ".yaml", ".toml", ".bash", ".r"
4960
+ ".m",
4961
+ ".py",
4962
+ ".ipynb",
4963
+ ".js",
4964
+ ".html",
4965
+ ".css",
4966
+ ".java",
4967
+ ".cpp",
4968
+ ".h",
4969
+ ".cs",
4970
+ ".go",
4971
+ ".rs",
4972
+ ".sh",
4973
+ ".rb",
4974
+ ".swift",
4975
+ ".ts",
4976
+ ".json",
4977
+ ".xml",
4978
+ ".yaml",
4979
+ ".toml",
4980
+ ".bash",
4981
+ ".r",
4756
4982
  }
4757
4983
 
4758
4984
  # Check file extension
4759
- ext = os.path.splitext(fpath)[-1].lower()
4985
+ ext = os.path.splitext(fpath)[-1].lower()
4760
4986
  if ext in code_extensions:
4761
- return True
4987
+ return True
4762
4988
  return False
4763
-
4989
+
4990
+
4764
4991
  def is_zip(fpath):
4765
4992
  import mimetypes
4766
4993
 
@@ -4982,6 +5209,7 @@ def apply_filter(img, *args):
4982
5209
  )
4983
5210
  return img.filter(supported_filters[filter_name])
4984
5211
 
5212
+
4985
5213
  def detect_angle(image, by="median", template=None):
4986
5214
  """Detect the angle of rotation using various methods."""
4987
5215
  from sklearn.decomposition import PCA
@@ -4989,7 +5217,8 @@ def detect_angle(image, by="median", template=None):
4989
5217
  from skimage.color import rgb2gray
4990
5218
  from scipy.fftpack import fftshift, fft2
4991
5219
  import numpy as np
4992
- import cv2
5220
+ import cv2
5221
+
4993
5222
  # Convert to grayscale
4994
5223
  gray_image = rgb2gray(image)
4995
5224
 
@@ -5091,6 +5320,7 @@ def detect_angle(image, by="median", template=None):
5091
5320
  print(f"Unknown method {by}")
5092
5321
  return 0
5093
5322
 
5323
+
5094
5324
  def imgsets(img, **kwargs):
5095
5325
  """
5096
5326
  Apply various enhancements and filters to an image using PIL's ImageEnhance and ImageFilter modules.
@@ -6355,13 +6585,13 @@ def _df_outlier(
6355
6585
  from scipy.stats import zscore
6356
6586
  from sklearn.ensemble import IsolationForest
6357
6587
  from sklearn.preprocessing import StandardScaler
6358
-
6588
+
6359
6589
  # Fill completely NaN columns with a default value (e.g., 0)
6360
6590
  data = data.copy()
6361
6591
  data.loc[:, data.isna().all()] = 0
6362
6592
  if columns is not None:
6363
- if isinstance(columns, (list,pd.core.indexes.base.Index)):
6364
- data=data[columns]
6593
+ if isinstance(columns, (list, pd.core.indexes.base.Index)):
6594
+ data = data[columns]
6365
6595
  col_names_org = data.columns.tolist()
6366
6596
  index_names_org = data.index.tolist()
6367
6597
  # Separate numeric and non-numeric columns
@@ -6527,6 +6757,7 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
6527
6757
  data = data.explode(column, ignore_index=True)
6528
6758
  return data
6529
6759
 
6760
+
6530
6761
  def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
6531
6762
  """
6532
6763
  Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
@@ -6536,24 +6767,30 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
6536
6767
  data = df_cycle(data, 'month', 12)
6537
6768
  """
6538
6769
  if columns is None:
6539
- columns = list(data.select_dtypes(include=np.number).columns) # If no columns specified, use all columns
6770
+ columns = list(
6771
+ data.select_dtypes(include=np.number).columns
6772
+ ) # If no columns specified, use all columns
6540
6773
  if max_val is None:
6541
- max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
6774
+ max_val = np.max(
6775
+ data[columns]
6776
+ ) # If no max_val specified, use the maximum value across all columns
6542
6777
  if isinstance(columns, str):
6543
- columns = [columns] # If a single column name is provided as a string, convert it to a list
6544
-
6778
+ columns = [
6779
+ columns
6780
+ ] # If a single column name is provided as a string, convert it to a list
6781
+
6545
6782
  # Check if inplace is True, so we modify the original dataframe
6546
6783
  if inplace:
6547
6784
  # Modify the data in place, no return statement needed
6548
6785
  for col in columns:
6549
- data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
6550
- data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
6786
+ data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
6787
+ data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
6551
6788
  else:
6552
6789
  # If inplace is False, return the modified dataframe
6553
6790
  new_data = data.copy()
6554
6791
  for col in columns:
6555
- new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
6556
- new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
6792
+ new_data[col + "_sin"] = np.sin(2 * np.pi * new_data[col] / max_val)
6793
+ new_data[col + "_cos"] = np.cos(2 * np.pi * new_data[col] / max_val)
6557
6794
  return new_data
6558
6795
 
6559
6796
 
@@ -6561,7 +6798,7 @@ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
6561
6798
  def df_astype(
6562
6799
  data: pd.DataFrame,
6563
6800
  columns: Optional[Union[str, List[str]]] = None,
6564
- astype: str = None,#"datetime",
6801
+ astype: str = None, # "datetime",
6565
6802
  skip_row: Union[str, list] = None,
6566
6803
  fmt: Optional[str] = None,
6567
6804
  inplace: bool = False,
@@ -6624,7 +6861,7 @@ def df_astype(
6624
6861
  "day",
6625
6862
  "month",
6626
6863
  "year",
6627
- "circular"
6864
+ "circular",
6628
6865
  ]
6629
6866
  # If inplace is False, make a copy of the DataFrame
6630
6867
  if not inplace:
@@ -6720,12 +6957,12 @@ def df_astype(
6720
6957
  data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
6721
6958
  # print(f"Successfully converted '{column}' to timedelta.")
6722
6959
  elif astype == "circular":
6723
- max_val = kwargs.get('max_val',None)
6724
- data[column]=df_cycle(data=data,columns=column,max_val=max_val)
6960
+ max_val = kwargs.get("max_val", None)
6961
+ data[column] = df_cycle(data=data, columns=column, max_val=max_val)
6725
6962
  else:
6726
6963
  # Convert to other types (e.g., float, int)
6727
- if astype=='int':
6728
- data[column] = data[column].astype('float').astype('int')
6964
+ if astype == "int":
6965
+ data[column] = data[column].astype("float").astype("int")
6729
6966
  else:
6730
6967
  data[column] = data[column].astype(astype)
6731
6968
  # print(f"Successfully converted '{column}' to {astype}.")
@@ -6775,7 +7012,9 @@ def df_sort_values(data, column, by=None, ascending=True, inplace=True, **kwargs
6775
7012
  ).index.tolist()
6776
7013
 
6777
7014
  # Convert to a categorical type with the new order
6778
- data[column] = pd.Categorical(data[column], categories=sorted_counts, ordered=True)
7015
+ data[column] = pd.Categorical(
7016
+ data[column], categories=sorted_counts, ordered=True
7017
+ )
6779
7018
  # Set ascending to count_ascending for sorting
6780
7019
  ascending = count_ascending # Adjust ascending for the final sort
6781
7020
  elif isinstance(by, list):
@@ -6977,7 +7216,7 @@ def df_fillna(
6977
7216
  # Fill completely NaN columns with a default value (e.g., 0)
6978
7217
  data = data.copy()
6979
7218
  data.loc[:, data.isna().all()] = 0
6980
-
7219
+
6981
7220
  col_names_org = data.columns.tolist()
6982
7221
  index_names_org = data.index.tolist()
6983
7222
  # Separate numeric and non-numeric columns
@@ -7034,7 +7273,7 @@ def df_fillna(
7034
7273
  imputed_data = imputer.fit_transform(numeric_data.T)
7035
7274
  else:
7036
7275
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
7037
-
7276
+
7038
7277
  imputed_data = pd.DataFrame(
7039
7278
  imputed_data if axis == 0 else imputed_data.T,
7040
7279
  index=numeric_data.index if axis == 0 else numeric_data.columns,
@@ -7179,11 +7418,15 @@ def df_encoder(
7179
7418
 
7180
7419
  encoder = LabelEncoder()
7181
7420
  # Apply LabelEncoder only to non-numeric columns
7182
- non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
7421
+ non_numeric_columns = [
7422
+ col for col in columns if not pd.api.types.is_numeric_dtype(data[col])
7423
+ ]
7183
7424
 
7184
7425
  if not non_numeric_columns:
7185
7426
  return data
7186
- encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
7427
+ encoded_data = data[non_numeric_columns].apply(
7428
+ lambda col: encoder.fit_transform(col)
7429
+ )
7187
7430
  return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
7188
7431
 
7189
7432
  # Target encoding (Mean of the target for each category)
@@ -7210,13 +7453,13 @@ def df_scaler(
7210
7453
  scaler=None,
7211
7454
  method="standard",
7212
7455
  columns=None, # default, select all numeric col/row
7213
- feature_range=None,# specific for 'minmax'
7456
+ feature_range=None, # specific for 'minmax'
7214
7457
  vmin=0,
7215
7458
  vmax=1,
7216
7459
  inplace=False,
7217
7460
  verbose=False, # show usage
7218
7461
  axis=0, # defalut column-wise
7219
- return_scaler:bool=False,# True: return both: return df, scaler
7462
+ return_scaler: bool = False, # True: return both: return df, scaler
7220
7463
  **kwargs,
7221
7464
  ):
7222
7465
  """
@@ -7235,34 +7478,56 @@ def df_scaler(
7235
7478
  if verbose:
7236
7479
  print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
7237
7480
  if scaler is None:
7238
- methods = ["standard", "minmax", "robust","maxabs"]
7481
+ methods = ["standard", "minmax", "robust", "maxabs"]
7239
7482
  method = strcmp(method, methods)[0]
7240
7483
  if method == "standard":
7241
7484
  from sklearn.preprocessing import StandardScaler
7485
+
7242
7486
  if verbose:
7243
- print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
7244
- print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
7487
+ print(
7488
+ "performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1."
7489
+ )
7490
+ print(
7491
+ "Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks."
7492
+ )
7245
7493
  scaler = StandardScaler(**kwargs)
7246
7494
  elif method == "minmax":
7247
7495
  from sklearn.preprocessing import MinMaxScaler
7496
+
7248
7497
  if feature_range is None:
7249
- feature_range=(vmin,vmax)
7498
+ feature_range = (vmin, vmax)
7250
7499
  if verbose:
7251
- print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
7252
- print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
7253
- print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
7254
- scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
7500
+ print(
7501
+ "don't forget to define the range: e.g., 'feature_range=(0, 1)'. "
7502
+ )
7503
+ print(
7504
+ "scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1]."
7505
+ )
7506
+ print(
7507
+ "Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks."
7508
+ )
7509
+ scaler = MinMaxScaler(feature_range=feature_range, **kwargs)
7255
7510
  elif method == "robust":
7256
7511
  from sklearn.preprocessing import RobustScaler
7512
+
7257
7513
  if verbose:
7258
- print("scales the data based on the median and interquartile range, which is robust to outliers.")
7259
- print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
7514
+ print(
7515
+ "scales the data based on the median and interquartile range, which is robust to outliers."
7516
+ )
7517
+ print(
7518
+ "Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation."
7519
+ )
7260
7520
  scaler = RobustScaler(**kwargs)
7261
- elif method=="maxabs":
7521
+ elif method == "maxabs":
7262
7522
  from sklearn.preprocessing import MaxAbsScaler
7523
+
7263
7524
  if verbose:
7264
- print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
7265
- print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
7525
+ print(
7526
+ "This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature."
7527
+ )
7528
+ print(
7529
+ "Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems."
7530
+ )
7266
7531
  scaler = MaxAbsScaler(**kwargs)
7267
7532
  if axis not in [0, 1]:
7268
7533
  raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
@@ -7275,7 +7540,7 @@ def df_scaler(
7275
7540
  non_numeric_columns = data.columns.difference(columns)
7276
7541
 
7277
7542
  # scaled_data = scaler.fit_transform(data[columns])
7278
- if scaler is None or not hasattr(scaler, 'mean_'):
7543
+ if scaler is None or not hasattr(scaler, "mean_"):
7279
7544
  scaled_data = scaler.fit_transform(data[columns])
7280
7545
  else:
7281
7546
  scaled_data = scaler.transform(data[columns])
@@ -7293,7 +7558,7 @@ def df_scaler(
7293
7558
  )
7294
7559
  scaled_df = scaled_df[data.columns] # Maintain column order
7295
7560
  if return_scaler:
7296
- return scaled_df,scaler
7561
+ return scaled_df, scaler
7297
7562
  else:
7298
7563
  return scaled_df
7299
7564
 
@@ -7310,7 +7575,11 @@ def df_scaler(
7310
7575
  # scaled_data = scaler.fit_transform(
7311
7576
  # numeric_rows.T
7312
7577
  # ).T # Transpose for scaling and then back
7313
- scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
7578
+ scaled_data = (
7579
+ scaler.fit_transform(numeric_rows.T).T
7580
+ if scaler is None or not hasattr(scaler, "mean_")
7581
+ else scaler.transform(numeric_rows.T).T
7582
+ )
7314
7583
 
7315
7584
  if inplace:
7316
7585
  data.loc[numeric_rows.index] = scaled_data
@@ -7319,7 +7588,7 @@ def df_scaler(
7319
7588
  scaled_df = data.copy()
7320
7589
  scaled_df.loc[numeric_rows.index] = scaled_data
7321
7590
  if return_scaler:
7322
- return scaled_df,scaler
7591
+ return scaled_df, scaler
7323
7592
  else:
7324
7593
  return scaled_df
7325
7594
 
@@ -7683,10 +7952,10 @@ def df_reducer(
7683
7952
  hue: str = None, # lda-specific
7684
7953
  scale: bool = True,
7685
7954
  fill_missing: bool = True,
7686
- size=2,# for plot marker size
7687
- markerscale=4,# for plot, legend marker size scale
7688
- edgecolor='none',# for plot,
7689
- legend_loc='best',# for plot,
7955
+ size=2, # for plot marker size
7956
+ markerscale=4, # for plot, legend marker size scale
7957
+ edgecolor="none", # for plot,
7958
+ legend_loc="best", # for plot,
7690
7959
  bbox_to_anchor=None,
7691
7960
  ncols=1,
7692
7961
  debug: bool = False,
@@ -7719,7 +7988,7 @@ def df_reducer(
7719
7988
  "autoencoder": "Autoencoder:\n\tA neural network-based approach for complex feature learning and non-linear dimensionality reduction. Advantage: Can capture very complex relationships. Limitation: Computationally expensive, requires neural network expertise for effective tuning.",
7720
7989
  "nmf": "Non-negative Matrix Factorization:\n\tEffective for parts-based decomposition, commonly used for sparse and non-negative data, e.g., text data or images. Advantage: Interpretability with non-negativity, efficient with sparse data. Limitation: Less effective for negative or zero-centered data.",
7721
7990
  "umap_hdbscan": "UMAP + HDBSCAN:\n\tCombination of UMAP for dimensionality reduction and HDBSCAN for density-based clustering, suitable for cluster discovery in high-dimensional data. Advantage: Effective in discovering clusters in embeddings. Limitation: Requires careful tuning of both UMAP and HDBSCAN parameters.",
7722
- "manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise."
7991
+ "manifold_learning": "Manifold Learning (Isomap, Hessian LLE, etc.):\n\tMethods designed to capture intrinsic geometrical structure. Advantage: Preserves non-linear relationships in low dimensions. Limitation: Computationally expensive and sensitive to noise.",
7723
7992
  }
7724
7993
 
7725
7994
  from sklearn.preprocessing import StandardScaler
@@ -7730,14 +7999,27 @@ def df_reducer(
7730
7999
  import seaborn as sns
7731
8000
  # Check valid method input
7732
8001
  methods = [
7733
- "pca", "umap", "umap_hdbscan", "tsne", "factor", "isolation_forest","manifold_learning", "lda", "kpca", "ica",
7734
- "mds", "lle", "svd", "truncated_svd", "spectral_embedding",
8002
+ "pca",
8003
+ "umap",
8004
+ "umap_hdbscan",
8005
+ "tsne",
8006
+ "factor",
8007
+ "isolation_forest",
8008
+ "manifold_learning",
8009
+ "lda",
8010
+ "kpca",
8011
+ "ica",
8012
+ "mds",
8013
+ "lle",
8014
+ "svd",
8015
+ "truncated_svd",
8016
+ "spectral_embedding",
7735
8017
  # "autoencoder","nmf",
7736
8018
  ]
7737
8019
  method = strcmp(method, methods)[0]
7738
8020
  if run_once_within(reverse=True):
7739
8021
  print(f"support methods:{methods}")
7740
-
8022
+
7741
8023
  if verbose:
7742
8024
  print(f"\nprocessing with using {dict_methods[method]}:")
7743
8025
  xlabel, ylabel = None, None
@@ -8050,8 +8332,9 @@ def df_reducer(
8050
8332
  svd_df[hue] = y
8051
8333
  if debug:
8052
8334
  print("Singular Value Decomposition (SVD) completed.")
8053
- elif method=="truncated_svd":
8335
+ elif method == "truncated_svd":
8054
8336
  from sklearn.decomposition import TruncatedSVD
8337
+
8055
8338
  svd = TruncatedSVD(n_components=n_components, random_state=random_state)
8056
8339
  X_reduced = svd.fit_transform(X)
8057
8340
  reduced_df = pd.DataFrame(
@@ -8070,7 +8353,9 @@ def df_reducer(
8070
8353
  elif method == "spectral_embedding":
8071
8354
  from sklearn.manifold import SpectralEmbedding
8072
8355
 
8073
- spectral = SpectralEmbedding(n_components=n_components, random_state=random_state)
8356
+ spectral = SpectralEmbedding(
8357
+ n_components=n_components, random_state=random_state
8358
+ )
8074
8359
  X_reduced = spectral.fit_transform(X)
8075
8360
  reduced_df = pd.DataFrame(
8076
8361
  X_reduced,
@@ -8168,7 +8453,7 @@ def df_reducer(
8168
8453
  print("Manifold Learning (Isomap) completed.")
8169
8454
  if hue:
8170
8455
  reduced_df[hue] = y
8171
-
8456
+
8172
8457
  #! Return reduced data and info as a new DataFrame with the same index
8173
8458
  if method == "pca":
8174
8459
  reduced_df = pca_df
@@ -8225,7 +8510,8 @@ def df_reducer(
8225
8510
  colname_met = "SVD_"
8226
8511
  # Quick plots
8227
8512
  if plot_ and (not method in ["isolation_forest"]):
8228
- from .plot import plotxy,figsets,get_color
8513
+ from .plot import plotxy, figsets, get_color
8514
+
8229
8515
  # if ax is None:
8230
8516
  # if figsize is None:
8231
8517
  # _, ax = plt.subplots(figsize=cm2inch(8, 8))
@@ -8235,9 +8521,9 @@ def df_reducer(
8235
8521
  # ax = ax.cla()
8236
8522
  xlabel = f"{colname_met}1" if xlabel is None else xlabel
8237
8523
  ylabel = f"{colname_met}2" if ylabel is None else ylabel
8238
- palette=get_color(len(flatten(data[hue],verbose=0)))
8524
+ palette = get_color(len(flatten(data[hue], verbose=0)))
8239
8525
 
8240
- reduced_df=reduced_df.sort_values(by=hue)
8526
+ reduced_df = reduced_df.sort_values(by=hue)
8241
8527
  print(flatten(reduced_df[hue]))
8242
8528
  ax = plotxy(
8243
8529
  data=reduced_df,
@@ -8247,24 +8533,31 @@ def df_reducer(
8247
8533
  palette=palette,
8248
8534
  # size=size,
8249
8535
  edgecolor=edgecolor,
8250
- kind_=["joint",
8251
- # "kde",
8252
- "ell",
8253
- ],
8536
+ kind_=[
8537
+ "joint",
8538
+ # "kde",
8539
+ "ell",
8540
+ ],
8254
8541
  kws_kde=dict(
8255
- hue=hue,
8256
- levels=2,
8257
- common_norm=False,
8258
- fill=True,
8259
- alpha=0.05,
8260
- ),
8261
- kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
8262
- kws_ellipse=dict(alpha=0.1,lw=1,label=None),
8542
+ hue=hue,
8543
+ levels=2,
8544
+ common_norm=False,
8545
+ fill=True,
8546
+ alpha=0.05,
8547
+ ),
8548
+ kws_joint=dict(kind="scatter", joint_kws=dict(s=size)),
8549
+ kws_ellipse=dict(alpha=0.1, lw=1, label=None),
8263
8550
  verbose=False,
8264
8551
  **kwargs,
8265
8552
  )
8266
8553
  figsets(
8267
- legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
8554
+ legend=dict(
8555
+ loc=legend_loc,
8556
+ markerscale=markerscale,
8557
+ bbox_to_anchor=bbox_to_anchor,
8558
+ ncols=ncols,
8559
+ fontsize=8,
8560
+ ),
8268
8561
  xlabel=xlabel if xlabel else None,
8269
8562
  ylabel=ylabel if ylabel else None,
8270
8563
  )
@@ -8297,6 +8590,7 @@ def df_reducer(
8297
8590
  # example:
8298
8591
  # df_reducer(data=data_log, columns=markers, n_components=2)
8299
8592
 
8593
+
8300
8594
  def get_df_format(data, threshold_unique=0.5, verbose=False):
8301
8595
  """
8302
8596
  检测表格: long, wide or uncertain.
@@ -8396,7 +8690,9 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
8396
8690
  if cluster_labels.nunique() < len(numeric_cols) * 0.5:
8397
8691
  wide_score += 2
8398
8692
  if verbose:
8399
- print("Clustering on columns shows grouping, suggesting wide format.")
8693
+ print(
8694
+ "Clustering on columns shows grouping, suggesting wide format."
8695
+ )
8400
8696
  except Exception as e:
8401
8697
  print(e) if verbose else None
8402
8698
 
@@ -8487,7 +8783,8 @@ def get_df_format(data, threshold_unique=0.5, verbose=False):
8487
8783
  if verbose:
8488
8784
  print("Final decision: Uncertain format.")
8489
8785
  return "uncertain"
8490
-
8786
+
8787
+
8491
8788
  def plot_cluster(
8492
8789
  data: pd.DataFrame,
8493
8790
  labels: np.ndarray,
@@ -8735,6 +9032,8 @@ def evaluate_cluster(
8735
9032
  metrics["V-Measure"] = np.nan
8736
9033
 
8737
9034
  return metrics
9035
+
9036
+
8738
9037
  def df_qc(
8739
9038
  data: pd.DataFrame,
8740
9039
  columns=None,
@@ -8744,7 +9043,7 @@ def df_qc(
8744
9043
  hue=None,
8745
9044
  output=False,
8746
9045
  verbose=True,
8747
- dir_save=None
9046
+ dir_save=None,
8748
9047
  ):
8749
9048
  """
8750
9049
  Usage example:
@@ -8752,16 +9051,17 @@ def df_qc(
8752
9051
  """
8753
9052
  from statsmodels.stats.outliers_influence import variance_inflation_factor
8754
9053
  from scipy.stats import skew, kurtosis, entropy
8755
-
9054
+
8756
9055
  pd.options.display.max_seq_items = 10
8757
9056
  #! display(data.select_dtypes(include=[np.number]).describe())
8758
9057
  #!skim
8759
9058
  if columns is not None:
8760
- if isinstance(columns, (list,pd.core.indexes.base.Index)):
8761
- data=data[columns]
9059
+ if isinstance(columns, (list, pd.core.indexes.base.Index)):
9060
+ data = data[columns]
8762
9061
  if skim:
8763
9062
  try:
8764
- import skimpy
9063
+ import skimpy
9064
+
8765
9065
  skimpy.skim(data)
8766
9066
  except:
8767
9067
  numerical_data = data.select_dtypes(include=[np.number])
@@ -8775,13 +9075,19 @@ def df_qc(
8775
9075
 
8776
9076
  # Missing values
8777
9077
  res_qc["missing_values"] = data.isnull().sum()
8778
- res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
9078
+ res_qc["missing_percentage"] = round(
9079
+ (res_qc["missing_values"] / len(data)) * 100, 2
9080
+ )
8779
9081
  res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
8780
9082
 
8781
9083
  # Data types and unique values
8782
9084
  res_qc["data_types"] = data.dtypes
8783
- res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
8784
- res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
9085
+ res_qc["unique_counts"] = (
9086
+ data.select_dtypes(exclude=np.number).nunique().sort_values()
9087
+ )
9088
+ res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(
9089
+ lambda x: x.unique()
9090
+ )
8785
9091
  res_qc["constant_columns"] = [
8786
9092
  col for col in data.columns if data[col].nunique() <= 1
8787
9093
  ]
@@ -8797,8 +9103,8 @@ def df_qc(
8797
9103
  data_outliers = df_outlier(data)
8798
9104
  outlier_num = data_outliers.isna().sum() - data.isnull().sum()
8799
9105
  res_qc["outlier_num"] = outlier_num[outlier_num > 0]
8800
- outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
8801
- res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
9106
+ outlier_percentage = round((outlier_num / len(data_outliers)) * 100, 2)
9107
+ res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage > 0]
8802
9108
  try:
8803
9109
  # Correlation and multicollinearity (VIF)
8804
9110
  if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
@@ -8816,16 +9122,16 @@ def df_qc(
8816
9122
  numeric_df = data.select_dtypes(include=[np.number]).dropna()
8817
9123
  if isinstance(numeric_df.columns, pd.MultiIndex):
8818
9124
  numeric_df.columns = [
8819
- "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
9125
+ "_".join(col).strip() if isinstance(col, tuple) else col
9126
+ for col in numeric_df.columns
8820
9127
  ]
8821
9128
 
8822
-
8823
9129
  vif_data = pd.DataFrame()
8824
- res_qc["vif"]=vif_data
9130
+ res_qc["vif"] = vif_data
8825
9131
  if numeric_df.shape[1] > 1 and not numeric_df.empty:
8826
9132
  vif_data["feature"] = numeric_df.columns.tolist()
8827
9133
  vif_data["VIF"] = [
8828
- round(variance_inflation_factor(numeric_df.values, i),2)
9134
+ round(variance_inflation_factor(numeric_df.values, i), 2)
8829
9135
  for i in range(numeric_df.shape[1])
8830
9136
  ]
8831
9137
  res_qc["vif"] = vif_data[
@@ -8847,8 +9153,8 @@ def df_qc(
8847
9153
  }
8848
9154
 
8849
9155
  # dtypes counts
8850
- res_qc['dtype_counts']=data.dtypes.value_counts()
8851
-
9156
+ res_qc["dtype_counts"] = data.dtypes.value_counts()
9157
+
8852
9158
  # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
8853
9159
  distribution_stats = data.select_dtypes(include=[np.number]).describe().T
8854
9160
  iqr = data.select_dtypes(include=[np.number]).apply(
@@ -8880,7 +9186,6 @@ def df_qc(
8880
9186
  if len(unique_types) > 1:
8881
9187
  inconsistent_types[col] = unique_types
8882
9188
  res_qc["inconsistent_types"] = inconsistent_types
8883
-
8884
9189
 
8885
9190
  # Text length analysis for text fields
8886
9191
  text_lengths = {}
@@ -8892,7 +9197,9 @@ def df_qc(
8892
9197
  res_qc["text_length_analysis"] = text_lengths
8893
9198
 
8894
9199
  # Summary statistics
8895
- res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
9200
+ res_qc["summary_statistics"] = data.describe().T.style.background_gradient(
9201
+ cmap="coolwarm", axis=0
9202
+ )
8896
9203
 
8897
9204
  # Automated warnings
8898
9205
  warnings = []
@@ -8920,39 +9227,60 @@ def df_qc(
8920
9227
  display(res_qc["data_types"])
8921
9228
  if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
8922
9229
  print(" ⤵ Missing Values Counts:")
8923
- display(pd.DataFrame(
8924
- {
8925
- "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
8926
- "missing_percent(%)": res_qc["missing_percentage"][
8927
- res_qc["missing_percentage"] > 0
8928
- ],
8929
- }
8930
- ).style.background_gradient(cmap="coolwarm", axis=0)
8931
- )
9230
+ display(
9231
+ pd.DataFrame(
9232
+ {
9233
+ "missing_values": res_qc["missing_values"][
9234
+ res_qc["missing_values"] > 0
9235
+ ],
9236
+ "missing_percent(%)": res_qc["missing_percentage"][
9237
+ res_qc["missing_percentage"] > 0
9238
+ ],
9239
+ }
9240
+ ).style.background_gradient(cmap="coolwarm", axis=0)
9241
+ )
8932
9242
  # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
8933
- print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
9243
+ print("\n⤵ Rows with Missing Values:", res_qc["rows_with_missing"])
9244
+
9245
+ (
9246
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"])
9247
+ if any(res_qc["constant_columns"])
9248
+ else None
9249
+ )
9250
+ (
9251
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"])
9252
+ if res_qc["duplicate_rows"]
9253
+ else None
9254
+ )
9255
+ (
9256
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"])
9257
+ if any(res_qc["duplicate_columns"])
9258
+ else None
9259
+ )
8934
9260
 
8935
- print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
8936
- print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
8937
- print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
8938
-
8939
9261
  if any(res_qc["outlier_num"]):
8940
9262
  print("\n⤵ Outlier Report:")
8941
- display(pd.DataFrame(
8942
- {
8943
- "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
8944
- "outlier_percentage(%)": res_qc["outlier_percentage"][
8945
- res_qc["outlier_percentage"] > 0
8946
- ],
8947
- }
8948
- ).style.background_gradient(cmap="coolwarm", axis=0)
8949
- )
9263
+ display(
9264
+ pd.DataFrame(
9265
+ {
9266
+ "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
9267
+ "outlier_percentage(%)": res_qc["outlier_percentage"][
9268
+ res_qc["outlier_percentage"] > 0
9269
+ ],
9270
+ }
9271
+ ).style.background_gradient(cmap="coolwarm", axis=0)
9272
+ )
8950
9273
 
8951
9274
  if any(res_qc["unique_counts"]):
8952
9275
  print("\n⤵ Unique Values per Column:")
8953
- display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
8954
- "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
8955
-
9276
+ display(
9277
+ pd.DataFrame(
9278
+ {
9279
+ "unique_counts": res_qc["unique_counts"],
9280
+ "unique_values": res_qc["unique_values"],
9281
+ }
9282
+ ).style.background_gradient(cmap="coolwarm", axis=0)
9283
+ )
8956
9284
 
8957
9285
  if res_qc["empty_columns"]:
8958
9286
  print("\n⤵ Empty Columns:", res_qc["empty_columns"])
@@ -8971,7 +9299,7 @@ def df_qc(
8971
9299
  print(res_qc["high_cardinality_categoricals"])
8972
9300
  if any(res_qc["inconsistent_types"]):
8973
9301
  print("\n⤵ Inconsistent Data Types:")
8974
- display(res_qc["inconsistent_types"])
9302
+ display(res_qc["inconsistent_types"])
8975
9303
  if any(res_qc["text_length_analysis"]):
8976
9304
  print("\n⤵ Text Length Analysis:")
8977
9305
  for col, stats in res_qc["text_length_analysis"].items():
@@ -8986,67 +9314,93 @@ def df_qc(
8986
9314
 
8987
9315
  pd.reset_option("display.max_seq_items")
8988
9316
  if plot_:
8989
- df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
9317
+ df_qc_plots(
9318
+ data=data, res_qc=res_qc, max_cols=max_cols, hue=hue, dir_save=dir_save
9319
+ )
8990
9320
  if output or not plot_:
8991
9321
  return res_qc
8992
9322
  return None
8993
9323
 
8994
9324
 
8995
- def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
9325
+ def df_qc_plots(
9326
+ data: pd.DataFrame,
9327
+ columns=None,
9328
+ res_qc: dict = None,
9329
+ max_cols=20,
9330
+ hue=None,
9331
+ dir_save=None,
9332
+ ):
8996
9333
  import matplotlib.pyplot as plt
8997
9334
  import seaborn as sns
8998
9335
  from .plot import subplot, figsets, get_color
8999
9336
  from datetime import datetime
9337
+
9000
9338
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
9001
-
9339
+
9002
9340
  if columns is not None:
9003
- if isinstance(columns, (list,pd.core.indexes.base.Index)):
9004
- data=data[columns]
9341
+ if isinstance(columns, (list, pd.core.indexes.base.Index)):
9342
+ data = data[columns]
9005
9343
  len_total = len(res_qc)
9006
9344
  n_row, n_col = int((len_total + 10)), 3
9007
- nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
9345
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row], verbose=False)
9008
9346
 
9009
9347
  missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
9010
9348
  ascending=False
9011
9349
  )
9012
9350
  if len(missing_data) > max_cols:
9013
9351
  missing_data = missing_data[:max_cols]
9014
- ax_missing_data=sns.barplot(
9352
+ ax_missing_data = sns.barplot(
9015
9353
  y=missing_data.index,
9016
9354
  x=missing_data.values,
9017
9355
  hue=missing_data.index,
9018
9356
  palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
9019
9357
  ax=nexttile(),
9020
9358
  )
9021
- figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
9359
+ figsets(
9360
+ title="Missing (#)",
9361
+ xlabel="#",
9362
+ ax=ax_missing_data,
9363
+ ylabel=None,
9364
+ fontsize=8 if len(missing_data) <= 20 else 6,
9365
+ )
9022
9366
 
9023
9367
  outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
9024
9368
  if len(outlier_num) > max_cols:
9025
9369
  outlier_num = outlier_num[:max_cols]
9026
- ax_outlier_num=sns.barplot(
9370
+ ax_outlier_num = sns.barplot(
9027
9371
  y=outlier_num.index,
9028
9372
  x=outlier_num.values,
9029
- hue=outlier_num.index,
9373
+ hue=outlier_num.index,
9030
9374
  palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
9031
9375
  ax=nexttile(),
9032
9376
  )
9033
- figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
9034
-
9377
+ figsets(
9378
+ ax=ax_outlier_num,
9379
+ title="Outliers (#)",
9380
+ xlabel="#",
9381
+ ylabel=None,
9382
+ fontsize=8 if len(outlier_num) <= 20 else 6,
9383
+ )
9384
+
9035
9385
  #!
9036
9386
  try:
9037
- for col in data.select_dtypes(include='category').columns:
9038
- sns.countplot(y=data[col],
9039
- palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
9040
- ax=nexttile())
9387
+ for col in data.select_dtypes(include="category").columns:
9388
+ sns.countplot(
9389
+ y=data[col],
9390
+ palette=get_color(
9391
+ data.select_dtypes(include="category").shape[1], cmap="coolwarm"
9392
+ )[::-1],
9393
+ ax=nexttile(),
9394
+ )
9041
9395
  figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
9042
9396
  except Exception as e:
9043
- pass
9397
+ pass
9044
9398
 
9045
9399
  # Skewness and Kurtosis Plots
9046
9400
  skewness = res_qc["skewness"].sort_values(ascending=False)
9047
9401
  kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
9048
9402
  if not skewness.empty:
9049
- ax_skewness=sns.barplot(
9403
+ ax_skewness = sns.barplot(
9050
9404
  y=skewness.index,
9051
9405
  x=skewness.values,
9052
9406
  hue=skewness.index,
@@ -9055,11 +9409,13 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9055
9409
  )
9056
9410
  figsets(
9057
9411
  title="Highly Skewed Numeric Columns (Skewness > 1)",
9058
- xlabel="Skewness",ylabel=None,ax=ax_skewness,
9059
- fontsize=8 if len(skewness)<=20 else 6
9412
+ xlabel="Skewness",
9413
+ ylabel=None,
9414
+ ax=ax_skewness,
9415
+ fontsize=8 if len(skewness) <= 20 else 6,
9060
9416
  )
9061
9417
  if not kurtosis.empty:
9062
- ax_kurtosis=sns.barplot(
9418
+ ax_kurtosis = sns.barplot(
9063
9419
  y=kurtosis.index,
9064
9420
  x=kurtosis.values,
9065
9421
  hue=kurtosis.index,
@@ -9068,59 +9424,68 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9068
9424
  )
9069
9425
  figsets(
9070
9426
  title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
9071
- xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
9072
- fontsize=8 if len(kurtosis)<=20 else 6
9427
+ xlabel="Kurtosis",
9428
+ ylabel=None,
9429
+ ax=ax_kurtosis,
9430
+ fontsize=8 if len(kurtosis) <= 20 else 6,
9073
9431
  )
9074
9432
 
9075
9433
  # Entropy for Categorical Variables
9076
9434
  entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
9077
9435
  ascending=False
9078
9436
  )
9079
- ax_entropy_data=sns.barplot(
9080
- y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
9437
+ ax_entropy_data = sns.barplot(
9438
+ y=entropy_data.index,
9439
+ x=entropy_data.values,
9440
+ hue=entropy_data.index,
9081
9441
  palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
9082
- ax=nexttile()
9083
- )
9442
+ ax=nexttile(),
9443
+ )
9084
9444
  figsets(
9085
- ylabel="Categorical Columns",
9086
- title="Entropy of Categorical Variables",
9087
- xlabel="Entropy (bits)",
9088
- ax=ax_entropy_data,
9089
- fontsize=8 if len(entropy_data)<=20 else 6
9090
- )
9445
+ ylabel="Categorical Columns",
9446
+ title="Entropy of Categorical Variables",
9447
+ xlabel="Entropy (bits)",
9448
+ ax=ax_entropy_data,
9449
+ fontsize=8 if len(entropy_data) <= 20 else 6,
9450
+ )
9091
9451
 
9092
9452
  # unique counts
9093
- unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
9094
- ax_unique_counts_=sns.barplot(
9095
- y=unique_counts.index,
9096
- x=unique_counts.values,
9097
- hue=unique_counts.index,
9098
- palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
9099
- ax=nexttile())
9453
+ unique_counts = res_qc["unique_counts"].sort_values(ascending=False)
9454
+ ax_unique_counts_ = sns.barplot(
9455
+ y=unique_counts.index,
9456
+ x=unique_counts.values,
9457
+ hue=unique_counts.index,
9458
+ palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
9459
+ ax=nexttile(),
9460
+ )
9100
9461
  figsets(
9101
- title="Unique Counts",
9102
- ylabel=None,
9103
- xlabel="#",
9104
- ax=ax_unique_counts_,
9105
- fontsize=8 if len(unique_counts)<=20 else 6
9106
- )
9462
+ title="Unique Counts",
9463
+ ylabel=None,
9464
+ xlabel="#",
9465
+ ax=ax_unique_counts_,
9466
+ fontsize=8 if len(unique_counts) <= 20 else 6,
9467
+ )
9107
9468
  # Binary Checking
9108
- ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
9109
- x=unique_counts[unique_counts<8].values,
9110
- hue=unique_counts[unique_counts<8].index,
9111
- palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
9112
- ax=nexttile())
9469
+ ax_unique_counts = sns.barplot(
9470
+ y=unique_counts[unique_counts < 8].index,
9471
+ x=unique_counts[unique_counts < 8].values,
9472
+ hue=unique_counts[unique_counts < 8].index,
9473
+ palette=get_color(len(unique_counts[unique_counts < 8].index), cmap="coolwarm")[
9474
+ ::-1
9475
+ ],
9476
+ ax=nexttile(),
9477
+ )
9113
9478
  plt.axvline(x=2, color="r", linestyle="--", lw=2)
9114
9479
  figsets(
9115
- ylabel=None,
9116
- title="Binary Checking",
9117
- xlabel="#",
9118
- ax=ax_unique_counts,
9119
- fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
9120
- )
9480
+ ylabel=None,
9481
+ title="Binary Checking",
9482
+ xlabel="#",
9483
+ ax=ax_unique_counts,
9484
+ fontsize=8 if len(unique_counts[unique_counts < 10].index) <= 20 else 6,
9485
+ )
9121
9486
 
9122
9487
  # dtypes counts
9123
- dtype_counts = res_qc['dtype_counts']
9488
+ dtype_counts = res_qc["dtype_counts"]
9124
9489
  txt = []
9125
9490
  for tp in dtype_counts.index:
9126
9491
  txt.append(list(data.select_dtypes(include=tp).columns))
@@ -9131,9 +9496,9 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9131
9496
  color="#F3C8B2",
9132
9497
  ax=nexttile(),
9133
9498
  )
9134
- max_columns_per_row = 1 # Maximum number of columns per row
9499
+ max_columns_per_row = 1 # Maximum number of columns per row
9135
9500
  for i, tp in enumerate(dtype_counts.index):
9136
- if i<=20:
9501
+ if i <= 20:
9137
9502
  column_names = txt[i]
9138
9503
  # Split the column names into multiple lines if too long
9139
9504
  column_name_str = ", ".join(column_names)
@@ -9152,7 +9517,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9152
9517
  ha="center",
9153
9518
  va="top",
9154
9519
  c="k",
9155
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
9520
+ fontsize=8 if len(dtype_counts.index) <= 20 else 6,
9156
9521
  rotation=0,
9157
9522
  )
9158
9523
  figsets(
@@ -9160,7 +9525,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9160
9525
  title="Dtypes",
9161
9526
  ylabel="#",
9162
9527
  ax=ax_dtype_counts,
9163
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
9528
+ fontsize=8 if len(dtype_counts.index) <= 20 else 6,
9164
9529
  )
9165
9530
  # from .plot import pie
9166
9531
  # pie()
@@ -9175,57 +9540,66 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9175
9540
  )
9176
9541
 
9177
9542
  if high_cardinality:
9178
- ax_high_cardinality=sns.barplot(
9543
+ ax_high_cardinality = sns.barplot(
9179
9544
  y=list(high_cardinality.keys()),
9180
9545
  x=list(high_cardinality.values()),
9181
9546
  hue=list(high_cardinality.keys()),
9182
- palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
9547
+ palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[
9548
+ ::-1
9549
+ ],
9183
9550
  ax=nexttile(),
9184
9551
  )
9185
9552
  figsets(
9186
9553
  title="High Cardinality Categorical Columns",
9187
9554
  xlabel="Unique Value Count",
9188
9555
  ax=ax_high_cardinality,
9189
- fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
9556
+ fontsize=8 if len(list(high_cardinality.keys())) <= 20 else 6,
9190
9557
  )
9191
9558
  if res_qc["low_variance_features"]:
9192
9559
  low_variance_data = data[res_qc["low_variance_features"]].copy()
9193
9560
  for col in low_variance_data.columns:
9194
- ax_low_variance_features=sns.histplot(
9561
+ ax_low_variance_features = sns.histplot(
9195
9562
  low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
9196
9563
  )
9197
- figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
9198
- fontsize=8 if len(low_variance_data[col])<=20 else 6)
9564
+ figsets(
9565
+ title=f"Low Variance Feature: {col}",
9566
+ ax=ax_low_variance_features,
9567
+ fontsize=8 if len(low_variance_data[col]) <= 20 else 6,
9568
+ )
9199
9569
 
9200
9570
  # VIF plot for multicollinearity detection
9201
9571
  if "vif" in res_qc and not res_qc["vif"].empty:
9202
9572
  vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
9203
9573
  if len(vif_data) > max_cols:
9204
9574
  vif_data = vif_data[:max_cols]
9205
- ax_vif=sns.barplot(data=vif_data,
9206
- x="VIF",
9207
- y="feature",
9208
- hue="VIF",
9209
- palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
9210
- ax=nexttile())
9575
+ ax_vif = sns.barplot(
9576
+ data=vif_data,
9577
+ x="VIF",
9578
+ y="feature",
9579
+ hue="VIF",
9580
+ palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
9581
+ ax=nexttile(),
9582
+ )
9211
9583
  figsets(
9212
9584
  title="Variance Inflation Factor(VIF)",
9213
9585
  xlabel="VIF",
9214
9586
  ylabel="Features",
9215
9587
  legend=None,
9216
9588
  ax=ax_vif,
9217
- fontsize=8 if len(vif_data)<=20 else 6
9589
+ fontsize=8 if len(vif_data) <= 20 else 6,
9218
9590
  )
9219
9591
 
9220
9592
  # Correlation heatmap for numeric columns with high correlation pairs
9221
9593
  if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
9222
9594
  corr = data.select_dtypes(include=[np.number]).corr()
9223
- if corr.shape[1]<=33:
9595
+ if corr.shape[1] <= 33:
9224
9596
  mask = np.triu(np.ones_like(corr, dtype=bool))
9225
9597
  num_columns = corr.shape[1]
9226
- fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
9598
+ fontsize = max(
9599
+ 6, min(12, 12 - (num_columns - 10) * 0.2)
9600
+ ) # Scale between 8 and 12
9227
9601
 
9228
- ax_heatmap=sns.heatmap(
9602
+ ax_heatmap = sns.heatmap(
9229
9603
  corr,
9230
9604
  mask=mask,
9231
9605
  annot=True,
@@ -9233,24 +9607,21 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9233
9607
  center=0,
9234
9608
  fmt=".1f",
9235
9609
  linewidths=0.5,
9236
- vmin=-1, vmax=1,
9610
+ vmin=-1,
9611
+ vmax=1,
9237
9612
  ax=nexttile(2, 2),
9238
- cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
9239
- annot_kws={"size": fontsize}
9240
- )
9241
-
9242
- figsets(
9243
- xangle=45,
9244
- title="Correlation Heatmap",
9245
- ax=ax_heatmap
9613
+ cbar_kws=dict(shrink=0.2, ticks=np.arange(-1, 2, 1)),
9614
+ annot_kws={"size": fontsize},
9246
9615
  )
9616
+
9617
+ figsets(xangle=45, title="Correlation Heatmap", ax=ax_heatmap)
9247
9618
  # # save figure
9248
9619
  # if dir_save:
9249
9620
  # figsave(dir_save,f"qc_plot_{now_}.pdf")
9250
9621
 
9251
9622
  if columns is not None:
9252
- if isinstance(columns, (list,pd.core.indexes.base.Index)):
9253
- data=data[columns]
9623
+ if isinstance(columns, (list, pd.core.indexes.base.Index)):
9624
+ data = data[columns]
9254
9625
 
9255
9626
  # len_total = len(res_qc)
9256
9627
  # n_row, n_col = int((len_total + 10) / 3), 3
@@ -9258,30 +9629,36 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
9258
9629
  #! check distribution
9259
9630
  data_num = data.select_dtypes(include=np.number)
9260
9631
  if len(data_num) > max_cols:
9261
- data_num = data_num.iloc[:,:max_cols]
9632
+ data_num = data_num.iloc[:, :max_cols]
9633
+
9634
+ data_num = df_scaler(data=data_num, method="standard")
9262
9635
 
9263
- data_num = df_scaler(data=data_num, method='standard')
9264
-
9265
9636
  import scipy.stats as stats
9637
+
9266
9638
  for column in data_num.columns:
9267
- #* Shapiro-Wilk test for normality
9639
+ # * Shapiro-Wilk test for normality
9268
9640
  stat, p_value = stats.shapiro(data_num[column])
9269
- normality = "norm" if p_value > 0.05 else "not_norm"
9270
- #* Plot histogram
9271
- ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
9641
+ normality = "norm" if p_value > 0.05 else "not_norm"
9642
+ # * Plot histogram
9643
+ ax_hist = sns.histplot(data_num[column], kde=True, ax=nexttile())
9272
9644
  x_min, x_max = ax_hist.get_xlim()
9273
9645
  y_min, y_max = ax_hist.get_ylim()
9274
- ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
9275
- f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
9276
- ha='center', va='top')
9277
- figsets(title=column,ax=ax_hist)
9278
- ax_twin=ax_hist.twinx()
9279
- #* Q-Q plot
9646
+ ax_hist.text(
9647
+ x_min + (x_max - x_min) * 0.5,
9648
+ y_min + (y_max - y_min) * 0.75,
9649
+ f"p(Shapiro-Wilk)={p_value:.3f}\n{normality}",
9650
+ ha="center",
9651
+ va="top",
9652
+ )
9653
+ figsets(title=column, ax=ax_hist)
9654
+ ax_twin = ax_hist.twinx()
9655
+ # * Q-Q plot
9280
9656
  stats.probplot(data_num[column], dist="norm", plot=ax_twin)
9281
- figsets(ylabel=f'Q-Q Plot:{column}',title=None)
9657
+ figsets(ylabel=f"Q-Q Plot:{column}", title=None)
9282
9658
  # save figure
9283
9659
  if dir_save:
9284
- figsave(dir_save,f"qc_plot_{now_}.pdf")
9660
+ figsave(dir_save, f"qc_plot_{now_}.pdf")
9661
+
9285
9662
 
9286
9663
  def df_corr(df: pd.DataFrame, method="pearson"):
9287
9664
  """
@@ -9318,6 +9695,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
9318
9695
 
9319
9696
  return corr_matrix, pval_matrix
9320
9697
 
9698
+
9321
9699
  def use_pd(
9322
9700
  func_name="excel",
9323
9701
  verbose=True,
@@ -9338,7 +9716,8 @@ def use_pd(
9338
9716
  if verbose:
9339
9717
  print(e)
9340
9718
 
9341
- def get_phone(phone_number: str, region: str = None,verbose=True):
9719
+
9720
+ def get_phone(phone_number: str, region: str = None, verbose=True):
9342
9721
  """
9343
9722
  usage:
9344
9723
  info = get_phone(15237654321, "DE")
@@ -9426,21 +9805,23 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
9426
9805
  dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
9427
9806
 
9428
9807
  # Advanced Timezone Handling
9429
- gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
9808
+ gmt_offsets = (
9809
+ pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds() / 3600
9810
+ )
9430
9811
  # Get the local timezone (current computer's time)
9431
9812
  local_timezone = get_localzone()
9432
- #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
9813
+ # local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
9433
9814
  local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
9434
9815
  offset_diff = local_offset - gmt_offsets
9435
9816
  head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
9436
- res= {
9817
+ res = {
9437
9818
  "valid": True,
9438
9819
  "possible": possible,
9439
9820
  "formatted": {
9440
9821
  "international": formatted_international,
9441
9822
  "national": formatted_national,
9442
9823
  "e164": formatted_e164,
9443
- },
9824
+ },
9444
9825
  "country_code": country_code,
9445
9826
  "country_name": country_name,
9446
9827
  "region_code": region_code,
@@ -9448,13 +9829,13 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
9448
9829
  "carrier": carrier_name,
9449
9830
  "time_zone": time_zones,
9450
9831
  "current_times": current_times,
9451
- "local_offset":f"{local_offset} utcoffset",
9832
+ "local_offset": f"{local_offset} utcoffset",
9452
9833
  "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
9453
9834
  "number_type": number_type_str,
9454
9835
  "is_toll_free": is_toll_free,
9455
- "is_premium_rate": is_premium_rate,
9836
+ "is_premium_rate": is_premium_rate,
9456
9837
  "dialing_instructions": dialing_instructions,
9457
- "suggested_fix": None, # Use phonenumbers.example_number if invalid
9838
+ "suggested_fix": None, # Use phonenumbers.example_number if invalid
9458
9839
  "logs": {
9459
9840
  "number_analysis_completed": datetime.now().strftime(
9460
9841
  "%Y-%m-%d %H:%M:%S"
@@ -9465,7 +9846,7 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
9465
9846
  }
9466
9847
 
9467
9848
  except phonenumbers.NumberParseException as e:
9468
- res= {"valid": False, "error": str(e)}
9849
+ res = {"valid": False, "error": str(e)}
9469
9850
  if verbose:
9470
9851
  preview(res)
9471
9852
  return res
@@ -9531,7 +9912,8 @@ def decode_pluscode(
9531
9912
 
9532
9913
  return latitude, longitude
9533
9914
 
9534
- def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
9915
+
9916
+ def get_loc(input_data, user_agent="0413@mygmail.com)", verbose=True):
9535
9917
  """
9536
9918
  Determine if the input is a city name, lat/lon, or DMS and perform geocoding or reverse geocoding.
9537
9919
  Usage:
@@ -9607,7 +9989,8 @@ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
9607
9989
  "Invalid input format. Please provide a city name, latitude/longitude, or DMS string."
9608
9990
  )
9609
9991
 
9610
- def enpass(code: str, method: str="AES", key: str = None):
9992
+
9993
+ def enpass(code: str, method: str = "AES", key: str = None):
9611
9994
  """
9612
9995
  usage: enpass("admin")
9613
9996
  Master encryption function that supports multiple methods: AES, RSA, and SHA256.
@@ -9617,6 +10000,7 @@ def enpass(code: str, method: str="AES", key: str = None):
9617
10000
  :return: The encrypted data or hashed value.
9618
10001
  """
9619
10002
  import hashlib
10003
+
9620
10004
  # AES Encryption (Advanced)
9621
10005
  def aes_encrypt(data: str, key: str):
9622
10006
  """
@@ -9630,9 +10014,10 @@ def enpass(code: str, method: str="AES", key: str = None):
9630
10014
  from cryptography.hazmat.primitives import padding
9631
10015
  import base64
9632
10016
  import os
10017
+
9633
10018
  # Generate a 256-bit key from the provided password
9634
10019
  key = hashlib.sha256(key.encode()).digest()
9635
-
10020
+
9636
10021
  # Generate a random initialization vector (IV)
9637
10022
  iv = os.urandom(16) # 16 bytes for AES block size
9638
10023
 
@@ -9659,10 +10044,12 @@ def enpass(code: str, method: str="AES", key: str = None):
9659
10044
  import base64
9660
10045
  from Crypto.PublicKey import RSA
9661
10046
  from Crypto.Cipher import PKCS1_OAEP
10047
+
9662
10048
  public_key_obj = RSA.import_key(public_key)
9663
10049
  cipher_rsa = PKCS1_OAEP.new(public_key_obj)
9664
10050
  encrypted_data = cipher_rsa.encrypt(data.encode())
9665
10051
  return base64.b64encode(encrypted_data).decode()
10052
+
9666
10053
  # SHA256 Hashing (Non-reversible)
9667
10054
  def sha256_hash(data: str):
9668
10055
  """
@@ -9671,9 +10058,10 @@ def enpass(code: str, method: str="AES", key: str = None):
9671
10058
  :return: The hashed value (hex string).
9672
10059
  """
9673
10060
  return hashlib.sha256(data.encode()).hexdigest()
10061
+
9674
10062
  if key is None:
9675
- key="worldpeace"
9676
- method=strcmp(method,["AES","RSA",'SHA256'])[0]
10063
+ key = "worldpeace"
10064
+ method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
9677
10065
  if method == "AES":
9678
10066
  return aes_encrypt(code, key)
9679
10067
  elif method == "RSA":
@@ -9685,7 +10073,7 @@ def enpass(code: str, method: str="AES", key: str = None):
9685
10073
 
9686
10074
 
9687
10075
  # Master Decryption Function (Supports AES, RSA)
9688
- def depass(encrypted_code: str, method: str='AES', key: str = None):
10076
+ def depass(encrypted_code: str, method: str = "AES", key: str = None):
9689
10077
  """
9690
10078
  Master decryption function that supports multiple methods: AES and RSA.
9691
10079
  :param encrypted_code: The encrypted data to decrypt.
@@ -9694,6 +10082,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
9694
10082
  :return: The decrypted data.
9695
10083
  """
9696
10084
  import hashlib
10085
+
9697
10086
  def aes_decrypt(encrypted_data: str, key: str):
9698
10087
  """
9699
10088
  Decrypts data encrypted using AES in CBC mode.
@@ -9705,12 +10094,13 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
9705
10094
  from cryptography.hazmat.backends import default_backend
9706
10095
  from cryptography.hazmat.primitives import padding
9707
10096
  import base64
10097
+
9708
10098
  # Generate the same 256-bit key from the password
9709
10099
  key = hashlib.sha256(key.encode()).digest()
9710
-
10100
+
9711
10101
  # Decode the encrypted data from base64
9712
10102
  encrypted_data = base64.b64decode(encrypted_data)
9713
-
10103
+
9714
10104
  # Extract the IV and the actual encrypted data
9715
10105
  iv = encrypted_data[:16] # First 16 bytes are the IV
9716
10106
  encrypted_data = encrypted_data[16:] # Remaining data is the encrypted message
@@ -9724,7 +10114,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
9724
10114
  unpadder = padding.PKCS7(128).unpadder()
9725
10115
  unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
9726
10116
 
9727
- return unpadded_data.decode()
10117
+ return unpadded_data.decode()
10118
+
9728
10119
  def rsa_decrypt(encrypted_data: str, private_key: str):
9729
10120
  """
9730
10121
  Decrypts RSA-encrypted data using the private key.
@@ -9735,6 +10126,7 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
9735
10126
  from Crypto.PublicKey import RSA
9736
10127
  from Crypto.Cipher import PKCS1_OAEP
9737
10128
  import base64
10129
+
9738
10130
  encrypted_data = base64.b64decode(encrypted_data)
9739
10131
  private_key_obj = RSA.import_key(private_key)
9740
10132
  cipher_rsa = PKCS1_OAEP.new(private_key_obj)
@@ -9742,8 +10134,8 @@ def depass(encrypted_code: str, method: str='AES', key: str = None):
9742
10134
  return decrypted_data.decode()
9743
10135
 
9744
10136
  if key is None:
9745
- key="worldpeace"
9746
- method=strcmp(method,["AES","RSA",'SHA256'])[0]
10137
+ key = "worldpeace"
10138
+ method = strcmp(method, ["AES", "RSA", "SHA256"])[0]
9747
10139
  if method == "AES":
9748
10140
  return aes_decrypt(encrypted_code, key)
9749
10141
  elif method == "RSA":