py2ls 0.2.4.29__py3-none-any.whl → 0.2.4.31__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -779,11 +779,150 @@ def strcmp(
779
779
  print(f"建议: {best_match}")
780
780
  return candidates[best_match_index], best_match_index
781
781
 
782
+ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
783
+ """
784
+ Compare two images using SSIM, Feature Matching (SIFT), or KNN Matching.
785
+
786
+ Parameters:
787
+ - img (list): List containing two image file paths [img1, img2].
788
+ - method (str): Comparison method ('ssim', 'match', or 'knn').
789
+ - plot_ (bool): Whether to display the results visually.
790
+ - figsize (list): Size of the figure for plots.
791
+
792
+ Returns:
793
+ - For 'ssim': (diff, score): SSIM difference map and similarity score.
794
+ - For 'match' or 'knn': (good_matches, len(good_matches), similarity_score): Matches and similarity score.
795
+ """
796
+ import cv2
797
+ import matplotlib.pyplot as plt
798
+ from skimage.metrics import structural_similarity as ssim
799
+ # Load images
800
+ image1 = cv2.imread(img[0])
801
+ image2 = cv2.imread(img[1])
802
+
803
+ if image1 is None or image2 is None:
804
+ raise ValueError("Could not load one or both images. Check file paths.")
805
+ methods=['ssim','match','knn']
806
+ method=strcmp(method, methods)[0]
807
+ if method == 'ssim':
808
+ # Convert images to grayscale
809
+ gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
810
+ gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
811
+
812
+ # Compute SSIM
813
+ score, diff = ssim(gray1, gray2, full=True)
814
+ print(f"SSIM Score: {score:.4f}")
815
+
816
+ # Convert diff to 8-bit for visualization
817
+ diff = (diff * 255).astype("uint8")
818
+
819
+ # Plot if needed
820
+ if plot_:
821
+ fig, ax = plt.subplots(1, 3, figsize=figsize)
822
+ ax[0].imshow(gray1, cmap='gray')
823
+ ax[0].set_title("Image 1")
824
+ ax[1].imshow(gray2, cmap='gray')
825
+ ax[1].set_title("Image 2")
826
+ ax[2].imshow(diff, cmap='gray')
827
+ ax[2].set_title("Difference (SSIM)")
828
+ plt.tight_layout()
829
+ plt.show()
830
+
831
+ return diff, score
832
+
833
+ elif method in ['match', 'knn']:
834
+ # Convert images to grayscale
835
+ gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
836
+ gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
837
+
838
+ # Initialize SIFT detector
839
+ sift = cv2.SIFT_create()
840
+
841
+ # Detect and compute features
842
+ keypoints1, descriptors1 = sift.detectAndCompute(gray1, None)
843
+ keypoints2, descriptors2 = sift.detectAndCompute(gray2, None)
844
+
845
+ if len(keypoints1) == 0 or len(keypoints2) == 0:
846
+ raise ValueError("No keypoints found in one or both images.")
847
+
848
+ # BFMatcher initialization
849
+ bf = cv2.BFMatcher()
850
+
851
+ if method == 'match': # Cross-check matching
852
+ bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
853
+ matches = bf.match(descriptors1, descriptors2)
854
+ matches = sorted(matches, key=lambda x: x.distance)
855
+
856
+ # Filter good matches
857
+ good_matches = [m for m in matches if m.distance < 0.75 * matches[-1].distance]
858
+
859
+ elif method == 'knn': # KNN matching with ratio test
860
+ matches = bf.knnMatch(descriptors1, descriptors2, k=2)
861
+ # Apply Lowe's ratio test
862
+ good_matches = [m for m, n in matches if m.distance < 0.75 * n.distance]
782
863
 
783
- # Example usaged
784
- # str1 = "plos biology"
785
- # str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
786
- # best_match, idx = strcmp(str1, str2, ignore_case=1)
864
+ # Calculate similarity score
865
+ similarity_score = len(good_matches) / min(len(keypoints1), len(keypoints2))
866
+ print(f"Number of good matches: {len(good_matches)}")
867
+ print(f"Similarity Score: {similarity_score:.4f}")
868
+ # Handle case where no good matches are found
869
+ if len(good_matches) == 0:
870
+ print("No good matches found.")
871
+ return good_matches, 0.0, None
872
+
873
+ # Identify matched keypoints
874
+ src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
875
+ dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
876
+
877
+ # Calculate Homography using RANSAC
878
+ homography_matrix, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
879
+
880
+ # Apply the homography to image2
881
+ h, w = image1.shape[:2]
882
+ warped_image2 = cv2.warpPerspective(image2, homography_matrix, (w, h))
883
+
884
+ # Plot result if needed
885
+ if plot_:
886
+ fig, ax = plt.subplots(1, 2, figsize=figsize)
887
+ ax[0].imshow(cv2.cvtColor(image1, cv2.COLOR_BGR2RGB))
888
+ ax[0].set_title("Image 1")
889
+ ax[1].imshow(cv2.cvtColor(warped_image2, cv2.COLOR_BGR2RGB))
890
+ ax[1].set_title("Warped Image 2")
891
+ plt.tight_layout()
892
+ plt.show()
893
+
894
+ # Plot matches if needed
895
+ if plot_:
896
+ result = cv2.drawMatches(image1, keypoints1, image2, keypoints2, good_matches, None, flags=2)
897
+ plt.figure(figsize=figsize)
898
+ plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
899
+ plt.title(f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})")
900
+ plt.axis('off')
901
+ plt.show()
902
+ # Identify unmatched keypoints
903
+ matched_idx1 = [m.queryIdx for m in good_matches]
904
+ matched_idx2 = [m.trainIdx for m in good_matches]
905
+
906
+ unmatched_kp1 = [kp for i, kp in enumerate(keypoints1) if i not in matched_idx1]
907
+ unmatched_kp2 = [kp for i, kp in enumerate(keypoints2) if i not in matched_idx2]
908
+
909
+ # Mark unmatched keypoints on the images
910
+ img1_marked = cv2.drawKeypoints(image1, unmatched_kp1, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
911
+ img2_marked = cv2.drawKeypoints(image2, unmatched_kp2, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
912
+
913
+ # Display results
914
+ if plot_:
915
+ fig, ax = plt.subplots(1, 2, figsize=figsize)
916
+ ax[0].imshow(cv2.cvtColor(img1_marked, cv2.COLOR_BGR2RGB))
917
+ ax[0].set_title("Unmatched Keypoints (Image 1)")
918
+ ax[1].imshow(cv2.cvtColor(img2_marked, cv2.COLOR_BGR2RGB))
919
+ ax[1].set_title("Unmatched Keypoints (Image 2)")
920
+ plt.tight_layout()
921
+ plt.show()
922
+ return good_matches, similarity_score, homography_matrix
923
+
924
+ else:
925
+ raise ValueError("Invalid method. Use 'ssim', 'match', or 'knn'.")
787
926
 
788
927
 
789
928
  def cn2pinyin(
@@ -892,6 +1031,143 @@ def dict2df(dict_, fill=None):
892
1031
  dict_[key] = value
893
1032
  return pd.DataFrame.from_dict(dict_)
894
1033
 
1034
+ def text2audio(
1035
+ text,
1036
+ method=None, # "pyttsx3","gTTS"
1037
+ rate=200,
1038
+ slow=False,#"gTTS"
1039
+ volume=1.0,
1040
+ voice=None,
1041
+ lang=None,
1042
+ gender=None,
1043
+ age=None,
1044
+ dir_save=None,
1045
+ ):
1046
+ """
1047
+ # sample_text = "Hello! This is a test of the pyttsx3 text-to-speech system."
1048
+ # sample_text = "这个是中文, 测试"
1049
+ # sample_text = "Hallo, ich bin echo, Wie Heissen Sie"
1050
+
1051
+ # text2audio(
1052
+ # text=sample_text,
1053
+ # rate=150,
1054
+ # volume=0.9,
1055
+ # # voice=None, # Replace with a voice name or ID available on your system
1056
+ # )
1057
+ """
1058
+ if method is not None:
1059
+ methods=["gTTS","pyttsx3","google"]
1060
+ method=strcmp(method, methods)[0]
1061
+ else:
1062
+ try:
1063
+ text2audio(text,method='google',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1064
+ except Exception as e:
1065
+ print(e)
1066
+ text2audio(text,method='pyttsx3',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1067
+
1068
+ if method=="pyttsx3":
1069
+ import pyttsx3
1070
+
1071
+ try:
1072
+ engine = pyttsx3.init()
1073
+ engine.setProperty("rate", rate)
1074
+ if 0.0 <= volume <= 1.0:
1075
+ engine.setProperty("volume", volume)
1076
+ else:
1077
+ raise ValueError("Volume must be between 0.0 and 1.0")
1078
+
1079
+ if gender is not None:
1080
+ gender = strcmp(gender, ["male", "female"])[0]
1081
+ if age is not None:
1082
+ if isinstance(age, (float, int)):
1083
+ if age <= 10:
1084
+ age = "child"
1085
+ elif 10 < age < 18:
1086
+ age = "senior"
1087
+ else:
1088
+ age = "adult"
1089
+ elif isinstance(age, str):
1090
+ age = strcmp(age, ["child", "adult", "senior"])[0]
1091
+ else:
1092
+ raise ValueError("age: should be in ['child', 'adult', 'senior']")
1093
+ voices = engine.getProperty("voices")
1094
+ if voice is None:
1095
+ if lang is None:
1096
+ voice = strcmp(detect_lang(text), [v.name for v in voices])[0]
1097
+ else:
1098
+ if run_once_within():
1099
+ print([v.name for v in voices])
1100
+ print(f"lang:{lang}")
1101
+ voice = strcmp(lang, [v.name for v in voices])[0]
1102
+ selected_voice = None
1103
+
1104
+ for v in voices:
1105
+ # Check if the voice matches the specified gender or age
1106
+ if voice and (voice.lower() in v.name.lower() or voice in v.id):
1107
+ selected_voice = v
1108
+ break
1109
+ if gender and gender.lower() in v.name.lower():
1110
+ selected_voice = v
1111
+ if age and age.lower() in v.name.lower():
1112
+ selected_voice = v
1113
+
1114
+ if selected_voice:
1115
+ engine.setProperty("voice", selected_voice.id)
1116
+ else:
1117
+ if voice or gender or age:
1118
+ raise ValueError(
1119
+ f"No matching voice found for specified criteria. Available voices: {[v.name for v in voices]}"
1120
+ )
1121
+ # Generate audio
1122
+ if dir_save:
1123
+ engine.save_to_file(text, dir_save)
1124
+ print(f"Audio saved to {dir_save}")
1125
+ else:
1126
+ engine.say(text)
1127
+
1128
+ engine.runAndWait()
1129
+ except Exception as e:
1130
+ print(f"An error occurred: {e}")
1131
+ # # Explicitly terminate the pyttsx3 engine to release resources
1132
+ try:
1133
+ engine.stop()
1134
+ except RuntimeError:
1135
+ pass
1136
+ # Safely exit the script if running interactively to avoid kernel restarts
1137
+ try:
1138
+ import sys
1139
+
1140
+ sys.exit()
1141
+ except SystemExit:
1142
+ pass
1143
+ elif method.lower() in ['google','gtts']:
1144
+ from gtts import gTTS
1145
+ try:
1146
+ if lang is None:
1147
+ from langdetect import detect
1148
+ lang = detect(text)
1149
+ # Initialize gTTS with the provided parameters
1150
+ tts = gTTS(text=text, lang=lang, slow=slow)
1151
+ except Exception as e:
1152
+ print(f"An error occurred: {e}")
1153
+
1154
+ print("not realtime reading...")
1155
+ if dir_save:
1156
+ if "." not in dir_save:
1157
+ dir_save=dir_save+".mp3"
1158
+ tts.save(dir_save)
1159
+ print(f"Audio saved to {dir_save}")
1160
+ else:
1161
+ dir_save = "temp_audio.mp3"
1162
+ if "." not in dir_save:
1163
+ dir_save=dir_save+".mp3"
1164
+ tts.save(dir_save)
1165
+ try:
1166
+ fopen(dir_save)
1167
+ except Exception as e:
1168
+ print(f"Error opening file: {e}")
1169
+ print("done")
1170
+
895
1171
  def str2time(time_str, fmt="24"):
896
1172
  """
897
1173
  Convert a time string into the specified format.
@@ -2094,7 +2370,7 @@ def fload(fpath, kind=None, **kwargs):
2094
2370
  False if chunksize else True
2095
2371
  ) # when chunksize, recommend low_memory=False # default:
2096
2372
  verbose = kwargs.pop("verbose", False)
2097
- if run_once_within(reverse=True):
2373
+ if run_once_within(reverse=True) and verbose:
2098
2374
  use_pd("read_csv", verbose=verbose)
2099
2375
 
2100
2376
  if comment is None:# default: None
@@ -2212,7 +2488,7 @@ def fload(fpath, kind=None, **kwargs):
2212
2488
  if chunksize:
2213
2489
  df = _get_chunks(df)
2214
2490
  print(df.shape)
2215
- if not is_df_abnormal(df, verbose=0): # normal
2491
+ if not is_df_abnormal(df, verbose=0) and verbose: # normal
2216
2492
  display(df.head(2))
2217
2493
  print(f"shape: {df.shape}")
2218
2494
  return df
@@ -2245,26 +2521,28 @@ def fload(fpath, kind=None, **kwargs):
2245
2521
  df = _get_chunks(df)
2246
2522
  print(df.shape)
2247
2523
  if not is_df_abnormal(df, verbose=0):
2248
- (
2249
- display(df.head(2))
2250
- if isinstance(df, pd.DataFrame)
2251
- else display("it is not a DataFrame")
2252
- )
2253
- (
2254
- print(f"shape: {df.shape}")
2255
- if isinstance(df, pd.DataFrame)
2256
- else display("it is not a DataFrame")
2257
- )
2524
+ if verbose:
2525
+ (
2526
+ display(df.head(2))
2527
+ if isinstance(df, pd.DataFrame)
2528
+ else display("it is not a DataFrame")
2529
+ )
2530
+ (
2531
+ print(f"shape: {df.shape}")
2532
+ if isinstance(df, pd.DataFrame)
2533
+ else display("it is not a DataFrame")
2534
+ )
2258
2535
  return df
2259
2536
  except EmptyDataError as e:
2260
2537
  continue
2261
2538
  else:
2262
2539
  pass
2263
- print(kwargs)
2540
+ # print(kwargs)
2264
2541
  # if is_df_abnormal(df,verbose=verbose):
2265
2542
  # df=pd.read_csv(fpath,**kwargs)
2266
- display(df.head(2))
2267
- print(f"shape: {df.shape}")
2543
+ if verbose:
2544
+ display(df.head(2))
2545
+ print(f"shape: {df.shape}")
2268
2546
  return df
2269
2547
 
2270
2548
  def load_excel(fpath, **kwargs):
@@ -2300,7 +2578,7 @@ def fload(fpath, kind=None, **kwargs):
2300
2578
  engine = kwargs.get("engine", "pyarrow")
2301
2579
  verbose = kwargs.pop("verbose", False)
2302
2580
 
2303
- if run_once_within(reverse=True):
2581
+ if run_once_within(reverse=True) and verbose:
2304
2582
  use_pd("read_parquet", verbose=verbose)
2305
2583
  try:
2306
2584
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2385,6 +2663,16 @@ def fload(fpath, kind=None, **kwargs):
2385
2663
  doc = Document(fpath)
2386
2664
  content = [para.text for para in doc.paragraphs]
2387
2665
  return content
2666
+
2667
+ def load_rtf(file_path):
2668
+ from striprtf.striprtf import rtf_to_text
2669
+ try:
2670
+ with open(file_path, "r") as file:
2671
+ rtf_content = file.read()
2672
+ text = rtf_to_text(rtf_content)
2673
+ return text
2674
+ except Exception as e:
2675
+ print(f"Error loading RTF file: {e}")
2388
2676
 
2389
2677
  if kind is None:
2390
2678
  _, kind = os.path.splitext(fpath)
@@ -2427,6 +2715,7 @@ def fload(fpath, kind=None, **kwargs):
2427
2715
  "xml",
2428
2716
  "ipynb",
2429
2717
  "mtx",
2718
+ "rtf"
2430
2719
  ]
2431
2720
  zip_types = [
2432
2721
  "gz",
@@ -2446,22 +2735,7 @@ def fload(fpath, kind=None, **kwargs):
2446
2735
  if kind not in supported_types:
2447
2736
  print(
2448
2737
  f'Warning:\n"{kind}" is not in the supported list '
2449
- ) # {supported_types}')
2450
- # if os.path.splitext(fpath)[1][1:].lower() in zip_types:
2451
- # keep=kwargs.get("keep", False)
2452
- # ifile=kwargs.get("ifile",(0,0))
2453
- # kwargs.pop("keep",None)
2454
- # kwargs.pop("ifile",None)
2455
- # fpath_unzip=unzip(fpath)
2456
- # if isinstance(fpath_unzip,list):
2457
- # fpath_unzip=fpath_unzip[ifile[0]]
2458
- # if os.path.isdir(fpath_unzip):
2459
- # fpath_selected=listdir(fpath_unzip,kind=kind).fpath[ifile[1]]
2460
- # fpath_unzip=fpath_selected
2461
- # content_unzip=fload(fpath_unzip, **kwargs)
2462
- # if not keep:
2463
- # os.remove(fpath_unzip)
2464
- # return content_unzip
2738
+ ) # {supported_types}')
2465
2739
 
2466
2740
  if kind == "docx":
2467
2741
  return load_docx(fpath)
@@ -2477,37 +2751,45 @@ def fload(fpath, kind=None, **kwargs):
2477
2751
  return load_xml(fpath)
2478
2752
  elif kind in ["csv", "tsv"]:
2479
2753
  # verbose = kwargs.pop("verbose", False)
2480
- if run_once_within(reverse=True):
2481
- use_pd("read_csv")
2754
+ # if run_once_within(reverse=True) and verbose:
2755
+ # use_pd("read_csv")
2482
2756
  content = load_csv(fpath, **kwargs)
2483
2757
  return content
2484
2758
  elif kind == "pkl":
2485
2759
  verbose = kwargs.pop("verbose", False)
2486
- if run_once_within(reverse=True):
2760
+ if run_once_within(reverse=True) and verbose:
2487
2761
  use_pd("read_pickle")
2488
- return pd.read_pickle(fpath, **kwargs)
2762
+ try:
2763
+ res_=pd.read_pickle(fpath, **kwargs)
2764
+ except Exception as e:
2765
+ import pickle
2766
+ with open('sgd_classifier.pkl', 'rb') as f:
2767
+ res_ = pickle.load(f)
2768
+ return res_
2489
2769
  elif kind in ["ods", "ods", "odt"]:
2490
2770
  engine = kwargs.get("engine", "odf")
2491
2771
  kwargs.pop("engine", None)
2492
2772
  return load_excel(fpath, engine=engine, **kwargs)
2493
2773
  elif kind == "xls":
2774
+ verbose = kwargs.pop("verbose", False)
2494
2775
  engine = kwargs.get("engine", "xlrd")
2495
2776
  kwargs.pop("engine", None)
2496
2777
  content = load_excel(fpath, engine=engine, **kwargs)
2497
- print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2778
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) and verbose else None
2498
2779
  display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2499
2780
  return content
2500
2781
  elif kind == "xlsx":
2782
+ verbose = kwargs.pop("verbose", False)
2501
2783
  content = load_excel(fpath, **kwargs)
2502
- display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2784
+ display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2503
2785
  print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2504
2786
  return content
2505
2787
  elif kind == "mtx":
2506
2788
  from scipy.io import mmread
2507
-
2789
+ verbose = kwargs.pop("verbose", False)
2508
2790
  dat_mtx = mmread(fpath)
2509
2791
  content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
2510
- display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2792
+ display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2511
2793
  print(f"shape: {content.shape}")
2512
2794
  return content
2513
2795
  elif kind == "ipynb":
@@ -2578,6 +2860,8 @@ def fload(fpath, kind=None, **kwargs):
2578
2860
 
2579
2861
  elif kind == "mplstyle":
2580
2862
  return read_mplstyle(fpath)
2863
+ elif kind == "rtf":
2864
+ return load_rtf(fpath)
2581
2865
 
2582
2866
  else:
2583
2867
  print("direct reading...")
@@ -2616,6 +2900,38 @@ def fload(fpath, kind=None, **kwargs):
2616
2900
  # docx_content = fload('sample.docx')
2617
2901
 
2618
2902
 
2903
+ def fopen(fpath):
2904
+ import os
2905
+ import platform
2906
+ import sys
2907
+ try:
2908
+ # Check if the file exists
2909
+ if not os.path.isfile(fpath):
2910
+ print(f"Error: The file does not exist - {fpath}")
2911
+ return
2912
+
2913
+ # Get the system platform
2914
+ system = platform.system()
2915
+
2916
+ # Platform-specific file opening commands
2917
+ if system == "Darwin": # macOS
2918
+ os.system(f"open \"{fpath}\"")
2919
+ elif system == "Windows": # Windows
2920
+ # Ensure the path is handled correctly in Windows, escape spaces
2921
+ os.system(f"start \"\" \"{fpath}\"")
2922
+ elif system == "Linux": # Linux
2923
+ os.system(f"xdg-open \"{fpath}\"")
2924
+ elif system == "Java": # Java (or other unhandled systems)
2925
+ print(f"Opening {fpath} on unsupported system.")
2926
+ else:
2927
+ print(f"Unsupported OS: {system}")
2928
+
2929
+ print(f"Successfully opened {fpath} with the default application.")
2930
+ except Exception as e:
2931
+ print(f"Error opening file {fpath}: {e}")
2932
+
2933
+
2934
+
2619
2935
  def fupdate(fpath, content=None, how="head"):
2620
2936
  """
2621
2937
  Update a file by adding new content at the top and moving the old content to the bottom.
@@ -3025,13 +3341,18 @@ def fsave(
3025
3341
  content.to_pickle(fpath, **kwargs)
3026
3342
  else:
3027
3343
  try:
3028
- print("trying to convert it as a DataFrame...")
3029
3344
  content = pd.DataFrame(content)
3030
3345
  content.to_pickle(fpath, **kwargs)
3031
3346
  except Exception as e:
3032
- raise ValueError(
3033
- f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
3034
- )
3347
+ try:
3348
+ import pickle
3349
+ with open(fpath, 'wb') as f:
3350
+ pickle.dump(content, f)
3351
+ print('done!', fpath)
3352
+ except Exception as e:
3353
+ raise ValueError(
3354
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
3355
+ )
3035
3356
  elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
3036
3357
  # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
3037
3358
  # optimized for data analytics tasks and is especially fast when working with Pandas.
@@ -3187,16 +3508,22 @@ def isa(content, kind):
3187
3508
  """
3188
3509
  if "img" in kind.lower() or "image" in kind.lower():
3189
3510
  return is_image(content)
3511
+ elif 'vid' in kind.lower():
3512
+ return is_video(content)
3513
+ elif 'aud' in kind.lower():
3514
+ return is_audio(content)
3190
3515
  elif "doc" in kind.lower():
3191
3516
  return is_document(content)
3192
3517
  elif "zip" in kind.lower():
3193
3518
  return is_zip(content)
3194
3519
  elif "dir" in kind.lower() or ("f" in kind.lower() and "d" in kind.lower()):
3195
3520
  return os.path.isdir(content)
3521
+ elif "code" in kind.lower(): # file
3522
+ return is_code(content)
3196
3523
  elif "fi" in kind.lower(): # file
3197
3524
  return os.path.isfile(content)
3198
3525
  elif "num" in kind.lower(): # file
3199
- return os.path.isfile(content)
3526
+ return isnum(content)
3200
3527
  elif "text" in kind.lower() or "txt" in kind.lower(): # file
3201
3528
  return is_text(content)
3202
3529
  elif "color" in kind.lower(): # file
@@ -3607,7 +3934,7 @@ def get_os(full=False, verbose=False):
3607
3934
  "usage (%)": usage.percent,
3608
3935
  }
3609
3936
  except PermissionError:
3610
- system_info["Disk"][partition.device] = "Permission Denied"
3937
+ system_info["disk"][partition.device] = "Permission Denied"
3611
3938
 
3612
3939
  # Network Information
3613
3940
  if_addrs = psutil.net_if_addrs()
@@ -3667,11 +3994,33 @@ def listdir(
3667
3994
  ascending=True,
3668
3995
  contains=None,# filter filenames using re
3669
3996
  booster=False,# walk in subfolders
3997
+ depth = 0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
3670
3998
  hidden=False, # Include hidden files/folders
3671
3999
  orient="list",
3672
4000
  output="df", # "df", 'list','dict','records','index','series'
3673
4001
  verbose=True,
3674
- ):
4002
+ ):
4003
+ def is_hidden(filepath):
4004
+ """Check if a file or folder is hidden."""
4005
+ system = platform.system()
4006
+ if system == "Windows":
4007
+ import ctypes
4008
+ attribute = ctypes.windll.kernel32.GetFileAttributesW(filepath)
4009
+ if attribute == -1:
4010
+ raise FileNotFoundError(f"File {filepath} not found.")
4011
+ return bool(attribute & 2) # FILE_ATTRIBUTE_HIDDEN
4012
+ else: # macOS/Linux: Hidden if the name starts with a dot
4013
+ return os.path.basename(filepath).startswith(".")
4014
+
4015
+ def get_user():
4016
+ """Retrieve the username of the current user."""
4017
+ system = platform.system()
4018
+ if system == "Windows":
4019
+ return os.environ.get("USERNAME", "Unknown")
4020
+ else:
4021
+ import pwd
4022
+ return pwd.getpwuid(os.getuid()).pw_name
4023
+
3675
4024
  if isinstance(kind, list):
3676
4025
  f_ = []
3677
4026
  for kind_ in kind:
@@ -3681,7 +4030,7 @@ def listdir(
3681
4030
  sort_by=sort_by,
3682
4031
  ascending=ascending,
3683
4032
  contains=contains,
3684
- booster=booster,# walk in subfolders
4033
+ depth=depth,# walk in subfolders
3685
4034
  hidden=hidden,
3686
4035
  orient=orient,
3687
4036
  output=output,
@@ -3710,12 +4059,24 @@ def listdir(
3710
4059
  "rootdir":[],
3711
4060
  "fname": [],
3712
4061
  "fpath": [],
4062
+ "num":[],
4063
+ "os":[]
3713
4064
  }
4065
+ root_depth = rootdir.rstrip(os.sep).count(os.sep)
3714
4066
  for dirpath, dirnames, ls in os.walk(rootdir):
4067
+ current_depth = dirpath.rstrip(os.sep).count(os.sep) - root_depth
4068
+ # Check depth limit
4069
+ if depth is not None and current_depth > depth:
4070
+ dirnames[:] = [] # Prevent further traversal into subfolders
4071
+ continue
4072
+
3715
4073
  if not hidden:
3716
- dirnames[:] = [d for d in dirnames if not d.startswith(".")]
3717
- ls = [i for i in ls if not i.startswith(".")]
3718
- for dirname in dirnames:
4074
+ dirnames[:] = [d for d in dirnames if not is_hidden(os.path.join(dirpath, d))]
4075
+ ls = [i for i in ls if not is_hidden(os.path.join(dirpath, i))]
4076
+
4077
+ for dirname in dirnames:
4078
+ if kind is not None and kind not in fd: # do not check folders
4079
+ continue
3719
4080
  if contains and not re.search(contains, dirname):
3720
4081
  continue
3721
4082
  dirname_path = os.path.join(dirpath, dirname)
@@ -3734,21 +4095,23 @@ def listdir(
3734
4095
  f['basename'].append(os.path.basename(dirname_path))
3735
4096
  f["path"].append(os.path.join(os.path.dirname(dirname_path), dirname))
3736
4097
  f["created_time"].append(
3737
- pd.to_datetime(os.path.getctime(dirname_path), unit="s")
4098
+ pd.to_datetime(int(os.path.getctime(dirname_path)), unit="s")
3738
4099
  )
3739
4100
  f["modified_time"].append(
3740
- pd.to_datetime(os.path.getmtime(dirname_path), unit="s")
4101
+ pd.to_datetime(int(os.path.getmtime(dirname_path)), unit="s")
3741
4102
  )
3742
4103
  f["last_open_time"].append(
3743
- pd.to_datetime(os.path.getatime(dirname_path), unit="s")
4104
+ pd.to_datetime(int(os.path.getatime(dirname_path)), unit="s")
3744
4105
  )
3745
4106
  f["permission"].append(stat.filemode(stats_file.st_mode)),
3746
- f["owner"].append(os.getlogin() if platform.system() != "Windows" else "N/A"),
4107
+ f["owner"].append(get_user()),
3747
4108
  f["rootdir"].append(dirpath)
3748
4109
  f["fname"].append(filename) # will be removed
3749
4110
  f["fpath"].append(fpath) # will be removed
3750
4111
  i += 1
3751
- for item in ls:
4112
+ for item in ls:
4113
+ if kind in fd:# only check folders
4114
+ continue
3752
4115
  if contains and not re.search(contains, item):
3753
4116
  continue
3754
4117
  item_path = os.path.join(dirpath, item)
@@ -3760,13 +4123,11 @@ def listdir(
3760
4123
  continue
3761
4124
  filename, file_extension = os.path.splitext(item)
3762
4125
  if kind is not None:
3763
- if not kind.startswith("."):
3764
- kind = "." + kind
3765
4126
  is_folder = kind.lower() in fd and os.path.isdir(item_path)
3766
4127
  is_file = kind.lower() in file_extension.lower() and (
3767
4128
  os.path.isfile(item_path)
3768
4129
  )
3769
- if kind in [".doc", ".img", ".zip"]: # 选择大的类别
4130
+ if kind in [".doc", ".img", ".zip",".code",".file",".image",".video",".audio"]: # 选择大的类别
3770
4131
  if kind != ".folder" and not isa(item_path, kind):
3771
4132
  continue
3772
4133
  elif kind in [".all"]:
@@ -3780,15 +4141,15 @@ def listdir(
3780
4141
  f["length"].append(len(filename))
3781
4142
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3782
4143
  f['basename'].append(os.path.basename(item_path))
3783
- f["path"].append(os.path.join(os.path.dirname(item_path), item))
4144
+ f["path"].append(os.path.join(os.path.dirname(item_path), item))
3784
4145
  f["created_time"].append(
3785
- pd.to_datetime(os.path.getctime(item_path), unit="s")
4146
+ pd.to_datetime(int(os.path.getctime(item_path)), unit="s")
3786
4147
  )
3787
4148
  f["modified_time"].append(
3788
- pd.to_datetime(os.path.getmtime(item_path), unit="s")
4149
+ pd.to_datetime(int(os.path.getmtime(item_path)), unit="s")
3789
4150
  )
3790
4151
  f["last_open_time"].append(
3791
- pd.to_datetime(os.path.getatime(item_path), unit="s")
4152
+ pd.to_datetime(int(os.path.getatime(item_path)), unit="s")
3792
4153
  )
3793
4154
  f["permission"].append(stat.filemode(stats_file.st_mode)),
3794
4155
  f["owner"].append(os.getlogin() if platform.system() != "Windows" else "N/A"),
@@ -3799,13 +4160,13 @@ def listdir(
3799
4160
 
3800
4161
  f["num"] = i
3801
4162
  f["os"] = get_os() # os.uname().machine
3802
- if not booster: # go deeper subfolders
3803
- break
4163
+ # if not booster: # go deeper subfolders
4164
+ # break
3804
4165
  #* convert to pd.DataFrame
3805
4166
  f = pd.DataFrame(f)
3806
4167
  f=f[["basename","name","kind","length","size","num","path","created_time",
3807
4168
  "modified_time","last_open_time","rootdir",
3808
- "fname","fpath","permission","owner","os",]]
4169
+ "permission","owner","os","fname","fpath",]]
3809
4170
  if "nam" in sort_by.lower():
3810
4171
  f = sort_kind(f, by="name", ascending=ascending)
3811
4172
  elif "crea" in sort_by.lower():
@@ -4173,39 +4534,233 @@ def is_num(s):
4173
4534
  def isnum(s):
4174
4535
  return is_num(s)
4175
4536
 
4176
-
4177
4537
  def is_image(fpath):
4538
+ """
4539
+ Determine if a given file is an image based on MIME type and file extension.
4540
+
4541
+ Args:
4542
+ fpath (str): Path to the file.
4543
+
4544
+ Returns:
4545
+ bool: True if the file is a recognized image, False otherwise.
4546
+ """
4178
4547
  import mimetypes
4548
+ # Known image MIME types
4549
+ image_mime_types = {
4550
+ "image/jpeg",
4551
+ "image/png",
4552
+ "image/gif",
4553
+ "image/bmp",
4554
+ "image/webp",
4555
+ "image/tiff",
4556
+ "image/x-icon",
4557
+ "image/svg+xml",
4558
+ "image/heic",
4559
+ "image/heif",
4560
+ }
4179
4561
 
4562
+ # Known image file extensions
4563
+ image_extensions = {
4564
+ ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tif", ".tiff",
4565
+ ".ico", ".svg", ".heic", ".heif",".fig",".jpg"
4566
+ }
4567
+
4568
+ # Get MIME type using mimetypes
4180
4569
  mime_type, _ = mimetypes.guess_type(fpath)
4181
- if mime_type and mime_type.startswith("image"):
4570
+
4571
+ # Check MIME type
4572
+ if mime_type in image_mime_types:
4573
+ return True
4574
+
4575
+ # Fallback: Check file extension
4576
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4577
+ if ext in image_extensions:
4578
+ return True
4579
+
4580
+ return False
4581
+
4582
+ def is_video(fpath):
4583
+ """
4584
+ Determine if a given file is a video based on MIME type and file extension.
4585
+
4586
+ Args:
4587
+ fpath (str): Path to the file.
4588
+
4589
+ Returns:
4590
+ bool: True if the file is a recognized video, False otherwise.
4591
+ """
4592
+ import mimetypes
4593
+ # Known video MIME types
4594
+ video_mime_types = {
4595
+ "video/mp4",
4596
+ "video/quicktime",
4597
+ "video/x-msvideo",
4598
+ "video/x-matroska",
4599
+ "video/x-flv",
4600
+ "video/webm",
4601
+ "video/ogg",
4602
+ "video/x-ms-wmv",
4603
+ "video/x-mpeg",
4604
+ "video/3gpp",
4605
+ "video/avi",
4606
+ "video/mpeg",
4607
+ "video/x-mpeg2",
4608
+ "video/x-ms-asf",
4609
+ }
4610
+
4611
+ # Known video file extensions
4612
+ video_extensions = {
4613
+ ".mp4", ".mov", ".avi", ".mkv", ".flv", ".webm", ".ogv", ".wmv",
4614
+ ".mpg", ".mpeg", ".3gp", ".mpeg2", ".asf", ".ts", ".m4v", ".divx",
4615
+ }
4616
+
4617
+ # Get MIME type using mimetypes
4618
+ mime_type, _ = mimetypes.guess_type(fpath)
4619
+
4620
+ # Check MIME type
4621
+ if mime_type in video_mime_types:
4622
+ return True
4623
+
4624
+ # Fallback: Check file extension
4625
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4626
+ if ext in video_extensions:
4182
4627
  return True
4183
- else:
4184
- return False
4185
4628
 
4629
+ return False
4186
4630
 
4187
4631
  def is_document(fpath):
4632
+ """
4633
+ Determine if a given file is a document based on MIME type and file extension.
4634
+
4635
+ Args:
4636
+ fpath (str): Path to the file.
4637
+
4638
+ Returns:
4639
+ bool: True if the file is a recognized document, False otherwise.
4640
+ """
4188
4641
  import mimetypes
4642
+ # Define known MIME types for documents
4643
+ document_mime_types = {
4644
+ "text/",
4645
+ "application/pdf",
4646
+ "application/msword",
4647
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
4648
+ "application/vnd.ms-excel",
4649
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
4650
+ "application/vnd.ms-powerpoint",
4651
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
4652
+ "application/rtf",
4653
+ "application/x-latex",
4654
+ "application/vnd.oasis.opendocument.text",
4655
+ "application/vnd.oasis.opendocument.spreadsheet",
4656
+ "application/vnd.oasis.opendocument.presentation",
4657
+ }
4658
+
4659
+ # Define extensions for fallback
4660
+ document_extensions = {
4661
+ ".txt",
4662
+ ".log",
4663
+ ".csv",
4664
+ ".json",
4665
+ ".xml",
4666
+ ".pdf",
4667
+ ".doc",
4668
+ ".docx",
4669
+ ".xls",
4670
+ ".xlsx",
4671
+ ".ppt",
4672
+ ".pptx",
4673
+ ".odt",
4674
+ ".ods",
4675
+ ".odp",
4676
+ ".rtf",
4677
+ ".tex",
4678
+ }
4189
4679
 
4680
+ # Get MIME type
4190
4681
  mime_type, _ = mimetypes.guess_type(fpath)
4191
- if mime_type and (
4192
- mime_type.startswith("text/")
4193
- or mime_type == "application/pdf"
4194
- or mime_type == "application/msword"
4195
- or mime_type
4196
- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
4197
- or mime_type == "application/vnd.ms-excel"
4198
- or mime_type
4199
- == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
4200
- or mime_type == "application/vnd.ms-powerpoint"
4201
- or mime_type
4202
- == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
4203
- ):
4682
+
4683
+ # Check MIME type
4684
+ if mime_type and any(mime_type.startswith(doc_type) for doc_type in document_mime_types):
4685
+ return True
4686
+
4687
+ # Fallback: Check file extension
4688
+ ext = os.path.splitext(fpath)[-1].lower() # Get the extension, ensure it's lowercase
4689
+ if ext in document_extensions:
4690
+ return True
4691
+
4692
+ return False
4693
+
4694
+ def is_audio(fpath):
4695
+ """
4696
+ Determine if a given file is an audio file based on MIME type and file extension.
4697
+
4698
+ Args:
4699
+ fpath (str): Path to the file.
4700
+
4701
+ Returns:
4702
+ bool: True if the file is a recognized audio file, False otherwise.
4703
+ """
4704
+ import mimetypes
4705
+ # Known audio MIME types
4706
+ audio_mime_types = {
4707
+ "audio/mpeg",
4708
+ "audio/wav",
4709
+ "audio/ogg",
4710
+ "audio/aac",
4711
+ "audio/flac",
4712
+ "audio/midi",
4713
+ "audio/x-midi",
4714
+ "audio/x-wav",
4715
+ "audio/x-flac",
4716
+ "audio/pcm",
4717
+ "audio/x-aiff",
4718
+ "audio/x-m4a",
4719
+ }
4720
+
4721
+ # Known audio file extensions
4722
+ audio_extensions = {
4723
+ ".mp3", ".wav", ".ogg", ".aac", ".flac", ".midi", ".m4a",
4724
+ ".aiff", ".pcm", ".wma", ".ape", ".alac", ".opus",
4725
+ }
4726
+
4727
+ # Get MIME type using mimetypes
4728
+ mime_type, _ = mimetypes.guess_type(fpath)
4729
+
4730
+ # Check MIME type
4731
+ if mime_type in audio_mime_types:
4732
+ return True
4733
+
4734
+ # Fallback: Check file extension
4735
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4736
+ if ext in audio_extensions:
4204
4737
  return True
4205
- else:
4206
- return False
4207
4738
 
4739
+ return False
4208
4740
 
4741
+ def is_code(fpath):
4742
+ """
4743
+ Determine if a given file is a code file based on file extension and optionally MIME type.
4744
+
4745
+ Args:
4746
+ fpath (str): Path to the file.
4747
+ check_mime (bool): Whether to perform a MIME type check in addition to file extension check.
4748
+
4749
+ Returns:
4750
+ bool: True if the file is a recognized code file, False otherwise.
4751
+ """
4752
+ # Known programming and scripting file extensions
4753
+ code_extensions = {
4754
+ ".m", ".py", ".ipynb", ".js", ".html", ".css", ".java", ".cpp", ".h", ".cs", ".go",
4755
+ ".rs", ".sh", ".rb", ".swift", ".ts", ".json", ".xml", ".yaml", ".toml", ".bash", ".r"
4756
+ }
4757
+
4758
+ # Check file extension
4759
+ ext = os.path.splitext(fpath)[-1].lower()
4760
+ if ext in code_extensions:
4761
+ return True
4762
+ return False
4763
+
4209
4764
  def is_zip(fpath):
4210
4765
  import mimetypes
4211
4766
 
@@ -6190,12 +6745,12 @@ def df_astype(
6190
6745
 
6191
6746
 
6192
6747
  # ! DataFrame
6193
- def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6748
+ def df_sort_values(data, column, by=None, ascending=True, inplace=True, **kwargs):
6194
6749
  """
6195
6750
  Sort a DataFrame by a specified column based on a custom order or by count.
6196
6751
 
6197
6752
  Parameters:
6198
- - df: DataFrame to be sorted.
6753
+ - data: DataFrame to be sorted.
6199
6754
  - column: The name of the column to sort by.
6200
6755
  - by: List specifying the custom order for sorting or 'count' to sort by frequency.
6201
6756
  - ascending: Boolean or list of booleans, default True.
@@ -6211,7 +6766,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6211
6766
 
6212
6767
  if isinstance(by, str) and "count" in by.lower():
6213
6768
  # Count occurrences of each value in the specified column
6214
- value_counts = df[column].value_counts()
6769
+ value_counts = data[column].value_counts()
6215
6770
 
6216
6771
  # Determine the order based on counts
6217
6772
  count_ascending = kwargs.pop("count_ascending", ascending)
@@ -6220,12 +6775,12 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6220
6775
  ).index.tolist()
6221
6776
 
6222
6777
  # Convert to a categorical type with the new order
6223
- df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
6778
+ data[column] = pd.Categorical(data[column], categories=sorted_counts, ordered=True)
6224
6779
  # Set ascending to count_ascending for sorting
6225
6780
  ascending = count_ascending # Adjust ascending for the final sort
6226
6781
  elif isinstance(by, list):
6227
6782
  # Convert the specified column to a categorical type with the custom order
6228
- df[column] = pd.Categorical(df[column], categories=by, ordered=True)
6783
+ data[column] = pd.Categorical(data[column], categories=by, ordered=True)
6229
6784
  else:
6230
6785
  raise ValueError("Custom order must be a list or 'count'.")
6231
6786
 
@@ -6240,7 +6795,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6240
6795
  return sorted_df
6241
6796
  except Exception as e:
6242
6797
  print(f"Error sorting DataFrame by '{column}': {e}")
6243
- return df
6798
+ return data
6244
6799
 
6245
6800
 
6246
6801
  # # Example usage:
@@ -7742,7 +8297,7 @@ def df_reducer(
7742
8297
  # example:
7743
8298
  # df_reducer(data=data_log, columns=markers, n_components=2)
7744
8299
 
7745
- def df_format(data, threshold_unique=0.5, verbose=False):
8300
+ def get_df_format(data, threshold_unique=0.5, verbose=False):
7746
8301
  """
7747
8302
  检测表格: long, wide or uncertain.
7748
8303
 
@@ -7834,13 +8389,16 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7834
8389
  # Step 5: Clustering analysis on numerical columns for correlation in wide format
7835
8390
  numeric_cols = data.select_dtypes(include="number").columns
7836
8391
  if len(numeric_cols) > 1:
7837
- scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
7838
- clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
7839
- cluster_labels = pd.Series(clustering.labels_)
7840
- if cluster_labels.nunique() < len(numeric_cols) * 0.5:
7841
- wide_score += 2
7842
- if verbose:
7843
- print("Clustering on columns shows grouping, suggesting wide format.")
8392
+ try:
8393
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
8394
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
8395
+ cluster_labels = pd.Series(clustering.labels_)
8396
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
8397
+ wide_score += 2
8398
+ if verbose:
8399
+ print("Clustering on columns shows grouping, suggesting wide format.")
8400
+ except Exception as e:
8401
+ print(e) if verbose else None
7844
8402
 
7845
8403
  # Step 6: Inter-column correlation analysis
7846
8404
  if len(numeric_cols) > 1:
@@ -7868,11 +8426,14 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7868
8426
 
7869
8427
  # Step 8: Multi-level clustering on rows to detect block structure for wide format
7870
8428
  if len(numeric_cols) > 1 and n_rows > 5:
7871
- clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7872
- if pd.Series(clustering_rows.labels_).nunique() < 2:
7873
- wide_score += 2
7874
- if verbose:
7875
- print("Row clustering reveals homogeneity, suggesting wide format.")
8429
+ try:
8430
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
8431
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
8432
+ wide_score += 2
8433
+ if verbose:
8434
+ print("Row clustering reveals homogeneity, suggesting wide format.")
8435
+ except Exception as e:
8436
+ print(e) if verbose else None
7876
8437
 
7877
8438
  # Step 9: Sequential name detection for time-series pattern in wide format
7878
8439
  if any(col.isdigit() or col.startswith("T") for col in col_names):
@@ -7881,15 +8442,18 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7881
8442
  print("Detected time-like sequential column names, supporting wide format.")
7882
8443
 
7883
8444
  # Step 10: Entropy of numeric columns
7884
- numeric_entropy = data[numeric_cols].apply(
7885
- lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7886
- )
7887
- if numeric_entropy.mean() < 2:
7888
- wide_score += 2
7889
- if verbose:
7890
- print(
7891
- "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7892
- )
8445
+ try:
8446
+ numeric_entropy = data[numeric_cols].apply(
8447
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
8448
+ )
8449
+ if numeric_entropy.mean() < 2:
8450
+ wide_score += 2
8451
+ if verbose:
8452
+ print(
8453
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
8454
+ )
8455
+ except Exception as e:
8456
+ print(e) if verbose else None
7893
8457
 
7894
8458
  # Step 11: Tie-breaking strategy if scores are equal
7895
8459
  if wide_score == long_score:
@@ -8905,3 +9469,286 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
8905
9469
  if verbose:
8906
9470
  preview(res)
8907
9471
  return res
9472
+
9473
+
9474
+ def decode_pluscode(
9475
+ pluscode: str, reference: tuple = (52.5200, 13.4050), return_bbox: bool = False
9476
+ ):
9477
+ """
9478
+ Decodes a Plus Code into latitude and longitude (and optionally returns a bounding box).
9479
+
9480
+ Parameters:
9481
+ pluscode (str): The Plus Code to decode. Can be full or short.
9482
+ reference (tuple, optional): Reference latitude and longitude for decoding short Plus Codes.
9483
+ Default is None, required if Plus Code is short.
9484
+ return_bbox (bool): If True, returns the bounding box coordinates (latitude/longitude bounds).
9485
+ Default is False.
9486
+
9487
+ Returns:
9488
+ tuple: (latitude, longitude) if `return_bbox` is False.
9489
+ (latitude, longitude, bbox) if `return_bbox` is True.
9490
+ bbox = (latitudeLo, latitudeHi, longitudeLo, longitudeHi)
9491
+ Raises:
9492
+ ValueError: If the Plus Code is invalid or reference is missing for a short code.
9493
+
9494
+ Usage:
9495
+ lat, lon = decode_pluscode("7FG6+89")
9496
+ print(f"Decoded Short Plus Code: Latitude: {lat}, Longitude: {lon}, Bounding Box: {bbox}")
9497
+
9498
+ lat, lon = decode_pluscode("9F4M7FG6+89")
9499
+ print(f"Decoded Full Plus Code: Latitude: {lat}, Longitude: {lon}")
9500
+ """
9501
+ from openlocationcode import openlocationcode as olc
9502
+
9503
+ # Validate Plus Code
9504
+ if not olc.isValid(pluscode):
9505
+ raise ValueError(f"Invalid Plus Code: {pluscode}")
9506
+
9507
+ # Handle Short Plus Codes
9508
+ if olc.isShort(pluscode):
9509
+ if reference is None:
9510
+ raise ValueError(
9511
+ "Reference location (latitude, longitude) is required for decoding short Plus Codes."
9512
+ )
9513
+ # Recover the full Plus Code using the reference location
9514
+ pluscode = olc.recoverNearest(pluscode, reference[0], reference[1])
9515
+
9516
+ # Decode the Plus Code
9517
+ decoded = olc.decode(pluscode)
9518
+
9519
+ # Calculate the center point of the bounding box
9520
+ latitude = (decoded.latitudeLo + decoded.latitudeHi) / 2
9521
+ longitude = (decoded.longitudeLo + decoded.longitudeHi) / 2
9522
+
9523
+ if return_bbox:
9524
+ bbox = (
9525
+ decoded.latitudeLo,
9526
+ decoded.latitudeHi,
9527
+ decoded.longitudeLo,
9528
+ decoded.longitudeHi,
9529
+ )
9530
+ return latitude, longitude, bbox
9531
+
9532
+ return latitude, longitude
9533
+
9534
+ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
9535
+ """
9536
+ Determine if the input is a city name, lat/lon, or DMS and perform geocoding or reverse geocoding.
9537
+ Usage:
9538
+ get_loc("Berlin, Germany") # Example city
9539
+ # get_loc((48.8566, 2.3522)) # Example latitude and longitude
9540
+ # get_loc("48 51 24.3 N") # Example DMS input
9541
+ """
9542
+ from geopy.geocoders import Nominatim
9543
+ import re
9544
+
9545
+ def dms_to_decimal(dms):
9546
+ """
9547
+ Convert DMS (Degrees, Minutes, Seconds) to Decimal format.
9548
+ Input should be in the format of "DD MM SS" or "D M S".
9549
+ """
9550
+ # Regex pattern for DMS input
9551
+ pattern = r"(\d{1,3})[^\d]*?(\d{1,2})[^\d]*?(\d{1,2})"
9552
+ match = re.match(pattern, dms)
9553
+
9554
+ if match:
9555
+ degrees, minutes, seconds = map(float, match.groups())
9556
+ decimal = degrees + (minutes / 60) + (seconds / 3600)
9557
+ return decimal
9558
+ else:
9559
+ raise ValueError("Invalid DMS format")
9560
+
9561
+ geolocator = Nominatim(user_agent="0413@mygmail.com)")
9562
+ # Case 1: Input is a city name (string)
9563
+ if isinstance(input_data, str) and not re.match(r"^\d+(\.\d+)?$", input_data):
9564
+ location = geolocator.geocode(input_data)
9565
+ if verbose:
9566
+ print(
9567
+ f"Latitude and Longitude for {input_data}: {location.latitude}, {location.longitude}"
9568
+ )
9569
+ else:
9570
+ print(f"Could not find {input_data}.")
9571
+ return location
9572
+
9573
+ # Case 2: Input is latitude and longitude (float or tuple)
9574
+ elif isinstance(input_data, (float, tuple)):
9575
+ if isinstance(input_data, tuple) and len(input_data) == 2:
9576
+ latitude, longitude = input_data
9577
+ elif isinstance(input_data, float):
9578
+ latitude = input_data
9579
+ longitude = None # No longitude provided for a single float
9580
+
9581
+ # Reverse geocoding
9582
+ location_reversed = geolocator.reverse(
9583
+ (latitude, longitude) if longitude else latitude
9584
+ )
9585
+ if verbose:
9586
+ print(
9587
+ f"Address from coordinates ({latitude}, {longitude if longitude else ''}): {location_reversed.address}"
9588
+ )
9589
+ else:
9590
+ print("Could not reverse geocode the coordinates.")
9591
+ return location_reversed
9592
+
9593
+ # Case 3: Input is a DMS string
9594
+ elif isinstance(input_data, str):
9595
+ try:
9596
+ decimal_lat = dms_to_decimal(input_data)
9597
+ print(f"Converted DMS to decimal latitude: {decimal_lat}")
9598
+
9599
+ location_reversed = geolocator.reverse(decimal_lat)
9600
+ if verbose:
9601
+ print(f"Address from coordinates: {location_reversed.address}")
9602
+ else:
9603
+ print("Could not reverse geocode the coordinates.")
9604
+ return location_reversed
9605
+ except ValueError:
9606
+ print(
9607
+ "Invalid input format. Please provide a city name, latitude/longitude, or DMS string."
9608
+ )
9609
+
9610
+ def enpass(code: str, method: str="AES", key: str = None):
9611
+ """
9612
+ usage: enpass("admin")
9613
+ Master encryption function that supports multiple methods: AES, RSA, and SHA256.
9614
+ :param code: The input data to encrypt or hash.
9615
+ :param method: The encryption or hashing method ('AES', 'RSA', or 'SHA256').
9616
+ :param key: The key to use for encryption. For AES and RSA, it can be a password or key in PEM format.
9617
+ :return: The encrypted data or hashed value.
9618
+ """
9619
+ import hashlib
9620
+ # AES Encryption (Advanced)
9621
+ def aes_encrypt(data: str, key: str):
9622
+ """
9623
+ Encrypts data using AES algorithm in CBC mode.
9624
+ :param data: The data to encrypt.
9625
+ :param key: The key to use for AES encryption.
9626
+ :return: The encrypted data, base64 encoded.
9627
+ """
9628
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
9629
+ from cryptography.hazmat.backends import default_backend
9630
+ from cryptography.hazmat.primitives import padding
9631
+ import base64
9632
+ import os
9633
+ # Generate a 256-bit key from the provided password
9634
+ key = hashlib.sha256(key.encode()).digest()
9635
+
9636
+ # Generate a random initialization vector (IV)
9637
+ iv = os.urandom(16) # 16 bytes for AES block size
9638
+
9639
+ # Pad the data to be a multiple of 16 bytes using PKCS7
9640
+ padder = padding.PKCS7(128).padder()
9641
+ padded_data = padder.update(data.encode()) + padder.finalize()
9642
+
9643
+ # Create AES cipher object using CBC mode
9644
+ cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
9645
+ encryptor = cipher.encryptor()
9646
+ encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
9647
+
9648
+ # Return the base64 encoded result (IV + encrypted data)
9649
+ return base64.b64encode(iv + encrypted_data).decode()
9650
+
9651
+ # RSA Encryption (Advanced)
9652
+ def rsa_encrypt(data: str, public_key: str):
9653
+ """
9654
+ Encrypts data using RSA encryption with OAEP padding.
9655
+ :param data: The data to encrypt.
9656
+ :param public_key: The public key in PEM format.
9657
+ :return: The encrypted data, base64 encoded.
9658
+ """
9659
+ import base64
9660
+ from Crypto.PublicKey import RSA
9661
+ from Crypto.Cipher import PKCS1_OAEP
9662
+ public_key_obj = RSA.import_key(public_key)
9663
+ cipher_rsa = PKCS1_OAEP.new(public_key_obj)
9664
+ encrypted_data = cipher_rsa.encrypt(data.encode())
9665
+ return base64.b64encode(encrypted_data).decode()
9666
+ # SHA256 Hashing (Non-reversible)
9667
+ def sha256_hash(data: str):
9668
+ """
9669
+ Generates a SHA256 hash of the data.
9670
+ :param data: The data to hash.
9671
+ :return: The hashed value (hex string).
9672
+ """
9673
+ return hashlib.sha256(data.encode()).hexdigest()
9674
+ if key is None:
9675
+ key="worldpeace"
9676
+ method=strcmp(method,["AES","RSA",'SHA256'])[0]
9677
+ if method == "AES":
9678
+ return aes_encrypt(code, key)
9679
+ elif method == "RSA":
9680
+ return rsa_encrypt(code, key)
9681
+ elif method == "SHA256":
9682
+ return sha256_hash(code)
9683
+ else:
9684
+ raise ValueError("Unsupported encryption method")
9685
+
9686
+
9687
+ # Master Decryption Function (Supports AES, RSA)
9688
+ def depass(encrypted_code: str, method: str='AES', key: str = None):
9689
+ """
9690
+ Master decryption function that supports multiple methods: AES and RSA.
9691
+ :param encrypted_code: The encrypted data to decrypt.
9692
+ :param method: The encryption method ('AES' or 'RSA').
9693
+ :param key: The key to use for decryption. For AES and RSA, it can be a password or key in PEM format.
9694
+ :return: The decrypted data.
9695
+ """
9696
+ import hashlib
9697
+ def aes_decrypt(encrypted_data: str, key: str):
9698
+ """
9699
+ Decrypts data encrypted using AES in CBC mode.
9700
+ :param encrypted_data: The encrypted data, base64 encoded.
9701
+ :param key: The key to use for AES decryption.
9702
+ :return: The decrypted data (string).
9703
+ """
9704
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
9705
+ from cryptography.hazmat.backends import default_backend
9706
+ from cryptography.hazmat.primitives import padding
9707
+ import base64
9708
+ # Generate the same 256-bit key from the password
9709
+ key = hashlib.sha256(key.encode()).digest()
9710
+
9711
+ # Decode the encrypted data from base64
9712
+ encrypted_data = base64.b64decode(encrypted_data)
9713
+
9714
+ # Extract the IV and the actual encrypted data
9715
+ iv = encrypted_data[:16] # First 16 bytes are the IV
9716
+ encrypted_data = encrypted_data[16:] # Remaining data is the encrypted message
9717
+
9718
+ # Create AES cipher object using CBC mode
9719
+ cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
9720
+ decryptor = cipher.decryptor()
9721
+ decrypted_data = decryptor.update(encrypted_data) + decryptor.finalize()
9722
+
9723
+ # Unpad the decrypted data using PKCS7
9724
+ unpadder = padding.PKCS7(128).unpadder()
9725
+ unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
9726
+
9727
+ return unpadded_data.decode()
9728
+ def rsa_decrypt(encrypted_data: str, private_key: str):
9729
+ """
9730
+ Decrypts RSA-encrypted data using the private key.
9731
+ :param encrypted_data: The encrypted data, base64 encoded.
9732
+ :param private_key: The private key in PEM format.
9733
+ :return: The decrypted data (string).
9734
+ """
9735
+ from Crypto.PublicKey import RSA
9736
+ from Crypto.Cipher import PKCS1_OAEP
9737
+ import base64
9738
+ encrypted_data = base64.b64decode(encrypted_data)
9739
+ private_key_obj = RSA.import_key(private_key)
9740
+ cipher_rsa = PKCS1_OAEP.new(private_key_obj)
9741
+ decrypted_data = cipher_rsa.decrypt(encrypted_data)
9742
+ return decrypted_data.decode()
9743
+
9744
+ if key is None:
9745
+ key="worldpeace"
9746
+ method=strcmp(method,["AES","RSA",'SHA256'])[0]
9747
+ if method == "AES":
9748
+ return aes_decrypt(encrypted_code, key)
9749
+ elif method == "RSA":
9750
+ return rsa_decrypt(encrypted_code, key)
9751
+ elif method == "SHA256":
9752
+ raise ValueError("SHA256 is a hash function and cannot be decrypted.")
9753
+ else:
9754
+ raise ValueError("Unsupported decryption method")