py2ls 0.2.4.29__py3-none-any.whl → 0.2.4.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -779,11 +779,150 @@ def strcmp(
779
779
  print(f"建议: {best_match}")
780
780
  return candidates[best_match_index], best_match_index
781
781
 
782
+ def imgcmp(img: list, method='knn', plot_=True, figsize=[12, 6]):
783
+ """
784
+ Compare two images using SSIM, Feature Matching (SIFT), or KNN Matching.
785
+
786
+ Parameters:
787
+ - img (list): List containing two image file paths [img1, img2].
788
+ - method (str): Comparison method ('ssim', 'match', or 'knn').
789
+ - plot_ (bool): Whether to display the results visually.
790
+ - figsize (list): Size of the figure for plots.
791
+
792
+ Returns:
793
+ - For 'ssim': (diff, score): SSIM difference map and similarity score.
794
+ - For 'match' or 'knn': (good_matches, len(good_matches), similarity_score): Matches and similarity score.
795
+ """
796
+ import cv2
797
+ import matplotlib.pyplot as plt
798
+ from skimage.metrics import structural_similarity as ssim
799
+ # Load images
800
+ image1 = cv2.imread(img[0])
801
+ image2 = cv2.imread(img[1])
802
+
803
+ if image1 is None or image2 is None:
804
+ raise ValueError("Could not load one or both images. Check file paths.")
805
+ methods=['ssim','match','knn']
806
+ method=strcmp(method, methods)[0]
807
+ if method == 'ssim':
808
+ # Convert images to grayscale
809
+ gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
810
+ gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
811
+
812
+ # Compute SSIM
813
+ score, diff = ssim(gray1, gray2, full=True)
814
+ print(f"SSIM Score: {score:.4f}")
815
+
816
+ # Convert diff to 8-bit for visualization
817
+ diff = (diff * 255).astype("uint8")
818
+
819
+ # Plot if needed
820
+ if plot_:
821
+ fig, ax = plt.subplots(1, 3, figsize=figsize)
822
+ ax[0].imshow(gray1, cmap='gray')
823
+ ax[0].set_title("Image 1")
824
+ ax[1].imshow(gray2, cmap='gray')
825
+ ax[1].set_title("Image 2")
826
+ ax[2].imshow(diff, cmap='gray')
827
+ ax[2].set_title("Difference (SSIM)")
828
+ plt.tight_layout()
829
+ plt.show()
830
+
831
+ return diff, score
832
+
833
+ elif method in ['match', 'knn']:
834
+ # Convert images to grayscale
835
+ gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
836
+ gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
837
+
838
+ # Initialize SIFT detector
839
+ sift = cv2.SIFT_create()
840
+
841
+ # Detect and compute features
842
+ keypoints1, descriptors1 = sift.detectAndCompute(gray1, None)
843
+ keypoints2, descriptors2 = sift.detectAndCompute(gray2, None)
844
+
845
+ if len(keypoints1) == 0 or len(keypoints2) == 0:
846
+ raise ValueError("No keypoints found in one or both images.")
847
+
848
+ # BFMatcher initialization
849
+ bf = cv2.BFMatcher()
850
+
851
+ if method == 'match': # Cross-check matching
852
+ bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
853
+ matches = bf.match(descriptors1, descriptors2)
854
+ matches = sorted(matches, key=lambda x: x.distance)
855
+
856
+ # Filter good matches
857
+ good_matches = [m for m in matches if m.distance < 0.75 * matches[-1].distance]
858
+
859
+ elif method == 'knn': # KNN matching with ratio test
860
+ matches = bf.knnMatch(descriptors1, descriptors2, k=2)
861
+ # Apply Lowe's ratio test
862
+ good_matches = [m for m, n in matches if m.distance < 0.75 * n.distance]
782
863
 
783
- # Example usaged
784
- # str1 = "plos biology"
785
- # str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
786
- # best_match, idx = strcmp(str1, str2, ignore_case=1)
864
+ # Calculate similarity score
865
+ similarity_score = len(good_matches) / min(len(keypoints1), len(keypoints2))
866
+ print(f"Number of good matches: {len(good_matches)}")
867
+ print(f"Similarity Score: {similarity_score:.4f}")
868
+ # Handle case where no good matches are found
869
+ if len(good_matches) == 0:
870
+ print("No good matches found.")
871
+ return good_matches, 0.0, None
872
+
873
+ # Identify matched keypoints
874
+ src_pts = np.float32([keypoints1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
875
+ dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
876
+
877
+ # Calculate Homography using RANSAC
878
+ homography_matrix, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
879
+
880
+ # Apply the homography to image2
881
+ h, w = image1.shape[:2]
882
+ warped_image2 = cv2.warpPerspective(image2, homography_matrix, (w, h))
883
+
884
+ # Plot result if needed
885
+ if plot_:
886
+ fig, ax = plt.subplots(1, 2, figsize=figsize)
887
+ ax[0].imshow(cv2.cvtColor(image1, cv2.COLOR_BGR2RGB))
888
+ ax[0].set_title("Image 1")
889
+ ax[1].imshow(cv2.cvtColor(warped_image2, cv2.COLOR_BGR2RGB))
890
+ ax[1].set_title("Warped Image 2")
891
+ plt.tight_layout()
892
+ plt.show()
893
+
894
+ # Plot matches if needed
895
+ if plot_:
896
+ result = cv2.drawMatches(image1, keypoints1, image2, keypoints2, good_matches, None, flags=2)
897
+ plt.figure(figsize=figsize)
898
+ plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
899
+ plt.title(f"Feature Matches ({len(good_matches)} matches, Score: {similarity_score:.4f})")
900
+ plt.axis('off')
901
+ plt.show()
902
+ # Identify unmatched keypoints
903
+ matched_idx1 = [m.queryIdx for m in good_matches]
904
+ matched_idx2 = [m.trainIdx for m in good_matches]
905
+
906
+ unmatched_kp1 = [kp for i, kp in enumerate(keypoints1) if i not in matched_idx1]
907
+ unmatched_kp2 = [kp for i, kp in enumerate(keypoints2) if i not in matched_idx2]
908
+
909
+ # Mark unmatched keypoints on the images
910
+ img1_marked = cv2.drawKeypoints(image1, unmatched_kp1, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
911
+ img2_marked = cv2.drawKeypoints(image2, unmatched_kp2, None, color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
912
+
913
+ # Display results
914
+ if plot_:
915
+ fig, ax = plt.subplots(1, 2, figsize=figsize)
916
+ ax[0].imshow(cv2.cvtColor(img1_marked, cv2.COLOR_BGR2RGB))
917
+ ax[0].set_title("Unmatched Keypoints (Image 1)")
918
+ ax[1].imshow(cv2.cvtColor(img2_marked, cv2.COLOR_BGR2RGB))
919
+ ax[1].set_title("Unmatched Keypoints (Image 2)")
920
+ plt.tight_layout()
921
+ plt.show()
922
+ return good_matches, similarity_score, homography_matrix
923
+
924
+ else:
925
+ raise ValueError("Invalid method. Use 'ssim', 'match', or 'knn'.")
787
926
 
788
927
 
789
928
  def cn2pinyin(
@@ -892,6 +1031,143 @@ def dict2df(dict_, fill=None):
892
1031
  dict_[key] = value
893
1032
  return pd.DataFrame.from_dict(dict_)
894
1033
 
1034
+ def text2audio(
1035
+ text,
1036
+ method=None, # "pyttsx3","gTTS"
1037
+ rate=200,
1038
+ slow=False,#"gTTS"
1039
+ volume=1.0,
1040
+ voice=None,
1041
+ lang=None,
1042
+ gender=None,
1043
+ age=None,
1044
+ dir_save=None,
1045
+ ):
1046
+ """
1047
+ # sample_text = "Hello! This is a test of the pyttsx3 text-to-speech system."
1048
+ # sample_text = "这个是中文, 测试"
1049
+ # sample_text = "Hallo, ich bin echo, Wie Heissen Sie"
1050
+
1051
+ # text2audio(
1052
+ # text=sample_text,
1053
+ # rate=150,
1054
+ # volume=0.9,
1055
+ # # voice=None, # Replace with a voice name or ID available on your system
1056
+ # )
1057
+ """
1058
+ if method is not None:
1059
+ methods=["gTTS","pyttsx3","google"]
1060
+ method=strcmp(method, methods)[0]
1061
+ else:
1062
+ try:
1063
+ text2audio(text,method='google',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1064
+ except Exception as e:
1065
+ print(e)
1066
+ text2audio(text,method='pyttsx3',rate=rate, slow=slow, volume=volume, voice=voice,lang=lang,gender=gender,age=age,dir_save=dir_save)
1067
+
1068
+ if method=="pyttsx3":
1069
+ import pyttsx3
1070
+
1071
+ try:
1072
+ engine = pyttsx3.init()
1073
+ engine.setProperty("rate", rate)
1074
+ if 0.0 <= volume <= 1.0:
1075
+ engine.setProperty("volume", volume)
1076
+ else:
1077
+ raise ValueError("Volume must be between 0.0 and 1.0")
1078
+
1079
+ if gender is not None:
1080
+ gender = strcmp(gender, ["male", "female"])[0]
1081
+ if age is not None:
1082
+ if isinstance(age, (float, int)):
1083
+ if age <= 10:
1084
+ age = "child"
1085
+ elif 10 < age < 18:
1086
+ age = "senior"
1087
+ else:
1088
+ age = "adult"
1089
+ elif isinstance(age, str):
1090
+ age = strcmp(age, ["child", "adult", "senior"])[0]
1091
+ else:
1092
+ raise ValueError("age: should be in ['child', 'adult', 'senior']")
1093
+ voices = engine.getProperty("voices")
1094
+ if voice is None:
1095
+ if lang is None:
1096
+ voice = strcmp(detect_lang(text), [v.name for v in voices])[0]
1097
+ else:
1098
+ if run_once_within():
1099
+ print([v.name for v in voices])
1100
+ print(f"lang:{lang}")
1101
+ voice = strcmp(lang, [v.name for v in voices])[0]
1102
+ selected_voice = None
1103
+
1104
+ for v in voices:
1105
+ # Check if the voice matches the specified gender or age
1106
+ if voice and (voice.lower() in v.name.lower() or voice in v.id):
1107
+ selected_voice = v
1108
+ break
1109
+ if gender and gender.lower() in v.name.lower():
1110
+ selected_voice = v
1111
+ if age and age.lower() in v.name.lower():
1112
+ selected_voice = v
1113
+
1114
+ if selected_voice:
1115
+ engine.setProperty("voice", selected_voice.id)
1116
+ else:
1117
+ if voice or gender or age:
1118
+ raise ValueError(
1119
+ f"No matching voice found for specified criteria. Available voices: {[v.name for v in voices]}"
1120
+ )
1121
+ # Generate audio
1122
+ if dir_save:
1123
+ engine.save_to_file(text, dir_save)
1124
+ print(f"Audio saved to {dir_save}")
1125
+ else:
1126
+ engine.say(text)
1127
+
1128
+ engine.runAndWait()
1129
+ except Exception as e:
1130
+ print(f"An error occurred: {e}")
1131
+ # # Explicitly terminate the pyttsx3 engine to release resources
1132
+ try:
1133
+ engine.stop()
1134
+ except RuntimeError:
1135
+ pass
1136
+ # Safely exit the script if running interactively to avoid kernel restarts
1137
+ try:
1138
+ import sys
1139
+
1140
+ sys.exit()
1141
+ except SystemExit:
1142
+ pass
1143
+ elif method.lower() in ['google','gtts']:
1144
+ from gtts import gTTS
1145
+ try:
1146
+ if lang is None:
1147
+ from langdetect import detect
1148
+ lang = detect(text)
1149
+ # Initialize gTTS with the provided parameters
1150
+ tts = gTTS(text=text, lang=lang, slow=slow)
1151
+ except Exception as e:
1152
+ print(f"An error occurred: {e}")
1153
+
1154
+ print("not realtime reading...")
1155
+ if dir_save:
1156
+ if "." not in dir_save:
1157
+ dir_save=dir_save+".mp3"
1158
+ tts.save(dir_save)
1159
+ print(f"Audio saved to {dir_save}")
1160
+ else:
1161
+ dir_save = "temp_audio.mp3"
1162
+ if "." not in dir_save:
1163
+ dir_save=dir_save+".mp3"
1164
+ tts.save(dir_save)
1165
+ try:
1166
+ fopen(dir_save)
1167
+ except Exception as e:
1168
+ print(f"Error opening file: {e}")
1169
+ print("done")
1170
+
895
1171
  def str2time(time_str, fmt="24"):
896
1172
  """
897
1173
  Convert a time string into the specified format.
@@ -2094,7 +2370,7 @@ def fload(fpath, kind=None, **kwargs):
2094
2370
  False if chunksize else True
2095
2371
  ) # when chunksize, recommend low_memory=False # default:
2096
2372
  verbose = kwargs.pop("verbose", False)
2097
- if run_once_within(reverse=True):
2373
+ if run_once_within(reverse=True) and verbose:
2098
2374
  use_pd("read_csv", verbose=verbose)
2099
2375
 
2100
2376
  if comment is None:# default: None
@@ -2212,7 +2488,7 @@ def fload(fpath, kind=None, **kwargs):
2212
2488
  if chunksize:
2213
2489
  df = _get_chunks(df)
2214
2490
  print(df.shape)
2215
- if not is_df_abnormal(df, verbose=0): # normal
2491
+ if not is_df_abnormal(df, verbose=0) and verbose: # normal
2216
2492
  display(df.head(2))
2217
2493
  print(f"shape: {df.shape}")
2218
2494
  return df
@@ -2245,26 +2521,28 @@ def fload(fpath, kind=None, **kwargs):
2245
2521
  df = _get_chunks(df)
2246
2522
  print(df.shape)
2247
2523
  if not is_df_abnormal(df, verbose=0):
2248
- (
2249
- display(df.head(2))
2250
- if isinstance(df, pd.DataFrame)
2251
- else display("it is not a DataFrame")
2252
- )
2253
- (
2254
- print(f"shape: {df.shape}")
2255
- if isinstance(df, pd.DataFrame)
2256
- else display("it is not a DataFrame")
2257
- )
2524
+ if verbose:
2525
+ (
2526
+ display(df.head(2))
2527
+ if isinstance(df, pd.DataFrame)
2528
+ else display("it is not a DataFrame")
2529
+ )
2530
+ (
2531
+ print(f"shape: {df.shape}")
2532
+ if isinstance(df, pd.DataFrame)
2533
+ else display("it is not a DataFrame")
2534
+ )
2258
2535
  return df
2259
2536
  except EmptyDataError as e:
2260
2537
  continue
2261
2538
  else:
2262
2539
  pass
2263
- print(kwargs)
2540
+ # print(kwargs)
2264
2541
  # if is_df_abnormal(df,verbose=verbose):
2265
2542
  # df=pd.read_csv(fpath,**kwargs)
2266
- display(df.head(2))
2267
- print(f"shape: {df.shape}")
2543
+ if verbose:
2544
+ display(df.head(2))
2545
+ print(f"shape: {df.shape}")
2268
2546
  return df
2269
2547
 
2270
2548
  def load_excel(fpath, **kwargs):
@@ -2300,7 +2578,7 @@ def fload(fpath, kind=None, **kwargs):
2300
2578
  engine = kwargs.get("engine", "pyarrow")
2301
2579
  verbose = kwargs.pop("verbose", False)
2302
2580
 
2303
- if run_once_within(reverse=True):
2581
+ if run_once_within(reverse=True) and verbose:
2304
2582
  use_pd("read_parquet", verbose=verbose)
2305
2583
  try:
2306
2584
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2385,6 +2663,16 @@ def fload(fpath, kind=None, **kwargs):
2385
2663
  doc = Document(fpath)
2386
2664
  content = [para.text for para in doc.paragraphs]
2387
2665
  return content
2666
+
2667
+ def load_rtf(file_path):
2668
+ from striprtf.striprtf import rtf_to_text
2669
+ try:
2670
+ with open(file_path, "r") as file:
2671
+ rtf_content = file.read()
2672
+ text = rtf_to_text(rtf_content)
2673
+ return text
2674
+ except Exception as e:
2675
+ print(f"Error loading RTF file: {e}")
2388
2676
 
2389
2677
  if kind is None:
2390
2678
  _, kind = os.path.splitext(fpath)
@@ -2427,6 +2715,7 @@ def fload(fpath, kind=None, **kwargs):
2427
2715
  "xml",
2428
2716
  "ipynb",
2429
2717
  "mtx",
2718
+ "rtf"
2430
2719
  ]
2431
2720
  zip_types = [
2432
2721
  "gz",
@@ -2446,22 +2735,7 @@ def fload(fpath, kind=None, **kwargs):
2446
2735
  if kind not in supported_types:
2447
2736
  print(
2448
2737
  f'Warning:\n"{kind}" is not in the supported list '
2449
- ) # {supported_types}')
2450
- # if os.path.splitext(fpath)[1][1:].lower() in zip_types:
2451
- # keep=kwargs.get("keep", False)
2452
- # ifile=kwargs.get("ifile",(0,0))
2453
- # kwargs.pop("keep",None)
2454
- # kwargs.pop("ifile",None)
2455
- # fpath_unzip=unzip(fpath)
2456
- # if isinstance(fpath_unzip,list):
2457
- # fpath_unzip=fpath_unzip[ifile[0]]
2458
- # if os.path.isdir(fpath_unzip):
2459
- # fpath_selected=listdir(fpath_unzip,kind=kind).fpath[ifile[1]]
2460
- # fpath_unzip=fpath_selected
2461
- # content_unzip=fload(fpath_unzip, **kwargs)
2462
- # if not keep:
2463
- # os.remove(fpath_unzip)
2464
- # return content_unzip
2738
+ ) # {supported_types}')
2465
2739
 
2466
2740
  if kind == "docx":
2467
2741
  return load_docx(fpath)
@@ -2477,37 +2751,45 @@ def fload(fpath, kind=None, **kwargs):
2477
2751
  return load_xml(fpath)
2478
2752
  elif kind in ["csv", "tsv"]:
2479
2753
  # verbose = kwargs.pop("verbose", False)
2480
- if run_once_within(reverse=True):
2481
- use_pd("read_csv")
2754
+ # if run_once_within(reverse=True) and verbose:
2755
+ # use_pd("read_csv")
2482
2756
  content = load_csv(fpath, **kwargs)
2483
2757
  return content
2484
2758
  elif kind == "pkl":
2485
2759
  verbose = kwargs.pop("verbose", False)
2486
- if run_once_within(reverse=True):
2760
+ if run_once_within(reverse=True) and verbose:
2487
2761
  use_pd("read_pickle")
2488
- return pd.read_pickle(fpath, **kwargs)
2762
+ try:
2763
+ res_=pd.read_pickle(fpath, **kwargs)
2764
+ except Exception as e:
2765
+ import pickle
2766
+ with open('sgd_classifier.pkl', 'rb') as f:
2767
+ res_ = pickle.load(f)
2768
+ return res_
2489
2769
  elif kind in ["ods", "ods", "odt"]:
2490
2770
  engine = kwargs.get("engine", "odf")
2491
2771
  kwargs.pop("engine", None)
2492
2772
  return load_excel(fpath, engine=engine, **kwargs)
2493
2773
  elif kind == "xls":
2774
+ verbose = kwargs.pop("verbose", False)
2494
2775
  engine = kwargs.get("engine", "xlrd")
2495
2776
  kwargs.pop("engine", None)
2496
2777
  content = load_excel(fpath, engine=engine, **kwargs)
2497
- print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2778
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) and verbose else None
2498
2779
  display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2499
2780
  return content
2500
2781
  elif kind == "xlsx":
2782
+ verbose = kwargs.pop("verbose", False)
2501
2783
  content = load_excel(fpath, **kwargs)
2502
- display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2784
+ display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2503
2785
  print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2504
2786
  return content
2505
2787
  elif kind == "mtx":
2506
2788
  from scipy.io import mmread
2507
-
2789
+ verbose = kwargs.pop("verbose", False)
2508
2790
  dat_mtx = mmread(fpath)
2509
2791
  content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
2510
- display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2792
+ display(content.head(3)) if isinstance(content, pd.DataFrame) and verbose else None
2511
2793
  print(f"shape: {content.shape}")
2512
2794
  return content
2513
2795
  elif kind == "ipynb":
@@ -2578,6 +2860,8 @@ def fload(fpath, kind=None, **kwargs):
2578
2860
 
2579
2861
  elif kind == "mplstyle":
2580
2862
  return read_mplstyle(fpath)
2863
+ elif kind == "rtf":
2864
+ return load_rtf(fpath)
2581
2865
 
2582
2866
  else:
2583
2867
  print("direct reading...")
@@ -2616,6 +2900,38 @@ def fload(fpath, kind=None, **kwargs):
2616
2900
  # docx_content = fload('sample.docx')
2617
2901
 
2618
2902
 
2903
+ def fopen(fpath):
2904
+ import os
2905
+ import platform
2906
+ import sys
2907
+ try:
2908
+ # Check if the file exists
2909
+ if not os.path.isfile(fpath):
2910
+ print(f"Error: The file does not exist - {fpath}")
2911
+ return
2912
+
2913
+ # Get the system platform
2914
+ system = platform.system()
2915
+
2916
+ # Platform-specific file opening commands
2917
+ if system == "Darwin": # macOS
2918
+ os.system(f"open \"{fpath}\"")
2919
+ elif system == "Windows": # Windows
2920
+ # Ensure the path is handled correctly in Windows, escape spaces
2921
+ os.system(f"start \"\" \"{fpath}\"")
2922
+ elif system == "Linux": # Linux
2923
+ os.system(f"xdg-open \"{fpath}\"")
2924
+ elif system == "Java": # Java (or other unhandled systems)
2925
+ print(f"Opening {fpath} on unsupported system.")
2926
+ else:
2927
+ print(f"Unsupported OS: {system}")
2928
+
2929
+ print(f"Successfully opened {fpath} with the default application.")
2930
+ except Exception as e:
2931
+ print(f"Error opening file {fpath}: {e}")
2932
+
2933
+
2934
+
2619
2935
  def fupdate(fpath, content=None, how="head"):
2620
2936
  """
2621
2937
  Update a file by adding new content at the top and moving the old content to the bottom.
@@ -3025,13 +3341,18 @@ def fsave(
3025
3341
  content.to_pickle(fpath, **kwargs)
3026
3342
  else:
3027
3343
  try:
3028
- print("trying to convert it as a DataFrame...")
3029
3344
  content = pd.DataFrame(content)
3030
3345
  content.to_pickle(fpath, **kwargs)
3031
3346
  except Exception as e:
3032
- raise ValueError(
3033
- f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
3034
- )
3347
+ try:
3348
+ import pickle
3349
+ with open(fpath, 'wb') as f:
3350
+ pickle.dump(content, f)
3351
+ print('done!', fpath)
3352
+ except Exception as e:
3353
+ raise ValueError(
3354
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
3355
+ )
3035
3356
  elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
3036
3357
  # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
3037
3358
  # optimized for data analytics tasks and is especially fast when working with Pandas.
@@ -3187,16 +3508,22 @@ def isa(content, kind):
3187
3508
  """
3188
3509
  if "img" in kind.lower() or "image" in kind.lower():
3189
3510
  return is_image(content)
3511
+ elif 'vid' in kind.lower():
3512
+ return is_video(content)
3513
+ elif 'aud' in kind.lower():
3514
+ return is_audio(content)
3190
3515
  elif "doc" in kind.lower():
3191
3516
  return is_document(content)
3192
3517
  elif "zip" in kind.lower():
3193
3518
  return is_zip(content)
3194
3519
  elif "dir" in kind.lower() or ("f" in kind.lower() and "d" in kind.lower()):
3195
3520
  return os.path.isdir(content)
3521
+ elif "code" in kind.lower(): # file
3522
+ return is_code(content)
3196
3523
  elif "fi" in kind.lower(): # file
3197
3524
  return os.path.isfile(content)
3198
3525
  elif "num" in kind.lower(): # file
3199
- return os.path.isfile(content)
3526
+ return isnum(content)
3200
3527
  elif "text" in kind.lower() or "txt" in kind.lower(): # file
3201
3528
  return is_text(content)
3202
3529
  elif "color" in kind.lower(): # file
@@ -3607,7 +3934,7 @@ def get_os(full=False, verbose=False):
3607
3934
  "usage (%)": usage.percent,
3608
3935
  }
3609
3936
  except PermissionError:
3610
- system_info["Disk"][partition.device] = "Permission Denied"
3937
+ system_info["disk"][partition.device] = "Permission Denied"
3611
3938
 
3612
3939
  # Network Information
3613
3940
  if_addrs = psutil.net_if_addrs()
@@ -3667,11 +3994,33 @@ def listdir(
3667
3994
  ascending=True,
3668
3995
  contains=None,# filter filenames using re
3669
3996
  booster=False,# walk in subfolders
3997
+ depth = 0, # 0: no subfolders; None: all subfolders; [int 1,2,3]: levels of subfolders
3670
3998
  hidden=False, # Include hidden files/folders
3671
3999
  orient="list",
3672
4000
  output="df", # "df", 'list','dict','records','index','series'
3673
4001
  verbose=True,
3674
- ):
4002
+ ):
4003
+ def is_hidden(filepath):
4004
+ """Check if a file or folder is hidden."""
4005
+ system = platform.system()
4006
+ if system == "Windows":
4007
+ import ctypes
4008
+ attribute = ctypes.windll.kernel32.GetFileAttributesW(filepath)
4009
+ if attribute == -1:
4010
+ raise FileNotFoundError(f"File {filepath} not found.")
4011
+ return bool(attribute & 2) # FILE_ATTRIBUTE_HIDDEN
4012
+ else: # macOS/Linux: Hidden if the name starts with a dot
4013
+ return os.path.basename(filepath).startswith(".")
4014
+
4015
+ def get_user():
4016
+ """Retrieve the username of the current user."""
4017
+ system = platform.system()
4018
+ if system == "Windows":
4019
+ return os.environ.get("USERNAME", "Unknown")
4020
+ else:
4021
+ import pwd
4022
+ return pwd.getpwuid(os.getuid()).pw_name
4023
+
3675
4024
  if isinstance(kind, list):
3676
4025
  f_ = []
3677
4026
  for kind_ in kind:
@@ -3681,7 +4030,7 @@ def listdir(
3681
4030
  sort_by=sort_by,
3682
4031
  ascending=ascending,
3683
4032
  contains=contains,
3684
- booster=booster,# walk in subfolders
4033
+ depth=depth,# walk in subfolders
3685
4034
  hidden=hidden,
3686
4035
  orient=orient,
3687
4036
  output=output,
@@ -3710,12 +4059,24 @@ def listdir(
3710
4059
  "rootdir":[],
3711
4060
  "fname": [],
3712
4061
  "fpath": [],
4062
+ "num":[],
4063
+ "os":[]
3713
4064
  }
4065
+ root_depth = rootdir.rstrip(os.sep).count(os.sep)
3714
4066
  for dirpath, dirnames, ls in os.walk(rootdir):
4067
+ current_depth = dirpath.rstrip(os.sep).count(os.sep) - root_depth
4068
+ # Check depth limit
4069
+ if depth is not None and current_depth > depth:
4070
+ dirnames[:] = [] # Prevent further traversal into subfolders
4071
+ continue
4072
+
3715
4073
  if not hidden:
3716
- dirnames[:] = [d for d in dirnames if not d.startswith(".")]
3717
- ls = [i for i in ls if not i.startswith(".")]
3718
- for dirname in dirnames:
4074
+ dirnames[:] = [d for d in dirnames if not is_hidden(os.path.join(dirpath, d))]
4075
+ ls = [i for i in ls if not is_hidden(os.path.join(dirpath, i))]
4076
+
4077
+ for dirname in dirnames:
4078
+ if kind is not None and kind not in fd: # do not check folders
4079
+ continue
3719
4080
  if contains and not re.search(contains, dirname):
3720
4081
  continue
3721
4082
  dirname_path = os.path.join(dirpath, dirname)
@@ -3734,21 +4095,23 @@ def listdir(
3734
4095
  f['basename'].append(os.path.basename(dirname_path))
3735
4096
  f["path"].append(os.path.join(os.path.dirname(dirname_path), dirname))
3736
4097
  f["created_time"].append(
3737
- pd.to_datetime(os.path.getctime(dirname_path), unit="s")
4098
+ pd.to_datetime(int(os.path.getctime(dirname_path)), unit="s")
3738
4099
  )
3739
4100
  f["modified_time"].append(
3740
- pd.to_datetime(os.path.getmtime(dirname_path), unit="s")
4101
+ pd.to_datetime(int(os.path.getmtime(dirname_path)), unit="s")
3741
4102
  )
3742
4103
  f["last_open_time"].append(
3743
- pd.to_datetime(os.path.getatime(dirname_path), unit="s")
4104
+ pd.to_datetime(int(os.path.getatime(dirname_path)), unit="s")
3744
4105
  )
3745
4106
  f["permission"].append(stat.filemode(stats_file.st_mode)),
3746
- f["owner"].append(os.getlogin() if platform.system() != "Windows" else "N/A"),
4107
+ f["owner"].append(get_user()),
3747
4108
  f["rootdir"].append(dirpath)
3748
4109
  f["fname"].append(filename) # will be removed
3749
4110
  f["fpath"].append(fpath) # will be removed
3750
4111
  i += 1
3751
- for item in ls:
4112
+ for item in ls:
4113
+ if kind in fd:# only check folders
4114
+ continue
3752
4115
  if contains and not re.search(contains, item):
3753
4116
  continue
3754
4117
  item_path = os.path.join(dirpath, item)
@@ -3760,13 +4123,11 @@ def listdir(
3760
4123
  continue
3761
4124
  filename, file_extension = os.path.splitext(item)
3762
4125
  if kind is not None:
3763
- if not kind.startswith("."):
3764
- kind = "." + kind
3765
4126
  is_folder = kind.lower() in fd and os.path.isdir(item_path)
3766
4127
  is_file = kind.lower() in file_extension.lower() and (
3767
4128
  os.path.isfile(item_path)
3768
4129
  )
3769
- if kind in [".doc", ".img", ".zip"]: # 选择大的类别
4130
+ if kind in [".doc", ".img", ".zip",".code",".file",".image",".video",".audio"]: # 选择大的类别
3770
4131
  if kind != ".folder" and not isa(item_path, kind):
3771
4132
  continue
3772
4133
  elif kind in [".all"]:
@@ -3780,15 +4141,15 @@ def listdir(
3780
4141
  f["length"].append(len(filename))
3781
4142
  f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
3782
4143
  f['basename'].append(os.path.basename(item_path))
3783
- f["path"].append(os.path.join(os.path.dirname(item_path), item))
4144
+ f["path"].append(os.path.join(os.path.dirname(item_path), item))
3784
4145
  f["created_time"].append(
3785
- pd.to_datetime(os.path.getctime(item_path), unit="s")
4146
+ pd.to_datetime(int(os.path.getctime(item_path)), unit="s")
3786
4147
  )
3787
4148
  f["modified_time"].append(
3788
- pd.to_datetime(os.path.getmtime(item_path), unit="s")
4149
+ pd.to_datetime(int(os.path.getmtime(item_path)), unit="s")
3789
4150
  )
3790
4151
  f["last_open_time"].append(
3791
- pd.to_datetime(os.path.getatime(item_path), unit="s")
4152
+ pd.to_datetime(int(os.path.getatime(item_path)), unit="s")
3792
4153
  )
3793
4154
  f["permission"].append(stat.filemode(stats_file.st_mode)),
3794
4155
  f["owner"].append(os.getlogin() if platform.system() != "Windows" else "N/A"),
@@ -3799,13 +4160,13 @@ def listdir(
3799
4160
 
3800
4161
  f["num"] = i
3801
4162
  f["os"] = get_os() # os.uname().machine
3802
- if not booster: # go deeper subfolders
3803
- break
4163
+ # if not booster: # go deeper subfolders
4164
+ # break
3804
4165
  #* convert to pd.DataFrame
3805
4166
  f = pd.DataFrame(f)
3806
4167
  f=f[["basename","name","kind","length","size","num","path","created_time",
3807
4168
  "modified_time","last_open_time","rootdir",
3808
- "fname","fpath","permission","owner","os",]]
4169
+ "permission","owner","os","fname","fpath",]]
3809
4170
  if "nam" in sort_by.lower():
3810
4171
  f = sort_kind(f, by="name", ascending=ascending)
3811
4172
  elif "crea" in sort_by.lower():
@@ -4173,39 +4534,233 @@ def is_num(s):
4173
4534
  def isnum(s):
4174
4535
  return is_num(s)
4175
4536
 
4176
-
4177
4537
  def is_image(fpath):
4538
+ """
4539
+ Determine if a given file is an image based on MIME type and file extension.
4540
+
4541
+ Args:
4542
+ fpath (str): Path to the file.
4543
+
4544
+ Returns:
4545
+ bool: True if the file is a recognized image, False otherwise.
4546
+ """
4178
4547
  import mimetypes
4548
+ # Known image MIME types
4549
+ image_mime_types = {
4550
+ "image/jpeg",
4551
+ "image/png",
4552
+ "image/gif",
4553
+ "image/bmp",
4554
+ "image/webp",
4555
+ "image/tiff",
4556
+ "image/x-icon",
4557
+ "image/svg+xml",
4558
+ "image/heic",
4559
+ "image/heif",
4560
+ }
4179
4561
 
4562
+ # Known image file extensions
4563
+ image_extensions = {
4564
+ ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tif", ".tiff",
4565
+ ".ico", ".svg", ".heic", ".heif",".fig",".jpg"
4566
+ }
4567
+
4568
+ # Get MIME type using mimetypes
4180
4569
  mime_type, _ = mimetypes.guess_type(fpath)
4181
- if mime_type and mime_type.startswith("image"):
4570
+
4571
+ # Check MIME type
4572
+ if mime_type in image_mime_types:
4573
+ return True
4574
+
4575
+ # Fallback: Check file extension
4576
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4577
+ if ext in image_extensions:
4578
+ return True
4579
+
4580
+ return False
4581
+
4582
+ def is_video(fpath):
4583
+ """
4584
+ Determine if a given file is a video based on MIME type and file extension.
4585
+
4586
+ Args:
4587
+ fpath (str): Path to the file.
4588
+
4589
+ Returns:
4590
+ bool: True if the file is a recognized video, False otherwise.
4591
+ """
4592
+ import mimetypes
4593
+ # Known video MIME types
4594
+ video_mime_types = {
4595
+ "video/mp4",
4596
+ "video/quicktime",
4597
+ "video/x-msvideo",
4598
+ "video/x-matroska",
4599
+ "video/x-flv",
4600
+ "video/webm",
4601
+ "video/ogg",
4602
+ "video/x-ms-wmv",
4603
+ "video/x-mpeg",
4604
+ "video/3gpp",
4605
+ "video/avi",
4606
+ "video/mpeg",
4607
+ "video/x-mpeg2",
4608
+ "video/x-ms-asf",
4609
+ }
4610
+
4611
+ # Known video file extensions
4612
+ video_extensions = {
4613
+ ".mp4", ".mov", ".avi", ".mkv", ".flv", ".webm", ".ogv", ".wmv",
4614
+ ".mpg", ".mpeg", ".3gp", ".mpeg2", ".asf", ".ts", ".m4v", ".divx",
4615
+ }
4616
+
4617
+ # Get MIME type using mimetypes
4618
+ mime_type, _ = mimetypes.guess_type(fpath)
4619
+
4620
+ # Check MIME type
4621
+ if mime_type in video_mime_types:
4622
+ return True
4623
+
4624
+ # Fallback: Check file extension
4625
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4626
+ if ext in video_extensions:
4182
4627
  return True
4183
- else:
4184
- return False
4185
4628
 
4629
+ return False
4186
4630
 
4187
4631
  def is_document(fpath):
4632
+ """
4633
+ Determine if a given file is a document based on MIME type and file extension.
4634
+
4635
+ Args:
4636
+ fpath (str): Path to the file.
4637
+
4638
+ Returns:
4639
+ bool: True if the file is a recognized document, False otherwise.
4640
+ """
4188
4641
  import mimetypes
4642
+ # Define known MIME types for documents
4643
+ document_mime_types = {
4644
+ "text/",
4645
+ "application/pdf",
4646
+ "application/msword",
4647
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
4648
+ "application/vnd.ms-excel",
4649
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
4650
+ "application/vnd.ms-powerpoint",
4651
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
4652
+ "application/rtf",
4653
+ "application/x-latex",
4654
+ "application/vnd.oasis.opendocument.text",
4655
+ "application/vnd.oasis.opendocument.spreadsheet",
4656
+ "application/vnd.oasis.opendocument.presentation",
4657
+ }
4658
+
4659
+ # Define extensions for fallback
4660
+ document_extensions = {
4661
+ ".txt",
4662
+ ".log",
4663
+ ".csv",
4664
+ ".json",
4665
+ ".xml",
4666
+ ".pdf",
4667
+ ".doc",
4668
+ ".docx",
4669
+ ".xls",
4670
+ ".xlsx",
4671
+ ".ppt",
4672
+ ".pptx",
4673
+ ".odt",
4674
+ ".ods",
4675
+ ".odp",
4676
+ ".rtf",
4677
+ ".tex",
4678
+ }
4189
4679
 
4680
+ # Get MIME type
4190
4681
  mime_type, _ = mimetypes.guess_type(fpath)
4191
- if mime_type and (
4192
- mime_type.startswith("text/")
4193
- or mime_type == "application/pdf"
4194
- or mime_type == "application/msword"
4195
- or mime_type
4196
- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
4197
- or mime_type == "application/vnd.ms-excel"
4198
- or mime_type
4199
- == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
4200
- or mime_type == "application/vnd.ms-powerpoint"
4201
- or mime_type
4202
- == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
4203
- ):
4682
+
4683
+ # Check MIME type
4684
+ if mime_type and any(mime_type.startswith(doc_type) for doc_type in document_mime_types):
4685
+ return True
4686
+
4687
+ # Fallback: Check file extension
4688
+ ext = os.path.splitext(fpath)[-1].lower() # Get the extension, ensure it's lowercase
4689
+ if ext in document_extensions:
4690
+ return True
4691
+
4692
+ return False
4693
+
4694
+ def is_audio(fpath):
4695
+ """
4696
+ Determine if a given file is an audio file based on MIME type and file extension.
4697
+
4698
+ Args:
4699
+ fpath (str): Path to the file.
4700
+
4701
+ Returns:
4702
+ bool: True if the file is a recognized audio file, False otherwise.
4703
+ """
4704
+ import mimetypes
4705
+ # Known audio MIME types
4706
+ audio_mime_types = {
4707
+ "audio/mpeg",
4708
+ "audio/wav",
4709
+ "audio/ogg",
4710
+ "audio/aac",
4711
+ "audio/flac",
4712
+ "audio/midi",
4713
+ "audio/x-midi",
4714
+ "audio/x-wav",
4715
+ "audio/x-flac",
4716
+ "audio/pcm",
4717
+ "audio/x-aiff",
4718
+ "audio/x-m4a",
4719
+ }
4720
+
4721
+ # Known audio file extensions
4722
+ audio_extensions = {
4723
+ ".mp3", ".wav", ".ogg", ".aac", ".flac", ".midi", ".m4a",
4724
+ ".aiff", ".pcm", ".wma", ".ape", ".alac", ".opus",
4725
+ }
4726
+
4727
+ # Get MIME type using mimetypes
4728
+ mime_type, _ = mimetypes.guess_type(fpath)
4729
+
4730
+ # Check MIME type
4731
+ if mime_type in audio_mime_types:
4732
+ return True
4733
+
4734
+ # Fallback: Check file extension
4735
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension and ensure lowercase
4736
+ if ext in audio_extensions:
4204
4737
  return True
4205
- else:
4206
- return False
4207
4738
 
4739
+ return False
4208
4740
 
4741
+ def is_code(fpath):
4742
+ """
4743
+ Determine if a given file is a code file based on file extension and optionally MIME type.
4744
+
4745
+ Args:
4746
+ fpath (str): Path to the file.
4747
+ check_mime (bool): Whether to perform a MIME type check in addition to file extension check.
4748
+
4749
+ Returns:
4750
+ bool: True if the file is a recognized code file, False otherwise.
4751
+ """
4752
+ # Known programming and scripting file extensions
4753
+ code_extensions = {
4754
+ ".m", ".py", ".ipynb", ".js", ".html", ".css", ".java", ".cpp", ".h", ".cs", ".go",
4755
+ ".rs", ".sh", ".rb", ".swift", ".ts", ".json", ".xml", ".yaml", ".toml", ".bash", ".r"
4756
+ }
4757
+
4758
+ # Check file extension
4759
+ ext = os.path.splitext(fpath)[-1].lower()
4760
+ if ext in code_extensions:
4761
+ return True
4762
+ return False
4763
+
4209
4764
  def is_zip(fpath):
4210
4765
  import mimetypes
4211
4766
 
@@ -6190,12 +6745,12 @@ def df_astype(
6190
6745
 
6191
6746
 
6192
6747
  # ! DataFrame
6193
- def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6748
+ def df_sort_values(data, column, by=None, ascending=True, inplace=True, **kwargs):
6194
6749
  """
6195
6750
  Sort a DataFrame by a specified column based on a custom order or by count.
6196
6751
 
6197
6752
  Parameters:
6198
- - df: DataFrame to be sorted.
6753
+ - data: DataFrame to be sorted.
6199
6754
  - column: The name of the column to sort by.
6200
6755
  - by: List specifying the custom order for sorting or 'count' to sort by frequency.
6201
6756
  - ascending: Boolean or list of booleans, default True.
@@ -6211,7 +6766,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6211
6766
 
6212
6767
  if isinstance(by, str) and "count" in by.lower():
6213
6768
  # Count occurrences of each value in the specified column
6214
- value_counts = df[column].value_counts()
6769
+ value_counts = data[column].value_counts()
6215
6770
 
6216
6771
  # Determine the order based on counts
6217
6772
  count_ascending = kwargs.pop("count_ascending", ascending)
@@ -6220,12 +6775,12 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6220
6775
  ).index.tolist()
6221
6776
 
6222
6777
  # Convert to a categorical type with the new order
6223
- df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
6778
+ data[column] = pd.Categorical(data[column], categories=sorted_counts, ordered=True)
6224
6779
  # Set ascending to count_ascending for sorting
6225
6780
  ascending = count_ascending # Adjust ascending for the final sort
6226
6781
  elif isinstance(by, list):
6227
6782
  # Convert the specified column to a categorical type with the custom order
6228
- df[column] = pd.Categorical(df[column], categories=by, ordered=True)
6783
+ data[column] = pd.Categorical(data[column], categories=by, ordered=True)
6229
6784
  else:
6230
6785
  raise ValueError("Custom order must be a list or 'count'.")
6231
6786
 
@@ -6240,7 +6795,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
6240
6795
  return sorted_df
6241
6796
  except Exception as e:
6242
6797
  print(f"Error sorting DataFrame by '{column}': {e}")
6243
- return df
6798
+ return data
6244
6799
 
6245
6800
 
6246
6801
  # # Example usage:
@@ -7742,7 +8297,7 @@ def df_reducer(
7742
8297
  # example:
7743
8298
  # df_reducer(data=data_log, columns=markers, n_components=2)
7744
8299
 
7745
- def df_format(data, threshold_unique=0.5, verbose=False):
8300
+ def get_df_format(data, threshold_unique=0.5, verbose=False):
7746
8301
  """
7747
8302
  检测表格: long, wide or uncertain.
7748
8303
 
@@ -7834,13 +8389,16 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7834
8389
  # Step 5: Clustering analysis on numerical columns for correlation in wide format
7835
8390
  numeric_cols = data.select_dtypes(include="number").columns
7836
8391
  if len(numeric_cols) > 1:
7837
- scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
7838
- clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
7839
- cluster_labels = pd.Series(clustering.labels_)
7840
- if cluster_labels.nunique() < len(numeric_cols) * 0.5:
7841
- wide_score += 2
7842
- if verbose:
7843
- print("Clustering on columns shows grouping, suggesting wide format.")
8392
+ try:
8393
+ scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
8394
+ clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
8395
+ cluster_labels = pd.Series(clustering.labels_)
8396
+ if cluster_labels.nunique() < len(numeric_cols) * 0.5:
8397
+ wide_score += 2
8398
+ if verbose:
8399
+ print("Clustering on columns shows grouping, suggesting wide format.")
8400
+ except Exception as e:
8401
+ print(e) if verbose else None
7844
8402
 
7845
8403
  # Step 6: Inter-column correlation analysis
7846
8404
  if len(numeric_cols) > 1:
@@ -7868,11 +8426,14 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7868
8426
 
7869
8427
  # Step 8: Multi-level clustering on rows to detect block structure for wide format
7870
8428
  if len(numeric_cols) > 1 and n_rows > 5:
7871
- clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
7872
- if pd.Series(clustering_rows.labels_).nunique() < 2:
7873
- wide_score += 2
7874
- if verbose:
7875
- print("Row clustering reveals homogeneity, suggesting wide format.")
8429
+ try:
8430
+ clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
8431
+ if pd.Series(clustering_rows.labels_).nunique() < 2:
8432
+ wide_score += 2
8433
+ if verbose:
8434
+ print("Row clustering reveals homogeneity, suggesting wide format.")
8435
+ except Exception as e:
8436
+ print(e) if verbose else None
7876
8437
 
7877
8438
  # Step 9: Sequential name detection for time-series pattern in wide format
7878
8439
  if any(col.isdigit() or col.startswith("T") for col in col_names):
@@ -7881,15 +8442,18 @@ def df_format(data, threshold_unique=0.5, verbose=False):
7881
8442
  print("Detected time-like sequential column names, supporting wide format.")
7882
8443
 
7883
8444
  # Step 10: Entropy of numeric columns
7884
- numeric_entropy = data[numeric_cols].apply(
7885
- lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
7886
- )
7887
- if numeric_entropy.mean() < 2:
7888
- wide_score += 2
7889
- if verbose:
7890
- print(
7891
- "Low entropy in numeric columns indicates stability across columns, supporting wide format."
7892
- )
8445
+ try:
8446
+ numeric_entropy = data[numeric_cols].apply(
8447
+ lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
8448
+ )
8449
+ if numeric_entropy.mean() < 2:
8450
+ wide_score += 2
8451
+ if verbose:
8452
+ print(
8453
+ "Low entropy in numeric columns indicates stability across columns, supporting wide format."
8454
+ )
8455
+ except Exception as e:
8456
+ print(e) if verbose else None
7893
8457
 
7894
8458
  # Step 11: Tie-breaking strategy if scores are equal
7895
8459
  if wide_score == long_score:
@@ -8905,3 +9469,286 @@ def get_phone(phone_number: str, region: str = None,verbose=True):
8905
9469
  if verbose:
8906
9470
  preview(res)
8907
9471
  return res
9472
+
9473
+
9474
+ def decode_pluscode(
9475
+ pluscode: str, reference: tuple = (52.5200, 13.4050), return_bbox: bool = False
9476
+ ):
9477
+ """
9478
+ Decodes a Plus Code into latitude and longitude (and optionally returns a bounding box).
9479
+
9480
+ Parameters:
9481
+ pluscode (str): The Plus Code to decode. Can be full or short.
9482
+ reference (tuple, optional): Reference latitude and longitude for decoding short Plus Codes.
9483
+ Default is None, required if Plus Code is short.
9484
+ return_bbox (bool): If True, returns the bounding box coordinates (latitude/longitude bounds).
9485
+ Default is False.
9486
+
9487
+ Returns:
9488
+ tuple: (latitude, longitude) if `return_bbox` is False.
9489
+ (latitude, longitude, bbox) if `return_bbox` is True.
9490
+ bbox = (latitudeLo, latitudeHi, longitudeLo, longitudeHi)
9491
+ Raises:
9492
+ ValueError: If the Plus Code is invalid or reference is missing for a short code.
9493
+
9494
+ Usage:
9495
+ lat, lon = decode_pluscode("7FG6+89")
9496
+ print(f"Decoded Short Plus Code: Latitude: {lat}, Longitude: {lon}, Bounding Box: {bbox}")
9497
+
9498
+ lat, lon = decode_pluscode("9F4M7FG6+89")
9499
+ print(f"Decoded Full Plus Code: Latitude: {lat}, Longitude: {lon}")
9500
+ """
9501
+ from openlocationcode import openlocationcode as olc
9502
+
9503
+ # Validate Plus Code
9504
+ if not olc.isValid(pluscode):
9505
+ raise ValueError(f"Invalid Plus Code: {pluscode}")
9506
+
9507
+ # Handle Short Plus Codes
9508
+ if olc.isShort(pluscode):
9509
+ if reference is None:
9510
+ raise ValueError(
9511
+ "Reference location (latitude, longitude) is required for decoding short Plus Codes."
9512
+ )
9513
+ # Recover the full Plus Code using the reference location
9514
+ pluscode = olc.recoverNearest(pluscode, reference[0], reference[1])
9515
+
9516
+ # Decode the Plus Code
9517
+ decoded = olc.decode(pluscode)
9518
+
9519
+ # Calculate the center point of the bounding box
9520
+ latitude = (decoded.latitudeLo + decoded.latitudeHi) / 2
9521
+ longitude = (decoded.longitudeLo + decoded.longitudeHi) / 2
9522
+
9523
+ if return_bbox:
9524
+ bbox = (
9525
+ decoded.latitudeLo,
9526
+ decoded.latitudeHi,
9527
+ decoded.longitudeLo,
9528
+ decoded.longitudeHi,
9529
+ )
9530
+ return latitude, longitude, bbox
9531
+
9532
+ return latitude, longitude
9533
+
9534
+ def get_loc(input_data, user_agent="0413@mygmail.com)",verbose=True):
9535
+ """
9536
+ Determine if the input is a city name, lat/lon, or DMS and perform geocoding or reverse geocoding.
9537
+ Usage:
9538
+ get_loc("Berlin, Germany") # Example city
9539
+ # get_loc((48.8566, 2.3522)) # Example latitude and longitude
9540
+ # get_loc("48 51 24.3 N") # Example DMS input
9541
+ """
9542
+ from geopy.geocoders import Nominatim
9543
+ import re
9544
+
9545
+ def dms_to_decimal(dms):
9546
+ """
9547
+ Convert DMS (Degrees, Minutes, Seconds) to Decimal format.
9548
+ Input should be in the format of "DD MM SS" or "D M S".
9549
+ """
9550
+ # Regex pattern for DMS input
9551
+ pattern = r"(\d{1,3})[^\d]*?(\d{1,2})[^\d]*?(\d{1,2})"
9552
+ match = re.match(pattern, dms)
9553
+
9554
+ if match:
9555
+ degrees, minutes, seconds = map(float, match.groups())
9556
+ decimal = degrees + (minutes / 60) + (seconds / 3600)
9557
+ return decimal
9558
+ else:
9559
+ raise ValueError("Invalid DMS format")
9560
+
9561
+ geolocator = Nominatim(user_agent="0413@mygmail.com)")
9562
+ # Case 1: Input is a city name (string)
9563
+ if isinstance(input_data, str) and not re.match(r"^\d+(\.\d+)?$", input_data):
9564
+ location = geolocator.geocode(input_data)
9565
+ if verbose:
9566
+ print(
9567
+ f"Latitude and Longitude for {input_data}: {location.latitude}, {location.longitude}"
9568
+ )
9569
+ else:
9570
+ print(f"Could not find {input_data}.")
9571
+ return location
9572
+
9573
+ # Case 2: Input is latitude and longitude (float or tuple)
9574
+ elif isinstance(input_data, (float, tuple)):
9575
+ if isinstance(input_data, tuple) and len(input_data) == 2:
9576
+ latitude, longitude = input_data
9577
+ elif isinstance(input_data, float):
9578
+ latitude = input_data
9579
+ longitude = None # No longitude provided for a single float
9580
+
9581
+ # Reverse geocoding
9582
+ location_reversed = geolocator.reverse(
9583
+ (latitude, longitude) if longitude else latitude
9584
+ )
9585
+ if verbose:
9586
+ print(
9587
+ f"Address from coordinates ({latitude}, {longitude if longitude else ''}): {location_reversed.address}"
9588
+ )
9589
+ else:
9590
+ print("Could not reverse geocode the coordinates.")
9591
+ return location_reversed
9592
+
9593
+ # Case 3: Input is a DMS string
9594
+ elif isinstance(input_data, str):
9595
+ try:
9596
+ decimal_lat = dms_to_decimal(input_data)
9597
+ print(f"Converted DMS to decimal latitude: {decimal_lat}")
9598
+
9599
+ location_reversed = geolocator.reverse(decimal_lat)
9600
+ if verbose:
9601
+ print(f"Address from coordinates: {location_reversed.address}")
9602
+ else:
9603
+ print("Could not reverse geocode the coordinates.")
9604
+ return location_reversed
9605
+ except ValueError:
9606
+ print(
9607
+ "Invalid input format. Please provide a city name, latitude/longitude, or DMS string."
9608
+ )
9609
+
9610
+ def enpass(code: str, method: str="AES", key: str = None):
9611
+ """
9612
+ usage: enpass("admin")
9613
+ Master encryption function that supports multiple methods: AES, RSA, and SHA256.
9614
+ :param code: The input data to encrypt or hash.
9615
+ :param method: The encryption or hashing method ('AES', 'RSA', or 'SHA256').
9616
+ :param key: The key to use for encryption. For AES and RSA, it can be a password or key in PEM format.
9617
+ :return: The encrypted data or hashed value.
9618
+ """
9619
+ import hashlib
9620
+ # AES Encryption (Advanced)
9621
+ def aes_encrypt(data: str, key: str):
9622
+ """
9623
+ Encrypts data using AES algorithm in CBC mode.
9624
+ :param data: The data to encrypt.
9625
+ :param key: The key to use for AES encryption.
9626
+ :return: The encrypted data, base64 encoded.
9627
+ """
9628
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
9629
+ from cryptography.hazmat.backends import default_backend
9630
+ from cryptography.hazmat.primitives import padding
9631
+ import base64
9632
+ import os
9633
+ # Generate a 256-bit key from the provided password
9634
+ key = hashlib.sha256(key.encode()).digest()
9635
+
9636
+ # Generate a random initialization vector (IV)
9637
+ iv = os.urandom(16) # 16 bytes for AES block size
9638
+
9639
+ # Pad the data to be a multiple of 16 bytes using PKCS7
9640
+ padder = padding.PKCS7(128).padder()
9641
+ padded_data = padder.update(data.encode()) + padder.finalize()
9642
+
9643
+ # Create AES cipher object using CBC mode
9644
+ cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
9645
+ encryptor = cipher.encryptor()
9646
+ encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
9647
+
9648
+ # Return the base64 encoded result (IV + encrypted data)
9649
+ return base64.b64encode(iv + encrypted_data).decode()
9650
+
9651
+ # RSA Encryption (Advanced)
9652
+ def rsa_encrypt(data: str, public_key: str):
9653
+ """
9654
+ Encrypts data using RSA encryption with OAEP padding.
9655
+ :param data: The data to encrypt.
9656
+ :param public_key: The public key in PEM format.
9657
+ :return: The encrypted data, base64 encoded.
9658
+ """
9659
+ import base64
9660
+ from Crypto.PublicKey import RSA
9661
+ from Crypto.Cipher import PKCS1_OAEP
9662
+ public_key_obj = RSA.import_key(public_key)
9663
+ cipher_rsa = PKCS1_OAEP.new(public_key_obj)
9664
+ encrypted_data = cipher_rsa.encrypt(data.encode())
9665
+ return base64.b64encode(encrypted_data).decode()
9666
+ # SHA256 Hashing (Non-reversible)
9667
+ def sha256_hash(data: str):
9668
+ """
9669
+ Generates a SHA256 hash of the data.
9670
+ :param data: The data to hash.
9671
+ :return: The hashed value (hex string).
9672
+ """
9673
+ return hashlib.sha256(data.encode()).hexdigest()
9674
+ if key is None:
9675
+ key="worldpeace"
9676
+ method=strcmp(method,["AES","RSA",'SHA256'])[0]
9677
+ if method == "AES":
9678
+ return aes_encrypt(code, key)
9679
+ elif method == "RSA":
9680
+ return rsa_encrypt(code, key)
9681
+ elif method == "SHA256":
9682
+ return sha256_hash(code)
9683
+ else:
9684
+ raise ValueError("Unsupported encryption method")
9685
+
9686
+
9687
+ # Master Decryption Function (Supports AES, RSA)
9688
+ def depass(encrypted_code: str, method: str='AES', key: str = None):
9689
+ """
9690
+ Master decryption function that supports multiple methods: AES and RSA.
9691
+ :param encrypted_code: The encrypted data to decrypt.
9692
+ :param method: The encryption method ('AES' or 'RSA').
9693
+ :param key: The key to use for decryption. For AES and RSA, it can be a password or key in PEM format.
9694
+ :return: The decrypted data.
9695
+ """
9696
+ import hashlib
9697
+ def aes_decrypt(encrypted_data: str, key: str):
9698
+ """
9699
+ Decrypts data encrypted using AES in CBC mode.
9700
+ :param encrypted_data: The encrypted data, base64 encoded.
9701
+ :param key: The key to use for AES decryption.
9702
+ :return: The decrypted data (string).
9703
+ """
9704
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
9705
+ from cryptography.hazmat.backends import default_backend
9706
+ from cryptography.hazmat.primitives import padding
9707
+ import base64
9708
+ # Generate the same 256-bit key from the password
9709
+ key = hashlib.sha256(key.encode()).digest()
9710
+
9711
+ # Decode the encrypted data from base64
9712
+ encrypted_data = base64.b64decode(encrypted_data)
9713
+
9714
+ # Extract the IV and the actual encrypted data
9715
+ iv = encrypted_data[:16] # First 16 bytes are the IV
9716
+ encrypted_data = encrypted_data[16:] # Remaining data is the encrypted message
9717
+
9718
+ # Create AES cipher object using CBC mode
9719
+ cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
9720
+ decryptor = cipher.decryptor()
9721
+ decrypted_data = decryptor.update(encrypted_data) + decryptor.finalize()
9722
+
9723
+ # Unpad the decrypted data using PKCS7
9724
+ unpadder = padding.PKCS7(128).unpadder()
9725
+ unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
9726
+
9727
+ return unpadded_data.decode()
9728
+ def rsa_decrypt(encrypted_data: str, private_key: str):
9729
+ """
9730
+ Decrypts RSA-encrypted data using the private key.
9731
+ :param encrypted_data: The encrypted data, base64 encoded.
9732
+ :param private_key: The private key in PEM format.
9733
+ :return: The decrypted data (string).
9734
+ """
9735
+ from Crypto.PublicKey import RSA
9736
+ from Crypto.Cipher import PKCS1_OAEP
9737
+ import base64
9738
+ encrypted_data = base64.b64decode(encrypted_data)
9739
+ private_key_obj = RSA.import_key(private_key)
9740
+ cipher_rsa = PKCS1_OAEP.new(private_key_obj)
9741
+ decrypted_data = cipher_rsa.decrypt(encrypted_data)
9742
+ return decrypted_data.decode()
9743
+
9744
+ if key is None:
9745
+ key="worldpeace"
9746
+ method=strcmp(method,["AES","RSA",'SHA256'])[0]
9747
+ if method == "AES":
9748
+ return aes_decrypt(encrypted_code, key)
9749
+ elif method == "RSA":
9750
+ return rsa_decrypt(encrypted_code, key)
9751
+ elif method == "SHA256":
9752
+ raise ValueError("SHA256 is a hash function and cannot be decrypted.")
9753
+ else:
9754
+ raise ValueError("Unsupported decryption method")