py2ls 0.2.4.1__py3-none-any.whl → 0.2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
51
51
 
52
52
  from . import netfinder
53
53
 
54
- # from .plot import get_color
55
-
56
54
  try:
57
55
  get_ipython().run_line_magic("load_ext", "autoreload")
58
56
  get_ipython().run_line_magic("autoreload", "2")
@@ -61,19 +59,31 @@ except NameError:
61
59
 
62
60
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
63
61
  """
64
- Add the Chinese font to the font manager
62
+ Add the Chinese (default) font to the font manager
65
63
  Args:
66
64
  dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
67
65
  """
68
66
  import matplotlib.pyplot as plt
69
67
  from matplotlib import font_manager
68
+ slashtype = "/" if 'mac' in get_os() else "\\"
69
+ if slashtype in dir_font:
70
+ font_manager.fontManager.addfont(dir_font)
71
+ fontname = os.path.basename(dir_font).split(".")[0]
72
+ else:
73
+ if "cn" in dir_font.lower() or "ch" in dir_font.lower():
74
+ fontname = "Hiragino Sans GB" # default Chinese font
75
+ else:
76
+ fontname = dir_font
70
77
 
71
- font_manager.fontManager.addfont(dir_font)
72
- fontname_chinese = os.path.basename(dir_font).split(".")[0]
73
-
74
- plt.rcParams["font.sans-serif"] = [fontname_chinese]
75
- plt.rcParams["font.family"] = "sans-serif"
78
+ plt.rcParams["font.sans-serif"] = [fontname]
79
+ # plt.rcParams["font.family"] = "sans-serif"
76
80
  plt.rcParams["axes.unicode_minus"] = False
81
+ fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
82
+ fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
83
+ if fontname not in fontname_in_system:
84
+ print(f"Font '{fontname}' not found. Falling back to default.")
85
+ plt.rcParams["font.sans-serif"] = ["Arial"]
86
+ return fontname
77
87
 
78
88
  # set 'dir_save'
79
89
  if "dar" in sys.platform:
@@ -506,6 +516,59 @@ def is_text(s):
506
516
  return has_alpha and has_non_alpha
507
517
 
508
518
 
519
+ from typing import Any, Union
520
+
521
+ def shared(lst1:Any, lst2:Any,*args, verbose=True):
522
+ """
523
+ check the shared elelements in two list.
524
+ usage:
525
+ list1 = [1, 2, 3, 4, 5]
526
+ list2 = [4, 5, 6, 7, 8]
527
+ list3 = [5, 6, 9, 10]
528
+ a = shared(list1, list2,list3)
529
+ """
530
+ if verbose:
531
+ print("\n********* checking shared elements *********")
532
+ if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
533
+ print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
534
+ shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
535
+ # support more lists
536
+ if args:
537
+ for arg in args:
538
+ shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
539
+ shared_elements = list(shared_elements)
540
+ if verbose:
541
+ elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
542
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
543
+ print("********* checking shared elements *********")
544
+ return shared_elements
545
+
546
+ def flatten(nested: Any, unique_list=True,verbose=True):
547
+ """
548
+ Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
549
+ Parameters:
550
+ nested : Any, Can be a list, tuple, dictionary, or set.
551
+ Returns: list, A flattened list.
552
+ """
553
+ flattened_list = []
554
+ stack = [nested]
555
+ while stack:
556
+ current = stack.pop()
557
+ if isinstance(current, dict):
558
+ stack.extend(current.values())
559
+ elif isinstance(current, (list, tuple, set)):
560
+ stack.extend(current)
561
+ elif isinstance(current, pd.Series):
562
+ stack.extend(current)
563
+ else:
564
+ flattened_list.append(current)
565
+ if verbose:
566
+ print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
567
+ if unique_list:
568
+ return unique(flattened_list)
569
+ else:
570
+ return flattened_list
571
+
509
572
  def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
510
573
  """
511
574
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -526,6 +589,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
526
589
  if isinstance(s, str):
527
590
  return s.lower()
528
591
  elif isinstance(s, list):
592
+ s=[str(i) for i in s]# convert all to str
529
593
  return [elem.lower() for elem in s]
530
594
  return s
531
595
 
@@ -535,7 +599,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
535
599
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
536
600
  elif "W" in scorer.lower():
537
601
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
538
- elif "ratio" in scorer.lower():#Ratio (Strictest)
602
+ elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
539
603
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
540
604
  else:
541
605
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -1697,26 +1761,18 @@ def fload(fpath, kind=None, **kwargs):
1697
1761
  def load_csv(fpath, **kwargs):
1698
1762
  from pandas.errors import EmptyDataError
1699
1763
 
1700
- engine = kwargs.get("engine", "pyarrow")
1701
- kwargs.pop("engine", None)
1702
- sep = kwargs.get("sep", "\t")
1703
- kwargs.pop("sep", None)
1704
- index_col = kwargs.get("index_col", None)
1705
- kwargs.pop("index_col", None)
1706
- memory_map = kwargs.get("memory_map", True)
1707
- kwargs.pop("memory_map", None)
1708
- skipinitialspace = kwargs.get("skipinitialspace", True)
1709
- kwargs.pop("skipinitialspace", None)
1710
- encoding = kwargs.get("encoding", "utf-8")
1711
- kwargs.pop("encoding", None)
1712
- on_bad_lines = kwargs.get("on_bad_lines", "skip")
1713
- kwargs.pop("on_bad_lines", None)
1714
- comment = kwargs.get("comment", None)
1715
- kwargs.pop("comment", None)
1716
-
1764
+ engine = kwargs.pop("engine", "pyarrow")
1765
+ sep = kwargs.pop("sep", "\t")
1766
+ index_col = kwargs.pop("index_col", None)
1767
+ memory_map = kwargs.pop("memory_map", False)
1768
+ skipinitialspace = kwargs.pop("skipinitialspace", False)
1769
+ encoding = kwargs.pop("encoding", "utf-8")
1770
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1771
+ comment = kwargs.pop("comment", None)
1717
1772
  fmt=kwargs.pop("fmt",False)
1773
+ verbose=kwargs.pop("verbose",False)
1718
1774
  if verbose:
1719
- print_pd_usage("read_csv", verbose=verbose)
1775
+ use_pd("read_csv", verbose=verbose)
1720
1776
  return
1721
1777
 
1722
1778
  if comment is None:
@@ -1800,7 +1856,7 @@ def fload(fpath, kind=None, **kwargs):
1800
1856
  separators = [",", "\t", ";", "|", " "]
1801
1857
  for sep in separators:
1802
1858
  sep2show = sep if sep != "\t" else "\\t"
1803
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1859
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1804
1860
  try:
1805
1861
  df = pd.read_csv(
1806
1862
  fpath,
@@ -1819,13 +1875,13 @@ def fload(fpath, kind=None, **kwargs):
1819
1875
  except:
1820
1876
  pass
1821
1877
  else:
1822
- engines = ["c", "python"]
1878
+ engines = [None,"c", "python"]
1823
1879
  for engine in engines:
1824
- # separators = [",", "\t", ";", "|", " "]
1880
+ separators = [",", "\t", ";", "|", " "]
1825
1881
  for sep in separators:
1826
1882
  try:
1827
1883
  sep2show = sep if sep != "\t" else "\\t"
1828
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1884
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
1829
1885
  df = pd.read_csv(
1830
1886
  fpath,
1831
1887
  engine=engine,
@@ -1848,7 +1904,7 @@ def fload(fpath, kind=None, **kwargs):
1848
1904
  engine = kwargs.get("engine", "openpyxl")
1849
1905
  verbose=kwargs.pop("verbose",False)
1850
1906
  if verbose:
1851
- print_pd_usage("read_excel", verbose=verbose)
1907
+ use_pd("read_excel", verbose=verbose)
1852
1908
  df = pd.read_excel(fpath, engine=engine, **kwargs)
1853
1909
  try:
1854
1910
  meata=pd.ExcelFile(fpath)
@@ -2031,6 +2087,9 @@ def fload(fpath, kind=None, **kwargs):
2031
2087
  elif kind.lower() in img_types:
2032
2088
  print(f'Image ".{kind}" is loaded.')
2033
2089
  return load_img(fpath)
2090
+ elif kind=="gz" and fpath.endswith(".soft.gz"):
2091
+ import GEOparse
2092
+ return GEOparse.get_GEO(filepath=fpath)
2034
2093
  elif kind.lower() in zip_types:
2035
2094
  keep = kwargs.get("keep", False)
2036
2095
  fpath_unzip = unzip(fpath)
@@ -2105,30 +2164,51 @@ def fload(fpath, kind=None, **kwargs):
2105
2164
  # docx_content = fload('sample.docx')
2106
2165
 
2107
2166
 
2108
- def fupdate(fpath, content=None):
2167
+ def fupdate(fpath, content=None, how="head"):
2109
2168
  """
2110
2169
  Update a file by adding new content at the top and moving the old content to the bottom.
2170
+ If the file is a JSON file, merge the new content with the old content.
2171
+
2111
2172
  Parameters
2112
2173
  ----------
2113
2174
  fpath : str
2114
2175
  The file path where the content should be updated.
2115
- content : str, optional
2116
- The new content to add at the top of the file. If not provided, the function will not add any new content.
2176
+ content : str or dict, optional
2177
+ The new content to add at the top of the file (for text) or merge (for JSON).
2178
+ If not provided, the function will not add any new content.
2179
+
2117
2180
  Notes
2118
2181
  -----
2119
2182
  - If the file at `fpath` does not exist, it will be created.
2120
- - The new content will be added at the top, followed by the old content of the file.
2183
+ - For text files, the new content will be added at the top, followed by the old content.
2184
+ - For JSON files, the new content will be merged with the existing JSON content.
2121
2185
  """
2122
2186
  content = content or ""
2123
- if os.path.exists(fpath):
2124
- with open(fpath, "r") as file:
2125
- old_content = file.read()
2187
+ file_ext = os.path.splitext(fpath)[1]
2188
+ how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
2189
+ how = strcmp(how, how_s)[0]
2190
+ print(how)
2191
+ add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
2192
+ if "json" in file_ext.lower():
2193
+ old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
2194
+ updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
2195
+ fsave(fpath,updated_content)
2126
2196
  else:
2127
- old_content = ""
2197
+ # Handle text file
2198
+ if os.path.exists(fpath):
2199
+ with open(fpath, "r") as file:
2200
+ old_content = file.read()
2201
+ else:
2202
+ old_content = ""
2128
2203
 
2129
- with open(fpath, "w") as file:
2130
- file.write(content)
2131
- file.write(old_content)
2204
+ # Write new content at the top followed by old content
2205
+ with open(fpath, "w") as file:
2206
+ if add_where=="head":
2207
+ file.write(content + "\n")
2208
+ file.write(old_content)
2209
+ else:
2210
+ file.write(old_content)
2211
+ file.write(content + "\n")
2132
2212
 
2133
2213
 
2134
2214
  def fappend(fpath, content=None):
@@ -2234,7 +2314,7 @@ def fsave(
2234
2314
 
2235
2315
  verbose=kwargs.pop("verbose",False)
2236
2316
  if verbose:
2237
- print_pd_usage("to_csv", verbose=verbose)
2317
+ use_pd("to_csv", verbose=verbose)
2238
2318
  kwargs_csv = dict(
2239
2319
  path_or_buf=None,
2240
2320
  sep=",",
@@ -2266,7 +2346,7 @@ def fsave(
2266
2346
  verbose=kwargs.pop("verbose",False)
2267
2347
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2268
2348
  if verbose:
2269
- print_pd_usage("to_excel", verbose=verbose)
2349
+ use_pd("to_excel", verbose=verbose)
2270
2350
  if any(kwargs):
2271
2351
  format_excel(df=data, filename=fpath, **kwargs)
2272
2352
  else:
@@ -2710,12 +2790,14 @@ def mkdir_nest(fpath: str) -> str:
2710
2790
  Returns:
2711
2791
  - str: The path of the created directory.
2712
2792
  """
2713
- # Check if the directory already exists
2714
- if os.path.isdir(fpath):
2715
- return fpath
2793
+
2716
2794
 
2717
2795
  # Split the full path into directories
2718
2796
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2797
+ if os.path.isdir(fpath):
2798
+ fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2799
+ print(fpath)
2800
+ return fpath
2719
2801
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2720
2802
 
2721
2803
  # Start creating directories from the root to the desired path
@@ -2744,34 +2826,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2744
2826
  - str: The path of the created directory or an error message.
2745
2827
  """
2746
2828
 
2747
- rootdir = []
2748
- # Convert string to list
2829
+ rootdir = []
2749
2830
  if chdir is None:
2750
2831
  return mkdir_nest(pardir)
2751
2832
  if isinstance(chdir, str):
2752
- chdir = [chdir]
2753
- # Subfoldername should be unique
2833
+ chdir = [chdir]
2754
2834
  chdir = list(set(chdir))
2755
2835
  if isinstance(pardir, str): # Dir_parents should be 'str' type
2756
- pardir = os.path.normpath(pardir)
2757
- # Get the slash type: "/" or "\"
2758
- stype = "/" if "/" in pardir else "\\"
2836
+ pardir = os.path.normpath(pardir)
2759
2837
  if "mac" in get_os().lower() or "lin" in get_os().lower():
2760
2838
  stype = "/"
2761
2839
  elif "win" in get_os().lower():
2762
2840
  stype = "\\"
2763
2841
  else:
2764
2842
  stype = "/"
2765
-
2766
- # Check if the parent directory exists and is a directory path
2843
+
2767
2844
  if os.path.isdir(pardir):
2768
2845
  os.chdir(pardir) # Set current path
2769
2846
  # Check if subdirectories are not empty
2770
2847
  if chdir:
2771
- chdir.sort()
2772
- # Create multiple subdirectories at once
2773
- for folder in chdir:
2774
- # Check if the subfolder already exists
2848
+ chdir.sort()
2849
+ for folder in chdir:
2775
2850
  child_tmp = os.path.join(pardir, folder)
2776
2851
  if not os.path.isdir(child_tmp):
2777
2852
  os.mkdir("./" + folder)
@@ -2791,6 +2866,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2791
2866
  # Dir is the main output, if only one dir, then str type is inconvenient
2792
2867
  if len(rootdir) == 1:
2793
2868
  rootdir = rootdir[0]
2869
+ rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
2870
+ print(rootdir)
2794
2871
  return rootdir
2795
2872
 
2796
2873
 
@@ -2805,22 +2882,25 @@ def figsave(*args, dpi=300):
2805
2882
  dir_save = None
2806
2883
  fname = None
2807
2884
  img = None
2885
+ f_slash = "/" if "mac" in get_os().lower() else "\\"
2808
2886
  for arg in args:
2809
2887
  if isinstance(arg, str):
2810
- if "/" in arg or "\\" in arg:
2888
+ if f_slash in arg:
2811
2889
  dir_save = arg
2812
- elif "/" not in arg and "\\" not in arg:
2890
+ else:
2813
2891
  fname = arg
2814
2892
  elif isinstance(arg, (Image.Image, np.ndarray)):
2815
2893
  img = arg # Store the PIL image if provided
2816
2894
 
2817
- f_slash = "/" if "mac" in get_os().lower() else "\\"
2818
2895
  if dir_save is None:
2819
2896
  dir_save="./"
2897
+ print(dir_save)
2898
+ # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
2820
2899
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
2821
2900
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
2822
2901
  if not dir_par.endswith(f_slash):
2823
2902
  dir_par += f_slash
2903
+ print(dir_par)
2824
2904
  if fname is None:
2825
2905
  fname = dir_ch
2826
2906
  mkdir(dir_par)
@@ -4415,12 +4495,48 @@ def preview(var):
4415
4495
  # preview("# This is a Markdown header")
4416
4496
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4417
4497
  # preview({"key": "value", "numbers": [1, 2, 3]})
4418
-
4498
+ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4499
+ """
4500
+ Extend a DataFrame by the list elecments in the column.
4501
+
4502
+ Parameters:
4503
+ ----------
4504
+ data : pd.DataFrame
4505
+ The input DataFrame to be extended.
4506
+
4507
+ column : str
4508
+ The name of the column to be split.
4509
+
4510
+ axis : int, optional
4511
+ The axis along which to expand the DataFrame.
4512
+ - 0 (default): Expand the specified column into multiple rows.
4513
+ - 1: Expand the specified column into multiple columns.
4514
+
4515
+ sep : str, optional
4516
+ The separator used to split the values in the specified column.
4517
+ Must be provided for the function to work correctly.
4518
+ """
4519
+
4520
+ data = data.copy()
4521
+ mask = data[column].str.contains(sep, na=False)
4522
+ data = data.copy()
4523
+ if mask.any():
4524
+ data[column] = (
4525
+ data[column]
4526
+ .apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
4527
+ )
4528
+
4529
+ # Strip spaces from each item in the lists
4530
+ data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
4531
+
4532
+ data = data.explode(column, ignore_index=True)
4533
+ return data
4419
4534
  # ! DataFrame
4420
4535
  def df_astype(
4421
- df: pd.DataFrame,
4536
+ data: pd.DataFrame,
4422
4537
  columns: Optional[Union[str, List[str]]] = None,
4423
4538
  astype: str = "datetime",
4539
+ skip_row:Union[str,list]=None,
4424
4540
  fmt: Optional[str] = None,
4425
4541
  inplace: bool = True,
4426
4542
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
@@ -4484,22 +4600,24 @@ def df_astype(
4484
4600
  ]
4485
4601
  # If inplace is False, make a copy of the DataFrame
4486
4602
  if not inplace:
4487
- df = df.copy()
4603
+ data = data.copy()
4604
+ if skip_row is not None:
4605
+ data = data.drop(index=skip_row, errors='ignore')
4606
+ # If columns is None, apply to all columns
4607
+ if columns is None:
4608
+ columns = data.columns.tolist()
4488
4609
  # correct the astype input
4489
4610
  if isinstance(astype,str):
4490
4611
  astype = strcmp(astype, astypes)[0]
4491
- print(f"converting {columns} as type: {astype}")
4612
+ print(f"converting as type: {astype}")
4492
4613
  elif isinstance(astype,dict):
4493
4614
  for col, dtype in astype.items():
4494
4615
  dtype='date' if dtype=="day" else dtype
4495
- df["col"]=df["col"].adtype(strcmp(dtype, astypes)[0])
4496
- return
4497
- # If columns is None, apply to all columns
4498
- if columns is None:
4499
- columns = df.columns
4616
+ data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
4617
+ return data if not inplace else None
4500
4618
 
4501
4619
  # Ensure columns is a list
4502
- if isinstance(columns, (str, int)):
4620
+ if isinstance(columns, str):
4503
4621
  columns = [columns]
4504
4622
 
4505
4623
  # Convert specified columns
@@ -4519,72 +4637,74 @@ def df_astype(
4519
4637
  kwargs.pop("errors", None)
4520
4638
  # convert it as type: datetime
4521
4639
  if isinstance(column, int):
4522
- df.iloc[:, column] = pd.to_datetime(
4523
- df.iloc[:, column], format=fmt, errors=errors, **kwargs
4640
+ data.iloc[:, column] = pd.to_datetime(
4641
+ data.iloc[:, column], format=fmt, errors=errors, **kwargs
4524
4642
  )
4525
4643
  # further convert:
4526
4644
  if astype == "time":
4527
- df.iloc[:, column] = df.iloc[:, column].dt.time
4645
+ data.iloc[:, column] = data.iloc[:, column].dt.time
4528
4646
  elif astype == "month":
4529
- df.iloc[:, column] = df.iloc[:, column].dt.month
4647
+ data.iloc[:, column] = data.iloc[:, column].dt.month
4530
4648
  elif astype == "year":
4531
- df.iloc[:, column] = df.iloc[:, column].dt.year
4649
+ data.iloc[:, column] = data.iloc[:, column].dt.year
4532
4650
  elif astype == "date" or astype == "day":
4533
- df.iloc[:, column] = df.iloc[:, column].dt.date
4651
+ data.iloc[:, column] = data.iloc[:, column].dt.date
4534
4652
  elif astype == "hour":
4535
- df.iloc[:, column] = df.iloc[:, column].dt.hour
4653
+ data.iloc[:, column] = data.iloc[:, column].dt.hour
4536
4654
  elif astype == "minute":
4537
- df.iloc[:, column] = df.iloc[:, column].dt.minute
4655
+ data.iloc[:, column] = data.iloc[:, column].dt.minute
4538
4656
  elif astype == "second":
4539
- df.iloc[:, column] = df.iloc[:, column].dt.second
4657
+ data.iloc[:, column] = data.iloc[:, column].dt.second
4540
4658
  elif astype == "week":
4541
- df.iloc[:, column] = df.iloc[:, column].dt.day_name()
4659
+ data.iloc[:, column] = data.iloc[:, column].dt.day_name()
4542
4660
  else:
4543
- df[column] = (
4661
+ data[column] = (
4544
4662
  pd.to_datetime(
4545
- df[column], format=fmt, errors=errors, **kwargs
4663
+ data[column], format=fmt, errors=errors, **kwargs
4546
4664
  )
4547
4665
  if fmt
4548
- else pd.to_datetime(df[column], errors=errors, **kwargs)
4666
+ else pd.to_datetime(data[column], errors=errors, **kwargs)
4549
4667
  )
4550
4668
  # further convert:
4551
4669
  if astype == "time":
4552
- df[column] = df[column].dt.time
4670
+ data[column] = data[column].dt.time
4553
4671
  elif astype == "month":
4554
- df[column] = df[column].dt.month
4672
+ data[column] = data[column].dt.month
4555
4673
  elif astype == "year":
4556
- df[column] = df[column].dt.year
4674
+ data[column] = data[column].dt.year
4557
4675
  elif astype == "date":
4558
- df[column] = df[column].dt.date
4676
+ data[column] = data[column].dt.date
4559
4677
  elif astype == "hour":
4560
- df[column] = df[column].dt.hour
4678
+ data[column] = data[column].dt.hour
4561
4679
  elif astype == "minute":
4562
- df[column] = df[column].dt.minute
4680
+ data[column] = data[column].dt.minute
4563
4681
  elif astype == "second":
4564
- df[column] = df[column].dt.second
4682
+ data[column] = data[column].dt.second
4565
4683
  elif astype == "week":
4566
- df[column] = df[column].dt.day_name()
4684
+ data[column] = data[column].dt.day_name()
4567
4685
 
4568
4686
  elif astype == "numeric":
4569
4687
  kwargs.pop("errors", None)
4570
- df[column] = pd.to_numeric(df[column], errors=errors, **kwargs)
4688
+ data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
4571
4689
  # print(f"Successfully converted '{column}' to numeric.")
4572
4690
  elif astype == "timedelta":
4573
4691
  kwargs.pop("errors", None)
4574
- df[column] = pd.to_timedelta(df[column], errors=errors, **kwargs)
4692
+ data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
4575
4693
  # print(f"Successfully converted '{column}' to timedelta.")
4576
4694
  else:
4577
4695
  # Convert to other types (e.g., float, int)
4578
- df[column] = df[column].astype(astype)
4696
+ data[column] = data[column].astype(astype)
4579
4697
  # print(f"Successfully converted '{column}' to {astype}.")
4580
4698
  except Exception as e:
4581
4699
  print(f"Error converting '{column}' to {astype}: {e}")
4582
-
4583
- # Return the modified DataFrame if inplace is False
4584
- return df
4700
+ try:
4701
+ display(data.info()[:10])
4702
+ except:
4703
+ pass
4704
+ return data
4585
4705
 
4586
4706
 
4587
- # ! DataFrame
4707
+ # ! DataFrame
4588
4708
  def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4589
4709
  """
4590
4710
  Sort a DataFrame by a specified column based on a custom order or by count.
@@ -4601,7 +4721,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4601
4721
  Returns:
4602
4722
  - Sorted DataFrame if inplace is False, otherwise None.
4603
4723
  """
4604
- if column not in df.columns:
4724
+ if column not in data.columns:
4605
4725
  raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
4606
4726
 
4607
4727
  if isinstance(by, str) and 'count' in by.lower():
@@ -4624,11 +4744,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4624
4744
 
4625
4745
  try:
4626
4746
  if inplace: # replace the original
4627
- df.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4747
+ data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4628
4748
  print(f"Successfully sorted DataFrame by '{column}'")
4629
4749
  return None
4630
4750
  else:
4631
- sorted_df = df.sort_values(column, ascending=ascending, **kwargs)
4751
+ sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
4632
4752
  print(f"Successfully sorted DataFrame by '{column}' using custom order.")
4633
4753
  return sorted_df
4634
4754
  except Exception as e:
@@ -4636,7 +4756,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4636
4756
  return df
4637
4757
 
4638
4758
 
4639
-
4640
4759
  # # Example usage:
4641
4760
  # # Sample DataFrame
4642
4761
  # data = {
@@ -4667,6 +4786,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4667
4786
  # display(df_month)
4668
4787
 
4669
4788
 
4789
+ def df_merge(
4790
+ df1: pd.DataFrame,
4791
+ df2: pd.DataFrame,
4792
+ use_index: bool = True,
4793
+ columns: list = ["col_left", "col_right"],
4794
+ how: str = "left",
4795
+ ) -> pd.DataFrame:
4796
+ """
4797
+ Merges two DataFrames based on either the index or shared columns with matching data types.
4798
+ usage:
4799
+ #(1) if the index are the same
4800
+ df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
4801
+ #(2) if there are shaed columns, then based on shared columns
4802
+ df_merged = df_merge(df1, df2, how='outer')
4803
+ #(3) if columns: then based on the specific columns
4804
+ df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
4805
+ Parameters:
4806
+ - df1 (pd.DataFrame): The first DataFrame.
4807
+ - df2 (pd.DataFrame): The second DataFrame.
4808
+ - use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
4809
+ - how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
4810
+ 'inner': only the rows that have matching values in both DataFrames (intersection)
4811
+ 'outer': keeps all rows from both DataFrames and fills in missing values with NaN
4812
+ 'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
4813
+ 'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
4814
+
4815
+ Returns:
4816
+ - pd.DataFrame: The merged DataFrame.
4817
+ """
4818
+
4819
+ # 1. Check if indices are comparable (same length and types)
4820
+ if use_index:
4821
+ print(f"Merging based on index using '{how}' join...")
4822
+ df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
4823
+ return df_merged
4824
+
4825
+ # 2. Find common columns with the same dtype
4826
+ common_columns = df1.columns.intersection(df2.columns)
4827
+ shared_columns = []
4828
+ for col in common_columns:
4829
+ if df1[col].dtype == df2[col].dtype:
4830
+ shared_columns.append(col)
4831
+ if not isinstance(columns, list):
4832
+ columns = [columns]
4833
+ if len(columns) != 2:
4834
+ raise ValueError(
4835
+ "'columns':list shoule be a list: columns=['col_left','col_right']"
4836
+ )
4837
+ if all(columns):
4838
+ print(f"Merging based on columns: {columns} using '{how}' join...")
4839
+ df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
4840
+ elif shared_columns:
4841
+ print(
4842
+ f"Merging based on shared columns: {shared_columns} using '{how}' join..."
4843
+ )
4844
+ df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
4845
+ else:
4846
+ raise ValueError(
4847
+ "No common columns with matching data types to merge on, and indices are not comparable."
4848
+ )
4849
+ return df_merged
4850
+
4851
+ def df_fillna(
4852
+ data: pd.DataFrame,
4853
+ method: str = "mean",
4854
+ axis: int = 0,# column-wise
4855
+ constant: float = None,
4856
+ inplace: bool = True,
4857
+ ) -> pd.DataFrame:
4858
+ """
4859
+ Fill missing values in a DataFrame using specified imputation method.
4860
+
4861
+ Parameters:
4862
+ data (pd.DataFrame): The DataFrame to fill missing values.
4863
+ method (str): The imputation method to use. Options are:
4864
+ - 'mean': Replace missing values with the mean of the column.
4865
+ - 'median': Replace missing values with the median of the column.
4866
+ - 'most_frequent': Replace missing values with the most frequent value in the column.
4867
+ - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
4868
+ - 'knn': Use K-Nearest Neighbors imputation.
4869
+ - 'iterative': Use Iterative imputation.
4870
+ axis (int): The axis along which to impute:
4871
+ - 0: Impute column-wise (default).
4872
+ - 1: Impute row-wise.
4873
+ constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
4874
+ inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
4875
+
4876
+ """
4877
+
4878
+ if data.empty:
4879
+ raise ValueError("Input DataFrame is empty.")
4880
+
4881
+ # Validate method
4882
+ methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
4883
+ method = strcmp(method, methods)[0]
4884
+
4885
+ # If using constant method, ask for a constant value
4886
+ if constant is not None:
4887
+ method = "constant"
4888
+ try:
4889
+ constant = float(constant)
4890
+ except ValueError:
4891
+ raise ValueError("Constant value must be a number.")
4892
+
4893
+ # Initialize SimpleImputer with the chosen method
4894
+ if method == "constant":
4895
+ imputer = SimpleImputer(strategy=method, fill_value=constant)
4896
+ elif method == "knn":
4897
+ from sklearn.impute import KNNImputer
4898
+
4899
+ imputer = KNNImputer(n_neighbors=n_neighbors)
4900
+ elif method == "iterative":
4901
+ from sklearn.impute import IterativeImputer
4902
+
4903
+ imputer = IterativeImputer(max_iter=max_iter)
4904
+ else:
4905
+ from sklearn.impute import SimpleImputer
4906
+
4907
+ imputer = SimpleImputer(strategy=method)
4908
+
4909
+ # Fit and transform the data
4910
+ if axis == 0:
4911
+ # Impute column-wise
4912
+ imputed_data = imputer.fit_transform(data)
4913
+ imputed_data.shape
4914
+ elif axis == 1:
4915
+ # Impute row-wise
4916
+ imputed_data = imputer.fit_transform(data.T)
4917
+ imputed_data.shape
4918
+ else:
4919
+ raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
4920
+
4921
+ df_filled = pd.DataFrame(
4922
+ imputed_data if axis == 0 else imputed_data.T,
4923
+ index=data.index,# if axis == 0 else data.columns,
4924
+ columns=data.columns,# if axis == 0 else data.index,
4925
+ )
4926
+
4927
+ if inplace:
4928
+ data.update(df_filled)
4929
+ return None # replace original
4930
+ else:
4931
+ return df_filled
4932
+ def df_scaler(
4933
+ data: pd.DataFrame,
4934
+ method="standard",
4935
+ columns=None, # default, select all numeric col/row
4936
+ inplace=False,
4937
+ verbose=False, # show usage
4938
+ axis=0, # defalut column-wise
4939
+ **kwargs,
4940
+ ):
4941
+ """
4942
+ df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
4943
+
4944
+ Parameters:
4945
+ - data: pandas DataFrame to be scaled.
4946
+ - method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
4947
+ - columns: List of columns (for axis=0) or rows (for axis=1) to scale.
4948
+ If None, all numeric columns/rows will be scaled.
4949
+ - inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
4950
+ - axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
4951
+ - verbose: If True, prints logs of the process.
4952
+ - kwargs: Additional arguments to be passed to the scaler.
4953
+ """
4954
+ if verbose:
4955
+ print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
4956
+
4957
+ methods = ["standard", "minmax", "robust"]
4958
+ method = strcmp(method, methods)[0]
4959
+ if method == "standard":
4960
+ from sklearn.preprocessing import StandardScaler
4961
+
4962
+ scaler = StandardScaler(**kwargs)
4963
+ elif method == "minmax":
4964
+ from sklearn.preprocessing import MinMaxScaler
4965
+
4966
+ scaler = MinMaxScaler(**kwargs)
4967
+ elif method == "robust":
4968
+ from sklearn.preprocessing import RobustScaler
4969
+
4970
+ scaler = RobustScaler(**kwargs)
4971
+ if axis not in [0, 1]:
4972
+ raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
4973
+
4974
+ if axis == 0:
4975
+ # Column-wise scaling (default)
4976
+ if columns is None:
4977
+ columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
4978
+ non_numeric_columns = data.columns.difference(columns)
4979
+ print(f"Scaling columns")
4980
+
4981
+ scaled_data = scaler.fit_transform(data[columns])
4982
+
4983
+ if inplace:
4984
+ data[columns] = scaled_data
4985
+ print("Original DataFrame modified in place (column-wise).")
4986
+ else:
4987
+ scaled_df = pd.concat(
4988
+ [
4989
+ pd.DataFrame(scaled_data, columns=columns, index=data.index),
4990
+ data[non_numeric_columns],
4991
+ ],
4992
+ axis=1,
4993
+ )
4994
+ scaled_df = scaled_df[data.columns] # Maintain column order
4995
+ return scaled_df
4996
+
4997
+ elif axis == 1:
4998
+ # Row-wise scaling
4999
+ if columns is None:
5000
+ columns = data.index.tolist()
5001
+ numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
5002
+ if numeric_rows.empty:
5003
+ raise ValueError("No numeric rows to scale.")
5004
+
5005
+ print(f"Scaling rows")
5006
+
5007
+ scaled_data = scaler.fit_transform(
5008
+ numeric_rows.T
5009
+ ).T # Transpose for scaling and then back
5010
+
5011
+ if inplace:
5012
+ data.loc[numeric_rows.index] = scaled_data
5013
+ print("Original DataFrame modified in place (row-wise).")
5014
+ else:
5015
+ scaled_df = data.copy()
5016
+ scaled_df.loc[numeric_rows.index] = scaled_data
5017
+ return scaled_df
5018
+
4670
5019
  def df_cluster(
4671
5020
  data: pd.DataFrame,
4672
5021
  columns: Optional[list] = None,
@@ -4721,7 +5070,7 @@ def df_cluster(
4721
5070
  X = scaler.fit_transform(X)
4722
5071
 
4723
5072
  for n_cluster in range_n_clusters:
4724
- kmeans = KMeans(n_clusters=n_cluster, random_state=42)
5073
+ kmeans = KMeans(n_clusters=n_cluster, random_state=1)
4725
5074
  cluster_labels = kmeans.fit_predict(X)
4726
5075
 
4727
5076
  silhouette_avg = silhouette_score(X, cluster_labels)
@@ -4737,7 +5086,7 @@ def df_cluster(
4737
5086
  print(f"n_clusters = {n_clusters}")
4738
5087
 
4739
5088
  # Apply K-Means Clustering with Optimal Number of Clusters
4740
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5089
+ kmeans = KMeans(n_clusters=n_clusters, random_state=1)
4741
5090
  cluster_labels = kmeans.fit_predict(X)
4742
5091
 
4743
5092
  if plot:
@@ -4838,7 +5187,7 @@ def df_cluster(
4838
5187
  # n_clusters = (
4839
5188
  # np.argmax(silhouette_avg_scores) + 2
4840
5189
  # ) # Optimal clusters based on max silhouette score
4841
- # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5190
+ # kmeans = KMeans(n_clusters=n_clusters, random_state=1)
4842
5191
  # cluster_labels = kmeans.fit_predict(X)
4843
5192
  silhouette_vals = silhouette_samples(X, cluster_labels)
4844
5193
 
@@ -4989,12 +5338,14 @@ def df_reducer(
4989
5338
  columns: Optional[List[str]] = None,
4990
5339
  method: str = "umap", # 'pca', 'umap'
4991
5340
  n_components: int = 2, # Default for umap, but 50 for PCA
4992
- umap_neighbors: int = 15, # Default
4993
- umap_min_dist: float = 0.1, # Default
5341
+ umap_neighbors: int = 15, # UMAP-specific
5342
+ umap_min_dist: float = 0.1, # UMAP-specific
5343
+ tsne_perplexity: int = 30, # t-SNE-specific
4994
5344
  scale: bool = True,
4995
5345
  fill_missing: bool = True,
4996
5346
  debug: bool = False,
4997
5347
  inplace: bool = True, # replace the oringinal data
5348
+ plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
4998
5349
  ) -> pd.DataFrame:
4999
5350
  """
5000
5351
  Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
@@ -5030,9 +5381,35 @@ def df_reducer(
5030
5381
  reduced_df : pd.DataFrame
5031
5382
  DataFrame with the reduced dimensions.
5032
5383
  """
5033
- from sklearn.decomposition import PCA
5384
+
5385
+ """
5386
+ PCA: explained_variance:
5387
+ indicates the proportion of the dataset's total variance that each principal
5388
+ component (PC) explains. It gives you a sense of how much information
5389
+ (or variance) is captured by each PC
5390
+ Interpretation:
5391
+ - Higher values indicate that the corresponding PC captures more variance.
5392
+ - The sum of the explained variances for all PCs equals 1 (or 100%).
5393
+ - If the first few components explain a high percentage (e.g., 90%),
5394
+ it means you can reduce the dimensionality of the data significantly without losing much information.
5395
+ Use case:
5396
+ You may plot a scree plot, which shows the explained variance for each PC, to help decide
5397
+ how many components to keep for analysis.
5398
+
5399
+ PCA: Singular values:
5400
+ represent the magnitude of variance along each principal component. Mathematically,
5401
+ they are the square roots of the eigenvalues of the covariance matrix.
5402
+ Interpretation:
5403
+ Larger singular values indicate that the associated PC captures more variance.
5404
+ Singular values are related to the scale of the data. If the data are scaled
5405
+ before PCA (e.g., standardized), then the singular values will provide a measure
5406
+ of the spread of data along each PC.
5407
+ Use case:
5408
+ Singular values help quantify the contribution of each principal component in a
5409
+ similar way to the explained variance. They are useful in understanding the overall
5410
+ structure of the data.
5411
+ """
5034
5412
  from sklearn.preprocessing import StandardScaler
5035
- import umap
5036
5413
  from sklearn.impute import SimpleImputer
5037
5414
 
5038
5415
  # Select columns if specified, else use all columns
@@ -5049,76 +5426,211 @@ def df_reducer(
5049
5426
  X = scaler.fit_transform(X)
5050
5427
 
5051
5428
  # Check valid method input
5052
- if method not in ["pca", "umap"]:
5053
- raise ValueError(f"Invalid method '{method}'. Choose 'pca' or 'umap'.")
5054
-
5429
+ methods=["pca", "umap","tsne","factor","isolation_forest"]
5430
+ method=strcmp(method, methods)[0]
5055
5431
  # Apply PCA if selected
5056
- if method == "pca":
5057
- if n_components is None:
5058
- # to get the n_components with threshold method:
5059
- pca = PCA()
5060
- pca_result = pca.fit_transform(X)
5061
-
5062
- # Calculate explained variance
5063
- explained_variance = pca.explained_variance_ratio_
5064
- # Cumulative explained variance
5065
- cumulative_variance = np.cumsum(explained_variance)
5066
- # Set a threshold for cumulative variance
5067
- threshold = 0.95 # Example threshold
5068
- n_components = (
5069
- np.argmax(cumulative_variance >= threshold) + 1
5070
- ) # Number of components to retain
5071
- if debug:
5072
- # debug:
5073
- # Plot the cumulative explained variance
5074
- plt.figure(figsize=(8, 5))
5075
- plt.plot(
5076
- range(1, len(cumulative_variance) + 1),
5077
- cumulative_variance,
5078
- marker="o",
5079
- linestyle="-",
5080
- )
5081
- plt.title("Cumulative Explained Variance by Principal Components")
5082
- plt.xlabel("Number of Principal Components")
5083
- plt.ylabel("Cumulative Explained Variance")
5084
- plt.xticks(range(1, len(cumulative_variance) + 1))
5085
- # Add horizontal line for the threshold
5086
- plt.axhline(
5087
- y=threshold, color="r", linestyle="--", label="Threshold (95%)"
5088
- )
5089
- # Add vertical line for n_components
5090
- plt.axvline(
5091
- x=n_components,
5092
- color="g",
5093
- linestyle="--",
5094
- label=f"n_components = {n_components}",
5095
- )
5096
- plt.legend()
5097
- plt.grid()
5432
+ if method == "pca":
5433
+ from sklearn.decomposition import PCA
5098
5434
  pca = PCA(n_components=n_components)
5099
5435
  X_reduced = pca.fit_transform(X)
5100
- print(f"PCA completed: Reduced to {n_components} components.")
5436
+
5437
+ # Additional PCA information
5438
+ explained_variance = pca.explained_variance_ratio_
5439
+ singular_values = pca.singular_values_
5440
+ loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
5441
+
5442
+ if debug:
5443
+ print(f"PCA completed: Reduced to {n_components} components.")
5444
+ print(f"Explained Variance: {explained_variance}")
5445
+ print(f"Singular Values: {singular_values}")
5446
+
5447
+ # Plot explained variance if debug=True
5448
+ if debug:
5449
+ # Plot explained variance
5450
+ cumulative_variance = np.cumsum(explained_variance)
5451
+ plt.figure(figsize=(8, 5))
5452
+ plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
5453
+ plt.title("Cumulative Explained Variance by Principal Components")
5454
+ plt.xlabel("Number of Principal Components")
5455
+ plt.ylabel("Cumulative Explained Variance")
5456
+ plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
5457
+ plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
5458
+ plt.legend()
5459
+ plt.grid()
5460
+ plt.show()
5461
+
5462
+ # Prepare reduced DataFrame with additional PCA info
5463
+ pca_df = pd.DataFrame(
5464
+ X_reduced, index=data.index,
5465
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5466
+ )
5467
+ # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
5468
+ # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
5469
+ # Expand explained variance to multiple columns if needed
5470
+ for i in range(n_components):
5471
+ pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
5472
+ for i in range(n_components):
5473
+ pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
5101
5474
 
5102
5475
  # Apply UMAP if selected
5103
5476
  elif method == "umap":
5477
+ import umap
5104
5478
  umap_reducer = umap.UMAP(
5105
5479
  n_neighbors=umap_neighbors,
5106
5480
  min_dist=umap_min_dist,
5107
- n_components=n_components,
5481
+ n_components=n_components
5108
5482
  )
5109
5483
  X_reduced = umap_reducer.fit_transform(X)
5110
- print(f"UMAP completed: Reduced to {n_components} components.")
5111
5484
 
5112
- # Return reduced data as a new DataFrame with the same index
5113
- reduced_df = pd.DataFrame(X_reduced, index=data.index)
5485
+ # Additional UMAP information
5486
+ embedding = umap_reducer.embedding_
5487
+ trustworthiness = umap_reducer._raw_data[:, :n_components]
5488
+
5489
+ if debug:
5490
+ print(f"UMAP completed: Reduced to {n_components} components.")
5491
+ print(f"Embedding Shape: {embedding.shape}")
5492
+ print(f"Trustworthiness: {trustworthiness}")
5493
+
5494
+ # Prepare reduced DataFrame with additional UMAP info
5495
+ umap_df = pd.DataFrame(
5496
+ X_reduced, index=data.index,
5497
+ columns=[f"UMAP_{i+1}" for i in range(n_components)]
5498
+ )
5499
+ umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5500
+ umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
5501
+ elif method == "tsne":
5502
+ from sklearn.manifold import TSNE
5503
+ tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5504
+ X_reduced = tsne.fit_transform(X)
5505
+
5506
+ # Prepare reduced DataFrame with additional t-SNE info
5507
+ tsne_df = pd.DataFrame(
5508
+ X_reduced, index=data.index,
5509
+ columns=[f"tSNE_{i+1}" for i in range(n_components)]
5510
+ )
5511
+ tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5512
+
5513
+ # Apply Factor Analysis if selected
5514
+ elif method == "factor":
5515
+ from sklearn.decomposition import FactorAnalysis
5516
+ factor = FactorAnalysis(n_components=n_components, random_state=1)
5517
+ X_reduced = factor.fit_transform(X)
5518
+ # Factor Analysis does not directly provide explained variance, but we can approximate it
5519
+ fa_variance = factor.noise_variance_
5520
+ # Prepare reduced DataFrame with additional Factor Analysis info
5521
+ factor_df = pd.DataFrame(
5522
+ X_reduced, index=data.index,
5523
+ columns=[f"Factor_{i+1}" for i in range(n_components)]
5524
+ )
5525
+ factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5526
+
5527
+ # Apply Isolation Forest for outlier detection if selected
5528
+ elif method == "isolation_forest":
5529
+ from sklearn.decomposition import PCA
5530
+ from sklearn.ensemble import IsolationForest
5531
+ # Step 1: Apply PCA for dimensionality reduction to 2 components
5532
+ pca = PCA(n_components=n_components)
5533
+ X_pca = pca.fit_transform(X)
5534
+
5535
+ explained_variance = pca.explained_variance_ratio_
5536
+ singular_values = pca.singular_values_
5537
+
5538
+ # Prepare reduced DataFrame with additional PCA info
5539
+ iso_forest_df = pd.DataFrame(
5540
+ X_pca, index=data.index,
5541
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5542
+ )
5543
+
5544
+ isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
5545
+ isolation_forest.fit(X)
5546
+ anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
5547
+ # Predict labels: 1 (normal), -1 (anomaly)
5548
+ anomaly_labels = isolation_forest.fit_predict(X)
5549
+ # Add anomaly scores and labels to the DataFrame
5550
+ iso_forest_df["Anomaly Score"] = anomaly_scores
5551
+ iso_forest_df["Anomaly Label"] = anomaly_labels
5552
+ # add info from pca
5553
+ for i in range(n_components):
5554
+ iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
5555
+ for i in range(n_components):
5556
+ iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
5557
+
5558
+ # Return reduced data and info as a new DataFrame with the same index
5559
+ if method == "pca":
5560
+ reduced_df = pca_df
5561
+ colname_met = "PC_"
5562
+ if plot_:
5563
+ sns.scatterplot(
5564
+ data=pca_df,
5565
+ x="PC_1",
5566
+ y="PC_2",
5567
+ # hue="condition",
5568
+ )
5569
+ elif method == "umap":
5570
+ reduced_df = umap_df
5571
+ colname_met = "UMAP_"
5572
+ if plot_:
5573
+ sns.scatterplot(
5574
+ data=umap_df,
5575
+ x="UMAP_1",
5576
+ y="UMAP_2",
5577
+ # hue="condition",
5578
+ )
5579
+ elif method == "tsne":
5580
+ reduced_df = tsne_df
5581
+ colname_met = "t-SNE_"
5582
+ if plot_:
5583
+ sns.scatterplot(
5584
+ data=tsne_df,
5585
+ x="tSNE_1",
5586
+ y="tSNE_2",
5587
+ # hue="batch",
5588
+ )
5589
+ elif method == "factor":
5590
+ reduced_df = factor_df
5591
+ colname_met = "Factor_"
5592
+ if plot_:
5593
+ sns.scatterplot(
5594
+ data=factor_df,
5595
+ x="Factor_1",
5596
+ y="Factor_2",
5597
+ # hue="batch",
5598
+ )
5599
+ elif method == "isolation_forest":
5600
+ reduced_df = iso_forest_df # Already a DataFrame for outliers
5601
+ colname_met = "PC_"
5602
+ if plot_:
5603
+ ax = sns.scatterplot(
5604
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
5605
+ x="PC_1",
5606
+ y="PC_2",
5607
+ label="normal", c="b",
5608
+ )
5609
+ ax = sns.scatterplot(
5610
+ ax=ax,
5611
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
5612
+ x="PC_1",
5613
+ y="PC_2",
5614
+ c="r",
5615
+ label="outlier", marker="+", s=30,
5616
+ )
5617
+
5114
5618
 
5115
5619
  if inplace:
5116
- # Replace or add new columns based on n_components
5620
+ # If inplace=True, add components back into the original data
5117
5621
  for col_idx in range(n_components):
5118
- data[f"Component_{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5622
+ data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5623
+
5624
+ # Add extra info for PCA/UMAP
5625
+ if method == "pca":
5626
+ data["Explained Variance"] = reduced_df["Explained Variance"]
5627
+ data["Singular Values"] = reduced_df["Singular Values"]
5628
+ elif method == "umap":
5629
+ data["Embedding"] = reduced_df["Embedding"]
5630
+ data["Trustworthiness"] = reduced_df["Trustworthiness"]
5119
5631
  return None # No return when inplace=True
5120
5632
 
5121
- return reduced_df
5633
+ return reduced_df
5122
5634
 
5123
5635
 
5124
5636
  # example:
@@ -5373,7 +5885,7 @@ def evaluate_cluster(
5373
5885
  return metrics
5374
5886
 
5375
5887
 
5376
- def print_pd_usage(
5888
+ def use_pd(
5377
5889
  func_name="excel",
5378
5890
  verbose=True,
5379
5891
  dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
@@ -5387,4 +5899,4 @@ def print_pd_usage(
5387
5899
  i_ = i_.replace("=", "\t= ") + ","
5388
5900
  print(i_) if i == 0 else print("\t", i_)
5389
5901
  else:
5390
- print(usage)
5902
+ print(usage)