py2ls 0.2.4__py3-none-any.whl → 0.2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -61,19 +61,31 @@ except NameError:
61
61
 
62
62
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
63
63
  """
64
- Add the Chinese font to the font manager
64
+ Add the Chinese (default) font to the font manager
65
65
  Args:
66
66
  dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
67
67
  """
68
68
  import matplotlib.pyplot as plt
69
69
  from matplotlib import font_manager
70
+ slashtype = "/" if 'mac' in get_os() else "\\"
71
+ if slashtype in dir_font:
72
+ font_manager.fontManager.addfont(dir_font)
73
+ fontname = os.path.basename(dir_font).split(".")[0]
74
+ else:
75
+ if "cn" in dir_font.lower() or "ch" in dir_font.lower():
76
+ fontname = "Hiragino Sans GB" # default Chinese font
77
+ else:
78
+ fontname = dir_font
70
79
 
71
- font_manager.fontManager.addfont(dir_font)
72
- fontname_chinese = os.path.basename(dir_font).split(".")[0]
73
-
74
- plt.rcParams["font.sans-serif"] = [fontname_chinese]
75
- plt.rcParams["font.family"] = "sans-serif"
80
+ plt.rcParams["font.sans-serif"] = [fontname]
81
+ # plt.rcParams["font.family"] = "sans-serif"
76
82
  plt.rcParams["axes.unicode_minus"] = False
83
+ fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
84
+ fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
85
+ if fontname not in fontname_in_system:
86
+ print(f"Font '{fontname}' not found. Falling back to default.")
87
+ plt.rcParams["font.sans-serif"] = ["Arial"]
88
+ return fontname
77
89
 
78
90
  # set 'dir_save'
79
91
  if "dar" in sys.platform:
@@ -526,6 +538,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
526
538
  if isinstance(s, str):
527
539
  return s.lower()
528
540
  elif isinstance(s, list):
541
+ s=[str(i) for i in s]# convert all to str
529
542
  return [elem.lower() for elem in s]
530
543
  return s
531
544
 
@@ -1697,24 +1710,16 @@ def fload(fpath, kind=None, **kwargs):
1697
1710
  def load_csv(fpath, **kwargs):
1698
1711
  from pandas.errors import EmptyDataError
1699
1712
 
1700
- engine = kwargs.get("engine", "pyarrow")
1701
- kwargs.pop("engine", None)
1702
- sep = kwargs.get("sep", "\t")
1703
- kwargs.pop("sep", None)
1704
- index_col = kwargs.get("index_col", None)
1705
- kwargs.pop("index_col", None)
1706
- memory_map = kwargs.get("memory_map", True)
1707
- kwargs.pop("memory_map", None)
1708
- skipinitialspace = kwargs.get("skipinitialspace", True)
1709
- kwargs.pop("skipinitialspace", None)
1710
- encoding = kwargs.get("encoding", "utf-8")
1711
- kwargs.pop("encoding", None)
1712
- on_bad_lines = kwargs.get("on_bad_lines", "skip")
1713
- kwargs.pop("on_bad_lines", None)
1714
- comment = kwargs.get("comment", None)
1715
- kwargs.pop("comment", None)
1716
-
1713
+ engine = kwargs.pop("engine", "pyarrow")
1714
+ sep = kwargs.pop("sep", "\t")
1715
+ index_col = kwargs.pop("index_col", None)
1716
+ memory_map = kwargs.pop("memory_map", False)
1717
+ skipinitialspace = kwargs.pop("skipinitialspace", False)
1718
+ encoding = kwargs.pop("encoding", "utf-8")
1719
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1720
+ comment = kwargs.pop("comment", None)
1717
1721
  fmt=kwargs.pop("fmt",False)
1722
+ verbose=kwargs.pop("verbose",False)
1718
1723
  if verbose:
1719
1724
  print_pd_usage("read_csv", verbose=verbose)
1720
1725
  return
@@ -1800,7 +1805,7 @@ def fload(fpath, kind=None, **kwargs):
1800
1805
  separators = [",", "\t", ";", "|", " "]
1801
1806
  for sep in separators:
1802
1807
  sep2show = sep if sep != "\t" else "\\t"
1803
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1808
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1804
1809
  try:
1805
1810
  df = pd.read_csv(
1806
1811
  fpath,
@@ -1819,13 +1824,13 @@ def fload(fpath, kind=None, **kwargs):
1819
1824
  except:
1820
1825
  pass
1821
1826
  else:
1822
- engines = ["c", "python"]
1827
+ engines = [None,"c", "python"]
1823
1828
  for engine in engines:
1824
- # separators = [",", "\t", ";", "|", " "]
1829
+ separators = [",", "\t", ";", "|", " "]
1825
1830
  for sep in separators:
1826
1831
  try:
1827
1832
  sep2show = sep if sep != "\t" else "\\t"
1828
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1833
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
1829
1834
  df = pd.read_csv(
1830
1835
  fpath,
1831
1836
  engine=engine,
@@ -2031,6 +2036,9 @@ def fload(fpath, kind=None, **kwargs):
2031
2036
  elif kind.lower() in img_types:
2032
2037
  print(f'Image ".{kind}" is loaded.')
2033
2038
  return load_img(fpath)
2039
+ elif kind=="gz" and fpath.endswith(".soft.gz"):
2040
+ import GEOparse
2041
+ return GEOparse.get_GEO(filepath=fpath)
2034
2042
  elif kind.lower() in zip_types:
2035
2043
  keep = kwargs.get("keep", False)
2036
2044
  fpath_unzip = unzip(fpath)
@@ -2105,30 +2113,51 @@ def fload(fpath, kind=None, **kwargs):
2105
2113
  # docx_content = fload('sample.docx')
2106
2114
 
2107
2115
 
2108
- def fupdate(fpath, content=None):
2116
+ def fupdate(fpath, content=None, how="head"):
2109
2117
  """
2110
2118
  Update a file by adding new content at the top and moving the old content to the bottom.
2119
+ If the file is a JSON file, merge the new content with the old content.
2120
+
2111
2121
  Parameters
2112
2122
  ----------
2113
2123
  fpath : str
2114
2124
  The file path where the content should be updated.
2115
- content : str, optional
2116
- The new content to add at the top of the file. If not provided, the function will not add any new content.
2125
+ content : str or dict, optional
2126
+ The new content to add at the top of the file (for text) or merge (for JSON).
2127
+ If not provided, the function will not add any new content.
2128
+
2117
2129
  Notes
2118
2130
  -----
2119
2131
  - If the file at `fpath` does not exist, it will be created.
2120
- - The new content will be added at the top, followed by the old content of the file.
2132
+ - For text files, the new content will be added at the top, followed by the old content.
2133
+ - For JSON files, the new content will be merged with the existing JSON content.
2121
2134
  """
2122
2135
  content = content or ""
2123
- if os.path.exists(fpath):
2124
- with open(fpath, "r") as file:
2125
- old_content = file.read()
2136
+ file_ext = os.path.splitext(fpath)[1]
2137
+ how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
2138
+ how = strcmp(how, how_s)[0]
2139
+ print(how)
2140
+ add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
2141
+ if "json" in file_ext.lower():
2142
+ old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
2143
+ updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
2144
+ fsave(fpath,updated_content)
2126
2145
  else:
2127
- old_content = ""
2146
+ # Handle text file
2147
+ if os.path.exists(fpath):
2148
+ with open(fpath, "r") as file:
2149
+ old_content = file.read()
2150
+ else:
2151
+ old_content = ""
2128
2152
 
2129
- with open(fpath, "w") as file:
2130
- file.write(content)
2131
- file.write(old_content)
2153
+ # Write new content at the top followed by old content
2154
+ with open(fpath, "w") as file:
2155
+ if add_where=="head":
2156
+ file.write(content + "\n")
2157
+ file.write(old_content)
2158
+ else:
2159
+ file.write(old_content)
2160
+ file.write(content + "\n")
2132
2161
 
2133
2162
 
2134
2163
  def fappend(fpath, content=None):
@@ -2710,12 +2739,14 @@ def mkdir_nest(fpath: str) -> str:
2710
2739
  Returns:
2711
2740
  - str: The path of the created directory.
2712
2741
  """
2713
- # Check if the directory already exists
2714
- if os.path.isdir(fpath):
2715
- return fpath
2742
+
2716
2743
 
2717
2744
  # Split the full path into directories
2718
2745
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2746
+ if os.path.isdir(fpath):
2747
+ fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2748
+ print(fpath)
2749
+ return fpath
2719
2750
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2720
2751
 
2721
2752
  # Start creating directories from the root to the desired path
@@ -2744,34 +2775,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2744
2775
  - str: The path of the created directory or an error message.
2745
2776
  """
2746
2777
 
2747
- rootdir = []
2748
- # Convert string to list
2778
+ rootdir = []
2749
2779
  if chdir is None:
2750
2780
  return mkdir_nest(pardir)
2751
2781
  if isinstance(chdir, str):
2752
- chdir = [chdir]
2753
- # Subfoldername should be unique
2782
+ chdir = [chdir]
2754
2783
  chdir = list(set(chdir))
2755
2784
  if isinstance(pardir, str): # Dir_parents should be 'str' type
2756
- pardir = os.path.normpath(pardir)
2757
- # Get the slash type: "/" or "\"
2758
- stype = "/" if "/" in pardir else "\\"
2785
+ pardir = os.path.normpath(pardir)
2759
2786
  if "mac" in get_os().lower() or "lin" in get_os().lower():
2760
2787
  stype = "/"
2761
2788
  elif "win" in get_os().lower():
2762
2789
  stype = "\\"
2763
2790
  else:
2764
2791
  stype = "/"
2765
-
2766
- # Check if the parent directory exists and is a directory path
2792
+
2767
2793
  if os.path.isdir(pardir):
2768
2794
  os.chdir(pardir) # Set current path
2769
2795
  # Check if subdirectories are not empty
2770
2796
  if chdir:
2771
- chdir.sort()
2772
- # Create multiple subdirectories at once
2773
- for folder in chdir:
2774
- # Check if the subfolder already exists
2797
+ chdir.sort()
2798
+ for folder in chdir:
2775
2799
  child_tmp = os.path.join(pardir, folder)
2776
2800
  if not os.path.isdir(child_tmp):
2777
2801
  os.mkdir("./" + folder)
@@ -2791,6 +2815,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2791
2815
  # Dir is the main output, if only one dir, then str type is inconvenient
2792
2816
  if len(rootdir) == 1:
2793
2817
  rootdir = rootdir[0]
2818
+ rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
2819
+ print(rootdir)
2794
2820
  return rootdir
2795
2821
 
2796
2822
 
@@ -2805,22 +2831,25 @@ def figsave(*args, dpi=300):
2805
2831
  dir_save = None
2806
2832
  fname = None
2807
2833
  img = None
2834
+ f_slash = "/" if "mac" in get_os().lower() else "\\"
2808
2835
  for arg in args:
2809
2836
  if isinstance(arg, str):
2810
- if "/" in arg or "\\" in arg:
2837
+ if f_slash in arg:
2811
2838
  dir_save = arg
2812
- elif "/" not in arg and "\\" not in arg:
2839
+ else:
2813
2840
  fname = arg
2814
2841
  elif isinstance(arg, (Image.Image, np.ndarray)):
2815
2842
  img = arg # Store the PIL image if provided
2816
2843
 
2817
- f_slash = "/" if "mac" in get_os().lower() else "\\"
2818
2844
  if dir_save is None:
2819
2845
  dir_save="./"
2846
+ print(dir_save)
2847
+ # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
2820
2848
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
2821
2849
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
2822
2850
  if not dir_par.endswith(f_slash):
2823
2851
  dir_par += f_slash
2852
+ print(dir_par)
2824
2853
  if fname is None:
2825
2854
  fname = dir_ch
2826
2855
  mkdir(dir_par)
@@ -4418,9 +4447,10 @@ def preview(var):
4418
4447
 
4419
4448
  # ! DataFrame
4420
4449
  def df_astype(
4421
- df: pd.DataFrame,
4450
+ data: pd.DataFrame,
4422
4451
  columns: Optional[Union[str, List[str]]] = None,
4423
4452
  astype: str = "datetime",
4453
+ skip_row:Union[str,list]=None,
4424
4454
  fmt: Optional[str] = None,
4425
4455
  inplace: bool = True,
4426
4456
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
@@ -4478,22 +4508,30 @@ def df_astype(
4478
4508
  "second",
4479
4509
  "time",
4480
4510
  "week",
4481
- "date",
4511
+ "date","day",
4482
4512
  "month",
4483
4513
  "year",
4484
4514
  ]
4485
- # correct the astype input
4486
- astype = strcmp(astype, astypes)[0]
4487
- print(f"converting {columns} as type: {astype}")
4488
4515
  # If inplace is False, make a copy of the DataFrame
4489
4516
  if not inplace:
4490
- df = df.copy()
4517
+ data = data.copy()
4518
+ if skip_row is not None:
4519
+ data = data.drop(index=skip_row, errors='ignore')
4491
4520
  # If columns is None, apply to all columns
4492
4521
  if columns is None:
4493
- columns = df.columns
4522
+ columns = data.columns.tolist()
4523
+ # correct the astype input
4524
+ if isinstance(astype,str):
4525
+ astype = strcmp(astype, astypes)[0]
4526
+ print(f"converting as type: {astype}")
4527
+ elif isinstance(astype,dict):
4528
+ for col, dtype in astype.items():
4529
+ dtype='date' if dtype=="day" else dtype
4530
+ data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
4531
+ return data if not inplace else None
4494
4532
 
4495
4533
  # Ensure columns is a list
4496
- if isinstance(columns, (str, int)):
4534
+ if isinstance(columns, str):
4497
4535
  columns = [columns]
4498
4536
 
4499
4537
  # Convert specified columns
@@ -4513,72 +4551,74 @@ def df_astype(
4513
4551
  kwargs.pop("errors", None)
4514
4552
  # convert it as type: datetime
4515
4553
  if isinstance(column, int):
4516
- df.iloc[:, column] = pd.to_datetime(
4517
- df.iloc[:, column], format=fmt, errors=errors, **kwargs
4554
+ data.iloc[:, column] = pd.to_datetime(
4555
+ data.iloc[:, column], format=fmt, errors=errors, **kwargs
4518
4556
  )
4519
4557
  # further convert:
4520
4558
  if astype == "time":
4521
- df.iloc[:, column] = df.iloc[:, column].dt.time
4559
+ data.iloc[:, column] = data.iloc[:, column].dt.time
4522
4560
  elif astype == "month":
4523
- df.iloc[:, column] = df.iloc[:, column].dt.month
4561
+ data.iloc[:, column] = data.iloc[:, column].dt.month
4524
4562
  elif astype == "year":
4525
- df.iloc[:, column] = df.iloc[:, column].dt.year
4526
- elif astype == "date":
4527
- df.iloc[:, column] = df.iloc[:, column].dt.date
4563
+ data.iloc[:, column] = data.iloc[:, column].dt.year
4564
+ elif astype == "date" or astype == "day":
4565
+ data.iloc[:, column] = data.iloc[:, column].dt.date
4528
4566
  elif astype == "hour":
4529
- df.iloc[:, column] = df.iloc[:, column].dt.hour
4567
+ data.iloc[:, column] = data.iloc[:, column].dt.hour
4530
4568
  elif astype == "minute":
4531
- df.iloc[:, column] = df.iloc[:, column].dt.minute
4569
+ data.iloc[:, column] = data.iloc[:, column].dt.minute
4532
4570
  elif astype == "second":
4533
- df.iloc[:, column] = df.iloc[:, column].dt.second
4571
+ data.iloc[:, column] = data.iloc[:, column].dt.second
4534
4572
  elif astype == "week":
4535
- df.iloc[:, column] = df.iloc[:, column].dt.day_name()
4573
+ data.iloc[:, column] = data.iloc[:, column].dt.day_name()
4536
4574
  else:
4537
- df[column] = (
4575
+ data[column] = (
4538
4576
  pd.to_datetime(
4539
- df[column], format=fmt, errors=errors, **kwargs
4577
+ data[column], format=fmt, errors=errors, **kwargs
4540
4578
  )
4541
4579
  if fmt
4542
- else pd.to_datetime(df[column], errors=errors, **kwargs)
4580
+ else pd.to_datetime(data[column], errors=errors, **kwargs)
4543
4581
  )
4544
4582
  # further convert:
4545
4583
  if astype == "time":
4546
- df[column] = df[column].dt.time
4584
+ data[column] = data[column].dt.time
4547
4585
  elif astype == "month":
4548
- df[column] = df[column].dt.month
4586
+ data[column] = data[column].dt.month
4549
4587
  elif astype == "year":
4550
- df[column] = df[column].dt.year
4588
+ data[column] = data[column].dt.year
4551
4589
  elif astype == "date":
4552
- df[column] = df[column].dt.date
4590
+ data[column] = data[column].dt.date
4553
4591
  elif astype == "hour":
4554
- df[column] = df[column].dt.hour
4592
+ data[column] = data[column].dt.hour
4555
4593
  elif astype == "minute":
4556
- df[column] = df[column].dt.minute
4594
+ data[column] = data[column].dt.minute
4557
4595
  elif astype == "second":
4558
- df[column] = df[column].dt.second
4596
+ data[column] = data[column].dt.second
4559
4597
  elif astype == "week":
4560
- df[column] = df[column].dt.day_name()
4598
+ data[column] = data[column].dt.day_name()
4561
4599
 
4562
4600
  elif astype == "numeric":
4563
4601
  kwargs.pop("errors", None)
4564
- df[column] = pd.to_numeric(df[column], errors=errors, **kwargs)
4602
+ data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
4565
4603
  # print(f"Successfully converted '{column}' to numeric.")
4566
4604
  elif astype == "timedelta":
4567
4605
  kwargs.pop("errors", None)
4568
- df[column] = pd.to_timedelta(df[column], errors=errors, **kwargs)
4606
+ data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
4569
4607
  # print(f"Successfully converted '{column}' to timedelta.")
4570
4608
  else:
4571
4609
  # Convert to other types (e.g., float, int)
4572
- df[column] = df[column].astype(astype)
4610
+ data[column] = data[column].astype(astype)
4573
4611
  # print(f"Successfully converted '{column}' to {astype}.")
4574
4612
  except Exception as e:
4575
4613
  print(f"Error converting '{column}' to {astype}: {e}")
4576
-
4577
- # Return the modified DataFrame if inplace is False
4578
- return df
4614
+ try:
4615
+ display(data.info()[:10])
4616
+ except:
4617
+ pass
4618
+ return data
4579
4619
 
4580
4620
 
4581
- # ! DataFrame
4621
+ # ! DataFrame
4582
4622
  def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4583
4623
  """
4584
4624
  Sort a DataFrame by a specified column based on a custom order or by count.
@@ -4595,7 +4635,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4595
4635
  Returns:
4596
4636
  - Sorted DataFrame if inplace is False, otherwise None.
4597
4637
  """
4598
- if column not in df.columns:
4638
+ if column not in data.columns:
4599
4639
  raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
4600
4640
 
4601
4641
  if isinstance(by, str) and 'count' in by.lower():
@@ -4618,11 +4658,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4618
4658
 
4619
4659
  try:
4620
4660
  if inplace: # replace the original
4621
- df.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4661
+ data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4622
4662
  print(f"Successfully sorted DataFrame by '{column}'")
4623
4663
  return None
4624
4664
  else:
4625
- sorted_df = df.sort_values(column, ascending=ascending, **kwargs)
4665
+ sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
4626
4666
  print(f"Successfully sorted DataFrame by '{column}' using custom order.")
4627
4667
  return sorted_df
4628
4668
  except Exception as e:
@@ -4630,7 +4670,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4630
4670
  return df
4631
4671
 
4632
4672
 
4633
-
4634
4673
  # # Example usage:
4635
4674
  # # Sample DataFrame
4636
4675
  # data = {
@@ -4661,6 +4700,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4661
4700
  # display(df_month)
4662
4701
 
4663
4702
 
4703
+ def df_merge(
4704
+ df1: pd.DataFrame,
4705
+ df2: pd.DataFrame,
4706
+ use_index: bool = True,
4707
+ columns: list = ["col_left", "col_right"],
4708
+ how: str = "left",
4709
+ ) -> pd.DataFrame:
4710
+ """
4711
+ Merges two DataFrames based on either the index or shared columns with matching data types.
4712
+ usage:
4713
+ #(1) if the index are the same
4714
+ df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
4715
+ #(2) if there are shaed columns, then based on shared columns
4716
+ df_merged = df_merge(df1, df2, how='outer')
4717
+ #(3) if columns: then based on the specific columns
4718
+ df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
4719
+ Parameters:
4720
+ - df1 (pd.DataFrame): The first DataFrame.
4721
+ - df2 (pd.DataFrame): The second DataFrame.
4722
+ - use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
4723
+ - how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
4724
+ 'inner': only the rows that have matching values in both DataFrames (intersection)
4725
+ 'outer': keeps all rows from both DataFrames and fills in missing values with NaN
4726
+ 'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
4727
+ 'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
4728
+
4729
+ Returns:
4730
+ - pd.DataFrame: The merged DataFrame.
4731
+ """
4732
+
4733
+ # 1. Check if indices are comparable (same length and types)
4734
+ if use_index or df1.index.equals(df2.index):
4735
+ print(f"Merging based on index using '{how}' join...")
4736
+ df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
4737
+ return df_merged
4738
+
4739
+ # 2. Find common columns with the same dtype
4740
+ common_columns = df1.columns.intersection(df2.columns)
4741
+ shared_columns = []
4742
+ for col in common_columns:
4743
+ if df1[col].dtype == df2[col].dtype:
4744
+ shared_columns.append(col)
4745
+ if not isinstance(columns, list):
4746
+ columns = [columns]
4747
+ if len(columns) != 2:
4748
+ raise ValueError(
4749
+ "'columns':list shoule be a list: columns=['col_left','col_right']"
4750
+ )
4751
+ if all(columns):
4752
+ print(f"Merging based on columns: {columns} using '{how}' join...")
4753
+ df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
4754
+ elif shared_columns:
4755
+ print(
4756
+ f"Merging based on shared columns: {shared_columns} using '{how}' join..."
4757
+ )
4758
+ df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
4759
+ else:
4760
+ raise ValueError(
4761
+ "No common columns with matching data types to merge on, and indices are not comparable."
4762
+ )
4763
+ return df_merged
4764
+
4765
+ def df_fillna(
4766
+ data: pd.DataFrame,
4767
+ method: str = "mean",
4768
+ axis: int = 0,# column-wise
4769
+ constant: float = None,
4770
+ inplace: bool = True,
4771
+ ) -> pd.DataFrame:
4772
+ """
4773
+ Fill missing values in a DataFrame using specified imputation method.
4774
+
4775
+ Parameters:
4776
+ data (pd.DataFrame): The DataFrame to fill missing values.
4777
+ method (str): The imputation method to use. Options are:
4778
+ - 'mean': Replace missing values with the mean of the column.
4779
+ - 'median': Replace missing values with the median of the column.
4780
+ - 'most_frequent': Replace missing values with the most frequent value in the column.
4781
+ - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
4782
+ - 'knn': Use K-Nearest Neighbors imputation.
4783
+ - 'iterative': Use Iterative imputation.
4784
+ axis (int): The axis along which to impute:
4785
+ - 0: Impute column-wise (default).
4786
+ - 1: Impute row-wise.
4787
+ constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
4788
+ inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
4789
+
4790
+ """
4791
+
4792
+ if data.empty:
4793
+ raise ValueError("Input DataFrame is empty.")
4794
+
4795
+ # Validate method
4796
+ methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
4797
+ method = strcmp(method, methods)[0]
4798
+
4799
+ # If using constant method, ask for a constant value
4800
+ if constant is not None:
4801
+ method = "constant"
4802
+ try:
4803
+ constant = float(constant)
4804
+ except ValueError:
4805
+ raise ValueError("Constant value must be a number.")
4806
+
4807
+ # Initialize SimpleImputer with the chosen method
4808
+ if method == "constant":
4809
+ imputer = SimpleImputer(strategy=method, fill_value=constant)
4810
+ elif method == "knn":
4811
+ from sklearn.impute import KNNImputer
4812
+
4813
+ imputer = KNNImputer(n_neighbors=n_neighbors)
4814
+ elif method == "iterative":
4815
+ from sklearn.impute import IterativeImputer
4816
+
4817
+ imputer = IterativeImputer(max_iter=max_iter)
4818
+ else:
4819
+ from sklearn.impute import SimpleImputer
4820
+
4821
+ imputer = SimpleImputer(strategy=method)
4822
+
4823
+ # Fit and transform the data
4824
+ if axis == 0:
4825
+ # Impute column-wise
4826
+ imputed_data = imputer.fit_transform(data)
4827
+ imputed_data.shape
4828
+ elif axis == 1:
4829
+ # Impute row-wise
4830
+ imputed_data = imputer.fit_transform(data.T)
4831
+ imputed_data.shape
4832
+ else:
4833
+ raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
4834
+
4835
+ df_filled = pd.DataFrame(
4836
+ imputed_data if axis == 0 else imputed_data.T,
4837
+ index=data.index,# if axis == 0 else data.columns,
4838
+ columns=data.columns,# if axis == 0 else data.index,
4839
+ )
4840
+
4841
+ if inplace:
4842
+ data.update(df_filled)
4843
+ return None # replace original
4844
+ else:
4845
+ return df_filled
4846
+ def df_scaler(
4847
+ data: pd.DataFrame,
4848
+ method="standard",
4849
+ columns=None, # default, select all numeric col/row
4850
+ inplace=False,
4851
+ verbose=False, # show usage
4852
+ axis=0, # defalut column-wise
4853
+ **kwargs,
4854
+ ):
4855
+ """
4856
+ df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
4857
+
4858
+ Parameters:
4859
+ - data: pandas DataFrame to be scaled.
4860
+ - method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
4861
+ - columns: List of columns (for axis=0) or rows (for axis=1) to scale.
4862
+ If None, all numeric columns/rows will be scaled.
4863
+ - inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
4864
+ - axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
4865
+ - verbose: If True, prints logs of the process.
4866
+ - kwargs: Additional arguments to be passed to the scaler.
4867
+ """
4868
+ if verbose:
4869
+ print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
4870
+
4871
+ methods = ["standard", "minmax", "robust"]
4872
+ method = strcmp(method, methods)[0]
4873
+ if method == "standard":
4874
+ from sklearn.preprocessing import StandardScaler
4875
+
4876
+ scaler = StandardScaler(**kwargs)
4877
+ elif method == "minmax":
4878
+ from sklearn.preprocessing import MinMaxScaler
4879
+
4880
+ scaler = MinMaxScaler(**kwargs)
4881
+ elif method == "robust":
4882
+ from sklearn.preprocessing import RobustScaler
4883
+
4884
+ scaler = RobustScaler(**kwargs)
4885
+ if axis not in [0, 1]:
4886
+ raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
4887
+
4888
+ if axis == 0:
4889
+ # Column-wise scaling (default)
4890
+ if columns is None:
4891
+ columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
4892
+ non_numeric_columns = data.columns.difference(columns)
4893
+ print(f"Scaling columns")
4894
+
4895
+ scaled_data = scaler.fit_transform(data[columns])
4896
+
4897
+ if inplace:
4898
+ data[columns] = scaled_data
4899
+ print("Original DataFrame modified in place (column-wise).")
4900
+ else:
4901
+ scaled_df = pd.concat(
4902
+ [
4903
+ pd.DataFrame(scaled_data, columns=columns, index=data.index),
4904
+ data[non_numeric_columns],
4905
+ ],
4906
+ axis=1,
4907
+ )
4908
+ scaled_df = scaled_df[data.columns] # Maintain column order
4909
+ return scaled_df
4910
+
4911
+ elif axis == 1:
4912
+ # Row-wise scaling
4913
+ if columns is None:
4914
+ columns = data.index.tolist()
4915
+ numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
4916
+ if numeric_rows.empty:
4917
+ raise ValueError("No numeric rows to scale.")
4918
+
4919
+ print(f"Scaling rows")
4920
+
4921
+ scaled_data = scaler.fit_transform(
4922
+ numeric_rows.T
4923
+ ).T # Transpose for scaling and then back
4924
+
4925
+ if inplace:
4926
+ data.loc[numeric_rows.index] = scaled_data
4927
+ print("Original DataFrame modified in place (row-wise).")
4928
+ else:
4929
+ scaled_df = data.copy()
4930
+ scaled_df.loc[numeric_rows.index] = scaled_data
4931
+ return scaled_df
4932
+
4664
4933
  def df_cluster(
4665
4934
  data: pd.DataFrame,
4666
4935
  columns: Optional[list] = None,
@@ -5381,4 +5650,4 @@ def print_pd_usage(
5381
5650
  i_ = i_.replace("=", "\t= ") + ","
5382
5651
  print(i_) if i == 0 else print("\t", i_)
5383
5652
  else:
5384
- print(usage)
5653
+ print(usage)