py2ls 0.2.4.1__py3-none-any.whl → 0.2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -61,19 +61,31 @@ except NameError:
61
61
 
62
62
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
63
63
  """
64
- Add the Chinese font to the font manager
64
+ Add the Chinese (default) font to the font manager
65
65
  Args:
66
66
  dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
67
67
  """
68
68
  import matplotlib.pyplot as plt
69
69
  from matplotlib import font_manager
70
+ slashtype = "/" if 'mac' in get_os() else "\\"
71
+ if slashtype in dir_font:
72
+ font_manager.fontManager.addfont(dir_font)
73
+ fontname = os.path.basename(dir_font).split(".")[0]
74
+ else:
75
+ if "cn" in dir_font.lower() or "ch" in dir_font.lower():
76
+ fontname = "Hiragino Sans GB" # default Chinese font
77
+ else:
78
+ fontname = dir_font
70
79
 
71
- font_manager.fontManager.addfont(dir_font)
72
- fontname_chinese = os.path.basename(dir_font).split(".")[0]
73
-
74
- plt.rcParams["font.sans-serif"] = [fontname_chinese]
75
- plt.rcParams["font.family"] = "sans-serif"
80
+ plt.rcParams["font.sans-serif"] = [fontname]
81
+ # plt.rcParams["font.family"] = "sans-serif"
76
82
  plt.rcParams["axes.unicode_minus"] = False
83
+ fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
84
+ fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
85
+ if fontname not in fontname_in_system:
86
+ print(f"Font '{fontname}' not found. Falling back to default.")
87
+ plt.rcParams["font.sans-serif"] = ["Arial"]
88
+ return fontname
77
89
 
78
90
  # set 'dir_save'
79
91
  if "dar" in sys.platform:
@@ -526,6 +538,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
526
538
  if isinstance(s, str):
527
539
  return s.lower()
528
540
  elif isinstance(s, list):
541
+ s=[str(i) for i in s]# convert all to str
529
542
  return [elem.lower() for elem in s]
530
543
  return s
531
544
 
@@ -1697,24 +1710,16 @@ def fload(fpath, kind=None, **kwargs):
1697
1710
  def load_csv(fpath, **kwargs):
1698
1711
  from pandas.errors import EmptyDataError
1699
1712
 
1700
- engine = kwargs.get("engine", "pyarrow")
1701
- kwargs.pop("engine", None)
1702
- sep = kwargs.get("sep", "\t")
1703
- kwargs.pop("sep", None)
1704
- index_col = kwargs.get("index_col", None)
1705
- kwargs.pop("index_col", None)
1706
- memory_map = kwargs.get("memory_map", True)
1707
- kwargs.pop("memory_map", None)
1708
- skipinitialspace = kwargs.get("skipinitialspace", True)
1709
- kwargs.pop("skipinitialspace", None)
1710
- encoding = kwargs.get("encoding", "utf-8")
1711
- kwargs.pop("encoding", None)
1712
- on_bad_lines = kwargs.get("on_bad_lines", "skip")
1713
- kwargs.pop("on_bad_lines", None)
1714
- comment = kwargs.get("comment", None)
1715
- kwargs.pop("comment", None)
1716
-
1713
+ engine = kwargs.pop("engine", "pyarrow")
1714
+ sep = kwargs.pop("sep", "\t")
1715
+ index_col = kwargs.pop("index_col", None)
1716
+ memory_map = kwargs.pop("memory_map", False)
1717
+ skipinitialspace = kwargs.pop("skipinitialspace", False)
1718
+ encoding = kwargs.pop("encoding", "utf-8")
1719
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1720
+ comment = kwargs.pop("comment", None)
1717
1721
  fmt=kwargs.pop("fmt",False)
1722
+ verbose=kwargs.pop("verbose",False)
1718
1723
  if verbose:
1719
1724
  print_pd_usage("read_csv", verbose=verbose)
1720
1725
  return
@@ -1800,7 +1805,7 @@ def fload(fpath, kind=None, **kwargs):
1800
1805
  separators = [",", "\t", ";", "|", " "]
1801
1806
  for sep in separators:
1802
1807
  sep2show = sep if sep != "\t" else "\\t"
1803
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1808
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1804
1809
  try:
1805
1810
  df = pd.read_csv(
1806
1811
  fpath,
@@ -1819,13 +1824,13 @@ def fload(fpath, kind=None, **kwargs):
1819
1824
  except:
1820
1825
  pass
1821
1826
  else:
1822
- engines = ["c", "python"]
1827
+ engines = [None,"c", "python"]
1823
1828
  for engine in engines:
1824
- # separators = [",", "\t", ";", "|", " "]
1829
+ separators = [",", "\t", ";", "|", " "]
1825
1830
  for sep in separators:
1826
1831
  try:
1827
1832
  sep2show = sep if sep != "\t" else "\\t"
1828
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1833
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
1829
1834
  df = pd.read_csv(
1830
1835
  fpath,
1831
1836
  engine=engine,
@@ -2031,6 +2036,9 @@ def fload(fpath, kind=None, **kwargs):
2031
2036
  elif kind.lower() in img_types:
2032
2037
  print(f'Image ".{kind}" is loaded.')
2033
2038
  return load_img(fpath)
2039
+ elif kind=="gz" and fpath.endswith(".soft.gz"):
2040
+ import GEOparse
2041
+ return GEOparse.get_GEO(filepath=fpath)
2034
2042
  elif kind.lower() in zip_types:
2035
2043
  keep = kwargs.get("keep", False)
2036
2044
  fpath_unzip = unzip(fpath)
@@ -2105,30 +2113,51 @@ def fload(fpath, kind=None, **kwargs):
2105
2113
  # docx_content = fload('sample.docx')
2106
2114
 
2107
2115
 
2108
- def fupdate(fpath, content=None):
2116
+ def fupdate(fpath, content=None, how="head"):
2109
2117
  """
2110
2118
  Update a file by adding new content at the top and moving the old content to the bottom.
2119
+ If the file is a JSON file, merge the new content with the old content.
2120
+
2111
2121
  Parameters
2112
2122
  ----------
2113
2123
  fpath : str
2114
2124
  The file path where the content should be updated.
2115
- content : str, optional
2116
- The new content to add at the top of the file. If not provided, the function will not add any new content.
2125
+ content : str or dict, optional
2126
+ The new content to add at the top of the file (for text) or merge (for JSON).
2127
+ If not provided, the function will not add any new content.
2128
+
2117
2129
  Notes
2118
2130
  -----
2119
2131
  - If the file at `fpath` does not exist, it will be created.
2120
- - The new content will be added at the top, followed by the old content of the file.
2132
+ - For text files, the new content will be added at the top, followed by the old content.
2133
+ - For JSON files, the new content will be merged with the existing JSON content.
2121
2134
  """
2122
2135
  content = content or ""
2123
- if os.path.exists(fpath):
2124
- with open(fpath, "r") as file:
2125
- old_content = file.read()
2136
+ file_ext = os.path.splitext(fpath)[1]
2137
+ how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
2138
+ how = strcmp(how, how_s)[0]
2139
+ print(how)
2140
+ add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
2141
+ if "json" in file_ext.lower():
2142
+ old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
2143
+ updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
2144
+ fsave(fpath,updated_content)
2126
2145
  else:
2127
- old_content = ""
2146
+ # Handle text file
2147
+ if os.path.exists(fpath):
2148
+ with open(fpath, "r") as file:
2149
+ old_content = file.read()
2150
+ else:
2151
+ old_content = ""
2128
2152
 
2129
- with open(fpath, "w") as file:
2130
- file.write(content)
2131
- file.write(old_content)
2153
+ # Write new content at the top followed by old content
2154
+ with open(fpath, "w") as file:
2155
+ if add_where=="head":
2156
+ file.write(content + "\n")
2157
+ file.write(old_content)
2158
+ else:
2159
+ file.write(old_content)
2160
+ file.write(content + "\n")
2132
2161
 
2133
2162
 
2134
2163
  def fappend(fpath, content=None):
@@ -2710,12 +2739,14 @@ def mkdir_nest(fpath: str) -> str:
2710
2739
  Returns:
2711
2740
  - str: The path of the created directory.
2712
2741
  """
2713
- # Check if the directory already exists
2714
- if os.path.isdir(fpath):
2715
- return fpath
2742
+
2716
2743
 
2717
2744
  # Split the full path into directories
2718
2745
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2746
+ if os.path.isdir(fpath):
2747
+ fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2748
+ print(fpath)
2749
+ return fpath
2719
2750
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2720
2751
 
2721
2752
  # Start creating directories from the root to the desired path
@@ -2744,34 +2775,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2744
2775
  - str: The path of the created directory or an error message.
2745
2776
  """
2746
2777
 
2747
- rootdir = []
2748
- # Convert string to list
2778
+ rootdir = []
2749
2779
  if chdir is None:
2750
2780
  return mkdir_nest(pardir)
2751
2781
  if isinstance(chdir, str):
2752
- chdir = [chdir]
2753
- # Subfoldername should be unique
2782
+ chdir = [chdir]
2754
2783
  chdir = list(set(chdir))
2755
2784
  if isinstance(pardir, str): # Dir_parents should be 'str' type
2756
- pardir = os.path.normpath(pardir)
2757
- # Get the slash type: "/" or "\"
2758
- stype = "/" if "/" in pardir else "\\"
2785
+ pardir = os.path.normpath(pardir)
2759
2786
  if "mac" in get_os().lower() or "lin" in get_os().lower():
2760
2787
  stype = "/"
2761
2788
  elif "win" in get_os().lower():
2762
2789
  stype = "\\"
2763
2790
  else:
2764
2791
  stype = "/"
2765
-
2766
- # Check if the parent directory exists and is a directory path
2792
+
2767
2793
  if os.path.isdir(pardir):
2768
2794
  os.chdir(pardir) # Set current path
2769
2795
  # Check if subdirectories are not empty
2770
2796
  if chdir:
2771
- chdir.sort()
2772
- # Create multiple subdirectories at once
2773
- for folder in chdir:
2774
- # Check if the subfolder already exists
2797
+ chdir.sort()
2798
+ for folder in chdir:
2775
2799
  child_tmp = os.path.join(pardir, folder)
2776
2800
  if not os.path.isdir(child_tmp):
2777
2801
  os.mkdir("./" + folder)
@@ -2791,6 +2815,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2791
2815
  # Dir is the main output, if only one dir, then str type is inconvenient
2792
2816
  if len(rootdir) == 1:
2793
2817
  rootdir = rootdir[0]
2818
+ rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
2819
+ print(rootdir)
2794
2820
  return rootdir
2795
2821
 
2796
2822
 
@@ -2805,22 +2831,25 @@ def figsave(*args, dpi=300):
2805
2831
  dir_save = None
2806
2832
  fname = None
2807
2833
  img = None
2834
+ f_slash = "/" if "mac" in get_os().lower() else "\\"
2808
2835
  for arg in args:
2809
2836
  if isinstance(arg, str):
2810
- if "/" in arg or "\\" in arg:
2837
+ if f_slash in arg:
2811
2838
  dir_save = arg
2812
- elif "/" not in arg and "\\" not in arg:
2839
+ else:
2813
2840
  fname = arg
2814
2841
  elif isinstance(arg, (Image.Image, np.ndarray)):
2815
2842
  img = arg # Store the PIL image if provided
2816
2843
 
2817
- f_slash = "/" if "mac" in get_os().lower() else "\\"
2818
2844
  if dir_save is None:
2819
2845
  dir_save="./"
2846
+ print(dir_save)
2847
+ # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
2820
2848
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
2821
2849
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
2822
2850
  if not dir_par.endswith(f_slash):
2823
2851
  dir_par += f_slash
2852
+ print(dir_par)
2824
2853
  if fname is None:
2825
2854
  fname = dir_ch
2826
2855
  mkdir(dir_par)
@@ -4418,9 +4447,10 @@ def preview(var):
4418
4447
 
4419
4448
  # ! DataFrame
4420
4449
  def df_astype(
4421
- df: pd.DataFrame,
4450
+ data: pd.DataFrame,
4422
4451
  columns: Optional[Union[str, List[str]]] = None,
4423
4452
  astype: str = "datetime",
4453
+ skip_row:Union[str,list]=None,
4424
4454
  fmt: Optional[str] = None,
4425
4455
  inplace: bool = True,
4426
4456
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
@@ -4484,22 +4514,24 @@ def df_astype(
4484
4514
  ]
4485
4515
  # If inplace is False, make a copy of the DataFrame
4486
4516
  if not inplace:
4487
- df = df.copy()
4517
+ data = data.copy()
4518
+ if skip_row is not None:
4519
+ data = data.drop(index=skip_row, errors='ignore')
4520
+ # If columns is None, apply to all columns
4521
+ if columns is None:
4522
+ columns = data.columns.tolist()
4488
4523
  # correct the astype input
4489
4524
  if isinstance(astype,str):
4490
4525
  astype = strcmp(astype, astypes)[0]
4491
- print(f"converting {columns} as type: {astype}")
4526
+ print(f"converting as type: {astype}")
4492
4527
  elif isinstance(astype,dict):
4493
4528
  for col, dtype in astype.items():
4494
4529
  dtype='date' if dtype=="day" else dtype
4495
- df["col"]=df["col"].adtype(strcmp(dtype, astypes)[0])
4496
- return
4497
- # If columns is None, apply to all columns
4498
- if columns is None:
4499
- columns = df.columns
4530
+ data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
4531
+ return data if not inplace else None
4500
4532
 
4501
4533
  # Ensure columns is a list
4502
- if isinstance(columns, (str, int)):
4534
+ if isinstance(columns, str):
4503
4535
  columns = [columns]
4504
4536
 
4505
4537
  # Convert specified columns
@@ -4519,72 +4551,74 @@ def df_astype(
4519
4551
  kwargs.pop("errors", None)
4520
4552
  # convert it as type: datetime
4521
4553
  if isinstance(column, int):
4522
- df.iloc[:, column] = pd.to_datetime(
4523
- df.iloc[:, column], format=fmt, errors=errors, **kwargs
4554
+ data.iloc[:, column] = pd.to_datetime(
4555
+ data.iloc[:, column], format=fmt, errors=errors, **kwargs
4524
4556
  )
4525
4557
  # further convert:
4526
4558
  if astype == "time":
4527
- df.iloc[:, column] = df.iloc[:, column].dt.time
4559
+ data.iloc[:, column] = data.iloc[:, column].dt.time
4528
4560
  elif astype == "month":
4529
- df.iloc[:, column] = df.iloc[:, column].dt.month
4561
+ data.iloc[:, column] = data.iloc[:, column].dt.month
4530
4562
  elif astype == "year":
4531
- df.iloc[:, column] = df.iloc[:, column].dt.year
4563
+ data.iloc[:, column] = data.iloc[:, column].dt.year
4532
4564
  elif astype == "date" or astype == "day":
4533
- df.iloc[:, column] = df.iloc[:, column].dt.date
4565
+ data.iloc[:, column] = data.iloc[:, column].dt.date
4534
4566
  elif astype == "hour":
4535
- df.iloc[:, column] = df.iloc[:, column].dt.hour
4567
+ data.iloc[:, column] = data.iloc[:, column].dt.hour
4536
4568
  elif astype == "minute":
4537
- df.iloc[:, column] = df.iloc[:, column].dt.minute
4569
+ data.iloc[:, column] = data.iloc[:, column].dt.minute
4538
4570
  elif astype == "second":
4539
- df.iloc[:, column] = df.iloc[:, column].dt.second
4571
+ data.iloc[:, column] = data.iloc[:, column].dt.second
4540
4572
  elif astype == "week":
4541
- df.iloc[:, column] = df.iloc[:, column].dt.day_name()
4573
+ data.iloc[:, column] = data.iloc[:, column].dt.day_name()
4542
4574
  else:
4543
- df[column] = (
4575
+ data[column] = (
4544
4576
  pd.to_datetime(
4545
- df[column], format=fmt, errors=errors, **kwargs
4577
+ data[column], format=fmt, errors=errors, **kwargs
4546
4578
  )
4547
4579
  if fmt
4548
- else pd.to_datetime(df[column], errors=errors, **kwargs)
4580
+ else pd.to_datetime(data[column], errors=errors, **kwargs)
4549
4581
  )
4550
4582
  # further convert:
4551
4583
  if astype == "time":
4552
- df[column] = df[column].dt.time
4584
+ data[column] = data[column].dt.time
4553
4585
  elif astype == "month":
4554
- df[column] = df[column].dt.month
4586
+ data[column] = data[column].dt.month
4555
4587
  elif astype == "year":
4556
- df[column] = df[column].dt.year
4588
+ data[column] = data[column].dt.year
4557
4589
  elif astype == "date":
4558
- df[column] = df[column].dt.date
4590
+ data[column] = data[column].dt.date
4559
4591
  elif astype == "hour":
4560
- df[column] = df[column].dt.hour
4592
+ data[column] = data[column].dt.hour
4561
4593
  elif astype == "minute":
4562
- df[column] = df[column].dt.minute
4594
+ data[column] = data[column].dt.minute
4563
4595
  elif astype == "second":
4564
- df[column] = df[column].dt.second
4596
+ data[column] = data[column].dt.second
4565
4597
  elif astype == "week":
4566
- df[column] = df[column].dt.day_name()
4598
+ data[column] = data[column].dt.day_name()
4567
4599
 
4568
4600
  elif astype == "numeric":
4569
4601
  kwargs.pop("errors", None)
4570
- df[column] = pd.to_numeric(df[column], errors=errors, **kwargs)
4602
+ data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
4571
4603
  # print(f"Successfully converted '{column}' to numeric.")
4572
4604
  elif astype == "timedelta":
4573
4605
  kwargs.pop("errors", None)
4574
- df[column] = pd.to_timedelta(df[column], errors=errors, **kwargs)
4606
+ data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
4575
4607
  # print(f"Successfully converted '{column}' to timedelta.")
4576
4608
  else:
4577
4609
  # Convert to other types (e.g., float, int)
4578
- df[column] = df[column].astype(astype)
4610
+ data[column] = data[column].astype(astype)
4579
4611
  # print(f"Successfully converted '{column}' to {astype}.")
4580
4612
  except Exception as e:
4581
4613
  print(f"Error converting '{column}' to {astype}: {e}")
4582
-
4583
- # Return the modified DataFrame if inplace is False
4584
- return df
4614
+ try:
4615
+ display(data.info()[:10])
4616
+ except:
4617
+ pass
4618
+ return data
4585
4619
 
4586
4620
 
4587
- # ! DataFrame
4621
+ # ! DataFrame
4588
4622
  def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4589
4623
  """
4590
4624
  Sort a DataFrame by a specified column based on a custom order or by count.
@@ -4601,7 +4635,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4601
4635
  Returns:
4602
4636
  - Sorted DataFrame if inplace is False, otherwise None.
4603
4637
  """
4604
- if column not in df.columns:
4638
+ if column not in data.columns:
4605
4639
  raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
4606
4640
 
4607
4641
  if isinstance(by, str) and 'count' in by.lower():
@@ -4624,11 +4658,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4624
4658
 
4625
4659
  try:
4626
4660
  if inplace: # replace the original
4627
- df.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4661
+ data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
4628
4662
  print(f"Successfully sorted DataFrame by '{column}'")
4629
4663
  return None
4630
4664
  else:
4631
- sorted_df = df.sort_values(column, ascending=ascending, **kwargs)
4665
+ sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
4632
4666
  print(f"Successfully sorted DataFrame by '{column}' using custom order.")
4633
4667
  return sorted_df
4634
4668
  except Exception as e:
@@ -4636,7 +4670,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4636
4670
  return df
4637
4671
 
4638
4672
 
4639
-
4640
4673
  # # Example usage:
4641
4674
  # # Sample DataFrame
4642
4675
  # data = {
@@ -4667,6 +4700,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4667
4700
  # display(df_month)
4668
4701
 
4669
4702
 
4703
+ def df_merge(
4704
+ df1: pd.DataFrame,
4705
+ df2: pd.DataFrame,
4706
+ use_index: bool = True,
4707
+ columns: list = ["col_left", "col_right"],
4708
+ how: str = "left",
4709
+ ) -> pd.DataFrame:
4710
+ """
4711
+ Merges two DataFrames based on either the index or shared columns with matching data types.
4712
+ usage:
4713
+ #(1) if the index are the same
4714
+ df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
4715
+ #(2) if there are shaed columns, then based on shared columns
4716
+ df_merged = df_merge(df1, df2, how='outer')
4717
+ #(3) if columns: then based on the specific columns
4718
+ df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
4719
+ Parameters:
4720
+ - df1 (pd.DataFrame): The first DataFrame.
4721
+ - df2 (pd.DataFrame): The second DataFrame.
4722
+ - use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
4723
+ - how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
4724
+ 'inner': only the rows that have matching values in both DataFrames (intersection)
4725
+ 'outer': keeps all rows from both DataFrames and fills in missing values with NaN
4726
+ 'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
4727
+ 'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
4728
+
4729
+ Returns:
4730
+ - pd.DataFrame: The merged DataFrame.
4731
+ """
4732
+
4733
+ # 1. Check if indices are comparable (same length and types)
4734
+ if use_index or df1.index.equals(df2.index):
4735
+ print(f"Merging based on index using '{how}' join...")
4736
+ df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
4737
+ return df_merged
4738
+
4739
+ # 2. Find common columns with the same dtype
4740
+ common_columns = df1.columns.intersection(df2.columns)
4741
+ shared_columns = []
4742
+ for col in common_columns:
4743
+ if df1[col].dtype == df2[col].dtype:
4744
+ shared_columns.append(col)
4745
+ if not isinstance(columns, list):
4746
+ columns = [columns]
4747
+ if len(columns) != 2:
4748
+ raise ValueError(
4749
+ "'columns':list shoule be a list: columns=['col_left','col_right']"
4750
+ )
4751
+ if all(columns):
4752
+ print(f"Merging based on columns: {columns} using '{how}' join...")
4753
+ df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
4754
+ elif shared_columns:
4755
+ print(
4756
+ f"Merging based on shared columns: {shared_columns} using '{how}' join..."
4757
+ )
4758
+ df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
4759
+ else:
4760
+ raise ValueError(
4761
+ "No common columns with matching data types to merge on, and indices are not comparable."
4762
+ )
4763
+ return df_merged
4764
+
4765
+ def df_fillna(
4766
+ data: pd.DataFrame,
4767
+ method: str = "mean",
4768
+ axis: int = 0,# column-wise
4769
+ constant: float = None,
4770
+ inplace: bool = True,
4771
+ ) -> pd.DataFrame:
4772
+ """
4773
+ Fill missing values in a DataFrame using specified imputation method.
4774
+
4775
+ Parameters:
4776
+ data (pd.DataFrame): The DataFrame to fill missing values.
4777
+ method (str): The imputation method to use. Options are:
4778
+ - 'mean': Replace missing values with the mean of the column.
4779
+ - 'median': Replace missing values with the median of the column.
4780
+ - 'most_frequent': Replace missing values with the most frequent value in the column.
4781
+ - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
4782
+ - 'knn': Use K-Nearest Neighbors imputation.
4783
+ - 'iterative': Use Iterative imputation.
4784
+ axis (int): The axis along which to impute:
4785
+ - 0: Impute column-wise (default).
4786
+ - 1: Impute row-wise.
4787
+ constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
4788
+ inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
4789
+
4790
+ """
4791
+
4792
+ if data.empty:
4793
+ raise ValueError("Input DataFrame is empty.")
4794
+
4795
+ # Validate method
4796
+ methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
4797
+ method = strcmp(method, methods)[0]
4798
+
4799
+ # If using constant method, ask for a constant value
4800
+ if constant is not None:
4801
+ method = "constant"
4802
+ try:
4803
+ constant = float(constant)
4804
+ except ValueError:
4805
+ raise ValueError("Constant value must be a number.")
4806
+
4807
+ # Initialize SimpleImputer with the chosen method
4808
+ if method == "constant":
4809
+ imputer = SimpleImputer(strategy=method, fill_value=constant)
4810
+ elif method == "knn":
4811
+ from sklearn.impute import KNNImputer
4812
+
4813
+ imputer = KNNImputer(n_neighbors=n_neighbors)
4814
+ elif method == "iterative":
4815
+ from sklearn.impute import IterativeImputer
4816
+
4817
+ imputer = IterativeImputer(max_iter=max_iter)
4818
+ else:
4819
+ from sklearn.impute import SimpleImputer
4820
+
4821
+ imputer = SimpleImputer(strategy=method)
4822
+
4823
+ # Fit and transform the data
4824
+ if axis == 0:
4825
+ # Impute column-wise
4826
+ imputed_data = imputer.fit_transform(data)
4827
+ imputed_data.shape
4828
+ elif axis == 1:
4829
+ # Impute row-wise
4830
+ imputed_data = imputer.fit_transform(data.T)
4831
+ imputed_data.shape
4832
+ else:
4833
+ raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
4834
+
4835
+ df_filled = pd.DataFrame(
4836
+ imputed_data if axis == 0 else imputed_data.T,
4837
+ index=data.index,# if axis == 0 else data.columns,
4838
+ columns=data.columns,# if axis == 0 else data.index,
4839
+ )
4840
+
4841
+ if inplace:
4842
+ data.update(df_filled)
4843
+ return None # replace original
4844
+ else:
4845
+ return df_filled
4846
+ def df_scaler(
4847
+ data: pd.DataFrame,
4848
+ method="standard",
4849
+ columns=None, # default, select all numeric col/row
4850
+ inplace=False,
4851
+ verbose=False, # show usage
4852
+ axis=0, # defalut column-wise
4853
+ **kwargs,
4854
+ ):
4855
+ """
4856
+ df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
4857
+
4858
+ Parameters:
4859
+ - data: pandas DataFrame to be scaled.
4860
+ - method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
4861
+ - columns: List of columns (for axis=0) or rows (for axis=1) to scale.
4862
+ If None, all numeric columns/rows will be scaled.
4863
+ - inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
4864
+ - axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
4865
+ - verbose: If True, prints logs of the process.
4866
+ - kwargs: Additional arguments to be passed to the scaler.
4867
+ """
4868
+ if verbose:
4869
+ print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
4870
+
4871
+ methods = ["standard", "minmax", "robust"]
4872
+ method = strcmp(method, methods)[0]
4873
+ if method == "standard":
4874
+ from sklearn.preprocessing import StandardScaler
4875
+
4876
+ scaler = StandardScaler(**kwargs)
4877
+ elif method == "minmax":
4878
+ from sklearn.preprocessing import MinMaxScaler
4879
+
4880
+ scaler = MinMaxScaler(**kwargs)
4881
+ elif method == "robust":
4882
+ from sklearn.preprocessing import RobustScaler
4883
+
4884
+ scaler = RobustScaler(**kwargs)
4885
+ if axis not in [0, 1]:
4886
+ raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
4887
+
4888
+ if axis == 0:
4889
+ # Column-wise scaling (default)
4890
+ if columns is None:
4891
+ columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
4892
+ non_numeric_columns = data.columns.difference(columns)
4893
+ print(f"Scaling columns")
4894
+
4895
+ scaled_data = scaler.fit_transform(data[columns])
4896
+
4897
+ if inplace:
4898
+ data[columns] = scaled_data
4899
+ print("Original DataFrame modified in place (column-wise).")
4900
+ else:
4901
+ scaled_df = pd.concat(
4902
+ [
4903
+ pd.DataFrame(scaled_data, columns=columns, index=data.index),
4904
+ data[non_numeric_columns],
4905
+ ],
4906
+ axis=1,
4907
+ )
4908
+ scaled_df = scaled_df[data.columns] # Maintain column order
4909
+ return scaled_df
4910
+
4911
+ elif axis == 1:
4912
+ # Row-wise scaling
4913
+ if columns is None:
4914
+ columns = data.index.tolist()
4915
+ numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
4916
+ if numeric_rows.empty:
4917
+ raise ValueError("No numeric rows to scale.")
4918
+
4919
+ print(f"Scaling rows")
4920
+
4921
+ scaled_data = scaler.fit_transform(
4922
+ numeric_rows.T
4923
+ ).T # Transpose for scaling and then back
4924
+
4925
+ if inplace:
4926
+ data.loc[numeric_rows.index] = scaled_data
4927
+ print("Original DataFrame modified in place (row-wise).")
4928
+ else:
4929
+ scaled_df = data.copy()
4930
+ scaled_df.loc[numeric_rows.index] = scaled_data
4931
+ return scaled_df
4932
+
4670
4933
  def df_cluster(
4671
4934
  data: pd.DataFrame,
4672
4935
  columns: Optional[list] = None,
@@ -5387,4 +5650,4 @@ def print_pd_usage(
5387
5650
  i_ = i_.replace("=", "\t= ") + ","
5388
5651
  print(i_) if i == 0 else print("\t", i_)
5389
5652
  else:
5390
- print(usage)
5653
+ print(usage)