PyPI - py2ls - Versions diffs - 0.2.4.1__py3-none-any.whl → 0.2.4.3__py3-none-any.whl - Mend

py2ls 0.2.4.1py3-none-any.whl → 0.2.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

py2ls/bio.py +513 -0
py2ls/data/usages_pd copy.json +1105 -0
py2ls/data/usages_pd.json +1413 -52
py2ls/fetch_update.py +45 -27
py2ls/ips.py +680 -168
py2ls/plot.py +104 -77
{py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/METADATA +1 -1
{py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/RECORD +9 -7
{py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
 from . import netfinder
-# from .plot import get_color
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
     get_ipython().run_line_magic("autoreload", "2")
@@ -61,19 +59,31 @@ except NameError:
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
     """
-    Add the Chinese font to the font manager
+    Add the Chinese (default) font to the font manager
     Args:
         dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
     """
     import matplotlib.pyplot as plt
     from matplotlib import font_manager
+    slashtype = "/" if 'mac' in get_os() else "\\"
+    if slashtype in dir_font:
+        font_manager.fontManager.addfont(dir_font)
+        fontname = os.path.basename(dir_font).split(".")[0]
+    else:
+        if "cn" in dir_font.lower() or "ch" in dir_font.lower():
+            fontname = "Hiragino Sans GB" # default Chinese font
+        else:
+            fontname = dir_font
-    font_manager.fontManager.addfont(dir_font)
-    fontname_chinese = os.path.basename(dir_font).split(".")[0]
-    plt.rcParams["font.sans-serif"] = [fontname_chinese]
-    plt.rcParams["font.family"] = "sans-serif"
+    plt.rcParams["font.sans-serif"] = [fontname]
+    # plt.rcParams["font.family"] = "sans-serif"
     plt.rcParams["axes.unicode_minus"] = False
+    fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
+    fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
+    if fontname not in fontname_in_system:
+        print(f"Font '{fontname}' not found. Falling back to default.")
+        plt.rcParams["font.sans-serif"] = ["Arial"]
+    return fontname
 # set 'dir_save'
 if "dar" in sys.platform:
@@ -506,6 +516,59 @@ def is_text(s):
     return has_alpha and has_non_alpha
+from typing import Any, Union
+def shared(lst1:Any, lst2:Any,*args, verbose=True):
+    """
+    check the shared elelements in two list.
+    usage:
+        list1 = [1, 2, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        list3 = [5, 6, 9, 10]
+        a = shared(list1, list2,list3)
+    """
+    if verbose:
+        print("\n********* checking shared elements *********")
+    if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
+        print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
+    shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
+    # support more lists
+    if args:
+        for arg in args:
+            shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
+    shared_elements = list(shared_elements)
+    if verbose:
+        elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
+        print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
+        print("********* checking shared elements *********")
+    return shared_elements
+def flatten(nested: Any, unique_list=True,verbose=True):
+    """
+    Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
+    Parameters:
+        nested : Any, Can be a list, tuple, dictionary, or set.
+    Returns: list, A flattened list.
+    """
+    flattened_list = []
+    stack = [nested]
+    while stack:
+        current = stack.pop()
+        if isinstance(current, dict):
+            stack.extend(current.values())
+        elif isinstance(current, (list, tuple, set)):
+            stack.extend(current)
+        elif isinstance(current, pd.Series):
+            stack.extend(current)
+        else:
+            flattened_list.append(current)
+    if verbose:
+        print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
+    if unique_list:
+        return unique(flattened_list)
+    else:
+        return flattened_list
 def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
     """
     Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -526,6 +589,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
             if isinstance(s, str):
                 return s.lower()
             elif isinstance(s, list):
+                s=[str(i) for i in s]# convert all to str
                 return [elem.lower() for elem in s]
         return s
@@ -535,7 +599,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
             similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
         elif "W" in scorer.lower():
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
-        elif "ratio" in scorer.lower():#Ratio (Strictest)
+        elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
             similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
         else:
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -1697,26 +1761,18 @@ def fload(fpath, kind=None, **kwargs):
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
-        engine = kwargs.get("engine", "pyarrow")
-        kwargs.pop("engine", None)
-        sep = kwargs.get("sep", "\t")
-        kwargs.pop("sep", None)
-        index_col = kwargs.get("index_col", None)
-        kwargs.pop("index_col", None)
-        memory_map = kwargs.get("memory_map", True)
-        kwargs.pop("memory_map", None)
-        skipinitialspace = kwargs.get("skipinitialspace", True)
-        kwargs.pop("skipinitialspace", None)
-        encoding = kwargs.get("encoding", "utf-8")
-        kwargs.pop("encoding", None)
-        on_bad_lines = kwargs.get("on_bad_lines", "skip")
-        kwargs.pop("on_bad_lines", None)
-        comment = kwargs.get("comment", None)
-        kwargs.pop("comment", None)
+        engine = kwargs.pop("engine", "pyarrow")
+        sep = kwargs.pop("sep", "\t")
+        index_col = kwargs.pop("index_col", None)
+        memory_map = kwargs.pop("memory_map", False)
+        skipinitialspace = kwargs.pop("skipinitialspace", False)
+        encoding = kwargs.pop("encoding", "utf-8")
+        on_bad_lines = kwargs.pop("on_bad_lines", "skip")
+        comment = kwargs.pop("comment", None)
         fmt=kwargs.pop("fmt",False)
+        verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("read_csv", verbose=verbose)
+            use_pd("read_csv", verbose=verbose)
             return
         if comment is None:
@@ -1800,7 +1856,7 @@ def fload(fpath, kind=None, **kwargs):
                 separators = [",", "\t", ";", "|", " "]
                 for sep in separators:
                     sep2show = sep if sep != "\t" else "\\t"
-                    # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
+                    print(f'trying with: engine=pyarrow, sep="{sep2show}"')
                     try:
                         df = pd.read_csv(
                             fpath,
@@ -1819,13 +1875,13 @@ def fload(fpath, kind=None, **kwargs):
                     except:
                         pass
                 else:
-                    engines = ["c", "python"]
+                    engines = [None,"c", "python"]
                     for engine in engines:
-                        # separators = [",", "\t", ";", "|", " "]
+                        separators = [",", "\t", ";", "|", " "]
                         for sep in separators:
                             try:
                                 sep2show = sep if sep != "\t" else "\\t"
-                                # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                print(f"trying with: engine={engine}, sep='{sep2show}'")
                                 df = pd.read_csv(
                                     fpath,
                                     engine=engine,
@@ -1848,7 +1904,7 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
         verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("read_excel", verbose=verbose)
+            use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
             meata=pd.ExcelFile(fpath)
@@ -2031,6 +2087,9 @@ def fload(fpath, kind=None, **kwargs):
     elif kind.lower() in img_types:
         print(f'Image ".{kind}" is loaded.')
         return load_img(fpath)
+    elif kind=="gz" and fpath.endswith(".soft.gz"):
+        import GEOparse
+        return GEOparse.get_GEO(filepath=fpath)
     elif kind.lower() in zip_types:
         keep = kwargs.get("keep", False)
         fpath_unzip = unzip(fpath)
@@ -2105,30 +2164,51 @@ def fload(fpath, kind=None, **kwargs):
 # docx_content = fload('sample.docx')
-def fupdate(fpath, content=None):
+def fupdate(fpath, content=None, how="head"):
     """
     Update a file by adding new content at the top and moving the old content to the bottom.
+    If the file is a JSON file, merge the new content with the old content.
     Parameters
     ----------
     fpath : str
         The file path where the content should be updated.
-    content : str, optional
-        The new content to add at the top of the file. If not provided, the function will not add any new content.
+    content : str or dict, optional
+        The new content to add at the top of the file (for text) or merge (for JSON).
+        If not provided, the function will not add any new content.
     Notes
     -----
     - If the file at `fpath` does not exist, it will be created.
-    - The new content will be added at the top, followed by the old content of the file.
+    - For text files, the new content will be added at the top, followed by the old content.
+    - For JSON files, the new content will be merged with the existing JSON content.
     """
     content = content or ""
-    if os.path.exists(fpath):
-        with open(fpath, "r") as file:
-            old_content = file.read()
+    file_ext = os.path.splitext(fpath)[1]
+    how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
+    how = strcmp(how, how_s)[0]
+    print(how)
+    add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
+    if "json" in file_ext.lower():
+        old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
+        updated_content =  {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
+        fsave(fpath,updated_content)
     else:
-        old_content = ""
+        # Handle text file
+        if os.path.exists(fpath):
+            with open(fpath, "r") as file:
+                old_content = file.read()
+        else:
+            old_content = ""
-    with open(fpath, "w") as file:
-        file.write(content)
-        file.write(old_content)
+        # Write new content at the top followed by old content
+        with open(fpath, "w") as file:
+            if add_where=="head":
+                file.write(content + "\n")
+                file.write(old_content)
+            else:
+                file.write(old_content)
+                file.write(content + "\n")
 def fappend(fpath, content=None):
@@ -2234,7 +2314,7 @@ def fsave(
         verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("to_csv", verbose=verbose)
+            use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
             sep=",",
@@ -2266,7 +2346,7 @@ def fsave(
         verbose=kwargs.pop("verbose",False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
         if verbose:
-            print_pd_usage("to_excel", verbose=verbose)
+            use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
         else:
@@ -2710,12 +2790,14 @@ def mkdir_nest(fpath: str) -> str:
     Returns:
     - str: The path of the created directory.
     """
-    # Check if the directory already exists
-    if os.path.isdir(fpath):
-        return fpath
     # Split the full path into directories
     f_slash = "/" if "mac" in get_os().lower() else "\\"
+    if os.path.isdir(fpath):
+        fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
+        print(fpath)
+        return fpath
     dir_parts = fpath.split(f_slash)  # Split the path by the OS-specific separator
     # Start creating directories from the root to the desired path
@@ -2744,34 +2826,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
     - str: The path of the created directory or an error message.
     """
-    rootdir = []
-    # Convert string to list
+    rootdir = []
     if chdir is None:
         return mkdir_nest(pardir)
     if isinstance(chdir, str):
-        chdir = [chdir]
-    # Subfoldername should be unique
+        chdir = [chdir]
     chdir = list(set(chdir))
     if isinstance(pardir, str):  # Dir_parents should be 'str' type
-        pardir = os.path.normpath(pardir)
-    # Get the slash type: "/" or "\"
-    stype = "/" if "/" in pardir else "\\"
+        pardir = os.path.normpath(pardir)
     if "mac" in get_os().lower() or "lin" in get_os().lower():
         stype = "/"
     elif "win" in get_os().lower():
         stype = "\\"
     else:
         stype = "/"
-    # Check if the parent directory exists and is a directory path
     if os.path.isdir(pardir):
         os.chdir(pardir)  # Set current path
         # Check if subdirectories are not empty
         if chdir:
-            chdir.sort()
-            # Create multiple subdirectories at once
-            for folder in chdir:
-                # Check if the subfolder already exists
+            chdir.sort()
+            for folder in chdir:
                 child_tmp = os.path.join(pardir, folder)
                 if not os.path.isdir(child_tmp):
                     os.mkdir("./" + folder)
@@ -2791,6 +2866,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
     # Dir is the main output, if only one dir, then str type is inconvenient
     if len(rootdir) == 1:
         rootdir = rootdir[0]
+    rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
+    print(rootdir)
     return rootdir
@@ -2805,22 +2882,25 @@ def figsave(*args, dpi=300):
     dir_save = None
     fname = None
     img = None
+    f_slash = "/" if "mac" in get_os().lower() else "\\"
     for arg in args:
         if isinstance(arg, str):
-            if "/" in arg or "\\" in arg:
+            if f_slash in arg:
                 dir_save = arg
-            elif "/" not in arg and "\\" not in arg:
+            else:
                 fname = arg
         elif isinstance(arg, (Image.Image, np.ndarray)):
             img = arg  # Store the PIL image if provided
-    f_slash = "/" if "mac" in get_os().lower() else "\\"
     if dir_save is None:
         dir_save="./"
+    print(dir_save)
+    # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
     dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
     dir_ch = "".join(dir_save.split(f_slash)[-1:])
     if not dir_par.endswith(f_slash):
         dir_par += f_slash
+    print(dir_par)
     if fname is None:
         fname = dir_ch
     mkdir(dir_par)
@@ -4415,12 +4495,48 @@ def preview(var):
 # preview("# This is a Markdown header")
 # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
 # preview({"key": "value", "numbers": [1, 2, 3]})
+def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
+    """
+    Extend a DataFrame by the list elecments in the column.
+    Parameters:
+    ----------
+    data : pd.DataFrame
+        The input DataFrame to be extended.
+    column : str
+        The name of the column to be split.
+    axis : int, optional
+        The axis along which to expand the DataFrame.
+        - 0 (default): Expand the specified column into multiple rows.
+        - 1: Expand the specified column into multiple columns.
+    sep : str, optional
+        The separator used to split the values in the specified column.
+        Must be provided for the function to work correctly.
+    """
+    data = data.copy()
+    mask = data[column].str.contains(sep, na=False)
+    data = data.copy()
+    if mask.any():
+        data[column] = (
+            data[column]
+            .apply(lambda x: x.split(sep) if isinstance(x, str) else x)  # Only split if x is a string
+        )
+        # Strip spaces from each item in the lists
+        data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
+    data = data.explode(column, ignore_index=True)
+    return data
 # ! DataFrame
 def df_astype(
-    df: pd.DataFrame,
+    data: pd.DataFrame,
     columns: Optional[Union[str, List[str]]] = None,
     astype: str = "datetime",
+    skip_row:Union[str,list]=None,
     fmt: Optional[str] = None,
     inplace: bool = True,
     errors: str = "coerce",  # Can be "ignore", "raise", or "coerce"
@@ -4484,22 +4600,24 @@ def df_astype(
     ]
     # If inplace is False, make a copy of the DataFrame
     if not inplace:
-        df = df.copy()
+        data = data.copy()
+    if skip_row is not None:
+        data = data.drop(index=skip_row, errors='ignore')
+    # If columns is None, apply to all columns
+    if columns is None:
+        columns = data.columns.tolist()
     # correct the astype input
     if isinstance(astype,str):
         astype = strcmp(astype, astypes)[0]
-        print(f"converting {columns} as type: {astype}")
+        print(f"converting as type: {astype}")
     elif isinstance(astype,dict):
         for col, dtype in astype.items():
             dtype='date' if dtype=="day" else dtype
-            df["col"]=df["col"].adtype(strcmp(dtype, astypes)[0])
-        return
-    # If columns is None, apply to all columns
-    if columns is None:
-        columns = df.columns
+            data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
+        return data if not inplace else None
     # Ensure columns is a list
-    if isinstance(columns, (str, int)):
+    if isinstance(columns, str):
         columns = [columns]
     # Convert specified columns
@@ -4519,72 +4637,74 @@ def df_astype(
                 kwargs.pop("errors", None)
                 # convert it as type: datetime
                 if isinstance(column, int):
-                    df.iloc[:, column] = pd.to_datetime(
-                        df.iloc[:, column], format=fmt, errors=errors, **kwargs
+                    data.iloc[:, column] = pd.to_datetime(
+                        data.iloc[:, column], format=fmt, errors=errors, **kwargs
                     )
                     # further convert:
                     if astype == "time":
-                        df.iloc[:, column] = df.iloc[:, column].dt.time
+                        data.iloc[:, column] = data.iloc[:, column].dt.time
                     elif astype == "month":
-                        df.iloc[:, column] = df.iloc[:, column].dt.month
+                        data.iloc[:, column] = data.iloc[:, column].dt.month
                     elif astype == "year":
-                        df.iloc[:, column] = df.iloc[:, column].dt.year
+                        data.iloc[:, column] = data.iloc[:, column].dt.year
                     elif astype == "date" or astype == "day":
-                        df.iloc[:, column] = df.iloc[:, column].dt.date
+                        data.iloc[:, column] = data.iloc[:, column].dt.date
                     elif astype == "hour":
-                        df.iloc[:, column] = df.iloc[:, column].dt.hour
+                        data.iloc[:, column] = data.iloc[:, column].dt.hour
                     elif astype == "minute":
-                        df.iloc[:, column] = df.iloc[:, column].dt.minute
+                        data.iloc[:, column] = data.iloc[:, column].dt.minute
                     elif astype == "second":
-                        df.iloc[:, column] = df.iloc[:, column].dt.second
+                        data.iloc[:, column] = data.iloc[:, column].dt.second
                     elif astype == "week":
-                        df.iloc[:, column] = df.iloc[:, column].dt.day_name()
+                        data.iloc[:, column] = data.iloc[:, column].dt.day_name()
                 else:
-                    df[column] = (
+                    data[column] = (
                         pd.to_datetime(
-                            df[column], format=fmt, errors=errors, **kwargs
+                            data[column], format=fmt, errors=errors, **kwargs
                         )
                         if fmt
-                        else pd.to_datetime(df[column], errors=errors, **kwargs)
+                        else pd.to_datetime(data[column], errors=errors, **kwargs)
                     )
                     # further convert:
                     if astype == "time":
-                        df[column] = df[column].dt.time
+                        data[column] = data[column].dt.time
                     elif astype == "month":
-                        df[column] = df[column].dt.month
+                        data[column] = data[column].dt.month
                     elif astype == "year":
-                        df[column] = df[column].dt.year
+                        data[column] = data[column].dt.year
                     elif astype == "date":
-                        df[column] = df[column].dt.date
+                        data[column] = data[column].dt.date
                     elif astype == "hour":
-                        df[column] = df[column].dt.hour
+                        data[column] = data[column].dt.hour
                     elif astype == "minute":
-                        df[column] = df[column].dt.minute
+                        data[column] = data[column].dt.minute
                     elif astype == "second":
-                        df[column] = df[column].dt.second
+                        data[column] = data[column].dt.second
                     elif astype == "week":
-                        df[column] = df[column].dt.day_name()
+                        data[column] = data[column].dt.day_name()
             elif astype == "numeric":
                 kwargs.pop("errors", None)
-                df[column] = pd.to_numeric(df[column], errors=errors, **kwargs)
+                data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
                 # print(f"Successfully converted '{column}' to numeric.")
             elif astype == "timedelta":
                 kwargs.pop("errors", None)
-                df[column] = pd.to_timedelta(df[column], errors=errors, **kwargs)
+                data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
                 # print(f"Successfully converted '{column}' to timedelta.")
             else:
                 # Convert to other types (e.g., float, int)
-                df[column] = df[column].astype(astype)
+                data[column] = data[column].astype(astype)
                 # print(f"Successfully converted '{column}' to {astype}.")
         except Exception as e:
             print(f"Error converting '{column}' to {astype}: {e}")
-    # Return the modified DataFrame if inplace is False
-    return df
+    try:
+        display(data.info()[:10])
+    except:
+        pass
+    return data
-# ! DataFrame
+# ! DataFrame
 def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
     """
     Sort a DataFrame by a specified column based on a custom order or by count.
@@ -4601,7 +4721,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
     Returns:
     - Sorted DataFrame if inplace is False, otherwise None.
     """
-    if column not in df.columns:
+    if column not in data.columns:
         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
     if isinstance(by, str) and 'count' in by.lower():
@@ -4624,11 +4744,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
     try:
         if inplace:  # replace the original
-            df.sort_values(column, ascending=ascending, inplace=True, **kwargs)
+            data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
             print(f"Successfully sorted DataFrame by '{column}'")
             return None
         else:
-            sorted_df = df.sort_values(column, ascending=ascending, **kwargs)
+            sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
             print(f"Successfully sorted DataFrame by '{column}' using custom order.")
             return sorted_df
     except Exception as e:
@@ -4636,7 +4756,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
         return df
 # # Example usage:
 # # Sample DataFrame
 # data = {
@@ -4667,6 +4786,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
 # display(df_month)
+def df_merge(
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    use_index: bool = True,
+    columns: list = ["col_left", "col_right"],
+    how: str = "left",
+) -> pd.DataFrame:
+    """
+    Merges two DataFrames based on either the index or shared columns with matching data types.
+    usage:
+        #(1) if the index are the same
+            df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
+        #(2) if there are shaed columns, then based on shared columns
+            df_merged = df_merge(df1, df2, how='outer')
+        #(3) if columns: then based on the specific columns
+            df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
+    Parameters:
+    - df1 (pd.DataFrame): The first DataFrame.
+    - df2 (pd.DataFrame): The second DataFrame.
+    - use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
+    - how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
+    'inner': only the rows that have matching values in both DataFrames (intersection)
+    'outer': keeps all rows from both DataFrames and fills in missing values with NaN
+    'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
+    'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
+    Returns:
+    - pd.DataFrame: The merged DataFrame.
+    """
+    # 1. Check if indices are comparable (same length and types)
+    if use_index:
+        print(f"Merging based on index using '{how}' join...")
+        df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
+        return df_merged
+    # 2. Find common columns with the same dtype
+    common_columns = df1.columns.intersection(df2.columns)
+    shared_columns = []
+    for col in common_columns:
+        if df1[col].dtype == df2[col].dtype:
+            shared_columns.append(col)
+    if not isinstance(columns, list):
+        columns = [columns]
+    if len(columns) != 2:
+        raise ValueError(
+            "'columns':list shoule be a list: columns=['col_left','col_right']"
+        )
+    if all(columns):
+        print(f"Merging based on columns: {columns} using '{how}' join...")
+        df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
+    elif shared_columns:
+        print(
+            f"Merging based on shared columns: {shared_columns} using '{how}' join..."
+        )
+        df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
+    else:
+        raise ValueError(
+            "No common columns with matching data types to merge on, and indices are not comparable."
+        )
+    return df_merged
+def df_fillna(
+    data: pd.DataFrame,
+    method: str = "mean",
+    axis: int = 0,# column-wise
+    constant: float = None,
+    inplace: bool = True,
+) -> pd.DataFrame:
+    """
+    Fill missing values in a DataFrame using specified imputation method.
+    Parameters:
+    data (pd.DataFrame): The DataFrame to fill missing values.
+    method (str): The imputation method to use. Options are:
+        - 'mean': Replace missing values with the mean of the column.
+        - 'median': Replace missing values with the median of the column.
+        - 'most_frequent': Replace missing values with the most frequent value in the column.
+        - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
+        - 'knn': Use K-Nearest Neighbors imputation.
+        - 'iterative': Use Iterative imputation.
+    axis (int): The axis along which to impute:
+        - 0: Impute column-wise (default).
+        - 1: Impute row-wise.
+    constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
+    inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
+    """
+    if data.empty:
+        raise ValueError("Input DataFrame is empty.")
+    # Validate method
+    methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
+    method = strcmp(method, methods)[0]
+    # If using constant method, ask for a constant value
+    if constant is not None:
+        method = "constant"
+        try:
+            constant = float(constant)
+        except ValueError:
+            raise ValueError("Constant value must be a number.")
+    # Initialize SimpleImputer with the chosen method
+    if method == "constant":
+        imputer = SimpleImputer(strategy=method, fill_value=constant)
+    elif method == "knn":
+        from sklearn.impute import KNNImputer
+        imputer = KNNImputer(n_neighbors=n_neighbors)
+    elif method == "iterative":
+        from sklearn.impute import IterativeImputer
+        imputer = IterativeImputer(max_iter=max_iter)
+    else:
+        from sklearn.impute import SimpleImputer
+        imputer = SimpleImputer(strategy=method)
+    # Fit and transform the data
+    if axis == 0:
+        # Impute column-wise
+        imputed_data = imputer.fit_transform(data)
+        imputed_data.shape
+    elif axis == 1:
+        # Impute row-wise
+        imputed_data = imputer.fit_transform(data.T)
+        imputed_data.shape
+    else:
+        raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
+    df_filled = pd.DataFrame(
+        imputed_data if axis == 0 else imputed_data.T,
+        index=data.index,# if axis == 0 else data.columns,
+        columns=data.columns,# if axis == 0 else data.index,
+    )
+    if inplace:
+        data.update(df_filled)
+        return None  # replace original
+    else:
+        return df_filled
+def df_scaler(
+    data: pd.DataFrame,
+    method="standard",
+    columns=None,  # default, select all numeric col/row
+    inplace=False,
+    verbose=False,  # show usage
+    axis=0,  # defalut column-wise
+    **kwargs,
+):
+    """
+    df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
+    Parameters:
+    - data: pandas DataFrame to be scaled.
+    - method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
+    - columns: List of columns (for axis=0) or rows (for axis=1) to scale.
+               If None, all numeric columns/rows will be scaled.
+    - inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
+    - axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
+    - verbose: If True, prints logs of the process.
+    - kwargs: Additional arguments to be passed to the scaler.
+    """
+    if verbose:
+        print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
+    methods = ["standard", "minmax", "robust"]
+    method = strcmp(method, methods)[0]
+    if method == "standard":
+        from sklearn.preprocessing import StandardScaler
+        scaler = StandardScaler(**kwargs)
+    elif method == "minmax":
+        from sklearn.preprocessing import MinMaxScaler
+        scaler = MinMaxScaler(**kwargs)
+    elif method == "robust":
+        from sklearn.preprocessing import RobustScaler
+        scaler = RobustScaler(**kwargs)
+    if axis not in [0, 1]:
+        raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if axis == 0:
+        # Column-wise scaling (default)
+        if columns is None:
+            columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
+        non_numeric_columns = data.columns.difference(columns)
+        print(f"Scaling columns")
+        scaled_data = scaler.fit_transform(data[columns])
+        if inplace:
+            data[columns] = scaled_data
+            print("Original DataFrame modified in place (column-wise).")
+        else:
+            scaled_df = pd.concat(
+                [
+                    pd.DataFrame(scaled_data, columns=columns, index=data.index),
+                    data[non_numeric_columns],
+                ],
+                axis=1,
+            )
+            scaled_df = scaled_df[data.columns]  # Maintain column order
+            return scaled_df
+    elif axis == 1:
+        # Row-wise scaling
+        if columns is None:
+            columns = data.index.tolist()
+        numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
+        if numeric_rows.empty:
+            raise ValueError("No numeric rows to scale.")
+        print(f"Scaling rows")
+        scaled_data = scaler.fit_transform(
+            numeric_rows.T
+        ).T  # Transpose for scaling and then back
+        if inplace:
+            data.loc[numeric_rows.index] = scaled_data
+            print("Original DataFrame modified in place (row-wise).")
+        else:
+            scaled_df = data.copy()
+            scaled_df.loc[numeric_rows.index] = scaled_data
+            return scaled_df
 def df_cluster(
     data: pd.DataFrame,
     columns: Optional[list] = None,
@@ -4721,7 +5070,7 @@ def df_cluster(
         X = scaler.fit_transform(X)
     for n_cluster in range_n_clusters:
-        kmeans = KMeans(n_clusters=n_cluster, random_state=42)
+        kmeans = KMeans(n_clusters=n_cluster, random_state=1)
         cluster_labels = kmeans.fit_predict(X)
         silhouette_avg = silhouette_score(X, cluster_labels)
@@ -4737,7 +5086,7 @@ def df_cluster(
     print(f"n_clusters = {n_clusters}")
     # Apply K-Means Clustering with Optimal Number of Clusters
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=1)
     cluster_labels = kmeans.fit_predict(X)
     if plot:
@@ -4838,7 +5187,7 @@ def df_cluster(
         # n_clusters = (
         #     np.argmax(silhouette_avg_scores) + 2
         # )  # Optimal clusters based on max silhouette score
-        # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+        # kmeans = KMeans(n_clusters=n_clusters, random_state=1)
         # cluster_labels = kmeans.fit_predict(X)
         silhouette_vals = silhouette_samples(X, cluster_labels)
@@ -4989,12 +5338,14 @@ def df_reducer(
     columns: Optional[List[str]] = None,
     method: str = "umap",  # 'pca', 'umap'
     n_components: int = 2,  # Default for umap, but 50 for PCA
-    umap_neighbors: int = 15,  # Default
-    umap_min_dist: float = 0.1,  # Default
+    umap_neighbors: int = 15,  # UMAP-specific
+    umap_min_dist: float = 0.1,  # UMAP-specific
+    tsne_perplexity: int = 30,  # t-SNE-specific
     scale: bool = True,
     fill_missing: bool = True,
     debug: bool = False,
     inplace: bool = True,  # replace the oringinal data
+    plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
 ) -> pd.DataFrame:
     """
     Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
@@ -5030,9 +5381,35 @@ def df_reducer(
     reduced_df : pd.DataFrame
         DataFrame with the reduced dimensions.
     """
-    from sklearn.decomposition import PCA
+    """
+       PCA: explained_variance:
+            indicates the proportion of the dataset's total variance that each principal
+            component (PC) explains. It gives you a sense of how much information
+            (or variance) is captured by each PC
+        Interpretation:
+            - Higher values indicate that the corresponding PC captures more variance.
+            - The sum of the explained variances for all PCs equals 1 (or 100%).
+            - If the first few components explain a high percentage (e.g., 90%),
+            it means you can reduce the dimensionality of the data significantly without losing much information.
+        Use case:
+            You may plot a scree plot, which shows the explained variance for each PC, to help decide
+            how many components to keep for analysis.
+        PCA: Singular values:
+            represent the magnitude of variance along each principal component. Mathematically,
+            they are the square roots of the eigenvalues of the covariance matrix.
+        Interpretation:
+            Larger singular values indicate that the associated PC captures more variance.
+            Singular values are related to the scale of the data. If the data are scaled
+            before PCA (e.g., standardized), then the singular values will provide a measure
+            of the spread of data along each PC.
+        Use case:
+            Singular values help quantify the contribution of each principal component in a
+            similar way to the explained variance. They are useful in understanding the overall
+            structure of the data.
+    """
     from sklearn.preprocessing import StandardScaler
-    import umap
     from sklearn.impute import SimpleImputer
     # Select columns if specified, else use all columns
@@ -5049,76 +5426,211 @@ def df_reducer(
         X = scaler.fit_transform(X)
     # Check valid method input
-    if method not in ["pca", "umap"]:
-        raise ValueError(f"Invalid method '{method}'. Choose 'pca' or 'umap'.")
+    methods=["pca", "umap","tsne","factor","isolation_forest"]
+    method=strcmp(method, methods)[0]
     # Apply PCA if selected
-    if method == "pca":
-        if n_components is None:
-            # to get the n_components with threshold method:
-            pca = PCA()
-            pca_result = pca.fit_transform(X)
-            # Calculate explained variance
-            explained_variance = pca.explained_variance_ratio_
-            # Cumulative explained variance
-            cumulative_variance = np.cumsum(explained_variance)
-            # Set a threshold for cumulative variance
-            threshold = 0.95  # Example threshold
-            n_components = (
-                np.argmax(cumulative_variance >= threshold) + 1
-            )  # Number of components to retain
-            if debug:
-                # debug:
-                # Plot the cumulative explained variance
-                plt.figure(figsize=(8, 5))
-                plt.plot(
-                    range(1, len(cumulative_variance) + 1),
-                    cumulative_variance,
-                    marker="o",
-                    linestyle="-",
-                )
-                plt.title("Cumulative Explained Variance by Principal Components")
-                plt.xlabel("Number of Principal Components")
-                plt.ylabel("Cumulative Explained Variance")
-                plt.xticks(range(1, len(cumulative_variance) + 1))
-                # Add horizontal line for the threshold
-                plt.axhline(
-                    y=threshold, color="r", linestyle="--", label="Threshold (95%)"
-                )
-                # Add vertical line for n_components
-                plt.axvline(
-                    x=n_components,
-                    color="g",
-                    linestyle="--",
-                    label=f"n_components = {n_components}",
-                )
-                plt.legend()
-                plt.grid()
+    if method == "pca":
+        from sklearn.decomposition import PCA
         pca = PCA(n_components=n_components)
         X_reduced = pca.fit_transform(X)
-        print(f"PCA completed: Reduced to {n_components} components.")
+        # Additional PCA information
+        explained_variance = pca.explained_variance_ratio_
+        singular_values = pca.singular_values_
+        loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
+        if debug:
+            print(f"PCA completed: Reduced to {n_components} components.")
+            print(f"Explained Variance: {explained_variance}")
+            print(f"Singular Values: {singular_values}")
+        # Plot explained variance if debug=True
+        if debug:
+            # Plot explained variance
+            cumulative_variance = np.cumsum(explained_variance)
+            plt.figure(figsize=(8, 5))
+            plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
+            plt.title("Cumulative Explained Variance by Principal Components")
+            plt.xlabel("Number of Principal Components")
+            plt.ylabel("Cumulative Explained Variance")
+            plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
+            plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
+            plt.legend()
+            plt.grid()
+            plt.show()
+        # Prepare reduced DataFrame with additional PCA info
+        pca_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"PC_{i+1}" for i in range(n_components)]
+        )
+        # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
+        # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
+        # Expand explained variance to multiple columns if needed
+        for i in range(n_components):
+            pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
+        for i in range(n_components):
+            pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
     # Apply UMAP if selected
     elif method == "umap":
+        import umap
         umap_reducer = umap.UMAP(
             n_neighbors=umap_neighbors,
             min_dist=umap_min_dist,
-            n_components=n_components,
+            n_components=n_components
         )
         X_reduced = umap_reducer.fit_transform(X)
-        print(f"UMAP completed: Reduced to {n_components} components.")
-    # Return reduced data as a new DataFrame with the same index
-    reduced_df = pd.DataFrame(X_reduced, index=data.index)
+        # Additional UMAP information
+        embedding = umap_reducer.embedding_
+        trustworthiness = umap_reducer._raw_data[:, :n_components]
+        if debug:
+            print(f"UMAP completed: Reduced to {n_components} components.")
+            print(f"Embedding Shape: {embedding.shape}")
+            print(f"Trustworthiness: {trustworthiness}")
+        # Prepare reduced DataFrame with additional UMAP info
+        umap_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"UMAP_{i+1}" for i in range(n_components)]
+        )
+        umap_df["Embedding"] = embedding[:, 0]  # Example of embedding data
+        umap_df["Trustworthiness"] = trustworthiness[:, 0]  # Trustworthiness metric
+    elif method == "tsne":
+        from sklearn.manifold import TSNE
+        tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
+        X_reduced = tsne.fit_transform(X)
+        # Prepare reduced DataFrame with additional t-SNE info
+        tsne_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"tSNE_{i+1}" for i in range(n_components)]
+        )
+        tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
+    # Apply Factor Analysis if selected
+    elif method == "factor":
+        from sklearn.decomposition import FactorAnalysis
+        factor = FactorAnalysis(n_components=n_components, random_state=1)
+        X_reduced = factor.fit_transform(X)
+        # Factor Analysis does not directly provide explained variance, but we can approximate it
+        fa_variance = factor.noise_variance_
+        # Prepare reduced DataFrame with additional Factor Analysis info
+        factor_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"Factor_{i+1}" for i in range(n_components)]
+        )
+        factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
+    # Apply Isolation Forest for outlier detection if selected
+    elif method == "isolation_forest":
+        from sklearn.decomposition import PCA
+        from sklearn.ensemble import IsolationForest
+        # Step 1: Apply PCA for dimensionality reduction to 2 components
+        pca = PCA(n_components=n_components)
+        X_pca = pca.fit_transform(X)
+        explained_variance = pca.explained_variance_ratio_
+        singular_values = pca.singular_values_
+        # Prepare reduced DataFrame with additional PCA info
+        iso_forest_df = pd.DataFrame(
+            X_pca, index=data.index,
+            columns=[f"PC_{i+1}" for i in range(n_components)]
+        )
+        isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
+        isolation_forest.fit(X)
+        anomaly_scores = isolation_forest.decision_function(X)  # Anomaly score: larger is less anomalous
+        # Predict labels: 1 (normal), -1 (anomaly)
+        anomaly_labels = isolation_forest.fit_predict(X)
+        # Add anomaly scores and labels to the DataFrame
+        iso_forest_df["Anomaly Score"] = anomaly_scores
+        iso_forest_df["Anomaly Label"] = anomaly_labels
+        # add info from pca
+        for i in range(n_components):
+            iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
+        for i in range(n_components):
+            iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
+    # Return reduced data and info as a new DataFrame with the same index
+    if method == "pca":
+        reduced_df = pca_df
+        colname_met = "PC_"
+        if plot_:
+            sns.scatterplot(
+                data=pca_df,
+                x="PC_1",
+                y="PC_2",
+                # hue="condition",
+            )
+    elif method == "umap":
+        reduced_df = umap_df
+        colname_met = "UMAP_"
+        if plot_:
+            sns.scatterplot(
+                data=umap_df,
+                x="UMAP_1",
+                y="UMAP_2",
+                # hue="condition",
+            )
+    elif method == "tsne":
+        reduced_df = tsne_df
+        colname_met = "t-SNE_"
+        if plot_:
+            sns.scatterplot(
+                data=tsne_df,
+                x="tSNE_1",
+                y="tSNE_2",
+                # hue="batch",
+            )
+    elif method == "factor":
+        reduced_df = factor_df
+        colname_met = "Factor_"
+        if plot_:
+            sns.scatterplot(
+                data=factor_df,
+                x="Factor_1",
+                y="Factor_2",
+                # hue="batch",
+            )
+    elif method == "isolation_forest":
+        reduced_df = iso_forest_df  # Already a DataFrame for outliers
+        colname_met = "PC_"
+        if plot_:
+            ax = sns.scatterplot(
+                data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
+                x="PC_1",
+                y="PC_2",
+                label="normal", c="b",
+            )
+            ax = sns.scatterplot(
+                ax=ax,
+                data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
+                x="PC_1",
+                y="PC_2",
+                c="r",
+                label="outlier", marker="+", s=30,
+            )
     if inplace:
-        # Replace or add new columns based on n_components
+        # If inplace=True, add components back into the original data
         for col_idx in range(n_components):
-            data[f"Component_{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+            data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+        # Add extra info for PCA/UMAP
+        if method == "pca":
+            data["Explained Variance"] = reduced_df["Explained Variance"]
+            data["Singular Values"] = reduced_df["Singular Values"]
+        elif method == "umap":
+            data["Embedding"] = reduced_df["Embedding"]
+            data["Trustworthiness"] = reduced_df["Trustworthiness"]
         return None  # No return when inplace=True
-    return reduced_df
+    return reduced_df
 # example:
@@ -5373,7 +5885,7 @@ def evaluate_cluster(
     return metrics
-def print_pd_usage(
+def use_pd(
     func_name="excel",
     verbose=True,
     dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
@@ -5387,4 +5899,4 @@ def print_pd_usage(
             i_ = i_.replace("=", "\t= ") + ","
             print(i_) if i == 0 else print("\t", i_)
     else:
-        print(usage)
+        print(usage)

py2ls 0.2.4.1__py3-none-any.whl → 0.2.4.3__py3-none-any.whl

py2ls 0.2.4.1py3-none-any.whl → 0.2.4.3py3-none-any.whl