PyPI - py2ls - Versions diffs - 0.2.4.2__py3-none-any.whl → 0.2.4.4__py3-none-any.whl - Mend

py2ls 0.2.4.2py3-none-any.whl → 0.2.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/bio.py +1225 -47
py2ls/data/mygenes_fields_241022.txt +355 -0
py2ls/ips.py +523 -110
py2ls/ml2ls.py +1094 -0
py2ls/netfinder.py +12 -1
py2ls/plot.py +290 -75
{py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/METADATA +1 -1
{py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/RECORD +11 -9
{py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
 from . import netfinder
-# from .plot import get_color
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
     get_ipython().run_line_magic("autoreload", "2")
@@ -108,6 +106,8 @@ def unique(lst, ascending=None):
     返回:
     list: 一个列表，其中的元素是唯一的，顺序根据参数 `ascending` 进行排序。
     """
+    if not lst:
+        return []
     if ascending is not None:
         # 移除重复项
         unique_items = list(set(lst))
@@ -518,6 +518,77 @@ def is_text(s):
     return has_alpha and has_non_alpha
+from typing import Any, Union
+def shared(*args, strict=True, n_shared=2, verbose=True):
+    """
+    check the shared elelements in two list.
+    usage:
+        list1 = [1, 2, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        list3 = [5, 6, 9, 10]
+        a = shared(list1, list2,list3)
+    """
+    if verbose:
+        print("\n********* checking shared elements *********")
+    if len(args) == 1 and isinstance(args[0], list):
+        lists = args[0]  # Unpack the single list
+    else:
+        lists = args  # Use the provided arguments as lists
+    flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
+    # Ensure all arguments are lists
+    if any(not isinstance(lst, list) for lst in flattened_lists):
+        print(f"{' ' * 2}All inputs must be lists.")
+        return []
+    first_list = flattened_lists[0]
+    shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
+    if strict:
+            # Strict mode: require elements to be in all lists
+            shared_elements = set(flattened_lists[0])
+            for lst in flattened_lists[1:]:
+                shared_elements.intersection_update(lst)
+    else:
+        all_elements = [item for sublist in flattened_lists for item in sublist]
+        element_count = Counter(all_elements)
+        # Get elements that appear in at least n_shared lists
+        shared_elements = [item for item, count in element_count.items() if count >= n_shared]
+    shared_elements = flatten(shared_elements)
+    if verbose:
+        elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
+        print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
+        print("********* checking shared elements *********")
+    return shared_elements
+def flatten(nested: Any, unique_list=True,verbose=True):
+    """
+    Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
+    Parameters:
+        nested : Any, Can be a list, tuple, dictionary, or set.
+    Returns: list, A flattened list.
+    """
+    flattened_list = []
+    stack = [nested]
+    while stack:
+        current = stack.pop()
+        if isinstance(current, dict):
+            stack.extend(current.values())
+        elif isinstance(current, (list, tuple, set)):
+            stack.extend(current)
+        elif isinstance(current, pd.Series):
+            stack.extend(current)
+        elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
+            stack.extend(current.tolist())
+        else:
+            flattened_list.append(current)
+    if verbose:
+        print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
+    if unique_list:
+        return unique(flattened_list)[::-1]
+    else:
+        return flattened_list
 def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
     """
     Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -548,7 +619,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
             similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
         elif "W" in scorer.lower():
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
-        elif "ratio" in scorer.lower():#Ratio (Strictest)
+        elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
             similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
         else:
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -1567,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     """
     Usage
     is_abnormal = is_df_abnormal(df, verbose=1)
+    True: abnormal
+    False: normal
     """
     # Initialize a list to hold messages about abnormalities
@@ -1594,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     if len(column_names) == 1 and delimiter_counts["\t"] > 1:
         messages.append("Abnormal: Column names are not split correctly.")
         is_abnormal = True
+        if verbose:
+            print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
     if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
         messages.append("Abnormal: Too many delimiters in column names.")
         is_abnormal = True
+        if verbose:
+            print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
     if delimiter_counts[""] > 3:
         messages.append("Abnormal: There are empty column names.")
         is_abnormal = True
+        if verbose:
+            print(f'delimiter_counts[""] > 3')
     if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
         messages.append("Abnormal: Some column names contain unexpected characters.")
         is_abnormal = True
+        if verbose:
+            print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
-    # Check for missing values
-    missing_values = df.isnull().sum()
-    if missing_values.any():
-        messages.append("Missing values in columns:")
-        messages.append(missing_values[missing_values > 0].to_string())
-        is_abnormal = True
+    # # Check for missing values
+    # missing_values = df.isnull().sum()
+    # if missing_values.any():
+    #     messages.append("Missing values in columns:")
+    #     messages.append(missing_values[missing_values > 0].to_string())
+    #     is_abnormal = True
+    #     print(f'missing_values.any()')
     # Check data types
     data_types = df.dtypes
@@ -1623,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     if constant_columns:
         messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
         is_abnormal = True
+        if verbose:
+            print(f'df.columns[df.nunique() == 1].tolist()')
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
@@ -1630,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
             "Abnormal: DataFrame is too small (less than 2 rows or columns)."
         )
         is_abnormal = True
+        if verbose:
+            print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
     # Compile results
     if verbose:
@@ -1672,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
             content = yaml.safe_load(file)
         return content
-    def load_xml(fpath):
-        tree = etree.parse(fpath)
-        root = tree.getroot()
-        return etree.tostring(root, pretty_print=True).decode()
+    def load_xml(fpath, fsize_thr: int = 100):
+        def load_small_xml(fpath):
+            tree = etree.parse(fpath)
+            root = tree.getroot()
+            return etree.tostring(root, pretty_print=True).decode()
+        def load_large_xml(fpath):
+            xml_parts = []
+            context = etree.iterparse(
+                fpath, events=("start", "end"), recover=True, huge_tree=True
+            )
+            for event, elem in context:
+                if event == "end":
+                    xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
+                    elem.clear()
+                    while elem.getprevious() is not None:
+                        del elem.getparent()[0]
+            del context
+            return "".join(xml_parts)
+        file_size = os.path.getsize(fpath) / 1024 / 1024  # in MB
+        if file_size > fsize_thr:
+            print(f"reading a small file:{file_size} Mb")
+            return load_large_xml(fpath)
+        else:
+            print(f"reading a big file:{file_size} Mb")
+            return load_small_xml(fpath)
     def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
         """
@@ -1721,7 +1833,7 @@ def fload(fpath, kind=None, **kwargs):
         fmt=kwargs.pop("fmt",False)
         verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("read_csv", verbose=verbose)
+            use_pd("read_csv", verbose=verbose)
             return
         if comment is None:
@@ -1742,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
                 on_bad_lines=on_bad_lines,
                 **kwargs,
             )
+            if is_df_abnormal(df, verbose=0):
+                raise ValueError("the df is abnormal")
         except:
             try:
                 try:
@@ -1769,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
                             comment=comment,
                             **kwargs,
                         )
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
                 except (UnicodeDecodeError, ValueError):
@@ -1805,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
                 separators = [",", "\t", ";", "|", " "]
                 for sep in separators:
                     sep2show = sep if sep != "\t" else "\\t"
-                    print(f'trying with: engine=pyarrow, sep="{sep2show}"')
+                    # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
+                    # print(".")
                     try:
                         df = pd.read_csv(
                             fpath,
@@ -1817,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
                             **kwargs,
                         )
                         if not is_df_abnormal(df, verbose=0):  # normal
-                            break
-                        else:
-                            if is_df_abnormal(df, verbose=0):
-                                pass
+                            display(df.head(2))
+                            print(f"shape: {df.shape}")
+                            return df
                     except:
                         pass
                 else:
@@ -1829,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
                         separators = [",", "\t", ";", "|", " "]
                         for sep in separators:
                             try:
-                                sep2show = sep if sep != "\t" else "\\t"
-                                print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                # sep2show = sep if sep != "\t" else "\\t"
+                                # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                # print(".")
                                 df = pd.read_csv(
                                     fpath,
                                     engine=engine,
@@ -1839,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
                                     comment=comment,
                                     **kwargs,
                                 )
+                                # display(df.head(2))
+                                # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
                                 if not is_df_abnormal(df, verbose=0):
-                                    break
+                                    display(df.head(2))
+                                    print(f"shape: {df.shape}")
+                                    return df
                             except EmptyDataError as e:
                                 continue
                         else:
@@ -1853,7 +1971,7 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
         verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("read_excel", verbose=verbose)
+            use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
             meata=pd.ExcelFile(fpath)
@@ -2263,7 +2381,7 @@ def fsave(
         verbose=kwargs.pop("verbose",False)
         if verbose:
-            print_pd_usage("to_csv", verbose=verbose)
+            use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
             sep=",",
@@ -2295,7 +2413,7 @@ def fsave(
         verbose=kwargs.pop("verbose",False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
         if verbose:
-            print_pd_usage("to_excel", verbose=verbose)
+            use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
         else:
@@ -2342,15 +2460,20 @@ def fsave(
     #         json.dump(data, file, **kwargs)
     def save_json(fpath_fname, var_dict_or_df):
+        def _convert_js(data):
+            if isinstance(data, pd.DataFrame):
+                return data.to_dict(orient="list")
+            elif isinstance(data, np.ndarray):
+                return data.tolist()
+            elif isinstance(data, dict):
+                return {key: _convert_js(value) for key, value in data.items()}
+            return data
+        serializable_data = _convert_js(var_dict_or_df)
+        # Save the serializable data to the JSON file
         with open(fpath_fname, "w") as f_json:
-            if isinstance(var_dict_or_df, pd.DataFrame):
-                var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
-            if isinstance(var_dict_or_df, dict):
-                for key, value in var_dict_or_df.items():
-                    if isinstance(value, np.ndarray):
-                        var_dict_or_df[key] = value.tolist()
-            # Save the dictionary or list of dictionaries to a JSON file
-            json.dump(var_dict_or_df, f_json, indent=4)
+            json.dump(serializable_data, f_json, indent=4)
     # # Example usage:
     # sets = {"title": "mse_path_ MSE"}
@@ -2594,7 +2717,7 @@ def listdir(
         print(ls)
         df_all = pd.DataFrame(
             {
-                "fname": all_files,
+                "fname": ls,
                 "fpath": [os.path.join(rootdir, i) for i in ls],
             }
         )
@@ -4444,7 +4567,42 @@ def preview(var):
 # preview("# This is a Markdown header")
 # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
 # preview({"key": "value", "numbers": [1, 2, 3]})
+def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
+    """
+    Extend a DataFrame by the list elecments in the column.
+    Parameters:
+    ----------
+    data : pd.DataFrame
+        The input DataFrame to be extended.
+    column : str
+        The name of the column to be split.
+    axis : int, optional
+        The axis along which to expand the DataFrame.
+        - 0 (default): Expand the specified column into multiple rows.
+        - 1: Expand the specified column into multiple columns.
+    sep : str, optional
+        The separator used to split the values in the specified column.
+        Must be provided for the function to work correctly.
+    """
+    data = data.copy()
+    mask = data[column].str.contains(sep, na=False)
+    data = data.copy()
+    if mask.any():
+        data[column] = (
+            data[column]
+            .apply(lambda x: x.split(sep) if isinstance(x, str) else x)  # Only split if x is a string
+        )
+        # Strip spaces from each item in the lists
+        data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
+    data = data.explode(column, ignore_index=True)
+    return data
 # ! DataFrame
 def df_astype(
     data: pd.DataFrame,
@@ -4703,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
 def df_merge(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
-    use_index: bool = True,
+    use_index: bool = False,
     columns: list = ["col_left", "col_right"],
     how: str = "left",
 ) -> pd.DataFrame:
@@ -4731,7 +4889,7 @@ def df_merge(
     """
     # 1. Check if indices are comparable (same length and types)
-    if use_index or df1.index.equals(df2.index):
+    if use_index:
         print(f"Merging based on index using '{how}' join...")
         df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
         return df_merged
@@ -4762,12 +4920,53 @@ def df_merge(
         )
     return df_merged
+def df_drop_duplicates(
+    data: pd.DataFrame,
+    by: Union[
+        str, List[str]
+    ] = "index",  # Options: 'index', or column name(s) for 'rows'
+    keep="first",  # Options: 'first', 'last', or False (drop all duplicates)
+    ignore_index=True,
+    inplace: bool = False,
+    verbose=True
+):
+    """
+    data (pd.DataFrame): DataFrame to drop duplicates from.
+    by (str): Specify by to drop duplicates:
+                 - 'index': Drop duplicates based on the DataFrame index.
+                 - Column name(s) for row-wise duplicate checking.
+    keep (str): Which duplicates to keep:
+        'first',
+        'last',
+        False (drop all duplicates).
+    inplace (bool): Whether to modify the original DataFrame in place.
+    """
+    original_shape = data.shape
+    if by == "index":
+        # Drop duplicates in the index
+        result = data[~data.index.duplicated(keep=keep)]
+    else:
+        # Drop duplicates row-wise based on column(s)
+        result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
+    if original_shape!=result.shape or verbose:
+        print(f"\nshape:{original_shape} (before drop_duplicates)")
+        print(f"shape:{result.shape} (after drop_duplicates)")
+    if inplace:
+        # Modify the original DataFrame in place
+        data.drop(data.index, inplace=True)  # Drop all rows first
+        data[data.columns] = result  # Refill the DataFrame
+        return None
+    else:
+        return result
 def df_fillna(
     data: pd.DataFrame,
-    method: str = "mean",
+    method: str = "knn",
     axis: int = 0,# column-wise
     constant: float = None,
+    n_neighbors: int = 5,  # KNN-specific
+    max_iter: int = 10, # Iterative methods specific
     inplace: bool = True,
+    random_state:int = None
 ) -> pd.DataFrame:
     """
     Fill missing values in a DataFrame using specified imputation method.
@@ -4779,8 +4978,15 @@ def df_fillna(
         - 'median': Replace missing values with the median of the column.
         - 'most_frequent': Replace missing values with the most frequent value in the column.
         - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
-        - 'knn': Use K-Nearest Neighbors imputation.
-        - 'iterative': Use Iterative imputation.
+        - 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
+        - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
+        - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
+        # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
+        # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
+        #       reconstructing the data matrix using low-rank approximation
+        # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
+        # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
     axis (int): The axis along which to impute:
         - 0: Impute column-wise (default).
         - 1: Impute row-wise.
@@ -4793,7 +4999,8 @@ def df_fillna(
         raise ValueError("Input DataFrame is empty.")
     # Validate method
-    methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
+    methods = ["mean", "median", "most_frequent",
+               "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
     method = strcmp(method, methods)[0]
     # If using constant method, ask for a constant value
@@ -4806,18 +5013,27 @@ def df_fillna(
     # Initialize SimpleImputer with the chosen method
     if method == "constant":
+        from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method, fill_value=constant)
     elif method == "knn":
         from sklearn.impute import KNNImputer
         imputer = KNNImputer(n_neighbors=n_neighbors)
-    elif method == "iterative":
+    elif method == "iterative" or method == "mice":
+        from sklearn.experimental import enable_iterative_imputer
         from sklearn.impute import IterativeImputer
-        imputer = IterativeImputer(max_iter=max_iter)
-    else:
+        imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
+    # elif method == "missforest":
+    #     from missingpy import MissForest
+    #     imputer = MissForest(max_iter=max_iter, random_state=random_state)
+    # elif method == "softimpute":
+    #     from fancyimpute import SoftImpute
+    #     imputer = SoftImpute()
+    # elif method == "svd":
+    #     from fancyimpute import IterativeSVD
+    #     imputer = IterativeSVD(max_iters=max_iter)
+    else: # mean, median, most_frequent
         from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method)
     # Fit and transform the data
@@ -4843,8 +5059,38 @@ def df_fillna(
         return None  # replace original
     else:
         return df_filled
+# # example
+# data = {
+#     "A": [1, 2, np.nan, 4, 5],
+#     "B": [np.nan, 2, 3, 4, np.nan],
+#     "C": [1, np.nan, 3, 4, 5],
+#     "D": [1, 2, 3, 4, np.nan],
+# }
+# # Define a function to test each imputation method
+# methods = [
+#     "mean",
+#     "median",
+#     "most_frequent",
+#     "constant",
+#     "knn",
+#     "iterative",
+#     # "missforest",
+#     # "softimpute",
+#     # "svd",
+# ]
+# # Create a dictionary to hold results
+# results = {}
+# for method_name in methods:
+#     print(method_name)
+#     display(df)
+#     display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
 def df_scaler(
-    data: pd.DataFrame,
+    data: pd.DataFrame, # should be numeric dtype
     method="standard",
     columns=None,  # default, select all numeric col/row
     inplace=False,
@@ -4984,7 +5230,7 @@ def df_cluster(
         X = scaler.fit_transform(X)
     for n_cluster in range_n_clusters:
-        kmeans = KMeans(n_clusters=n_cluster, random_state=42)
+        kmeans = KMeans(n_clusters=n_cluster, random_state=1)
         cluster_labels = kmeans.fit_predict(X)
         silhouette_avg = silhouette_score(X, cluster_labels)
@@ -5000,7 +5246,7 @@ def df_cluster(
     print(f"n_clusters = {n_clusters}")
     # Apply K-Means Clustering with Optimal Number of Clusters
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=1)
     cluster_labels = kmeans.fit_predict(X)
     if plot:
@@ -5101,7 +5347,7 @@ def df_cluster(
         # n_clusters = (
         #     np.argmax(silhouette_avg_scores) + 2
         # )  # Optimal clusters based on max silhouette score
-        # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+        # kmeans = KMeans(n_clusters=n_clusters, random_state=1)
         # cluster_labels = kmeans.fit_predict(X)
         silhouette_vals = silhouette_samples(X, cluster_labels)
@@ -5252,12 +5498,14 @@ def df_reducer(
     columns: Optional[List[str]] = None,
     method: str = "umap",  # 'pca', 'umap'
     n_components: int = 2,  # Default for umap, but 50 for PCA
-    umap_neighbors: int = 15,  # Default
-    umap_min_dist: float = 0.1,  # Default
+    umap_neighbors: int = 15,  # UMAP-specific
+    umap_min_dist: float = 0.1,  # UMAP-specific
+    tsne_perplexity: int = 30,  # t-SNE-specific
     scale: bool = True,
     fill_missing: bool = True,
     debug: bool = False,
     inplace: bool = True,  # replace the oringinal data
+    plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
 ) -> pd.DataFrame:
     """
     Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
@@ -5293,14 +5541,40 @@ def df_reducer(
     reduced_df : pd.DataFrame
         DataFrame with the reduced dimensions.
     """
-    from sklearn.decomposition import PCA
+    """
+       PCA: explained_variance:
+            indicates the proportion of the dataset's total variance that each principal
+            component (PC) explains. It gives you a sense of how much information
+            (or variance) is captured by each PC
+        Interpretation:
+            - Higher values indicate that the corresponding PC captures more variance.
+            - The sum of the explained variances for all PCs equals 1 (or 100%).
+            - If the first few components explain a high percentage (e.g., 90%),
+            it means you can reduce the dimensionality of the data significantly without losing much information.
+        Use case:
+            You may plot a scree plot, which shows the explained variance for each PC, to help decide
+            how many components to keep for analysis.
+        PCA: Singular values:
+            represent the magnitude of variance along each principal component. Mathematically,
+            they are the square roots of the eigenvalues of the covariance matrix.
+        Interpretation:
+            Larger singular values indicate that the associated PC captures more variance.
+            Singular values are related to the scale of the data. If the data are scaled
+            before PCA (e.g., standardized), then the singular values will provide a measure
+            of the spread of data along each PC.
+        Use case:
+            Singular values help quantify the contribution of each principal component in a
+            similar way to the explained variance. They are useful in understanding the overall
+            structure of the data.
+    """
     from sklearn.preprocessing import StandardScaler
-    import umap
     from sklearn.impute import SimpleImputer
     # Select columns if specified, else use all columns
     X = data[columns].values if columns else data.values
+    print(X.shape,type(X))
     # Handle missing values
     if fill_missing:
         imputer = SimpleImputer(strategy="mean")
@@ -5312,76 +5586,215 @@ def df_reducer(
         X = scaler.fit_transform(X)
     # Check valid method input
-    if method not in ["pca", "umap"]:
-        raise ValueError(f"Invalid method '{method}'. Choose 'pca' or 'umap'.")
+    methods=["pca", "umap","tsne","factor","isolation_forest"]
+    method=strcmp(method, methods)[0]
     # Apply PCA if selected
-    if method == "pca":
-        if n_components is None:
-            # to get the n_components with threshold method:
-            pca = PCA()
-            pca_result = pca.fit_transform(X)
-            # Calculate explained variance
-            explained_variance = pca.explained_variance_ratio_
-            # Cumulative explained variance
-            cumulative_variance = np.cumsum(explained_variance)
-            # Set a threshold for cumulative variance
-            threshold = 0.95  # Example threshold
-            n_components = (
-                np.argmax(cumulative_variance >= threshold) + 1
-            )  # Number of components to retain
-            if debug:
-                # debug:
-                # Plot the cumulative explained variance
-                plt.figure(figsize=(8, 5))
-                plt.plot(
-                    range(1, len(cumulative_variance) + 1),
-                    cumulative_variance,
-                    marker="o",
-                    linestyle="-",
-                )
-                plt.title("Cumulative Explained Variance by Principal Components")
-                plt.xlabel("Number of Principal Components")
-                plt.ylabel("Cumulative Explained Variance")
-                plt.xticks(range(1, len(cumulative_variance) + 1))
-                # Add horizontal line for the threshold
-                plt.axhline(
-                    y=threshold, color="r", linestyle="--", label="Threshold (95%)"
-                )
-                # Add vertical line for n_components
-                plt.axvline(
-                    x=n_components,
-                    color="g",
-                    linestyle="--",
-                    label=f"n_components = {n_components}",
-                )
-                plt.legend()
-                plt.grid()
+    if method == "pca":
+        from sklearn.decomposition import PCA
         pca = PCA(n_components=n_components)
         X_reduced = pca.fit_transform(X)
-        print(f"PCA completed: Reduced to {n_components} components.")
+        # Additional PCA information
+        explained_variance = pca.explained_variance_ratio_
+        singular_values = pca.singular_values_
+        loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
+        if debug:
+            print(f"PCA completed: Reduced to {n_components} components.")
+            print(f"Explained Variance: {explained_variance}")
+            print(f"Singular Values: {singular_values}")
+        # Plot explained variance if debug=True
+        if debug:
+            # Plot explained variance
+            cumulative_variance = np.cumsum(explained_variance)
+            plt.figure(figsize=(8, 5))
+            plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
+            plt.title("Cumulative Explained Variance by Principal Components")
+            plt.xlabel("Number of Principal Components")
+            plt.ylabel("Cumulative Explained Variance")
+            plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
+            plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
+            plt.legend()
+            plt.grid()
+            plt.show()
+        # Prepare reduced DataFrame with additional PCA info
+        pca_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"PC_{i+1}" for i in range(n_components)]
+        )
+        # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
+        # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
+        # Expand explained variance to multiple columns if needed
+        for i in range(n_components):
+            pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
+        for i in range(n_components):
+            pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
     # Apply UMAP if selected
     elif method == "umap":
+        import umap
         umap_reducer = umap.UMAP(
             n_neighbors=umap_neighbors,
             min_dist=umap_min_dist,
-            n_components=n_components,
+            n_components=n_components
         )
         X_reduced = umap_reducer.fit_transform(X)
-        print(f"UMAP completed: Reduced to {n_components} components.")
-    # Return reduced data as a new DataFrame with the same index
-    reduced_df = pd.DataFrame(X_reduced, index=data.index)
+        # Additional UMAP information
+        embedding = umap_reducer.embedding_
+        trustworthiness = umap_reducer._raw_data[:, :n_components]
+        if debug:
+            print(f"UMAP completed: Reduced to {n_components} components.")
+            print(f"Embedding Shape: {embedding.shape}")
+            print(f"Trustworthiness: {trustworthiness}")
+        # Prepare reduced DataFrame with additional UMAP info
+        umap_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"UMAP_{i+1}" for i in range(n_components)]
+        )
+        umap_df["Embedding"] = embedding[:, 0]  # Example of embedding data
+        umap_df["Trustworthiness"] = trustworthiness[:, 0]  # Trustworthiness metric
+    elif method == "tsne":
+        from sklearn.manifold import TSNE
+        tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
+        X_reduced = tsne.fit_transform(X)
+        # Prepare reduced DataFrame with additional t-SNE info
+        tsne_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"tSNE_{i+1}" for i in range(n_components)]
+        )
+        tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
+    # Apply Factor Analysis if selected
+    elif method == "factor":
+        from sklearn.decomposition import FactorAnalysis
+        factor = FactorAnalysis(n_components=n_components, random_state=1)
+        X_reduced = factor.fit_transform(X)
+        # Factor Analysis does not directly provide explained variance, but we can approximate it
+        fa_variance = factor.noise_variance_
+        # Prepare reduced DataFrame with additional Factor Analysis info
+        factor_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"Factor_{i+1}" for i in range(n_components)]
+        )
+        factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
+    # Apply Isolation Forest for outlier detection if selected
+    elif method == "isolation_forest":
+        from sklearn.decomposition import PCA
+        from sklearn.ensemble import IsolationForest
+        # Step 1: Apply PCA for dimensionality reduction to 2 components
+        pca = PCA(n_components=n_components)
+        X_pca = pca.fit_transform(X)
+        explained_variance = pca.explained_variance_ratio_
+        singular_values = pca.singular_values_
+        # Prepare reduced DataFrame with additional PCA info
+        iso_forest_df = pd.DataFrame(
+            X_pca, index=data.index,
+            columns=[f"PC_{i+1}" for i in range(n_components)]
+        )
+        isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
+        isolation_forest.fit(X)
+        anomaly_scores = isolation_forest.decision_function(X)  # Anomaly score: larger is less anomalous
+        # Predict labels: 1 (normal), -1 (anomaly)
+        anomaly_labels = isolation_forest.fit_predict(X)
+        # Add anomaly scores and labels to the DataFrame
+        iso_forest_df["Anomaly Score"] = anomaly_scores
+        iso_forest_df["Anomaly Label"] = anomaly_labels
+        # add info from pca
+        for i in range(n_components):
+            iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
+        for i in range(n_components):
+            iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
+    # Return reduced data and info as a new DataFrame with the same index
+    if method == "pca":
+        reduced_df = pca_df
+        colname_met = "PC_"
+        if plot_:
+            sns.scatterplot(
+                data=pca_df,
+                x="PC_1",
+                y="PC_2",
+                # hue="condition",
+            )
+    elif method == "umap":
+        reduced_df = umap_df
+        colname_met = "UMAP_"
+        if plot_:
+            sns.scatterplot(
+                data=umap_df,
+                x="UMAP_1",
+                y="UMAP_2",
+                # hue="condition",
+            )
+    elif method == "tsne":
+        reduced_df = tsne_df
+        colname_met = "t-SNE_"
+        if plot_:
+            sns.scatterplot(
+                data=tsne_df,
+                x="tSNE_1",
+                y="tSNE_2",
+                # hue="batch",
+            )
+    elif method == "factor":
+        reduced_df = factor_df
+        colname_met = "Factor_"
+        if plot_:
+            sns.scatterplot(
+                data=factor_df,
+                x="Factor_1",
+                y="Factor_2",
+                # hue="batch",
+            )
+    elif method == "isolation_forest":
+        reduced_df = iso_forest_df  # Already a DataFrame for outliers
+        colname_met = "PC_"
+        if plot_:
+            ax = sns.scatterplot(
+                data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
+                x="PC_1",
+                y="PC_2",
+                label="normal", c="b",
+            )
+            ax = sns.scatterplot(
+                ax=ax,
+                data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
+                x="PC_1",
+                y="PC_2",
+                c="r",
+                label="outlier", marker="+", s=30,
+            )
     if inplace:
-        # Replace or add new columns based on n_components
+        # If inplace=True, add components back into the original data
         for col_idx in range(n_components):
-            data[f"Component_{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+            data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+        # Add extra info for PCA/UMAP
+        if method == "pca":
+            for i in range(n_components):
+                data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
+            for i in range(n_components):
+                data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
+        elif method == "umap":
+            for i in range(n_components):
+                data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
+            data["Embedding"] = reduced_df["Embedding"]
+            data["Trustworthiness"] = reduced_df["Trustworthiness"]
         return None  # No return when inplace=True
-    return reduced_df
+    return reduced_df
 # example:
@@ -5636,7 +6049,7 @@ def evaluate_cluster(
     return metrics
-def print_pd_usage(
+def use_pd(
     func_name="excel",
     verbose=True,
     dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",

py2ls 0.2.4.2__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

py2ls 0.2.4.2py3-none-any.whl → 0.2.4.4py3-none-any.whl