PyPI - py2ls - Versions diffs - 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl - Mend

py2ls 0.2.4.3py3-none-any.whl → 0.2.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/bio.py +955 -18
py2ls/data/mygenes_fields_241022.txt +355 -0
py2ls/ips.py +219 -55
py2ls/ml2ls.py +1094 -0
py2ls/netfinder.py +12 -1
py2ls/plot.py +266 -71
{py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/METADATA +1 -1
{py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/RECORD +11 -9
{py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -106,6 +106,8 @@ def unique(lst, ascending=None):
     返回:
     list: 一个列表，其中的元素是唯一的，顺序根据参数 `ascending` 进行排序。
     """
+    if not lst:
+        return []
     if ascending is not None:
         # 移除重复项
         unique_items = list(set(lst))
@@ -518,7 +520,7 @@ def is_text(s):
 from typing import Any, Union
-def shared(lst1:Any, lst2:Any,*args, verbose=True):
+def shared(*args, strict=True, n_shared=2, verbose=True):
     """
     check the shared elelements in two list.
     usage:
@@ -529,14 +531,30 @@ def shared(lst1:Any, lst2:Any,*args, verbose=True):
     """
     if verbose:
         print("\n********* checking shared elements *********")
-    if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
-        print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
-    shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
-    # support more lists
-    if args:
-        for arg in args:
-            shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
-    shared_elements = list(shared_elements)
+    if len(args) == 1 and isinstance(args[0], list):
+        lists = args[0]  # Unpack the single list
+    else:
+        lists = args  # Use the provided arguments as lists
+    flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
+    # Ensure all arguments are lists
+    if any(not isinstance(lst, list) for lst in flattened_lists):
+        print(f"{' ' * 2}All inputs must be lists.")
+        return []
+    first_list = flattened_lists[0]
+    shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
+    if strict:
+            # Strict mode: require elements to be in all lists
+            shared_elements = set(flattened_lists[0])
+            for lst in flattened_lists[1:]:
+                shared_elements.intersection_update(lst)
+    else:
+        all_elements = [item for sublist in flattened_lists for item in sublist]
+        element_count = Counter(all_elements)
+        # Get elements that appear in at least n_shared lists
+        shared_elements = [item for item, count in element_count.items() if count >= n_shared]
+    shared_elements = flatten(shared_elements)
     if verbose:
         elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
         print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
@@ -555,17 +573,19 @@ def flatten(nested: Any, unique_list=True,verbose=True):
     while stack:
         current = stack.pop()
         if isinstance(current, dict):
-            stack.extend(current.values())
+            stack.extend(current.values())
         elif isinstance(current, (list, tuple, set)):
             stack.extend(current)
         elif isinstance(current, pd.Series):
             stack.extend(current)
+        elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
+            stack.extend(current.tolist())
         else:
             flattened_list.append(current)
     if verbose:
         print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
     if unique_list:
-        return unique(flattened_list)
+        return unique(flattened_list)[::-1]
     else:
         return flattened_list
@@ -1618,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     """
     Usage
     is_abnormal = is_df_abnormal(df, verbose=1)
+    True: abnormal
+    False: normal
     """
     # Initialize a list to hold messages about abnormalities
@@ -1645,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     if len(column_names) == 1 and delimiter_counts["\t"] > 1:
         messages.append("Abnormal: Column names are not split correctly.")
         is_abnormal = True
+        if verbose:
+            print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
     if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
         messages.append("Abnormal: Too many delimiters in column names.")
         is_abnormal = True
+        if verbose:
+            print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
     if delimiter_counts[""] > 3:
         messages.append("Abnormal: There are empty column names.")
         is_abnormal = True
+        if verbose:
+            print(f'delimiter_counts[""] > 3')
     if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
         messages.append("Abnormal: Some column names contain unexpected characters.")
         is_abnormal = True
+        if verbose:
+            print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
-    # Check for missing values
-    missing_values = df.isnull().sum()
-    if missing_values.any():
-        messages.append("Missing values in columns:")
-        messages.append(missing_values[missing_values > 0].to_string())
-        is_abnormal = True
+    # # Check for missing values
+    # missing_values = df.isnull().sum()
+    # if missing_values.any():
+    #     messages.append("Missing values in columns:")
+    #     messages.append(missing_values[missing_values > 0].to_string())
+    #     is_abnormal = True
+    #     print(f'missing_values.any()')
     # Check data types
     data_types = df.dtypes
@@ -1674,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     if constant_columns:
         messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
         is_abnormal = True
+        if verbose:
+            print(f'df.columns[df.nunique() == 1].tolist()')
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
@@ -1681,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
             "Abnormal: DataFrame is too small (less than 2 rows or columns)."
         )
         is_abnormal = True
+        if verbose:
+            print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
     # Compile results
     if verbose:
@@ -1723,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
             content = yaml.safe_load(file)
         return content
-    def load_xml(fpath):
-        tree = etree.parse(fpath)
-        root = tree.getroot()
-        return etree.tostring(root, pretty_print=True).decode()
+    def load_xml(fpath, fsize_thr: int = 100):
+        def load_small_xml(fpath):
+            tree = etree.parse(fpath)
+            root = tree.getroot()
+            return etree.tostring(root, pretty_print=True).decode()
+        def load_large_xml(fpath):
+            xml_parts = []
+            context = etree.iterparse(
+                fpath, events=("start", "end"), recover=True, huge_tree=True
+            )
+            for event, elem in context:
+                if event == "end":
+                    xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
+                    elem.clear()
+                    while elem.getprevious() is not None:
+                        del elem.getparent()[0]
+            del context
+            return "".join(xml_parts)
+        file_size = os.path.getsize(fpath) / 1024 / 1024  # in MB
+        if file_size > fsize_thr:
+            print(f"reading a small file:{file_size} Mb")
+            return load_large_xml(fpath)
+        else:
+            print(f"reading a big file:{file_size} Mb")
+            return load_small_xml(fpath)
     def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
         """
@@ -1793,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
                 on_bad_lines=on_bad_lines,
                 **kwargs,
             )
+            if is_df_abnormal(df, verbose=0):
+                raise ValueError("the df is abnormal")
         except:
             try:
                 try:
@@ -1820,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
                             comment=comment,
                             **kwargs,
                         )
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
                 except (UnicodeDecodeError, ValueError):
@@ -1856,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
                 separators = [",", "\t", ";", "|", " "]
                 for sep in separators:
                     sep2show = sep if sep != "\t" else "\\t"
-                    print(f'trying with: engine=pyarrow, sep="{sep2show}"')
+                    # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
+                    # print(".")
                     try:
                         df = pd.read_csv(
                             fpath,
@@ -1868,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
                             **kwargs,
                         )
                         if not is_df_abnormal(df, verbose=0):  # normal
-                            break
-                        else:
-                            if is_df_abnormal(df, verbose=0):
-                                pass
+                            display(df.head(2))
+                            print(f"shape: {df.shape}")
+                            return df
                     except:
                         pass
                 else:
@@ -1880,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
                         separators = [",", "\t", ";", "|", " "]
                         for sep in separators:
                             try:
-                                sep2show = sep if sep != "\t" else "\\t"
-                                print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                # sep2show = sep if sep != "\t" else "\\t"
+                                # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                # print(".")
                                 df = pd.read_csv(
                                     fpath,
                                     engine=engine,
@@ -1890,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
                                     comment=comment,
                                     **kwargs,
                                 )
+                                # display(df.head(2))
+                                # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
                                 if not is_df_abnormal(df, verbose=0):
-                                    break
+                                    display(df.head(2))
+                                    print(f"shape: {df.shape}")
+                                    return df
                             except EmptyDataError as e:
                                 continue
                         else:
@@ -2393,15 +2460,20 @@ def fsave(
     #         json.dump(data, file, **kwargs)
     def save_json(fpath_fname, var_dict_or_df):
+        def _convert_js(data):
+            if isinstance(data, pd.DataFrame):
+                return data.to_dict(orient="list")
+            elif isinstance(data, np.ndarray):
+                return data.tolist()
+            elif isinstance(data, dict):
+                return {key: _convert_js(value) for key, value in data.items()}
+            return data
+        serializable_data = _convert_js(var_dict_or_df)
+        # Save the serializable data to the JSON file
         with open(fpath_fname, "w") as f_json:
-            if isinstance(var_dict_or_df, pd.DataFrame):
-                var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
-            if isinstance(var_dict_or_df, dict):
-                for key, value in var_dict_or_df.items():
-                    if isinstance(value, np.ndarray):
-                        var_dict_or_df[key] = value.tolist()
-            # Save the dictionary or list of dictionaries to a JSON file
-            json.dump(var_dict_or_df, f_json, indent=4)
+            json.dump(serializable_data, f_json, indent=4)
     # # Example usage:
     # sets = {"title": "mse_path_ MSE"}
@@ -2645,7 +2717,7 @@ def listdir(
         print(ls)
         df_all = pd.DataFrame(
             {
-                "fname": all_files,
+                "fname": ls,
                 "fpath": [os.path.join(rootdir, i) for i in ls],
             }
         )
@@ -4789,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
 def df_merge(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
-    use_index: bool = True,
+    use_index: bool = False,
     columns: list = ["col_left", "col_right"],
     how: str = "left",
 ) -> pd.DataFrame:
@@ -4848,12 +4920,53 @@ def df_merge(
         )
     return df_merged
+def df_drop_duplicates(
+    data: pd.DataFrame,
+    by: Union[
+        str, List[str]
+    ] = "index",  # Options: 'index', or column name(s) for 'rows'
+    keep="first",  # Options: 'first', 'last', or False (drop all duplicates)
+    ignore_index=True,
+    inplace: bool = False,
+    verbose=True
+):
+    """
+    data (pd.DataFrame): DataFrame to drop duplicates from.
+    by (str): Specify by to drop duplicates:
+                 - 'index': Drop duplicates based on the DataFrame index.
+                 - Column name(s) for row-wise duplicate checking.
+    keep (str): Which duplicates to keep:
+        'first',
+        'last',
+        False (drop all duplicates).
+    inplace (bool): Whether to modify the original DataFrame in place.
+    """
+    original_shape = data.shape
+    if by == "index":
+        # Drop duplicates in the index
+        result = data[~data.index.duplicated(keep=keep)]
+    else:
+        # Drop duplicates row-wise based on column(s)
+        result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
+    if original_shape!=result.shape or verbose:
+        print(f"\nshape:{original_shape} (before drop_duplicates)")
+        print(f"shape:{result.shape} (after drop_duplicates)")
+    if inplace:
+        # Modify the original DataFrame in place
+        data.drop(data.index, inplace=True)  # Drop all rows first
+        data[data.columns] = result  # Refill the DataFrame
+        return None
+    else:
+        return result
 def df_fillna(
     data: pd.DataFrame,
-    method: str = "mean",
+    method: str = "knn",
     axis: int = 0,# column-wise
     constant: float = None,
+    n_neighbors: int = 5,  # KNN-specific
+    max_iter: int = 10, # Iterative methods specific
     inplace: bool = True,
+    random_state:int = None
 ) -> pd.DataFrame:
     """
     Fill missing values in a DataFrame using specified imputation method.
@@ -4865,8 +4978,15 @@ def df_fillna(
         - 'median': Replace missing values with the median of the column.
         - 'most_frequent': Replace missing values with the most frequent value in the column.
         - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
-        - 'knn': Use K-Nearest Neighbors imputation.
-        - 'iterative': Use Iterative imputation.
+        - 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
+        - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
+        - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
+        # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
+        # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
+        #       reconstructing the data matrix using low-rank approximation
+        # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
+        # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
     axis (int): The axis along which to impute:
         - 0: Impute column-wise (default).
         - 1: Impute row-wise.
@@ -4879,7 +4999,8 @@ def df_fillna(
         raise ValueError("Input DataFrame is empty.")
     # Validate method
-    methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
+    methods = ["mean", "median", "most_frequent",
+               "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
     method = strcmp(method, methods)[0]
     # If using constant method, ask for a constant value
@@ -4892,18 +5013,27 @@ def df_fillna(
     # Initialize SimpleImputer with the chosen method
     if method == "constant":
+        from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method, fill_value=constant)
     elif method == "knn":
         from sklearn.impute import KNNImputer
         imputer = KNNImputer(n_neighbors=n_neighbors)
-    elif method == "iterative":
+    elif method == "iterative" or method == "mice":
+        from sklearn.experimental import enable_iterative_imputer
         from sklearn.impute import IterativeImputer
-        imputer = IterativeImputer(max_iter=max_iter)
-    else:
+        imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
+    # elif method == "missforest":
+    #     from missingpy import MissForest
+    #     imputer = MissForest(max_iter=max_iter, random_state=random_state)
+    # elif method == "softimpute":
+    #     from fancyimpute import SoftImpute
+    #     imputer = SoftImpute()
+    # elif method == "svd":
+    #     from fancyimpute import IterativeSVD
+    #     imputer = IterativeSVD(max_iters=max_iter)
+    else: # mean, median, most_frequent
         from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method)
     # Fit and transform the data
@@ -4929,8 +5059,38 @@ def df_fillna(
         return None  # replace original
     else:
         return df_filled
+# # example
+# data = {
+#     "A": [1, 2, np.nan, 4, 5],
+#     "B": [np.nan, 2, 3, 4, np.nan],
+#     "C": [1, np.nan, 3, 4, 5],
+#     "D": [1, 2, 3, 4, np.nan],
+# }
+# # Define a function to test each imputation method
+# methods = [
+#     "mean",
+#     "median",
+#     "most_frequent",
+#     "constant",
+#     "knn",
+#     "iterative",
+#     # "missforest",
+#     # "softimpute",
+#     # "svd",
+# ]
+# # Create a dictionary to hold results
+# results = {}
+# for method_name in methods:
+#     print(method_name)
+#     display(df)
+#     display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
 def df_scaler(
-    data: pd.DataFrame,
+    data: pd.DataFrame, # should be numeric dtype
     method="standard",
     columns=None,  # default, select all numeric col/row
     inplace=False,
@@ -5414,7 +5574,7 @@ def df_reducer(
     # Select columns if specified, else use all columns
     X = data[columns].values if columns else data.values
+    print(X.shape,type(X))
     # Handle missing values
     if fill_missing:
         imputer = SimpleImputer(strategy="mean")
@@ -5620,15 +5780,19 @@ def df_reducer(
         # If inplace=True, add components back into the original data
         for col_idx in range(n_components):
             data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
         # Add extra info for PCA/UMAP
         if method == "pca":
-            data["Explained Variance"] = reduced_df["Explained Variance"]
-            data["Singular Values"] = reduced_df["Singular Values"]
-        elif method == "umap":
+            for i in range(n_components):
+                data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
+            for i in range(n_components):
+                data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
+        elif method == "umap":
+            for i in range(n_components):
+                data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
             data["Embedding"] = reduced_df["Embedding"]
             data["Trustworthiness"] = reduced_df["Trustworthiness"]
         return None  # No return when inplace=True
     return reduced_df

py2ls 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

py2ls 0.2.4.3py3-none-any.whl → 0.2.4.4py3-none-any.whl