py2ls 0.2.4.5__py3-none-any.whl → 0.2.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/bio.py +562 -52
- py2ls/ips.py +161 -63
- py2ls/mol.py +289 -0
- py2ls/plot.py +274 -132
- {py2ls-0.2.4.5.dist-info → py2ls-0.2.4.7.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.5.dist-info → py2ls-0.2.4.7.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.5.dist-info → py2ls-0.2.4.7.dist-info}/WHEEL +1 -1
    
        py2ls/ips.py
    CHANGED
    
    | @@ -60,6 +60,7 @@ except NameError: | |
| 60 60 | 
             
            def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
         | 
| 61 61 | 
             
                """
         | 
| 62 62 | 
             
                Add the Chinese (default) font to the font manager
         | 
| 63 | 
            +
                show chinese
         | 
| 63 64 | 
             
                Args:
         | 
| 64 65 | 
             
                    dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
         | 
| 65 66 | 
             
                """
         | 
| @@ -554,14 +555,28 @@ def shared(*args, strict=True, n_shared=2, verbose=True): | |
| 554 555 | 
             
                    # Get elements that appear in at least n_shared lists
         | 
| 555 556 | 
             
                    shared_elements = [item for item, count in element_count.items() if count >= n_shared]
         | 
| 556 557 |  | 
| 557 | 
            -
                shared_elements = flatten(shared_elements, verbose=verbose) | 
| 558 | 
            +
                shared_elements = flatten(shared_elements, verbose=verbose)
         | 
| 558 559 | 
             
                if verbose:
         | 
| 559 560 | 
             
                    elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
         | 
| 560 561 | 
             
                    print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
         | 
| 561 562 | 
             
                    print("********* checking shared elements *********")
         | 
| 562 563 | 
             
                return shared_elements
         | 
| 563 564 |  | 
| 564 | 
            -
            def  | 
| 565 | 
            +
            def not_shared(*args, strict=True, n_shared=2, verbose=False):
         | 
| 566 | 
            +
                """
         | 
| 567 | 
            +
                To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
         | 
| 568 | 
            +
                usage:
         | 
| 569 | 
            +
                    list1 = [1, 8, 3, 3, 4, 5]
         | 
| 570 | 
            +
                    list2 = [4, 5, 6, 7, 8]
         | 
| 571 | 
            +
                    not_shared(list1,list2)# output [1,3]
         | 
| 572 | 
            +
                """
         | 
| 573 | 
            +
                _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
         | 
| 574 | 
            +
                list1 = args[0]
         | 
| 575 | 
            +
                _not_shared=[item for item in list1 if item not in _common]
         | 
| 576 | 
            +
                return flatten(_not_shared, verbose=verbose)
         | 
| 577 | 
            +
             | 
| 578 | 
            +
             | 
| 579 | 
            +
            def flatten(nested: Any, unique_list=True, verbose=False):
         | 
| 565 580 | 
             
                """
         | 
| 566 581 | 
             
                Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
         | 
| 567 582 | 
             
                Parameters:
         | 
| @@ -589,7 +604,7 @@ def flatten(nested: Any, unique_list=True, verbose=True): | |
| 589 604 | 
             
                else:
         | 
| 590 605 | 
             
                    return flattened_list
         | 
| 591 606 |  | 
| 592 | 
            -
            def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
         | 
| 607 | 
            +
            def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=False, scorer="WR"):
         | 
| 593 608 | 
             
                """
         | 
| 594 609 | 
             
                Compares a search term with a list of candidate strings and finds the best match based on similarity score.
         | 
| 595 610 |  | 
| @@ -623,6 +638,11 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR" | |
| 623 638 | 
             
                        similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
         | 
| 624 639 | 
             
                    else:
         | 
| 625 640 | 
             
                        similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
         | 
| 641 | 
            +
                    if get_rank:
         | 
| 642 | 
            +
                        idx = [similarity_scores.index(i) for i in sorted(similarity_scores,reverse=True)]
         | 
| 643 | 
            +
                        if verbose:
         | 
| 644 | 
            +
                            display([candidates[ii] for ii in idx])
         | 
| 645 | 
            +
                        return [candidates[ii] for ii in idx]
         | 
| 626 646 | 
             
                    best_match_index = similarity_scores.index(max(similarity_scores))
         | 
| 627 647 | 
             
                    best_match_score = similarity_scores[best_match_index]
         | 
| 628 648 | 
             
                else:
         | 
| @@ -1554,13 +1574,26 @@ def unzip(dir_path, output_dir=None): | |
| 1554 1574 | 
             
                        tar_ref.extractall(output_dir)
         | 
| 1555 1575 | 
             
                    return output_dir
         | 
| 1556 1576 | 
             
                # Handle .gz files
         | 
| 1557 | 
            -
                if dir_path.endswith(".gz"):
         | 
| 1577 | 
            +
                if dir_path.endswith(".gz") or dir_path.endswith(".gzip"):
         | 
| 1558 1578 | 
             
                    import gzip
         | 
| 1559 1579 |  | 
| 1560 1580 | 
             
                    output_file = os.path.splitext(dir_path)[0]  # remove the .gz extension
         | 
| 1561 | 
            -
                     | 
| 1562 | 
            -
                        with open( | 
| 1563 | 
            -
                             | 
| 1581 | 
            +
                    try:
         | 
| 1582 | 
            +
                        with gzip.open(dir_path, "rb") as gz_file:
         | 
| 1583 | 
            +
                            with open(output_file, "wb") as out_file:
         | 
| 1584 | 
            +
                                shutil.copyfileobj(gz_file, out_file)
         | 
| 1585 | 
            +
                        print(f"unzipped '{dir_path}' to '{output_file}'")
         | 
| 1586 | 
            +
                    except FileNotFoundError:
         | 
| 1587 | 
            +
                        print(f"Error: The file '{dir_path}' was not found.")
         | 
| 1588 | 
            +
                    except PermissionError:
         | 
| 1589 | 
            +
                        print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
         | 
| 1590 | 
            +
                    except Exception as e:
         | 
| 1591 | 
            +
                        try:
         | 
| 1592 | 
            +
                            import tarfile
         | 
| 1593 | 
            +
                            with tarfile.open(dir_path, 'r:gz') as tar:
         | 
| 1594 | 
            +
                                tar.extractall(path=output_file)
         | 
| 1595 | 
            +
                        except Exception as final_e:
         | 
| 1596 | 
            +
                            print(f"An final unexpected error occurred: {final_e}")
         | 
| 1564 1597 | 
             
                    return output_file
         | 
| 1565 1598 |  | 
| 1566 1599 | 
             
                # Handle .zip files
         | 
| @@ -1642,9 +1675,12 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool: | |
| 1642 1675 | 
             
                False: normal
         | 
| 1643 1676 |  | 
| 1644 1677 | 
             
                """
         | 
| 1678 | 
            +
                if not isinstance(df, pd.DataFrame):
         | 
| 1679 | 
            +
                    return False
         | 
| 1680 | 
            +
                df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
         | 
| 1645 1681 | 
             
                # Initialize a list to hold messages about abnormalities
         | 
| 1646 1682 | 
             
                messages = []
         | 
| 1647 | 
            -
                is_abnormal =  | 
| 1683 | 
            +
                is_abnormal = True
         | 
| 1648 1684 | 
             
                # Check the shape of the DataFrame
         | 
| 1649 1685 | 
             
                actual_shape = df.shape
         | 
| 1650 1686 | 
             
                messages.append(f"Shape of DataFrame: {actual_shape}")
         | 
| @@ -1739,10 +1775,12 @@ def fload(fpath, kind=None, **kwargs): | |
| 1739 1775 | 
             
                        content = file.read()
         | 
| 1740 1776 | 
             
                    return content
         | 
| 1741 1777 |  | 
| 1742 | 
            -
                def load_html(fpath):
         | 
| 1743 | 
            -
             | 
| 1744 | 
            -
             | 
| 1745 | 
            -
             | 
| 1778 | 
            +
                # def load_html(fpath):
         | 
| 1779 | 
            +
                #     with open(fpath, "r") as file:
         | 
| 1780 | 
            +
                #         content = file.read()
         | 
| 1781 | 
            +
                #     return content
         | 
| 1782 | 
            +
                def load_html(fpath,**kwargs):
         | 
| 1783 | 
            +
                    return pd.read_html(fpath,**kwargs)
         | 
| 1746 1784 |  | 
| 1747 1785 | 
             
                def load_json(fpath, **kwargs):
         | 
| 1748 1786 | 
             
                    output=kwargs.pop("output","json")
         | 
| @@ -1956,8 +1994,8 @@ def fload(fpath, kind=None, **kwargs): | |
| 1956 1994 | 
             
                                            # display(df.head(2))
         | 
| 1957 1995 | 
             
                                            # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
         | 
| 1958 1996 | 
             
                                            if not is_df_abnormal(df, verbose=0):
         | 
| 1959 | 
            -
                                                display(df.head(2))
         | 
| 1960 | 
            -
                                                print(f"shape: {df.shape}")
         | 
| 1997 | 
            +
                                                display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
         | 
| 1998 | 
            +
                                                print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
         | 
| 1961 1999 | 
             
                                                return df
         | 
| 1962 2000 | 
             
                                        except EmptyDataError as e:
         | 
| 1963 2001 | 
             
                                            continue
         | 
| @@ -1981,6 +2019,42 @@ def fload(fpath, kind=None, **kwargs): | |
| 1981 2019 | 
             
                        pass
         | 
| 1982 2020 | 
             
                    return df
         | 
| 1983 2021 |  | 
| 2022 | 
            +
             | 
| 2023 | 
            +
                def load_parquet(fpath, **kwargs):
         | 
| 2024 | 
            +
                    """
         | 
| 2025 | 
            +
                    Load a Parquet file into a Pandas DataFrame with advanced options.
         | 
| 2026 | 
            +
             | 
| 2027 | 
            +
                    Parameters:
         | 
| 2028 | 
            +
                    - fpath (str): The file path to the Parquet file.
         | 
| 2029 | 
            +
                    - engine (str): The engine to use for reading the Parquet file (default is 'pyarrow').
         | 
| 2030 | 
            +
                    - columns (list): List of columns to load. If None, loads all columns.
         | 
| 2031 | 
            +
                    - verbose (bool): If True, prints additional information about the loading process.
         | 
| 2032 | 
            +
                    - filters (list): List of filter conditions for predicate pushdown.
         | 
| 2033 | 
            +
                    - **kwargs: Additional keyword arguments for `pd.read_parquet`.
         | 
| 2034 | 
            +
             | 
| 2035 | 
            +
                    Returns:
         | 
| 2036 | 
            +
                    - df (DataFrame): The loaded DataFrame.
         | 
| 2037 | 
            +
                    """
         | 
| 2038 | 
            +
                    
         | 
| 2039 | 
            +
                    engine = kwargs.get("engine", "pyarrow")
         | 
| 2040 | 
            +
                    verbose = kwargs.pop("verbose", False)
         | 
| 2041 | 
            +
                    
         | 
| 2042 | 
            +
                    if verbose:
         | 
| 2043 | 
            +
                        use_pd("read_parquet", verbose=verbose)
         | 
| 2044 | 
            +
                    try:
         | 
| 2045 | 
            +
                        df = pd.read_parquet(fpath, engine=engine, **kwargs)
         | 
| 2046 | 
            +
                        if verbose:
         | 
| 2047 | 
            +
                            if 'columns' in kwargs:
         | 
| 2048 | 
            +
                                print(f"Loaded columns: {kwargs['columns']}")
         | 
| 2049 | 
            +
                            else:
         | 
| 2050 | 
            +
                                print("Loaded all columns.")
         | 
| 2051 | 
            +
                        print(f"shape: {df.shape}")
         | 
| 2052 | 
            +
                    except Exception as e:
         | 
| 2053 | 
            +
                        print(f"An error occurred while loading the Parquet file: {e}")
         | 
| 2054 | 
            +
                        df = None
         | 
| 2055 | 
            +
             | 
| 2056 | 
            +
                    return df 
         | 
| 2057 | 
            +
             | 
| 1984 2058 | 
             
                def load_ipynb(fpath, **kwargs):
         | 
| 1985 2059 | 
             
                    as_version = kwargs.get("as_version", 4)
         | 
| 1986 2060 | 
             
                    with open(fpath, "r") as file:
         | 
| @@ -2049,51 +2123,21 @@ def fload(fpath, kind=None, **kwargs): | |
| 2049 2123 | 
             
                    kind = kind.lower()
         | 
| 2050 2124 | 
             
                kind = kind.lstrip(".").lower()
         | 
| 2051 2125 | 
             
                img_types = [
         | 
| 2052 | 
            -
                    "bmp",
         | 
| 2053 | 
            -
                    " | 
| 2054 | 
            -
                    "gif",
         | 
| 2055 | 
            -
                    "icns",
         | 
| 2056 | 
            -
                    "ico",
         | 
| 2057 | 
            -
                    "im",
         | 
| 2058 | 
            -
                    "jpg",
         | 
| 2059 | 
            -
                    "jpeg",
         | 
| 2060 | 
            -
                    "jpeg2000",
         | 
| 2061 | 
            -
                    "msp",
         | 
| 2062 | 
            -
                    "pcx",
         | 
| 2063 | 
            -
                    "png",
         | 
| 2064 | 
            -
                    "ppm",
         | 
| 2065 | 
            -
                    "sgi",
         | 
| 2066 | 
            -
                    "spider",
         | 
| 2067 | 
            -
                    "tga",
         | 
| 2068 | 
            -
                    "tiff",
         | 
| 2069 | 
            -
                    "tif",
         | 
| 2070 | 
            -
                    "webp",
         | 
| 2071 | 
            -
                    "json",
         | 
| 2126 | 
            +
                    "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
         | 
| 2127 | 
            +
                    "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
         | 
| 2072 2128 | 
             
                ]
         | 
| 2073 2129 | 
             
                doc_types = [
         | 
| 2074 | 
            -
                    "docx",
         | 
| 2075 | 
            -
                    "txt",
         | 
| 2076 | 
            -
                    "md",
         | 
| 2077 | 
            -
                    " | 
| 2078 | 
            -
                    "json",
         | 
| 2079 | 
            -
                    "yaml",
         | 
| 2080 | 
            -
                    "xml",
         | 
| 2081 | 
            -
                    "csv",
         | 
| 2082 | 
            -
                    "xlsx",
         | 
| 2083 | 
            -
                    "pdf",
         | 
| 2130 | 
            +
                    "docx","pdf",
         | 
| 2131 | 
            +
                    "txt","csv","xlsx","tsv","parquet","snappy",
         | 
| 2132 | 
            +
                    "md","html",
         | 
| 2133 | 
            +
                    "json","yaml","xml",
         | 
| 2084 2134 | 
             
                    "ipynb",
         | 
| 2135 | 
            +
                    "mtx"
         | 
| 2085 2136 | 
             
                ]
         | 
| 2086 2137 | 
             
                zip_types = [
         | 
| 2087 | 
            -
                    "gz",
         | 
| 2088 | 
            -
                    " | 
| 2089 | 
            -
                    " | 
| 2090 | 
            -
                    "tar",
         | 
| 2091 | 
            -
                    "tar.gz",
         | 
| 2092 | 
            -
                    "tar.bz2",
         | 
| 2093 | 
            -
                    "bz2",
         | 
| 2094 | 
            -
                    "xz",
         | 
| 2095 | 
            -
                    "rar",
         | 
| 2096 | 
            -
                    "tgz",
         | 
| 2138 | 
            +
                    "gz","zip","7z","rar","tgz",
         | 
| 2139 | 
            +
                    "tar","tar.gz","tar.bz2",
         | 
| 2140 | 
            +
                    "bz2","xz","gzip"
         | 
| 2097 2141 | 
             
                ]
         | 
| 2098 2142 | 
             
                other_types = ["fcs"]
         | 
| 2099 2143 | 
             
                supported_types = [*doc_types, *img_types, *zip_types, *other_types]
         | 
| @@ -2122,14 +2166,14 @@ def fload(fpath, kind=None, **kwargs): | |
| 2122 2166 | 
             
                elif kind == "txt" or kind == "md":
         | 
| 2123 2167 | 
             
                    return load_txt_md(fpath)
         | 
| 2124 2168 | 
             
                elif kind == "html":
         | 
| 2125 | 
            -
                    return load_html(fpath)
         | 
| 2169 | 
            +
                    return load_html(fpath, **kwargs)
         | 
| 2126 2170 | 
             
                elif kind == "json":
         | 
| 2127 | 
            -
                    return load_json(fpath)
         | 
| 2171 | 
            +
                    return load_json(fpath, **kwargs)
         | 
| 2128 2172 | 
             
                elif kind == "yaml":
         | 
| 2129 2173 | 
             
                    return load_yaml(fpath)
         | 
| 2130 2174 | 
             
                elif kind == "xml":
         | 
| 2131 2175 | 
             
                    return load_xml(fpath)
         | 
| 2132 | 
            -
                elif kind  | 
| 2176 | 
            +
                elif kind in ["csv","tsv"]:
         | 
| 2133 2177 | 
             
                    content = load_csv(fpath, **kwargs)
         | 
| 2134 2178 | 
             
                    return content
         | 
| 2135 2179 | 
             
                elif kind in ["ods", "ods", "odt"]:
         | 
| @@ -2140,14 +2184,25 @@ def fload(fpath, kind=None, **kwargs): | |
| 2140 2184 | 
             
                    engine = kwargs.get("engine", "xlrd")
         | 
| 2141 2185 | 
             
                    kwargs.pop("engine", None)
         | 
| 2142 2186 | 
             
                    content = load_excel(fpath, engine=engine, **kwargs)
         | 
| 2187 | 
            +
                    print(f"shape: {content.shape}")
         | 
| 2143 2188 | 
             
                    display(content.head(3))
         | 
| 2144 2189 | 
             
                    return content
         | 
| 2145 2190 | 
             
                elif kind == "xlsx":
         | 
| 2146 2191 | 
             
                    content = load_excel(fpath, **kwargs)
         | 
| 2147 2192 | 
             
                    display(content.head(3))
         | 
| 2193 | 
            +
                    print(f"shape: {content.shape}")
         | 
| 2194 | 
            +
                    return content
         | 
| 2195 | 
            +
                elif kind=='mtx':
         | 
| 2196 | 
            +
                    from scipy.io import mmread
         | 
| 2197 | 
            +
                    dat_mtx=mmread(fpath)
         | 
| 2198 | 
            +
                    content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
         | 
| 2199 | 
            +
                    display(content.head(3))
         | 
| 2200 | 
            +
                    print(f"shape: {content.shape}")
         | 
| 2148 2201 | 
             
                    return content
         | 
| 2149 2202 | 
             
                elif kind == "ipynb":
         | 
| 2150 2203 | 
             
                    return load_ipynb(fpath, **kwargs)
         | 
| 2204 | 
            +
                elif kind in ['parquet','snappy']:
         | 
| 2205 | 
            +
                    return load_parquet(fpath,**kwargs)
         | 
| 2151 2206 | 
             
                elif kind == "pdf":
         | 
| 2152 2207 | 
             
                    # print('usage:load_pdf(fpath, page="all", verbose=False)')
         | 
| 2153 2208 | 
             
                    return load_pdf(fpath, **kwargs)
         | 
| @@ -2193,9 +2248,7 @@ def fload(fpath, kind=None, **kwargs): | |
| 2193 2248 | 
             
                    return meta, data
         | 
| 2194 2249 |  | 
| 2195 2250 | 
             
                else:
         | 
| 2196 | 
            -
                     | 
| 2197 | 
            -
                    #     content = load_csv(fpath, **kwargs)
         | 
| 2198 | 
            -
                    # except:
         | 
| 2251 | 
            +
                    print("direct reading...")
         | 
| 2199 2252 | 
             
                    try:
         | 
| 2200 2253 | 
             
                        try:
         | 
| 2201 2254 | 
             
                            with open(fpath, "r", encoding="utf-8") as f:
         | 
| @@ -2495,6 +2548,25 @@ def fsave( | |
| 2495 2548 | 
             
                    tree = etree.ElementTree(root)
         | 
| 2496 2549 | 
             
                    tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
         | 
| 2497 2550 |  | 
| 2551 | 
            +
                def save_parquet(fpath:str, data:pd.DataFrame, **kwargs): 
         | 
| 2552 | 
            +
                    engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
         | 
| 2553 | 
            +
                    compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
         | 
| 2554 | 
            +
                    try:
         | 
| 2555 | 
            +
                        # Attempt to save with "pyarrow" if engine is set to "auto"
         | 
| 2556 | 
            +
                            data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
         | 
| 2557 | 
            +
                            print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.") 
         | 
| 2558 | 
            +
                    except Exception as e:
         | 
| 2559 | 
            +
                        print(f"Error using with engine '{engine}' and {compression} compression: {e}")
         | 
| 2560 | 
            +
                        if "Sparse" in str(e):
         | 
| 2561 | 
            +
                            try:
         | 
| 2562 | 
            +
                                # Handle sparse data by converting columns to dense
         | 
| 2563 | 
            +
                                print("Attempting to convert sparse columns to dense format...")
         | 
| 2564 | 
            +
                                data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
         | 
| 2565 | 
            +
                                save_parquet(fpath, data=data,**kwargs)
         | 
| 2566 | 
            +
                            except Exception as last_e:
         | 
| 2567 | 
            +
                                print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
         | 
| 2568 | 
            +
                                
         | 
| 2569 | 
            +
             | 
| 2498 2570 | 
             
                if kind is None:
         | 
| 2499 2571 | 
             
                    _, kind = os.path.splitext(fpath)
         | 
| 2500 2572 | 
             
                    kind = kind.lower()
         | 
| @@ -2540,6 +2612,15 @@ def fsave( | |
| 2540 2612 | 
             
                    save_yaml(fpath, content, **kwargs)
         | 
| 2541 2613 | 
             
                elif kind == "ipynb":
         | 
| 2542 2614 | 
             
                    save_ipynb(fpath, content, **kwargs)
         | 
| 2615 | 
            +
                elif kind.lower() in ["parquet","pq","big","par"]:
         | 
| 2616 | 
            +
                    compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
         | 
| 2617 | 
            +
                    # fix the fpath ends
         | 
| 2618 | 
            +
                    if not '.parquet' in fpath:
         | 
| 2619 | 
            +
                        fpath=fpath.replace(kind, 'parquet')
         | 
| 2620 | 
            +
                    if compression is not None:
         | 
| 2621 | 
            +
                        if not fpath.endswith(compression):
         | 
| 2622 | 
            +
                            fpath=fpath+f".{compression}"
         | 
| 2623 | 
            +
                    save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
         | 
| 2543 2624 | 
             
                else:
         | 
| 2544 2625 | 
             
                    try:
         | 
| 2545 2626 | 
             
                        netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
         | 
| @@ -3058,8 +3139,11 @@ def figsave(*args, dpi=300): | |
| 3058 3139 |  | 
| 3059 3140 | 
             
            def is_str_color(s):
         | 
| 3060 3141 | 
             
                # Regular expression pattern for hexadecimal color codes
         | 
| 3061 | 
            -
                 | 
| 3062 | 
            -
             | 
| 3142 | 
            +
                if isinstance(s,str):
         | 
| 3143 | 
            +
                    color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
         | 
| 3144 | 
            +
                    return re.match(color_code_pattern, s) is not None
         | 
| 3145 | 
            +
                else:
         | 
| 3146 | 
            +
                    return True
         | 
| 3063 3147 |  | 
| 3064 3148 |  | 
| 3065 3149 | 
             
            def is_num(s):
         | 
| @@ -5509,7 +5593,21 @@ def df_reducer( | |
| 5509 5593 | 
             
            ) -> pd.DataFrame:
         | 
| 5510 5594 | 
             
                """
         | 
| 5511 5595 | 
             
                Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
         | 
| 5512 | 
            -
             | 
| 5596 | 
            +
                method: 
         | 
| 5597 | 
            +
                    1. 'umap': 
         | 
| 5598 | 
            +
                        - big dataset and global structure, often preferred in large-scale datasets for 
         | 
| 5599 | 
            +
                        visualization and dimensionality reduction, balancing speed and quality of visualization.
         | 
| 5600 | 
            +
                        - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global 
         | 
| 5601 | 
            +
                        relationships, causing clusters to appear in arbitrary proximities to each other.
         | 
| 5602 | 
            +
                    2. 'pca': 
         | 
| 5603 | 
            +
                        - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global 
         | 
| 5604 | 
            +
                            relationships, causing clusters to appear in arbitrary proximities to each other.
         | 
| 5605 | 
            +
                        - useful as a preprocessing step and in datasets where linear relationships dominate.
         | 
| 5606 | 
            +
                    3. 't-SNE': 
         | 
| 5607 | 
            +
                        a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global 
         | 
| 5608 | 
            +
                            relationships, causing clusters to appear in arbitrary proximities to each other.
         | 
| 5609 | 
            +
                        b. often preferred in large-scale datasets for visualization and dimensionality 
         | 
| 5610 | 
            +
                            reduction, balancing speed and quality of visualization.
         | 
| 5513 5611 | 
             
                Parameters:
         | 
| 5514 5612 | 
             
                -----------
         | 
| 5515 5613 | 
             
                data : pd.DataFrame
         | 
    
        py2ls/mol.py
    ADDED
    
    | @@ -0,0 +1,289 @@ | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import subprocess
         | 
| 3 | 
            +
            from rdkit import Chem
         | 
| 4 | 
            +
            from rdkit.Chem import AllChem,Draw
         | 
| 5 | 
            +
            from openbabel import openbabel
         | 
| 6 | 
            +
            import matplotlib.pyplot as plt
         | 
| 7 | 
            +
            # import pymol2  # 使用 PyMOL API 进行分子展示
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            from typing import Any, Dict, Union, List
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            def load_mol(fpath: str) -> Union[Dict[str, Any], None]:
         | 
| 12 | 
            +
                """
         | 
| 13 | 
            +
                Master function to read various molecular structure files and return a consistent molecule dictionary.
         | 
| 14 | 
            +
                Supports formats: .pdb, .mol, .sdf, .xyz, .gro, and others through RDKit, Pybel, MDAnalysis, and ASE.
         | 
| 15 | 
            +
                
         | 
| 16 | 
            +
                Parameters:
         | 
| 17 | 
            +
                - fpath (str): Path to the molecular file
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                Returns:
         | 
| 20 | 
            +
                - mol_dict (Dict[str, Any]): Dictionary with molecule information:
         | 
| 21 | 
            +
                    - 'atoms': List of atom information dictionaries
         | 
| 22 | 
            +
                    - 'bonds': List of bond information dictionaries
         | 
| 23 | 
            +
                    - 'metadata': Metadata for molecule (e.g., file name)
         | 
| 24 | 
            +
                """
         | 
| 25 | 
            +
                ext = os.path.splitext(fpath)[-1].lower()  # Get the file extension
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def create_atom_dict(atom) -> Dict[str, Any]:
         | 
| 28 | 
            +
                    """Helper to create a consistent atom dictionary."""
         | 
| 29 | 
            +
                    return {
         | 
| 30 | 
            +
                        'element': atom.atomic_symbol,
         | 
| 31 | 
            +
                        'coords': atom.coords,
         | 
| 32 | 
            +
                        'index': atom.idx,
         | 
| 33 | 
            +
                        'charge': atom.formalcharge
         | 
| 34 | 
            +
                    }
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                def create_bond_dict(bond) -> Dict[str, Any]:
         | 
| 37 | 
            +
                    """Helper to create a consistent bond dictionary."""
         | 
| 38 | 
            +
                    return {
         | 
| 39 | 
            +
                        'start_atom_idx': bond.GetBeginAtomIdx(),
         | 
| 40 | 
            +
                        'end_atom_idx': bond.GetEndAtomIdx(),
         | 
| 41 | 
            +
                        'bond_type': bond.GetBondTypeAsDouble()
         | 
| 42 | 
            +
                    }
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                mol_dict = {
         | 
| 45 | 
            +
                    "atoms": [],
         | 
| 46 | 
            +
                    "bonds": [],
         | 
| 47 | 
            +
                    "metadata": {
         | 
| 48 | 
            +
                        "file_name": os.path.basename(fpath),
         | 
| 49 | 
            +
                        "format": ext
         | 
| 50 | 
            +
                    }
         | 
| 51 | 
            +
                }
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                try:
         | 
| 54 | 
            +
                    # Handling with RDKit (for .mol and .sdf)
         | 
| 55 | 
            +
                    if ext in ['.mol', '.sdf']:
         | 
| 56 | 
            +
                        from rdkit import Chem
         | 
| 57 | 
            +
                        if ext == '.mol':
         | 
| 58 | 
            +
                            mol = Chem.MolFromMolFile(fpath)
         | 
| 59 | 
            +
                            if mol is None:
         | 
| 60 | 
            +
                                raise ValueError("RDKit failed to parse the .mol file.")
         | 
| 61 | 
            +
                            atoms = mol.GetAtoms()
         | 
| 62 | 
            +
                            bonds = mol.GetBonds()
         | 
| 63 | 
            +
                        elif ext == '.sdf':
         | 
| 64 | 
            +
                            supplier = Chem.SDMolSupplier(fpath)
         | 
| 65 | 
            +
                            mol = next(supplier, None)
         | 
| 66 | 
            +
                            if mol is None:
         | 
| 67 | 
            +
                                raise ValueError("RDKit failed to parse the .sdf file.")
         | 
| 68 | 
            +
                            atoms = mol.GetAtoms()
         | 
| 69 | 
            +
                            bonds = mol.GetBonds()
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                        # Populate atom and bond data
         | 
| 72 | 
            +
                        mol_dict["atoms"] = [
         | 
| 73 | 
            +
                            {
         | 
| 74 | 
            +
                                "element": atom.GetSymbol(),
         | 
| 75 | 
            +
                                "coords": atom.GetOwningMol().GetConformer().GetAtomPosition(atom.GetIdx()),
         | 
| 76 | 
            +
                                "index": atom.GetIdx(),
         | 
| 77 | 
            +
                                "charge": atom.GetFormalCharge()
         | 
| 78 | 
            +
                            }
         | 
| 79 | 
            +
                            for atom in atoms
         | 
| 80 | 
            +
                        ]
         | 
| 81 | 
            +
                        mol_dict["bonds"] = [
         | 
| 82 | 
            +
                            create_bond_dict(bond)
         | 
| 83 | 
            +
                            for bond in bonds
         | 
| 84 | 
            +
                        ]
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                    # Handling with Pybel (supports multiple formats: .pdb, .mol, .xyz, etc.)
         | 
| 87 | 
            +
                    elif ext in ['.pdb', '.mol', '.xyz', '.sdf']:
         | 
| 88 | 
            +
                        from openbabel import pybel
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                        mol = next(pybel.readfile(ext[1:], fpath), None)
         | 
| 91 | 
            +
                        if mol is None:
         | 
| 92 | 
            +
                            raise ValueError("Pybel failed to parse the file.")
         | 
| 93 | 
            +
                        # Populate atom and bond data
         | 
| 94 | 
            +
                        mol_dict["atoms"] = [
         | 
| 95 | 
            +
                            {
         | 
| 96 | 
            +
                                "element": atom.type,
         | 
| 97 | 
            +
                                "coords": atom.coords,
         | 
| 98 | 
            +
                                "index": atom.idx,
         | 
| 99 | 
            +
                                "charge": atom.partialcharge
         | 
| 100 | 
            +
                            }
         | 
| 101 | 
            +
                            for atom in mol.atoms
         | 
| 102 | 
            +
                        ]
         | 
| 103 | 
            +
                        mol_dict["bonds"] = [
         | 
| 104 | 
            +
                            {
         | 
| 105 | 
            +
                                "start_atom_idx": bond.GetBeginAtomIdx(),
         | 
| 106 | 
            +
                                "end_atom_idx": bond.GetEndAtomIdx(),
         | 
| 107 | 
            +
                                "bond_type": bond.GetBondOrder()
         | 
| 108 | 
            +
                            }
         | 
| 109 | 
            +
                            for bond in openbabel.OBMolBondIter(mol.OBMol)
         | 
| 110 | 
            +
                        ]
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    # Handling with MDAnalysis (for .pdb, .gro, and trajectory files)
         | 
| 113 | 
            +
                    elif ext in ['.pdb', '.gro', '.xyz', '.xtc', '.dcd', '.trr']:
         | 
| 114 | 
            +
                        import MDAnalysis as mda
         | 
| 115 | 
            +
                        u = mda.Universe(fpath)
         | 
| 116 | 
            +
                        atoms = u.atoms
         | 
| 117 | 
            +
                        mol_dict["atoms"] = [
         | 
| 118 | 
            +
                            {
         | 
| 119 | 
            +
                                "element": atom.name,
         | 
| 120 | 
            +
                                "coords": atom.position,
         | 
| 121 | 
            +
                                "index": atom.id,
         | 
| 122 | 
            +
                                "charge": atom.charge if hasattr(atom, 'charge') else None
         | 
| 123 | 
            +
                            }
         | 
| 124 | 
            +
                            for atom in atoms
         | 
| 125 | 
            +
                        ]
         | 
| 126 | 
            +
                        mol_dict["bonds"] = [
         | 
| 127 | 
            +
                            {"start_atom_idx": bond[0], "end_atom_idx": bond[1], "bond_type": 1}
         | 
| 128 | 
            +
                            for bond in u.bonds.indices
         | 
| 129 | 
            +
                        ]
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                    # Handling with ASE (for .xyz, .pdb, and other atomic structure formats)
         | 
| 132 | 
            +
                    elif ext in ['.xyz', '.pdb', '.vasp', '.cif']:
         | 
| 133 | 
            +
                        from ase.io import read as ase_read
         | 
| 134 | 
            +
                        atoms = ase_read(fpath)
         | 
| 135 | 
            +
                        mol_dict["atoms"] = [
         | 
| 136 | 
            +
                            {
         | 
| 137 | 
            +
                                "element": atom.symbol,
         | 
| 138 | 
            +
                                "coords": atom.position,
         | 
| 139 | 
            +
                                "index": i,
         | 
| 140 | 
            +
                                "charge": None
         | 
| 141 | 
            +
                            }
         | 
| 142 | 
            +
                            for i, atom in enumerate(atoms)
         | 
| 143 | 
            +
                        ]
         | 
| 144 | 
            +
                        # ASE does not explicitly support bonds by default, so bonds are not populated here.
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    else:
         | 
| 147 | 
            +
                        raise ValueError(f"Unsupported file extension: {ext}")
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                except Exception as e:
         | 
| 150 | 
            +
                    print(f"Error loading molecule from {fpath}: {e}")
         | 
| 151 | 
            +
                    return None
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                return mol_dict
         | 
| 154 | 
            +
             
         | 
| 155 | 
            +
            class DockingConfig:
         | 
| 156 | 
            +
                def __init__(self, receptor_file, ligand_smiles_list, center=(0, 0, 0), size=(20, 20, 20), output_dir="docking_results"):
         | 
| 157 | 
            +
                    self.receptor_file = receptor_file
         | 
| 158 | 
            +
                    self.ligand_smiles_list = ligand_smiles_list
         | 
| 159 | 
            +
                    self.center = center
         | 
| 160 | 
            +
                    self.size = size
         | 
| 161 | 
            +
                    self.output_dir = output_dir
         | 
| 162 | 
            +
                    os.makedirs(output_dir, exist_ok=True)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            def mol_to_pdbqt(mol, output_file):
         | 
| 165 | 
            +
                """Converts an RDKit Mol object to PDBQT format."""
         | 
| 166 | 
            +
                obConversion = openbabel.OBConversion()
         | 
| 167 | 
            +
                obConversion.SetInAndOutFormats("mol", "pdbqt")
         | 
| 168 | 
            +
                obMol = openbabel.OBMol()
         | 
| 169 | 
            +
                obConversion.ReadString(obMol, Chem.MolToMolBlock(mol))
         | 
| 170 | 
            +
                obConversion.WriteFile(obMol, output_file)
         | 
| 171 | 
            +
             | 
| 172 | 
            +
            def prepare_ligand(smiles, ligand_id):
         | 
| 173 | 
            +
                """Prepare the ligand file in PDBQT format."""
         | 
| 174 | 
            +
                mol = Chem.MolFromSmiles(smiles)
         | 
| 175 | 
            +
                mol = Chem.AddHs(mol)
         | 
| 176 | 
            +
                AllChem.EmbedMolecule(mol)
         | 
| 177 | 
            +
                AllChem.UFFOptimizeMolecule(mol)
         | 
| 178 | 
            +
                ligand_file = f"ligand_{ligand_id}.pdbqt"
         | 
| 179 | 
            +
                mol_to_pdbqt(mol, ligand_file)
         | 
| 180 | 
            +
                return ligand_file
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            def run_docking(receptor_file, ligand_file, output_file, center, size):
         | 
| 183 | 
            +
                """Runs Vina docking using the receptor and ligand files."""
         | 
| 184 | 
            +
                vina_command = [
         | 
| 185 | 
            +
                    "vina",
         | 
| 186 | 
            +
                    "--receptor", receptor_file,
         | 
| 187 | 
            +
                    "--ligand", ligand_file,
         | 
| 188 | 
            +
                    "--center_x", str(center[0]),
         | 
| 189 | 
            +
                    "--center_y", str(center[1]),
         | 
| 190 | 
            +
                    "--center_z", str(center[2]),
         | 
| 191 | 
            +
                    "--size_x", str(size[0]),
         | 
| 192 | 
            +
                    "--size_y", str(size[1]),
         | 
| 193 | 
            +
                    "--size_z", str(size[2]),
         | 
| 194 | 
            +
                    "--out", output_file,
         | 
| 195 | 
            +
                    "--log", output_file.replace(".pdbqt", ".log")
         | 
| 196 | 
            +
                ]
         | 
| 197 | 
            +
                subprocess.run(vina_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         | 
| 198 | 
            +
             | 
| 199 | 
            +
            def parse_vina_output(output_file):
         | 
| 200 | 
            +
                """Parses Vina output log file to extract docking scores."""
         | 
| 201 | 
            +
                scores = []
         | 
| 202 | 
            +
                with open(output_file.replace(".pdbqt", ".log"), 'r') as f:
         | 
| 203 | 
            +
                    for line in f:
         | 
| 204 | 
            +
                        if line.startswith("REMARK VINA RESULT"):
         | 
| 205 | 
            +
                            score = float(line.split()[3])
         | 
| 206 | 
            +
                            scores.append(score)
         | 
| 207 | 
            +
                return scores
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            def docking_master_function(config: DockingConfig):
         | 
| 210 | 
            +
                """Master function to run molecular docking for multiple ligands."""
         | 
| 211 | 
            +
                receptor_pdbqt = config.receptor_file
         | 
| 212 | 
            +
                results = {}
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                for i, smiles in enumerate(config.ligand_smiles_list):
         | 
| 215 | 
            +
                    ligand_file = prepare_ligand(smiles, ligand_id=i)
         | 
| 216 | 
            +
                    output_file = os.path.join(config.output_dir, f"docked_ligand_{i}.pdbqt")
         | 
| 217 | 
            +
                    
         | 
| 218 | 
            +
                    # Run docking for each ligand
         | 
| 219 | 
            +
                    run_docking(
         | 
| 220 | 
            +
                        receptor_file=receptor_pdbqt,
         | 
| 221 | 
            +
                        ligand_file=ligand_file,
         | 
| 222 | 
            +
                        output_file=output_file,
         | 
| 223 | 
            +
                        center=config.center,
         | 
| 224 | 
            +
                        size=config.size
         | 
| 225 | 
            +
                    )
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                    # Parse docking results and store them
         | 
| 228 | 
            +
                    scores = parse_vina_output(output_file)
         | 
| 229 | 
            +
                    results[smiles] = scores
         | 
| 230 | 
            +
                    print(f"Ligand {i} (SMILES: {smiles}) docking scores: {scores}")
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                    # Visualize individual docking result
         | 
| 233 | 
            +
                    visualize_docking(config.receptor_file, output_file, f"{config.output_dir}/ligand_{i}_visualization.png")
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    # Clean up intermediate files
         | 
| 236 | 
            +
                    os.remove(ligand_file)
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                # Plot binding affinity distribution
         | 
| 239 | 
            +
                plot_binding_affinities(results, f"{config.output_dir}/binding_affinities.png")
         | 
| 240 | 
            +
                return results
         | 
| 241 | 
            +
             | 
| 242 | 
            +
            def visualize_docking(receptor_file, ligand_file, dir_save):
         | 
| 243 | 
            +
                """Generates a 2D visualization of the docking result using RDKit and Matplotlib."""
         | 
| 244 | 
            +
                # Load the receptor and ligand molecules
         | 
| 245 | 
            +
                receptor = Chem.MolFromPDBFile(receptor_file, removeHs=False)
         | 
| 246 | 
            +
                ligand = Chem.MolFromPDBFile(ligand_file, removeHs=False)
         | 
| 247 | 
            +
             | 
| 248 | 
            +
                # Draw the receptor and ligand
         | 
| 249 | 
            +
                img = Draw.MolToImage(receptor, size=(300, 300))
         | 
| 250 | 
            +
                img_ligand = Draw.MolToImage(ligand, size=(300, 300))
         | 
| 251 | 
            +
             | 
| 252 | 
            +
                # Save images
         | 
| 253 | 
            +
                img.save(dir_save.replace('.png', '_receptor.png'))
         | 
| 254 | 
            +
                img_ligand.save(dir_save.replace('.png', '_ligand.png'))
         | 
| 255 | 
            +
                
         | 
| 256 | 
            +
                print(f"Saved 2D visualizations to {dir_save.replace('.png', '_receptor.png')} and {dir_save.replace('.png', '_ligand.png')}")
         | 
| 257 | 
            +
             | 
| 258 | 
            +
             | 
| 259 | 
            +
            def plot_binding_affinities(results, dir_save):
         | 
| 260 | 
            +
                """Plots binding affinities for all ligands."""
         | 
| 261 | 
            +
                ligands = list(results.keys())
         | 
| 262 | 
            +
                affinities = [min(scores) for scores in results.values()]  # Minimum binding affinity per ligand
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                plt.figure(figsize=(10, 6))
         | 
| 265 | 
            +
                plt.barh(ligands, affinities, color="skyblue")
         | 
| 266 | 
            +
                plt.xlabel("Binding Affinity (kcal/mol)")
         | 
| 267 | 
            +
                plt.ylabel("Ligands (SMILES)")
         | 
| 268 | 
            +
                plt.title("Binding Affinities of Different Ligands")
         | 
| 269 | 
            +
                plt.gca().invert_yaxis()
         | 
| 270 | 
            +
                plt.tight_layout()
         | 
| 271 | 
            +
                plt.savefig(dir_save)
         | 
| 272 | 
            +
                plt.show()
         | 
| 273 | 
            +
                print(f"Saved binding affinity plot to {dir_save}")
         | 
| 274 | 
            +
                
         | 
| 275 | 
            +
            # 示例使用
         | 
| 276 | 
            +
            if __name__ == "__main__":
         | 
| 277 | 
            +
                # 配置
         | 
| 278 | 
            +
                receptor_file = "receptor.pdbqt"
         | 
| 279 | 
            +
                ligand_smiles_list = ["CCO", "CCC", "CCN"]  # 示例的配体SMILES列表
         | 
| 280 | 
            +
                docking_config = DockingConfig(
         | 
| 281 | 
            +
                    receptor_file=receptor_file,
         | 
| 282 | 
            +
                    ligand_smiles_list=ligand_smiles_list,
         | 
| 283 | 
            +
                    center=(10, 10, 10),  # 假设对接中心
         | 
| 284 | 
            +
                    size=(20, 20, 20)     # 假设对接区域大小
         | 
| 285 | 
            +
                )
         | 
| 286 | 
            +
             | 
| 287 | 
            +
                # 运行master function
         | 
| 288 | 
            +
                docking_results = docking_master_function(docking_config)
         | 
| 289 | 
            +
                print("Final docking results:", docking_results)
         |