py2ls 0.2.4.3__py3-none-any.whl → 0.2.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/bio.py +1384 -19
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/ips.py +221 -57
- py2ls/ml2ls.py +1094 -0
- py2ls/netfinder.py +12 -1
- py2ls/plot.py +308 -80
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.5.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.5.dist-info}/RECORD +11 -9
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.5.dist-info}/WHEEL +0 -0
    
        py2ls/ips.py
    CHANGED
    
    | @@ -106,6 +106,8 @@ def unique(lst, ascending=None): | |
| 106 106 | 
             
                返回:
         | 
| 107 107 | 
             
                list: 一个列表,其中的元素是唯一的,顺序根据参数 `ascending` 进行排序。
         | 
| 108 108 | 
             
                """
         | 
| 109 | 
            +
                if not lst:
         | 
| 110 | 
            +
                    return []
         | 
| 109 111 | 
             
                if ascending is not None:
         | 
| 110 112 | 
             
                    # 移除重复项
         | 
| 111 113 | 
             
                    unique_items = list(set(lst))
         | 
| @@ -518,7 +520,7 @@ def is_text(s): | |
| 518 520 |  | 
| 519 521 | 
             
            from typing import Any, Union
         | 
| 520 522 |  | 
| 521 | 
            -
            def shared( | 
| 523 | 
            +
            def shared(*args, strict=True, n_shared=2, verbose=True):
         | 
| 522 524 | 
             
                """
         | 
| 523 525 | 
             
                check the shared elelements in two list.
         | 
| 524 526 | 
             
                usage:
         | 
| @@ -529,21 +531,37 @@ def shared(lst1:Any, lst2:Any,*args, verbose=True): | |
| 529 531 | 
             
                """
         | 
| 530 532 | 
             
                if verbose:
         | 
| 531 533 | 
             
                    print("\n********* checking shared elements *********")
         | 
| 532 | 
            -
             | 
| 533 | 
            -
             | 
| 534 | 
            -
             | 
| 535 | 
            -
                 | 
| 536 | 
            -
             | 
| 537 | 
            -
             | 
| 538 | 
            -
             | 
| 539 | 
            -
                 | 
| 534 | 
            +
             | 
| 535 | 
            +
                if len(args) == 1 and isinstance(args[0], list):
         | 
| 536 | 
            +
                    lists = args[0]  # Unpack the single list
         | 
| 537 | 
            +
                else:
         | 
| 538 | 
            +
                    lists = args  # Use the provided arguments as lists
         | 
| 539 | 
            +
                flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
         | 
| 540 | 
            +
                # Ensure all arguments are lists
         | 
| 541 | 
            +
                if any(not isinstance(lst, list) for lst in flattened_lists):
         | 
| 542 | 
            +
                    print(f"{' ' * 2}All inputs must be lists.")
         | 
| 543 | 
            +
                    return []
         | 
| 544 | 
            +
                first_list = flattened_lists[0]
         | 
| 545 | 
            +
                shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
         | 
| 546 | 
            +
                if strict:
         | 
| 547 | 
            +
                        # Strict mode: require elements to be in all lists
         | 
| 548 | 
            +
                        shared_elements = set(flattened_lists[0])
         | 
| 549 | 
            +
                        for lst in flattened_lists[1:]:
         | 
| 550 | 
            +
                            shared_elements.intersection_update(lst)
         | 
| 551 | 
            +
                else:
         | 
| 552 | 
            +
                    all_elements = [item for sublist in flattened_lists for item in sublist]
         | 
| 553 | 
            +
                    element_count = Counter(all_elements)
         | 
| 554 | 
            +
                    # Get elements that appear in at least n_shared lists
         | 
| 555 | 
            +
                    shared_elements = [item for item, count in element_count.items() if count >= n_shared]
         | 
| 556 | 
            +
             | 
| 557 | 
            +
                shared_elements = flatten(shared_elements, verbose=verbose) 
         | 
| 540 558 | 
             
                if verbose:
         | 
| 541 559 | 
             
                    elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
         | 
| 542 560 | 
             
                    print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
         | 
| 543 561 | 
             
                    print("********* checking shared elements *********")
         | 
| 544 562 | 
             
                return shared_elements
         | 
| 545 563 |  | 
| 546 | 
            -
            def flatten(nested: Any, unique_list=True,verbose=True):
         | 
| 564 | 
            +
            def flatten(nested: Any, unique_list=True, verbose=True):
         | 
| 547 565 | 
             
                """
         | 
| 548 566 | 
             
                Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
         | 
| 549 567 | 
             
                Parameters:
         | 
| @@ -555,17 +573,19 @@ def flatten(nested: Any, unique_list=True,verbose=True): | |
| 555 573 | 
             
                while stack:
         | 
| 556 574 | 
             
                    current = stack.pop()
         | 
| 557 575 | 
             
                    if isinstance(current, dict):
         | 
| 558 | 
            -
                        stack.extend(current.values())
         | 
| 576 | 
            +
                        stack.extend(current.values()) 
         | 
| 559 577 | 
             
                    elif isinstance(current, (list, tuple, set)):
         | 
| 560 578 | 
             
                        stack.extend(current)
         | 
| 561 579 | 
             
                    elif isinstance(current, pd.Series):
         | 
| 562 580 | 
             
                        stack.extend(current)
         | 
| 581 | 
            +
                    elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
         | 
| 582 | 
            +
                        stack.extend(current.tolist())
         | 
| 563 583 | 
             
                    else:
         | 
| 564 584 | 
             
                        flattened_list.append(current)
         | 
| 565 585 | 
             
                if verbose:
         | 
| 566 586 | 
             
                    print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
         | 
| 567 587 | 
             
                if unique_list:
         | 
| 568 | 
            -
                    return unique(flattened_list)
         | 
| 588 | 
            +
                    return unique(flattened_list)[::-1]
         | 
| 569 589 | 
             
                else:
         | 
| 570 590 | 
             
                    return flattened_list
         | 
| 571 591 |  | 
| @@ -1618,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool: | |
| 1618 1638 | 
             
                """
         | 
| 1619 1639 | 
             
                Usage
         | 
| 1620 1640 | 
             
                is_abnormal = is_df_abnormal(df, verbose=1)
         | 
| 1641 | 
            +
                True: abnormal
         | 
| 1642 | 
            +
                False: normal
         | 
| 1621 1643 |  | 
| 1622 1644 | 
             
                """
         | 
| 1623 1645 | 
             
                # Initialize a list to hold messages about abnormalities
         | 
| @@ -1645,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool: | |
| 1645 1667 | 
             
                if len(column_names) == 1 and delimiter_counts["\t"] > 1:
         | 
| 1646 1668 | 
             
                    messages.append("Abnormal: Column names are not split correctly.")
         | 
| 1647 1669 | 
             
                    is_abnormal = True
         | 
| 1670 | 
            +
                    if verbose:
         | 
| 1671 | 
            +
                        print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
         | 
| 1648 1672 |  | 
| 1649 1673 | 
             
                if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
         | 
| 1650 1674 | 
             
                    messages.append("Abnormal: Too many delimiters in column names.")
         | 
| 1651 1675 | 
             
                    is_abnormal = True
         | 
| 1676 | 
            +
                    if verbose:
         | 
| 1677 | 
            +
                        print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
         | 
| 1652 1678 |  | 
| 1653 1679 | 
             
                if delimiter_counts[""] > 3:
         | 
| 1654 1680 | 
             
                    messages.append("Abnormal: There are empty column names.")
         | 
| 1655 1681 | 
             
                    is_abnormal = True
         | 
| 1682 | 
            +
                    if verbose:
         | 
| 1683 | 
            +
                        print(f'delimiter_counts[""] > 3')
         | 
| 1656 1684 |  | 
| 1657 1685 | 
             
                if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
         | 
| 1658 1686 | 
             
                    messages.append("Abnormal: Some column names contain unexpected characters.")
         | 
| 1659 1687 | 
             
                    is_abnormal = True
         | 
| 1688 | 
            +
                    if verbose:
         | 
| 1689 | 
            +
                        print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
         | 
| 1660 1690 |  | 
| 1661 | 
            -
                # Check for missing values
         | 
| 1662 | 
            -
                missing_values = df.isnull().sum()
         | 
| 1663 | 
            -
                if missing_values.any():
         | 
| 1664 | 
            -
             | 
| 1665 | 
            -
             | 
| 1666 | 
            -
             | 
| 1691 | 
            +
                # # Check for missing values
         | 
| 1692 | 
            +
                # missing_values = df.isnull().sum()
         | 
| 1693 | 
            +
                # if missing_values.any():
         | 
| 1694 | 
            +
                #     messages.append("Missing values in columns:")
         | 
| 1695 | 
            +
                #     messages.append(missing_values[missing_values > 0].to_string())
         | 
| 1696 | 
            +
                #     is_abnormal = True
         | 
| 1697 | 
            +
                #     print(f'missing_values.any()')
         | 
| 1667 1698 |  | 
| 1668 1699 | 
             
                # Check data types
         | 
| 1669 1700 | 
             
                data_types = df.dtypes
         | 
| @@ -1674,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool: | |
| 1674 1705 | 
             
                if constant_columns:
         | 
| 1675 1706 | 
             
                    messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
         | 
| 1676 1707 | 
             
                    is_abnormal = True
         | 
| 1708 | 
            +
                    if verbose:
         | 
| 1709 | 
            +
                        print(f'df.columns[df.nunique() == 1].tolist()')
         | 
| 1677 1710 |  | 
| 1678 1711 | 
             
                # Check for an unreasonable number of rows or columns
         | 
| 1679 1712 | 
             
                if actual_shape[0] < 2 or actual_shape[1] < 2:
         | 
| @@ -1681,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool: | |
| 1681 1714 | 
             
                        "Abnormal: DataFrame is too small (less than 2 rows or columns)."
         | 
| 1682 1715 | 
             
                    )
         | 
| 1683 1716 | 
             
                    is_abnormal = True
         | 
| 1717 | 
            +
                    if verbose:
         | 
| 1718 | 
            +
                        print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
         | 
| 1684 1719 |  | 
| 1685 1720 | 
             
                # Compile results
         | 
| 1686 1721 | 
             
                if verbose:
         | 
| @@ -1723,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs): | |
| 1723 1758 | 
             
                        content = yaml.safe_load(file)
         | 
| 1724 1759 | 
             
                    return content
         | 
| 1725 1760 |  | 
| 1726 | 
            -
             | 
| 1727 | 
            -
             | 
| 1728 | 
            -
                     | 
| 1729 | 
            -
             | 
| 1761 | 
            +
             | 
| 1762 | 
            +
                def load_xml(fpath, fsize_thr: int = 100):
         | 
| 1763 | 
            +
                    def load_small_xml(fpath):
         | 
| 1764 | 
            +
                        tree = etree.parse(fpath)
         | 
| 1765 | 
            +
                        root = tree.getroot()
         | 
| 1766 | 
            +
                        return etree.tostring(root, pretty_print=True).decode()
         | 
| 1767 | 
            +
             | 
| 1768 | 
            +
                    def load_large_xml(fpath):
         | 
| 1769 | 
            +
                        xml_parts = []
         | 
| 1770 | 
            +
                        context = etree.iterparse(
         | 
| 1771 | 
            +
                            fpath, events=("start", "end"), recover=True, huge_tree=True
         | 
| 1772 | 
            +
                        )
         | 
| 1773 | 
            +
             | 
| 1774 | 
            +
                        for event, elem in context:
         | 
| 1775 | 
            +
                            if event == "end":
         | 
| 1776 | 
            +
                                xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
         | 
| 1777 | 
            +
                                elem.clear()
         | 
| 1778 | 
            +
                                while elem.getprevious() is not None:
         | 
| 1779 | 
            +
                                    del elem.getparent()[0]
         | 
| 1780 | 
            +
                        del context
         | 
| 1781 | 
            +
                        return "".join(xml_parts)
         | 
| 1782 | 
            +
             | 
| 1783 | 
            +
                    file_size = os.path.getsize(fpath) / 1024 / 1024  # in MB
         | 
| 1784 | 
            +
             | 
| 1785 | 
            +
                    if file_size > fsize_thr:
         | 
| 1786 | 
            +
                        print(f"reading a small file:{file_size} Mb")
         | 
| 1787 | 
            +
                        return load_large_xml(fpath)
         | 
| 1788 | 
            +
                    else:
         | 
| 1789 | 
            +
                        print(f"reading a big file:{file_size} Mb")
         | 
| 1790 | 
            +
                        return load_small_xml(fpath)
         | 
| 1730 1791 |  | 
| 1731 1792 | 
             
                def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
         | 
| 1732 1793 | 
             
                    """
         | 
| @@ -1793,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs): | |
| 1793 1854 | 
             
                            on_bad_lines=on_bad_lines,
         | 
| 1794 1855 | 
             
                            **kwargs,
         | 
| 1795 1856 | 
             
                        )
         | 
| 1857 | 
            +
                        if is_df_abnormal(df, verbose=0):
         | 
| 1858 | 
            +
                            raise ValueError("the df is abnormal")
         | 
| 1796 1859 | 
             
                    except:
         | 
| 1797 1860 | 
             
                        try:
         | 
| 1798 1861 | 
             
                            try:
         | 
| @@ -1820,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs): | |
| 1820 1883 | 
             
                                        comment=comment,
         | 
| 1821 1884 | 
             
                                        **kwargs,
         | 
| 1822 1885 | 
             
                                    )
         | 
| 1823 | 
            -
             | 
| 1824 1886 | 
             
                                if is_df_abnormal(df, verbose=0):
         | 
| 1825 1887 | 
             
                                    raise ValueError("the df is abnormal")
         | 
| 1826 1888 | 
             
                            except (UnicodeDecodeError, ValueError):
         | 
| @@ -1856,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs): | |
| 1856 1918 | 
             
                            separators = [",", "\t", ";", "|", " "]
         | 
| 1857 1919 | 
             
                            for sep in separators:
         | 
| 1858 1920 | 
             
                                sep2show = sep if sep != "\t" else "\\t"
         | 
| 1859 | 
            -
                                print(f'trying with: engine=pyarrow, sep="{sep2show}"')
         | 
| 1921 | 
            +
                                # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
         | 
| 1922 | 
            +
                                # print(".")
         | 
| 1860 1923 | 
             
                                try:
         | 
| 1861 1924 | 
             
                                    df = pd.read_csv(
         | 
| 1862 1925 | 
             
                                        fpath,
         | 
| @@ -1868,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs): | |
| 1868 1931 | 
             
                                        **kwargs,
         | 
| 1869 1932 | 
             
                                    )
         | 
| 1870 1933 | 
             
                                    if not is_df_abnormal(df, verbose=0):  # normal
         | 
| 1871 | 
            -
                                         | 
| 1872 | 
            -
             | 
| 1873 | 
            -
                                         | 
| 1874 | 
            -
                                            pass
         | 
| 1934 | 
            +
                                        display(df.head(2))
         | 
| 1935 | 
            +
                                        print(f"shape: {df.shape}")
         | 
| 1936 | 
            +
                                        return df
         | 
| 1875 1937 | 
             
                                except:
         | 
| 1876 1938 | 
             
                                    pass
         | 
| 1877 1939 | 
             
                            else:
         | 
| @@ -1880,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs): | |
| 1880 1942 | 
             
                                    separators = [",", "\t", ";", "|", " "]
         | 
| 1881 1943 | 
             
                                    for sep in separators:
         | 
| 1882 1944 | 
             
                                        try:
         | 
| 1883 | 
            -
                                            sep2show = sep if sep != "\t" else "\\t"
         | 
| 1884 | 
            -
                                            print(f"trying with: engine={engine}, sep='{sep2show}'")
         | 
| 1945 | 
            +
                                            # sep2show = sep if sep != "\t" else "\\t"
         | 
| 1946 | 
            +
                                            # print(f"trying with: engine={engine}, sep='{sep2show}'")
         | 
| 1947 | 
            +
                                            # print(".")
         | 
| 1885 1948 | 
             
                                            df = pd.read_csv(
         | 
| 1886 1949 | 
             
                                                fpath,
         | 
| 1887 1950 | 
             
                                                engine=engine,
         | 
| @@ -1890,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs): | |
| 1890 1953 | 
             
                                                comment=comment,
         | 
| 1891 1954 | 
             
                                                **kwargs,
         | 
| 1892 1955 | 
             
                                            )
         | 
| 1956 | 
            +
                                            # display(df.head(2))
         | 
| 1957 | 
            +
                                            # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
         | 
| 1893 1958 | 
             
                                            if not is_df_abnormal(df, verbose=0):
         | 
| 1894 | 
            -
                                                 | 
| 1959 | 
            +
                                                display(df.head(2))
         | 
| 1960 | 
            +
                                                print(f"shape: {df.shape}")
         | 
| 1961 | 
            +
                                                return df
         | 
| 1895 1962 | 
             
                                        except EmptyDataError as e:
         | 
| 1896 1963 | 
             
                                            continue
         | 
| 1897 1964 | 
             
                                    else:
         | 
| @@ -2393,15 +2460,20 @@ def fsave( | |
| 2393 2460 | 
             
                #         json.dump(data, file, **kwargs)
         | 
| 2394 2461 |  | 
| 2395 2462 | 
             
                def save_json(fpath_fname, var_dict_or_df):
         | 
| 2463 | 
            +
                    def _convert_js(data):
         | 
| 2464 | 
            +
                        if isinstance(data, pd.DataFrame):
         | 
| 2465 | 
            +
                            return data.to_dict(orient="list") 
         | 
| 2466 | 
            +
                        elif isinstance(data, np.ndarray):
         | 
| 2467 | 
            +
                            return data.tolist()
         | 
| 2468 | 
            +
                        elif isinstance(data, dict):
         | 
| 2469 | 
            +
                            return {key: _convert_js(value) for key, value in data.items()}
         | 
| 2470 | 
            +
                        return data 
         | 
| 2471 | 
            +
             | 
| 2472 | 
            +
                    serializable_data = _convert_js(var_dict_or_df)
         | 
| 2473 | 
            +
                    
         | 
| 2474 | 
            +
                    # Save the serializable data to the JSON file
         | 
| 2396 2475 | 
             
                    with open(fpath_fname, "w") as f_json:
         | 
| 2397 | 
            -
                         | 
| 2398 | 
            -
                            var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
         | 
| 2399 | 
            -
                        if isinstance(var_dict_or_df, dict):
         | 
| 2400 | 
            -
                            for key, value in var_dict_or_df.items():
         | 
| 2401 | 
            -
                                if isinstance(value, np.ndarray):
         | 
| 2402 | 
            -
                                    var_dict_or_df[key] = value.tolist()
         | 
| 2403 | 
            -
                        # Save the dictionary or list of dictionaries to a JSON file
         | 
| 2404 | 
            -
                        json.dump(var_dict_or_df, f_json, indent=4)
         | 
| 2476 | 
            +
                        json.dump(serializable_data, f_json, indent=4)
         | 
| 2405 2477 |  | 
| 2406 2478 | 
             
                # # Example usage:
         | 
| 2407 2479 | 
             
                # sets = {"title": "mse_path_ MSE"}
         | 
| @@ -2645,7 +2717,7 @@ def listdir( | |
| 2645 2717 | 
             
                    print(ls)
         | 
| 2646 2718 | 
             
                    df_all = pd.DataFrame(
         | 
| 2647 2719 | 
             
                        {
         | 
| 2648 | 
            -
                            "fname":  | 
| 2720 | 
            +
                            "fname": ls,
         | 
| 2649 2721 | 
             
                            "fpath": [os.path.join(rootdir, i) for i in ls],
         | 
| 2650 2722 | 
             
                        }
         | 
| 2651 2723 | 
             
                    )
         | 
| @@ -2768,7 +2840,7 @@ def listdir( | |
| 2768 2840 | 
             
            # print(result)
         | 
| 2769 2841 | 
             
            # df=listdir("/", contains='sss',sort_by='name',ascending=False)
         | 
| 2770 2842 | 
             
            # print(df.fname.to_list(),"\n",df.fpath.to_list())
         | 
| 2771 | 
            -
            def  | 
| 2843 | 
            +
            def listfunc(lib_name, opt="call"):
         | 
| 2772 2844 | 
             
                if opt == "call":
         | 
| 2773 2845 | 
             
                    funcs = [func for func in dir(lib_name) if callable(getattr(lib_name, func))]
         | 
| 2774 2846 | 
             
                else:
         | 
| @@ -4789,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs): | |
| 4789 4861 | 
             
            def df_merge(
         | 
| 4790 4862 | 
             
                df1: pd.DataFrame,
         | 
| 4791 4863 | 
             
                df2: pd.DataFrame,
         | 
| 4792 | 
            -
                use_index: bool =  | 
| 4864 | 
            +
                use_index: bool = False,
         | 
| 4793 4865 | 
             
                columns: list = ["col_left", "col_right"],
         | 
| 4794 4866 | 
             
                how: str = "left",
         | 
| 4795 4867 | 
             
            ) -> pd.DataFrame:
         | 
| @@ -4848,12 +4920,53 @@ def df_merge( | |
| 4848 4920 | 
             
                    )
         | 
| 4849 4921 | 
             
                return df_merged
         | 
| 4850 4922 |  | 
| 4923 | 
            +
            def df_drop_duplicates(
         | 
| 4924 | 
            +
                data: pd.DataFrame,
         | 
| 4925 | 
            +
                by: Union[
         | 
| 4926 | 
            +
                    str, List[str]
         | 
| 4927 | 
            +
                ] = "index",  # Options: 'index', or column name(s) for 'rows'
         | 
| 4928 | 
            +
                keep="first",  # Options: 'first', 'last', or False (drop all duplicates)
         | 
| 4929 | 
            +
                ignore_index=True,
         | 
| 4930 | 
            +
                inplace: bool = False,
         | 
| 4931 | 
            +
                verbose=True
         | 
| 4932 | 
            +
            ):
         | 
| 4933 | 
            +
                """
         | 
| 4934 | 
            +
                data (pd.DataFrame): DataFrame to drop duplicates from.
         | 
| 4935 | 
            +
                by (str): Specify by to drop duplicates:
         | 
| 4936 | 
            +
                             - 'index': Drop duplicates based on the DataFrame index.
         | 
| 4937 | 
            +
                             - Column name(s) for row-wise duplicate checking.
         | 
| 4938 | 
            +
                keep (str): Which duplicates to keep: 
         | 
| 4939 | 
            +
                    'first', 
         | 
| 4940 | 
            +
                    'last', 
         | 
| 4941 | 
            +
                    False (drop all duplicates).
         | 
| 4942 | 
            +
                inplace (bool): Whether to modify the original DataFrame in place.
         | 
| 4943 | 
            +
                """
         | 
| 4944 | 
            +
                original_shape = data.shape
         | 
| 4945 | 
            +
                if by == "index":
         | 
| 4946 | 
            +
                    # Drop duplicates in the index
         | 
| 4947 | 
            +
                    result = data[~data.index.duplicated(keep=keep)]
         | 
| 4948 | 
            +
                else:
         | 
| 4949 | 
            +
                    # Drop duplicates row-wise based on column(s)
         | 
| 4950 | 
            +
                    result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
         | 
| 4951 | 
            +
                if original_shape!=result.shape or verbose:
         | 
| 4952 | 
            +
                    print(f"\nshape:{original_shape} (before drop_duplicates)")
         | 
| 4953 | 
            +
                    print(f"shape:{result.shape} (after drop_duplicates)")
         | 
| 4954 | 
            +
                if inplace:
         | 
| 4955 | 
            +
                    # Modify the original DataFrame in place
         | 
| 4956 | 
            +
                    data.drop(data.index, inplace=True)  # Drop all rows first
         | 
| 4957 | 
            +
                    data[data.columns] = result  # Refill the DataFrame
         | 
| 4958 | 
            +
                    return None
         | 
| 4959 | 
            +
                else:
         | 
| 4960 | 
            +
                    return result
         | 
| 4851 4961 | 
             
            def df_fillna(
         | 
| 4852 4962 | 
             
                data: pd.DataFrame,
         | 
| 4853 | 
            -
                method: str = " | 
| 4963 | 
            +
                method: str = "knn",
         | 
| 4854 4964 | 
             
                axis: int = 0,# column-wise
         | 
| 4855 4965 | 
             
                constant: float = None,
         | 
| 4966 | 
            +
                n_neighbors: int = 5,  # KNN-specific
         | 
| 4967 | 
            +
                max_iter: int = 10, # Iterative methods specific
         | 
| 4856 4968 | 
             
                inplace: bool = True,
         | 
| 4969 | 
            +
                random_state:int = None
         | 
| 4857 4970 | 
             
            ) -> pd.DataFrame:
         | 
| 4858 4971 | 
             
                """
         | 
| 4859 4972 | 
             
                Fill missing values in a DataFrame using specified imputation method.
         | 
| @@ -4865,8 +4978,15 @@ def df_fillna( | |
| 4865 4978 | 
             
                    - 'median': Replace missing values with the median of the column.
         | 
| 4866 4979 | 
             
                    - 'most_frequent': Replace missing values with the most frequent value in the column.
         | 
| 4867 4980 | 
             
                    - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
         | 
| 4868 | 
            -
                    - 'knn': Use K-Nearest Neighbors imputation | 
| 4869 | 
            -
                    - 'iterative': Use Iterative imputation | 
| 4981 | 
            +
                    - 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
         | 
| 4982 | 
            +
                    - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
         | 
| 4983 | 
            +
                    - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
         | 
| 4984 | 
            +
                    # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
         | 
| 4985 | 
            +
                    # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by 
         | 
| 4986 | 
            +
                    #       reconstructing the data matrix using low-rank approximation
         | 
| 4987 | 
            +
                    # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
         | 
| 4988 | 
            +
                    # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
         | 
| 4989 | 
            +
                
         | 
| 4870 4990 | 
             
                axis (int): The axis along which to impute:
         | 
| 4871 4991 | 
             
                    - 0: Impute column-wise (default).
         | 
| 4872 4992 | 
             
                    - 1: Impute row-wise.
         | 
| @@ -4879,7 +4999,8 @@ def df_fillna( | |
| 4879 4999 | 
             
                    raise ValueError("Input DataFrame is empty.")
         | 
| 4880 5000 |  | 
| 4881 5001 | 
             
                # Validate method
         | 
| 4882 | 
            -
                methods = ["mean", "median", "most_frequent",  | 
| 5002 | 
            +
                methods = ["mean", "median", "most_frequent", 
         | 
| 5003 | 
            +
                           "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
         | 
| 4883 5004 | 
             
                method = strcmp(method, methods)[0]
         | 
| 4884 5005 |  | 
| 4885 5006 | 
             
                # If using constant method, ask for a constant value
         | 
| @@ -4892,18 +5013,27 @@ def df_fillna( | |
| 4892 5013 |  | 
| 4893 5014 | 
             
                # Initialize SimpleImputer with the chosen method
         | 
| 4894 5015 | 
             
                if method == "constant":
         | 
| 5016 | 
            +
                    from sklearn.impute import SimpleImputer
         | 
| 4895 5017 | 
             
                    imputer = SimpleImputer(strategy=method, fill_value=constant)
         | 
| 4896 5018 | 
             
                elif method == "knn":
         | 
| 4897 5019 | 
             
                    from sklearn.impute import KNNImputer
         | 
| 4898 | 
            -
             | 
| 4899 5020 | 
             
                    imputer = KNNImputer(n_neighbors=n_neighbors)
         | 
| 4900 | 
            -
                elif method == "iterative":
         | 
| 5021 | 
            +
                elif method == "iterative" or method == "mice":
         | 
| 5022 | 
            +
                    from sklearn.experimental import enable_iterative_imputer
         | 
| 4901 5023 | 
             
                    from sklearn.impute import IterativeImputer
         | 
| 4902 5024 |  | 
| 4903 | 
            -
                    imputer = IterativeImputer(max_iter=max_iter)
         | 
| 4904 | 
            -
                 | 
| 5025 | 
            +
                    imputer = IterativeImputer(max_iter=max_iter, random_state=random_state) 
         | 
| 5026 | 
            +
                # elif method == "missforest":
         | 
| 5027 | 
            +
                #     from missingpy import MissForest
         | 
| 5028 | 
            +
                #     imputer = MissForest(max_iter=max_iter, random_state=random_state)
         | 
| 5029 | 
            +
                # elif method == "softimpute":
         | 
| 5030 | 
            +
                #     from fancyimpute import SoftImpute
         | 
| 5031 | 
            +
                #     imputer = SoftImpute()
         | 
| 5032 | 
            +
                # elif method == "svd":
         | 
| 5033 | 
            +
                #     from fancyimpute import IterativeSVD
         | 
| 5034 | 
            +
                #     imputer = IterativeSVD(max_iters=max_iter)
         | 
| 5035 | 
            +
                else: # mean, median, most_frequent
         | 
| 4905 5036 | 
             
                    from sklearn.impute import SimpleImputer
         | 
| 4906 | 
            -
             | 
| 4907 5037 | 
             
                    imputer = SimpleImputer(strategy=method)
         | 
| 4908 5038 |  | 
| 4909 5039 | 
             
                # Fit and transform the data
         | 
| @@ -4929,8 +5059,38 @@ def df_fillna( | |
| 4929 5059 | 
             
                    return None  # replace original
         | 
| 4930 5060 | 
             
                else:
         | 
| 4931 5061 | 
             
                    return df_filled
         | 
| 5062 | 
            +
            # # example
         | 
| 5063 | 
            +
            # data = {
         | 
| 5064 | 
            +
            #     "A": [1, 2, np.nan, 4, 5],
         | 
| 5065 | 
            +
            #     "B": [np.nan, 2, 3, 4, np.nan],
         | 
| 5066 | 
            +
            #     "C": [1, np.nan, 3, 4, 5],
         | 
| 5067 | 
            +
            #     "D": [1, 2, 3, 4, np.nan],
         | 
| 5068 | 
            +
            # }
         | 
| 5069 | 
            +
             | 
| 5070 | 
            +
            # # Define a function to test each imputation method
         | 
| 5071 | 
            +
            # methods = [
         | 
| 5072 | 
            +
            #     "mean",
         | 
| 5073 | 
            +
            #     "median",
         | 
| 5074 | 
            +
            #     "most_frequent",
         | 
| 5075 | 
            +
            #     "constant",
         | 
| 5076 | 
            +
            #     "knn",
         | 
| 5077 | 
            +
            #     "iterative",
         | 
| 5078 | 
            +
            #     # "missforest",
         | 
| 5079 | 
            +
            #     # "softimpute",
         | 
| 5080 | 
            +
            #     # "svd",
         | 
| 5081 | 
            +
            # ]
         | 
| 5082 | 
            +
             | 
| 5083 | 
            +
            # # Create a dictionary to hold results
         | 
| 5084 | 
            +
            # results = {}
         | 
| 5085 | 
            +
             | 
| 5086 | 
            +
            # for method_name in methods:
         | 
| 5087 | 
            +
            #     print(method_name)
         | 
| 5088 | 
            +
            #     display(df)
         | 
| 5089 | 
            +
            #     display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
         | 
| 5090 | 
            +
             | 
| 5091 | 
            +
                
         | 
| 4932 5092 | 
             
            def df_scaler(
         | 
| 4933 | 
            -
                data: pd.DataFrame,
         | 
| 5093 | 
            +
                data: pd.DataFrame, # should be numeric dtype
         | 
| 4934 5094 | 
             
                method="standard",
         | 
| 4935 5095 | 
             
                columns=None,  # default, select all numeric col/row
         | 
| 4936 5096 | 
             
                inplace=False,
         | 
| @@ -5414,7 +5574,7 @@ def df_reducer( | |
| 5414 5574 |  | 
| 5415 5575 | 
             
                # Select columns if specified, else use all columns
         | 
| 5416 5576 | 
             
                X = data[columns].values if columns else data.values
         | 
| 5417 | 
            -
             | 
| 5577 | 
            +
                print(X.shape,type(X))
         | 
| 5418 5578 | 
             
                # Handle missing values
         | 
| 5419 5579 | 
             
                if fill_missing:
         | 
| 5420 5580 | 
             
                    imputer = SimpleImputer(strategy="mean")
         | 
| @@ -5620,15 +5780,19 @@ def df_reducer( | |
| 5620 5780 | 
             
                    # If inplace=True, add components back into the original data
         | 
| 5621 5781 | 
             
                    for col_idx in range(n_components):
         | 
| 5622 5782 | 
             
                        data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
         | 
| 5623 | 
            -
                    
         | 
| 5624 5783 | 
             
                    # Add extra info for PCA/UMAP
         | 
| 5625 5784 | 
             
                    if method == "pca":
         | 
| 5626 | 
            -
                         | 
| 5627 | 
            -
             | 
| 5628 | 
            -
             | 
| 5785 | 
            +
                        for i in range(n_components):
         | 
| 5786 | 
            +
                            data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
         | 
| 5787 | 
            +
                        for i in range(n_components):
         | 
| 5788 | 
            +
                            data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
         | 
| 5789 | 
            +
                    elif method == "umap": 
         | 
| 5790 | 
            +
                        for i in range(n_components):
         | 
| 5791 | 
            +
                            data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
         | 
| 5629 5792 | 
             
                        data["Embedding"] = reduced_df["Embedding"]
         | 
| 5630 5793 | 
             
                        data["Trustworthiness"] = reduced_df["Trustworthiness"]
         | 
| 5631 5794 | 
             
                    return None  # No return when inplace=True
         | 
| 5795 | 
            +
                
         | 
| 5632 5796 |  | 
| 5633 5797 | 
             
                return reduced_df 
         | 
| 5634 5798 |  |