PyPI - multipers - Versions diffs - 2.2.3__cp310-cp310-win_amd64.whl → 2.3.0__cp310-cp310-win_amd64.whl - Mend

multipers 2.2.3__cp310-cp310-win_amd64.whl → 2.3.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of multipers might be problematic. Click here for more details.

Files changed (182) hide show

multipers/__init__.py +33 -31
multipers/_signed_measure_meta.py +430 -430
multipers/_slicer_meta.py +211 -212
multipers/data/MOL2.py +458 -458
multipers/data/UCR.py +18 -18
multipers/data/graphs.py +466 -466
multipers/data/immuno_regions.py +27 -27
multipers/data/pytorch2simplextree.py +90 -90
multipers/data/shape3d.py +101 -101
multipers/data/synthetic.py +113 -111
multipers/distances.py +198 -198
multipers/filtration_conversions.pxd.tp +84 -84
multipers/filtrations/__init__.py +18 -0
multipers/filtrations/filtrations.py +289 -0
multipers/filtrations.pxd +224 -224
multipers/function_rips.cp310-win_amd64.pyd +0 -0
multipers/function_rips.pyx +105 -105
multipers/grids.cp310-win_amd64.pyd +0 -0
multipers/grids.pyx +350 -350
multipers/gudhi/Persistence_slices_interface.h +132 -132
multipers/gudhi/Simplex_tree_interface.h +239 -245
multipers/gudhi/Simplex_tree_multi_interface.h +516 -561
multipers/gudhi/cubical_to_boundary.h +59 -59
multipers/gudhi/gudhi/Bitmap_cubical_complex.h +450 -450
multipers/gudhi/gudhi/Bitmap_cubical_complex_base.h +1070 -1070
multipers/gudhi/gudhi/Bitmap_cubical_complex_periodic_boundary_conditions_base.h +579 -579
multipers/gudhi/gudhi/Debug_utils.h +45 -45
multipers/gudhi/gudhi/Fields/Multi_field.h +484 -484
multipers/gudhi/gudhi/Fields/Multi_field_operators.h +455 -455
multipers/gudhi/gudhi/Fields/Multi_field_shared.h +450 -450
multipers/gudhi/gudhi/Fields/Multi_field_small.h +531 -531
multipers/gudhi/gudhi/Fields/Multi_field_small_operators.h +507 -507
multipers/gudhi/gudhi/Fields/Multi_field_small_shared.h +531 -531
multipers/gudhi/gudhi/Fields/Z2_field.h +355 -355
multipers/gudhi/gudhi/Fields/Z2_field_operators.h +376 -376
multipers/gudhi/gudhi/Fields/Zp_field.h +420 -420
multipers/gudhi/gudhi/Fields/Zp_field_operators.h +400 -400
multipers/gudhi/gudhi/Fields/Zp_field_shared.h +418 -418
multipers/gudhi/gudhi/Flag_complex_edge_collapser.h +337 -337
multipers/gudhi/gudhi/Matrix.h +2107 -2107
multipers/gudhi/gudhi/Multi_critical_filtration.h +1038 -1038
multipers/gudhi/gudhi/Multi_persistence/Box.h +171 -171
multipers/gudhi/gudhi/Multi_persistence/Line.h +282 -282
multipers/gudhi/gudhi/Off_reader.h +173 -173
multipers/gudhi/gudhi/One_critical_filtration.h +1432 -1431
multipers/gudhi/gudhi/Persistence_matrix/Base_matrix.h +769 -769
multipers/gudhi/gudhi/Persistence_matrix/Base_matrix_with_column_compression.h +686 -686
multipers/gudhi/gudhi/Persistence_matrix/Boundary_matrix.h +842 -842
multipers/gudhi/gudhi/Persistence_matrix/Chain_matrix.h +1350 -1350
multipers/gudhi/gudhi/Persistence_matrix/Id_to_index_overlay.h +1105 -1105
multipers/gudhi/gudhi/Persistence_matrix/Position_to_index_overlay.h +859 -859
multipers/gudhi/gudhi/Persistence_matrix/RU_matrix.h +910 -910
multipers/gudhi/gudhi/Persistence_matrix/allocators/entry_constructors.h +139 -139
multipers/gudhi/gudhi/Persistence_matrix/base_pairing.h +230 -230
multipers/gudhi/gudhi/Persistence_matrix/base_swap.h +211 -211
multipers/gudhi/gudhi/Persistence_matrix/boundary_cell_position_to_id_mapper.h +60 -60
multipers/gudhi/gudhi/Persistence_matrix/boundary_face_position_to_id_mapper.h +60 -60
multipers/gudhi/gudhi/Persistence_matrix/chain_pairing.h +136 -136
multipers/gudhi/gudhi/Persistence_matrix/chain_rep_cycles.h +190 -190
multipers/gudhi/gudhi/Persistence_matrix/chain_vine_swap.h +616 -616
multipers/gudhi/gudhi/Persistence_matrix/columns/chain_column_extra_properties.h +150 -150
multipers/gudhi/gudhi/Persistence_matrix/columns/column_dimension_holder.h +106 -106
multipers/gudhi/gudhi/Persistence_matrix/columns/column_utilities.h +219 -219
multipers/gudhi/gudhi/Persistence_matrix/columns/entry_types.h +327 -327
multipers/gudhi/gudhi/Persistence_matrix/columns/heap_column.h +1140 -1140
multipers/gudhi/gudhi/Persistence_matrix/columns/intrusive_list_column.h +934 -934
multipers/gudhi/gudhi/Persistence_matrix/columns/intrusive_set_column.h +934 -934
multipers/gudhi/gudhi/Persistence_matrix/columns/list_column.h +980 -980
multipers/gudhi/gudhi/Persistence_matrix/columns/naive_vector_column.h +1092 -1092
multipers/gudhi/gudhi/Persistence_matrix/columns/row_access.h +192 -192
multipers/gudhi/gudhi/Persistence_matrix/columns/set_column.h +921 -921
multipers/gudhi/gudhi/Persistence_matrix/columns/small_vector_column.h +1093 -1093
multipers/gudhi/gudhi/Persistence_matrix/columns/unordered_set_column.h +1012 -1012
multipers/gudhi/gudhi/Persistence_matrix/columns/vector_column.h +1244 -1244
multipers/gudhi/gudhi/Persistence_matrix/matrix_dimension_holders.h +186 -186
multipers/gudhi/gudhi/Persistence_matrix/matrix_row_access.h +164 -164
multipers/gudhi/gudhi/Persistence_matrix/ru_pairing.h +156 -156
multipers/gudhi/gudhi/Persistence_matrix/ru_rep_cycles.h +376 -376
multipers/gudhi/gudhi/Persistence_matrix/ru_vine_swap.h +540 -540
multipers/gudhi/gudhi/Persistent_cohomology/Field_Zp.h +118 -118
multipers/gudhi/gudhi/Persistent_cohomology/Multi_field.h +173 -173
multipers/gudhi/gudhi/Persistent_cohomology/Persistent_cohomology_column.h +128 -128
multipers/gudhi/gudhi/Persistent_cohomology.h +745 -745
multipers/gudhi/gudhi/Points_off_io.h +171 -171
multipers/gudhi/gudhi/Simple_object_pool.h +69 -69
multipers/gudhi/gudhi/Simplex_tree/Simplex_tree_iterators.h +463 -463
multipers/gudhi/gudhi/Simplex_tree/Simplex_tree_node_explicit_storage.h +83 -83
multipers/gudhi/gudhi/Simplex_tree/Simplex_tree_siblings.h +106 -106
multipers/gudhi/gudhi/Simplex_tree/Simplex_tree_star_simplex_iterators.h +277 -277
multipers/gudhi/gudhi/Simplex_tree/hooks_simplex_base.h +62 -62
multipers/gudhi/gudhi/Simplex_tree/indexing_tag.h +27 -27
multipers/gudhi/gudhi/Simplex_tree/serialization_utils.h +62 -62
multipers/gudhi/gudhi/Simplex_tree/simplex_tree_options.h +157 -157
multipers/gudhi/gudhi/Simplex_tree.h +2794 -2794
multipers/gudhi/gudhi/Simplex_tree_multi.h +152 -163
multipers/gudhi/gudhi/distance_functions.h +62 -62
multipers/gudhi/gudhi/graph_simplicial_complex.h +104 -104
multipers/gudhi/gudhi/persistence_interval.h +253 -253
multipers/gudhi/gudhi/persistence_matrix_options.h +170 -170
multipers/gudhi/gudhi/reader_utils.h +367 -367
multipers/gudhi/mma_interface_coh.h +256 -255
multipers/gudhi/mma_interface_h0.h +223 -231
multipers/gudhi/mma_interface_matrix.h +284 -282
multipers/gudhi/naive_merge_tree.h +536 -575
multipers/gudhi/scc_io.h +310 -289
multipers/gudhi/truc.h +890 -888
multipers/io.cp310-win_amd64.pyd +0 -0
multipers/io.pyx +711 -711
multipers/ml/accuracies.py +90 -90
multipers/ml/convolutions.py +520 -520
multipers/ml/invariants_with_persistable.py +79 -79
multipers/ml/kernels.py +176 -176
multipers/ml/mma.py +713 -714
multipers/ml/one.py +472 -472
multipers/ml/point_clouds.py +352 -346
multipers/ml/signed_measures.py +1589 -1589
multipers/ml/sliced_wasserstein.py +461 -461
multipers/ml/tools.py +113 -113
multipers/mma_structures.cp310-win_amd64.pyd +0 -0
multipers/mma_structures.pxd +127 -127
multipers/mma_structures.pyx +4 -4
multipers/mma_structures.pyx.tp +1085 -1085
multipers/multi_parameter_rank_invariant/diff_helpers.h +84 -93
multipers/multi_parameter_rank_invariant/euler_characteristic.h +97 -97
multipers/multi_parameter_rank_invariant/function_rips.h +322 -322
multipers/multi_parameter_rank_invariant/hilbert_function.h +769 -769
multipers/multi_parameter_rank_invariant/persistence_slices.h +148 -148
multipers/multi_parameter_rank_invariant/rank_invariant.h +369 -369
multipers/multiparameter_edge_collapse.py +41 -41
multipers/multiparameter_module_approximation/approximation.h +2296 -2295
multipers/multiparameter_module_approximation/combinatory.h +129 -129
multipers/multiparameter_module_approximation/debug.h +107 -107
multipers/multiparameter_module_approximation/format_python-cpp.h +286 -286
multipers/multiparameter_module_approximation/heap_column.h +238 -238
multipers/multiparameter_module_approximation/images.h +79 -79
multipers/multiparameter_module_approximation/list_column.h +174 -174
multipers/multiparameter_module_approximation/list_column_2.h +232 -232
multipers/multiparameter_module_approximation/ru_matrix.h +347 -347
multipers/multiparameter_module_approximation/set_column.h +135 -135
multipers/multiparameter_module_approximation/structure_higher_dim_barcode.h +36 -36
multipers/multiparameter_module_approximation/unordered_set_column.h +166 -166
multipers/multiparameter_module_approximation/utilities.h +403 -419
multipers/multiparameter_module_approximation/vector_column.h +223 -223
multipers/multiparameter_module_approximation/vector_matrix.h +331 -331
multipers/multiparameter_module_approximation/vineyards.h +464 -464
multipers/multiparameter_module_approximation/vineyards_trajectories.h +649 -649
multipers/multiparameter_module_approximation.cp310-win_amd64.pyd +0 -0
multipers/multiparameter_module_approximation.pyx +216 -217
multipers/pickle.py +90 -53
multipers/plots.py +342 -334
multipers/point_measure.cp310-win_amd64.pyd +0 -0
multipers/point_measure.pyx +322 -320
multipers/simplex_tree_multi.cp310-win_amd64.pyd +0 -0
multipers/simplex_tree_multi.pxd +133 -133
multipers/simplex_tree_multi.pyx +18 -15
multipers/simplex_tree_multi.pyx.tp +1939 -1935
multipers/slicer.cp310-win_amd64.pyd +0 -0
multipers/slicer.pxd +81 -20
multipers/slicer.pxd.tp +215 -214
multipers/slicer.pyx +1091 -308
multipers/slicer.pyx.tp +924 -914
multipers/tensor/tensor.h +672 -672
multipers/tensor.pxd +13 -13
multipers/test.pyx +44 -44
multipers/tests/__init__.py +57 -57
multipers/torch/diff_grids.py +217 -217
multipers/torch/rips_density.py +310 -304
{multipers-2.2.3.dist-info → multipers-2.3.0.dist-info}/LICENSE +21 -21
{multipers-2.2.3.dist-info → multipers-2.3.0.dist-info}/METADATA +21 -11
multipers-2.3.0.dist-info/RECORD +182 -0
multipers/tests/test_diff_helper.py +0 -73
multipers/tests/test_hilbert_function.py +0 -82
multipers/tests/test_mma.py +0 -83
multipers/tests/test_point_clouds.py +0 -49
multipers/tests/test_python-cpp_conversion.py +0 -82
multipers/tests/test_signed_betti.py +0 -181
multipers/tests/test_signed_measure.py +0 -89
multipers/tests/test_simplextreemulti.py +0 -221
multipers/tests/test_slicer.py +0 -221
multipers-2.2.3.dist-info/RECORD +0 -189
{multipers-2.2.3.dist-info → multipers-2.3.0.dist-info}/WHEEL +0 -0
{multipers-2.2.3.dist-info → multipers-2.3.0.dist-info}/top_level.txt +0 -0

multipers/data/MOL2.py CHANGED Viewed

@@ -1,458 +1,458 @@
-import os
-from os import listdir
-from os.path import expanduser
-from typing import Iterable
-import matplotlib.pyplot as plt
-import MDAnalysis as mda
-import numpy as np
-import pandas as pd
-from joblib import Parallel, delayed
-from MDAnalysis.topology.guessers import guess_masses
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.preprocessing import LabelEncoder
-# from numba import njit
-from tqdm import tqdm
-import multipers as mp
-DATASET_PATH = expanduser("~/Datasets/")
-JC_path = DATASET_PATH + "Cleves-Jain/"
-DUDE_path = DATASET_PATH + "DUD-E/"
-# pathes = get_data_path()
-# imgs = apply_pipeline(pathes=pathes, pipeline=pipeline_img)
-# distances_to_letter, ytest = img_distances(imgs)
-def _get_mols_in_path(folder):
-    with open(folder + "/TargetList", "r") as f:
-        train_data = [folder + "/" + mol.strip() for mol in f.readlines()]
-    criterion = (
-        lambda dataset: dataset.endswith(".mol2")
-        and not dataset.startswith("final")
-        and dataset not in train_data
-    )
-    test_data = [
-        folder + "/" + dataset
-        for dataset in listdir(folder)
-        if criterion(folder + "/" + dataset)
-    ]
-    return train_data, test_data
-def get_data_path_JC(type="dict"):
-    if type == "dict":
-        out = {}
-    elif type == "list":
-        out = []
-    else:
-        raise TypeError(f"Type {out} not supported")
-    for stuff in listdir(JC_path):
-        if stuff.startswith("target_"):
-            current_letter = stuff[-1]
-            to_add = _get_mols_in_path(JC_path + stuff)
-            if type == "dict":
-                out[current_letter] = to_add
-            elif type == "list":
-                out.append(to_add)
-    decoy_folder = JC_path + "RognanRing850/"
-    to_add = [
-        decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")
-    ]
-    if type == "dict":
-        out["decoy"] = to_add
-    elif type == "list":
-        out.append(to_add)
-    return out
-def get_all_JC_path():
-    out = []
-    for stuff in listdir(JC_path):
-        if stuff.startswith("target_"):
-            train_data, test_data = _get_mols_in_path(JC_path + stuff)
-            out += train_data
-            out += test_data
-    decoy_folder = JC_path + "RognanRing850/"
-    out += [
-        decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")
-    ]
-    return out
-def split_multimol(
-    path: str,
-    mol_name: str,
-    out_folder_name: str = "splitted",
-    enforce_charges: bool = False,
-):
-    with open(path + mol_name, "r") as f:
-        lines = f.readlines()
-    splitted_mols = []
-    index = 0
-    for i, line in enumerate(lines):
-        is_last = i == len(lines) - 1
-        if line.strip() == "@<TRIPOS>MOLECULE" or is_last:
-            if i != index:
-                molecule = "".join(lines[index : i + is_last])
-                if enforce_charges:
-                    # print(f"Replaced molecule {i}")
-                    molecule = molecule.replace("NO_CHARGES", "USER_CHARGES")
-                    # print(molecule)
-                    # return
-                index = i
-                splitted_mols.append(molecule)
-    if not os.path.exists(path + out_folder_name):
-        os.mkdir(path + out_folder_name)
-    for i, mol in enumerate(splitted_mols):
-        with open(path + out_folder_name + f"/{i}.mol2", "w") as f:
-            f.write(mol)
-    return [path + out_folder_name + f"/{i}.mol2" for i in range(len(splitted_mols))]
-# @njit(parallel=True)
-def apply_pipeline(pathes: dict, pipeline):
-    img_dict = {}
-    for key, value in tqdm(pathes.items(), desc="Applying pipeline"):
-        if len(key) == 1:
-            train_paths, test_paths = value
-            train_imgs = pipeline.transform(train_paths)
-            test_imgs = pipeline.transform(test_paths)
-            img_dict[key] = (train_imgs, test_imgs)
-        else:
-            assert key == "decoy"
-            img_dict[key] = pipeline.transform(value)
-    return img_dict
-from sklearn.metrics import pairwise_distances
-def img_distances(img_dict: dict):
-    distances_to_anchors = []
-    ytest = []
-    decoy_list = img_dict["decoy"]
-    for letter, imgs in img_dict.items():
-        if len(letter) != 1:
-            continue  # decoy
-        xtrain, xtest = imgs
-        assert len(xtest) > 0
-        train_data, test_data = xtrain, np.concatenate([xtest, decoy_list])
-        D = pairwise_distances(train_data, test_data)
-        distances_to_anchors.append(D)
-        letter_ytest = np.array(
-            [letter] * len(xtest) + ["0"] * len(decoy_list), dtype="<U1"
-        )
-        ytest.append(letter_ytest)
-    return distances_to_anchors, ytest
-def get_EF_vector_from_distances(distances, ytest, alpha=0.05):
-    EF = []
-    for distance_to_anchors, letter_ytest in zip(distances, ytest):
-        indices = np.argsort(distance_to_anchors, axis=1)
-        n = indices.shape[1]
-        n_max = int(alpha * n)
-        good_indices = (
-            letter_ytest[indices[:, :n_max]] == letter_ytest[0]
-        )  ## assumes that ytest[:,0] are the good letters
-        EF_letter = good_indices.sum(axis=1) / (letter_ytest == letter_ytest[0]).sum()
-        EF_letter /= alpha
-        EF.append(EF_letter.mean())
-    return np.mean(EF)
-def EF_from_distance_matrix(
-    distances: np.ndarray, labels: list | np.ndarray, alpha: float, anchors_in_test=True
-):
-    """
-    Computes the Enrichment Factor from a distance matrix, and its labels.
-     - First axis of the distance matrix is the anchors on which to compute the EF
-     - Second axis is the test. For convenience, anchors can be put in test, if the flag anchors_in_test is set to true.
-     - labels is a table of bools, representing the the labels of the test axis of the distance matrix.
-     - alpha : the EF alpha parameter.
-    """
-    n = len(labels)
-    n_max = int(alpha * n)
-    indices = np.argsort(distances, axis=1)
-    EF_ = [
-        ((labels[idx[:n_max]]).sum() - anchors_in_test)
-        / (labels.sum() - anchors_in_test)
-        for idx in indices
-    ]
-    return np.mean(EF_) / alpha
-def EF_AUC(distances: np.ndarray, labels: np.ndarray, anchors_in_test=0):
-    if distances.ndim == 1:
-        distances = distances[None, :]
-    assert distances.ndim == 2
-    indices = np.argsort(distances, axis=1)
-    out = []
-    for i in range(1, distances.size):
-        proportion_of_good_indices = (
-            labels[indices[:, :i]].sum(axis=1).mean() - anchors_in_test
-        ) / min(i, labels.sum() - anchors_in_test)
-        out.append(proportion_of_good_indices)
-    # print(out)
-    return np.mean(out)
-def theorical_max_EF(distances, labels, alpha):
-    n = len(labels)
-    n_max = int(alpha * n)
-    num_true_labels = np.sum(
-        labels == labels[0]
-    )  ## if labels are not True / False, assumes that the first one is a good one
-    return min(n_max, num_true_labels) / alpha
-def theorical_max_EF_from_distances(list_of_distances, list_of_labels, alpha):
-    return np.mean(
-        [
-            theorical_max_EF(distances, labels, alpha)
-            for distances, labels in zip(list_of_distances, list_of_labels)
-        ]
-    )
-def plot_EF_from_distances(
-    alphas=[0.01, 0.02, 0.05, 0.1], EF=EF_from_distance_matrix, plot: bool = True
-):
-    y = np.round([EF(alpha=alpha) for alpha in alphas], decimals=2)
-    if plot:
-        _alphas = np.linspace(0.01, 1.0, 100)
-        plt.figure()
-        plt.plot(_alphas, [EF(alpha=alpha) for alpha in _alphas])
-        plt.scatter(alphas, y, c="r")
-        plt.title("Enrichment Factor")
-        plt.xlabel(r"$\alpha$" + f" = {alphas}")
-        plt.ylabel(r"$\mathrm{EF}_\alpha$" + f" = {y}")
-    return y
-def lines2bonds(
-    mol: mda.Universe, bond_types=["ar", "am", 3, 2, 1, 0], molecule_format=None
-):
-    extension = (
-        mol.filename.split(".")[-1].lower()
-        if molecule_format is None
-        else molecule_format
-    )
-    match extension:
-        case "mol2":
-            out = lines2bonds_MOL2(mol)["bond_type"]
-        case "pdb":
-            out = lines2bonds_PDB(mol)
-        case _:
-            raise Exception("Invalid, or not supported molecule format.")
-    return LabelEncoder().fit(bond_types).transform(out)
-def lines2bonds_MOL2(mol: mda.Universe):
-    _lines = open(mol.filename, "r").readlines()
-    out = []
-    index = 0
-    while index < len(_lines) and _lines[index].strip() != "@<TRIPOS>BOND":
-        index += 1
-    index += 1
-    while index < len(_lines) and _lines[index].strip()[0] != "@":
-        line = _lines[index].strip().split(" ")
-        for j, truc in enumerate(line):
-            line[j] = truc.strip()
-        # try:
-        out.append([stuff for stuff in line if len(stuff) > 0])
-        # except:
-        # 	print_lin
-        index += 1
-    out = pd.DataFrame(out, columns=["bond_id", "atom1", "atom2", "bond_type"])
-    out.set_index(["bond_id"], inplace=True)
-    return out
-def lines2bonds_PDB(mol: mda.Universe):
-    raise Exception("Not yet implemented.")
-    return
-def _mol2graphst(
-    path: str | mda.Universe, filtrations: Iterable[str], molecule_format=None
-):
-    molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
-    num_filtrations = len(filtrations)
-    nodes = molecule.atoms.indices.reshape(1, -1)
-    edges = molecule.bonds.dump_contents().T
-    num_vertices = nodes.shape[1]
-    num_edges = edges.shape[1]
-    st = mp.SimplexTreeMulti(num_parameters=num_filtrations)
-    ## Edges filtration
-    # edges = np.array(bonds_df[["atom1", "atom2"]]).T
-    edges_filtration = np.zeros((num_edges, num_filtrations), dtype=np.float32) - np.inf
-    for i, filtration in enumerate(filtrations):
-        match filtration:
-            case "bond_length":
-                bond_lengths = molecule.bonds.bonds()
-                edges_filtration[:, i] = bond_lengths
-            case "bond_type":
-                bond_types = lines2bonds(mol=molecule, molecule_format=molecule_format)
-                edges_filtration[:, i] = bond_types
-            case _:
-                pass
-    ## Nodes filtration
-    nodes_filtrations = np.zeros(
-        (num_vertices, num_filtrations), dtype=np.float32
-    ) + np.min(
-        edges_filtration, axis=0
-    )  # better than - np.inf
-    st.insert_batch(nodes, nodes_filtrations)
-    st.insert_batch(edges, edges_filtration)
-    for i, filtration in enumerate(filtrations):
-        match filtration:
-            case "charge":
-                charges = molecule.atoms.charges
-                st.fill_lowerstar(charges, parameter=i)
-            case "atomic_mass":
-                masses = molecule.atoms.masses
-                null_indices = masses == 0
-                if np.any(null_indices):  # guess if necessary
-                    masses[null_indices] = guess_masses(molecule.atoms.types)[
-                        null_indices
-                    ]
-                st.fill_lowerstar(-masses, parameter=i)
-            case _:
-                pass
-    st.make_filtration_non_decreasing()  # Necessary ?
-    return st
-def _mol2ripsst(
-    path: str,
-    filtrations: Iterable[str],
-    threshold=np.inf,
-    bond_types: list = ["ar", "am", 3, 2, 1, 0],
-):
-    import gudhi as gd
-    assert "bond_length" == filtrations[0], "Bond length has to be first for rips."
-    molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
-    num_parameters = len(filtrations)
-    st_rips = gd.RipsComplex(
-        points=molecule.atoms.positions, max_edge_length=threshold
-    ).create_simplex_tree()
-    st = mp.SimplexTreeMulti(
-        st_rips,
-        num_parameters=num_parameters,
-        default_values=[
-            bond_types.index(0) if f == "bond_type" else -np.inf
-            for f in filtrations[1:]
-        ],  # the 0 index is the label of 'no bond' in bond_types
-    )
-    ## Edges filtration
-    mol_bonds = molecule.bonds.indices.T
-    edges_filtration = (
-        np.zeros((mol_bonds.shape[1], num_parameters), dtype=np.float32) - np.inf
-    )
-    for i, filtration in enumerate(filtrations):
-        match filtration:
-            case "bond_type":
-                edges_filtration[:, i] = lines2bonds(
-                    mol=molecule, bond_types=bond_types
-                )
-            case "atomic_mass":
-                continue
-            case "charge":
-                continue
-            case "bond_length":
-                edges_filtration[:, i] = [st_rips.filtration(s) for s in mol_bonds.T]
-            case _:
-                raise Exception(
-                    f"Invalid filtration {filtration}. Available ones : bond_type, atomic_mass, charge, bond_length."
-                )
-    st.assign_batch_filtration(mol_bonds, edges_filtration, propagate=False)
-    min_filtration = edges_filtration.min(axis=0)
-    st.assign_batch_filtration(
-        np.asarray([list(range(st.num_vertices))], dtype=int),
-        np.asarray([min_filtration] * st.num_vertices, dtype=np.float32),
-        propagate=False,
-    )
-    ## Nodes filtration
-    for i, filtration in enumerate(filtrations):
-        match filtration:
-            case "charge":
-                charges = molecule.atoms.charges
-                st.fill_lowerstar(charges, parameter=i)
-            case "atomic_mass":
-                masses = molecule.atoms.masses
-                null_indices = masses == 0
-                if np.any(null_indices):  # guess if necessary
-                    masses[null_indices] = guess_masses(molecule.atoms.types)[
-                        null_indices
-                    ]
-                # print(masses)
-                st.fill_lowerstar(-masses, parameter=i)
-            case _:
-                pass
-    st.make_filtration_non_decreasing()  # Necessary ?
-    return st
-class Molecule2SimplexTree(BaseEstimator, TransformerMixin):
-    """
-    Transforms a list of MDA-compatible files into a list of mulitparameter simplextrees
-    Input
-    -----
-     X: Iterable[path_to_files:str]
-    Output
-    ------
-     Iterable[multipers.SimplexTreeMulti]
-    Parameters
-    ----------
-     - filtrations : list of filtration names. Available ones : 'charge', 'atomic_mass', 'bond_length', 'bond_type'. Others are ignored.
-     - graph : bool. If true, will use the graph given by the molecule, otherwise, a Rips Complex Based on the distance. '
-     In that case bond_length is ignored (it's the 1rst parameter).
-    """
-    def __init__(
-        self,
-        delayed: bool = False,
-        filtrations: Iterable[str] = [],
-        graph: bool = True,
-        n_jobs: int = 1,
-    ) -> None:
-        super().__init__()
-        self.delayed = delayed
-        self.n_jobs = n_jobs
-        self.filtrations = filtrations
-        self.graph = graph
-        self._molecule_format = None
-        return
-    def fit(self, X: Iterable[str], y=None):
-        if len(X) == 0:
-            return self
-        test_mol = mda.Universe(X[0])
-        self._molecule_format = test_mol.filename.split(".")[-1].lower()
-        return self
-    def transform(self, X: Iterable[str]):
-        _to_simplextree = _mol2graphst if self.graph else _mol2ripsst
-        to_simplex_tree = lambda path_to_mol2_file: [
-            _to_simplextree(path=path_to_mol2_file, filtrations=self.filtrations)
-        ]
-        if self.delayed:
-            return [delayed(to_simplex_tree)(path) for path in X]
-        return Parallel(n_jobs=self.n_jobs, prefer="threads")(
-            delayed(to_simplex_tree)(path) for path in X
-        )
+import os
+from os import listdir
+from os.path import expanduser
+from typing import Iterable
+import matplotlib.pyplot as plt
+import MDAnalysis as mda
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+from MDAnalysis.topology.guessers import guess_masses
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import LabelEncoder
+# from numba import njit
+from tqdm import tqdm
+import multipers as mp
+DATASET_PATH = expanduser("~/Datasets/")
+JC_path = DATASET_PATH + "Cleves-Jain/"
+DUDE_path = DATASET_PATH + "DUD-E/"
+# pathes = get_data_path()
+# imgs = apply_pipeline(pathes=pathes, pipeline=pipeline_img)
+# distances_to_letter, ytest = img_distances(imgs)
+def _get_mols_in_path(folder):
+    with open(folder + "/TargetList", "r") as f:
+        train_data = [folder + "/" + mol.strip() for mol in f.readlines()]
+    criterion = (
+        lambda dataset: dataset.endswith(".mol2")
+        and not dataset.startswith("final")
+        and dataset not in train_data
+    )
+    test_data = [
+        folder + "/" + dataset
+        for dataset in listdir(folder)
+        if criterion(folder + "/" + dataset)
+    ]
+    return train_data, test_data
+def get_data_path_JC(type="dict"):
+    if type == "dict":
+        out = {}
+    elif type == "list":
+        out = []
+    else:
+        raise TypeError(f"Type {out} not supported")
+    for stuff in listdir(JC_path):
+        if stuff.startswith("target_"):
+            current_letter = stuff[-1]
+            to_add = _get_mols_in_path(JC_path + stuff)
+            if type == "dict":
+                out[current_letter] = to_add
+            elif type == "list":
+                out.append(to_add)
+    decoy_folder = JC_path + "RognanRing850/"
+    to_add = [
+        decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")
+    ]
+    if type == "dict":
+        out["decoy"] = to_add
+    elif type == "list":
+        out.append(to_add)
+    return out
+def get_all_JC_path():
+    out = []
+    for stuff in listdir(JC_path):
+        if stuff.startswith("target_"):
+            train_data, test_data = _get_mols_in_path(JC_path + stuff)
+            out += train_data
+            out += test_data
+    decoy_folder = JC_path + "RognanRing850/"
+    out += [
+        decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")
+    ]
+    return out
+def split_multimol(
+    path: str,
+    mol_name: str,
+    out_folder_name: str = "splitted",
+    enforce_charges: bool = False,
+):
+    with open(path + mol_name, "r") as f:
+        lines = f.readlines()
+    splitted_mols = []
+    index = 0
+    for i, line in enumerate(lines):
+        is_last = i == len(lines) - 1
+        if line.strip() == "@<TRIPOS>MOLECULE" or is_last:
+            if i != index:
+                molecule = "".join(lines[index : i + is_last])
+                if enforce_charges:
+                    # print(f"Replaced molecule {i}")
+                    molecule = molecule.replace("NO_CHARGES", "USER_CHARGES")
+                    # print(molecule)
+                    # return
+                index = i
+                splitted_mols.append(molecule)
+    if not os.path.exists(path + out_folder_name):
+        os.mkdir(path + out_folder_name)
+    for i, mol in enumerate(splitted_mols):
+        with open(path + out_folder_name + f"/{i}.mol2", "w") as f:
+            f.write(mol)
+    return [path + out_folder_name + f"/{i}.mol2" for i in range(len(splitted_mols))]
+# @njit(parallel=True)
+def apply_pipeline(pathes: dict, pipeline):
+    img_dict = {}
+    for key, value in tqdm(pathes.items(), desc="Applying pipeline"):
+        if len(key) == 1:
+            train_paths, test_paths = value
+            train_imgs = pipeline.transform(train_paths)
+            test_imgs = pipeline.transform(test_paths)
+            img_dict[key] = (train_imgs, test_imgs)
+        else:
+            assert key == "decoy"
+            img_dict[key] = pipeline.transform(value)
+    return img_dict
+from sklearn.metrics import pairwise_distances
+def img_distances(img_dict: dict):
+    distances_to_anchors = []
+    ytest = []
+    decoy_list = img_dict["decoy"]
+    for letter, imgs in img_dict.items():
+        if len(letter) != 1:
+            continue  # decoy
+        xtrain, xtest = imgs
+        assert len(xtest) > 0
+        train_data, test_data = xtrain, np.concatenate([xtest, decoy_list])
+        D = pairwise_distances(train_data, test_data)
+        distances_to_anchors.append(D)
+        letter_ytest = np.array(
+            [letter] * len(xtest) + ["0"] * len(decoy_list), dtype="<U1"
+        )
+        ytest.append(letter_ytest)
+    return distances_to_anchors, ytest
+def get_EF_vector_from_distances(distances, ytest, alpha=0.05):
+    EF = []
+    for distance_to_anchors, letter_ytest in zip(distances, ytest):
+        indices = np.argsort(distance_to_anchors, axis=1)
+        n = indices.shape[1]
+        n_max = int(alpha * n)
+        good_indices = (
+            letter_ytest[indices[:, :n_max]] == letter_ytest[0]
+        )  ## assumes that ytest[:,0] are the good letters
+        EF_letter = good_indices.sum(axis=1) / (letter_ytest == letter_ytest[0]).sum()
+        EF_letter /= alpha
+        EF.append(EF_letter.mean())
+    return np.mean(EF)
+def EF_from_distance_matrix(
+    distances: np.ndarray, labels: list | np.ndarray, alpha: float, anchors_in_test=True
+):
+    """
+    Computes the Enrichment Factor from a distance matrix, and its labels.
+     - First axis of the distance matrix is the anchors on which to compute the EF
+     - Second axis is the test. For convenience, anchors can be put in test, if the flag anchors_in_test is set to true.
+     - labels is a table of bools, representing the the labels of the test axis of the distance matrix.
+     - alpha : the EF alpha parameter.
+    """
+    n = len(labels)
+    n_max = int(alpha * n)
+    indices = np.argsort(distances, axis=1)
+    EF_ = [
+        ((labels[idx[:n_max]]).sum() - anchors_in_test)
+        / (labels.sum() - anchors_in_test)
+        for idx in indices
+    ]
+    return np.mean(EF_) / alpha
+def EF_AUC(distances: np.ndarray, labels: np.ndarray, anchors_in_test=0):
+    if distances.ndim == 1:
+        distances = distances[None, :]
+    assert distances.ndim == 2
+    indices = np.argsort(distances, axis=1)
+    out = []
+    for i in range(1, distances.size):
+        proportion_of_good_indices = (
+            labels[indices[:, :i]].sum(axis=1).mean() - anchors_in_test
+        ) / min(i, labels.sum() - anchors_in_test)
+        out.append(proportion_of_good_indices)
+    # print(out)
+    return np.mean(out)
+def theorical_max_EF(distances, labels, alpha):
+    n = len(labels)
+    n_max = int(alpha * n)
+    num_true_labels = np.sum(
+        labels == labels[0]
+    )  ## if labels are not True / False, assumes that the first one is a good one
+    return min(n_max, num_true_labels) / alpha
+def theorical_max_EF_from_distances(list_of_distances, list_of_labels, alpha):
+    return np.mean(
+        [
+            theorical_max_EF(distances, labels, alpha)
+            for distances, labels in zip(list_of_distances, list_of_labels)
+        ]
+    )
+def plot_EF_from_distances(
+    alphas=[0.01, 0.02, 0.05, 0.1], EF=EF_from_distance_matrix, plot: bool = True
+):
+    y = np.round([EF(alpha=alpha) for alpha in alphas], decimals=2)
+    if plot:
+        _alphas = np.linspace(0.01, 1.0, 100)
+        plt.figure()
+        plt.plot(_alphas, [EF(alpha=alpha) for alpha in _alphas])
+        plt.scatter(alphas, y, c="r")
+        plt.title("Enrichment Factor")
+        plt.xlabel(r"$\alpha$" + f" = {alphas}")
+        plt.ylabel(r"$\mathrm{EF}_\alpha$" + f" = {y}")
+    return y
+def lines2bonds(
+    mol: mda.Universe, bond_types=["ar", "am", 3, 2, 1, 0], molecule_format=None
+):
+    extension = (
+        mol.filename.split(".")[-1].lower()
+        if molecule_format is None
+        else molecule_format
+    )
+    match extension:
+        case "mol2":
+            out = lines2bonds_MOL2(mol)["bond_type"]
+        case "pdb":
+            out = lines2bonds_PDB(mol)
+        case _:
+            raise Exception("Invalid, or not supported molecule format.")
+    return LabelEncoder().fit(bond_types).transform(out)
+def lines2bonds_MOL2(mol: mda.Universe):
+    _lines = open(mol.filename, "r").readlines()
+    out = []
+    index = 0
+    while index < len(_lines) and _lines[index].strip() != "@<TRIPOS>BOND":
+        index += 1
+    index += 1
+    while index < len(_lines) and _lines[index].strip()[0] != "@":
+        line = _lines[index].strip().split(" ")
+        for j, truc in enumerate(line):
+            line[j] = truc.strip()
+        # try:
+        out.append([stuff for stuff in line if len(stuff) > 0])
+        # except:
+        # 	print_lin
+        index += 1
+    out = pd.DataFrame(out, columns=["bond_id", "atom1", "atom2", "bond_type"])
+    out.set_index(["bond_id"], inplace=True)
+    return out
+def lines2bonds_PDB(mol: mda.Universe):
+    raise Exception("Not yet implemented.")
+    return
+def _mol2graphst(
+    path: str | mda.Universe, filtrations: Iterable[str], molecule_format=None
+):
+    molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
+    num_filtrations = len(filtrations)
+    nodes = molecule.atoms.indices.reshape(1, -1)
+    edges = molecule.bonds.dump_contents().T
+    num_vertices = nodes.shape[1]
+    num_edges = edges.shape[1]
+    st = mp.SimplexTreeMulti(num_parameters=num_filtrations)
+    ## Edges filtration
+    # edges = np.array(bonds_df[["atom1", "atom2"]]).T
+    edges_filtration = np.zeros((num_edges, num_filtrations), dtype=np.float32) - np.inf
+    for i, filtration in enumerate(filtrations):
+        match filtration:
+            case "bond_length":
+                bond_lengths = molecule.bonds.bonds()
+                edges_filtration[:, i] = bond_lengths
+            case "bond_type":
+                bond_types = lines2bonds(mol=molecule, molecule_format=molecule_format)
+                edges_filtration[:, i] = bond_types
+            case _:
+                pass
+    ## Nodes filtration
+    nodes_filtrations = np.zeros(
+        (num_vertices, num_filtrations), dtype=np.float32
+    ) + np.min(
+        edges_filtration, axis=0
+    )  # better than - np.inf
+    st.insert_batch(nodes, nodes_filtrations)
+    st.insert_batch(edges, edges_filtration)
+    for i, filtration in enumerate(filtrations):
+        match filtration:
+            case "charge":
+                charges = molecule.atoms.charges
+                st.fill_lowerstar(charges, parameter=i)
+            case "atomic_mass":
+                masses = molecule.atoms.masses
+                null_indices = masses == 0
+                if np.any(null_indices):  # guess if necessary
+                    masses[null_indices] = guess_masses(molecule.atoms.types)[
+                        null_indices
+                    ]
+                st.fill_lowerstar(-masses, parameter=i)
+            case _:
+                pass
+    st.make_filtration_non_decreasing()  # Necessary ?
+    return st
+def _mol2ripsst(
+    path: str,
+    filtrations: Iterable[str],
+    threshold=np.inf,
+    bond_types: list = ["ar", "am", 3, 2, 1, 0],
+):
+    import gudhi as gd
+    assert "bond_length" == filtrations[0], "Bond length has to be first for rips."
+    molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
+    num_parameters = len(filtrations)
+    st_rips = gd.RipsComplex(
+        points=molecule.atoms.positions, max_edge_length=threshold
+    ).create_simplex_tree()
+    st = mp.SimplexTreeMulti(
+        st_rips,
+        num_parameters=num_parameters,
+        default_values=[
+            bond_types.index(0) if f == "bond_type" else -np.inf
+            for f in filtrations[1:]
+        ],  # the 0 index is the label of 'no bond' in bond_types
+    )
+    ## Edges filtration
+    mol_bonds = molecule.bonds.indices.T
+    edges_filtration = (
+        np.zeros((mol_bonds.shape[1], num_parameters), dtype=np.float32) - np.inf
+    )
+    for i, filtration in enumerate(filtrations):
+        match filtration:
+            case "bond_type":
+                edges_filtration[:, i] = lines2bonds(
+                    mol=molecule, bond_types=bond_types
+                )
+            case "atomic_mass":
+                continue
+            case "charge":
+                continue
+            case "bond_length":
+                edges_filtration[:, i] = [st_rips.filtration(s) for s in mol_bonds.T]
+            case _:
+                raise Exception(
+                    f"Invalid filtration {filtration}. Available ones : bond_type, atomic_mass, charge, bond_length."
+                )
+    st.assign_batch_filtration(mol_bonds, edges_filtration, propagate=False)
+    min_filtration = edges_filtration.min(axis=0)
+    st.assign_batch_filtration(
+        np.asarray([list(range(st.num_vertices))], dtype=int),
+        np.asarray([min_filtration] * st.num_vertices, dtype=np.float32),
+        propagate=False,
+    )
+    ## Nodes filtration
+    for i, filtration in enumerate(filtrations):
+        match filtration:
+            case "charge":
+                charges = molecule.atoms.charges
+                st.fill_lowerstar(charges, parameter=i)
+            case "atomic_mass":
+                masses = molecule.atoms.masses
+                null_indices = masses == 0
+                if np.any(null_indices):  # guess if necessary
+                    masses[null_indices] = guess_masses(molecule.atoms.types)[
+                        null_indices
+                    ]
+                # print(masses)
+                st.fill_lowerstar(-masses, parameter=i)
+            case _:
+                pass
+    st.make_filtration_non_decreasing()  # Necessary ?
+    return st
+class Molecule2SimplexTree(BaseEstimator, TransformerMixin):
+    """
+    Transforms a list of MDA-compatible files into a list of mulitparameter simplextrees
+    Input
+    -----
+     X: Iterable[path_to_files:str]
+    Output
+    ------
+     Iterable[multipers.SimplexTreeMulti]
+    Parameters
+    ----------
+     - filtrations : list of filtration names. Available ones : 'charge', 'atomic_mass', 'bond_length', 'bond_type'. Others are ignored.
+     - graph : bool. If true, will use the graph given by the molecule, otherwise, a Rips Complex Based on the distance. '
+     In that case bond_length is ignored (it's the 1rst parameter).
+    """
+    def __init__(
+        self,
+        delayed: bool = False,
+        filtrations: Iterable[str] = [],
+        graph: bool = True,
+        n_jobs: int = 1,
+    ) -> None:
+        super().__init__()
+        self.delayed = delayed
+        self.n_jobs = n_jobs
+        self.filtrations = filtrations
+        self.graph = graph
+        self._molecule_format = None
+        return
+    def fit(self, X: Iterable[str], y=None):
+        if len(X) == 0:
+            return self
+        test_mol = mda.Universe(X[0])
+        self._molecule_format = test_mol.filename.split(".")[-1].lower()
+        return self
+    def transform(self, X: Iterable[str]):
+        _to_simplextree = _mol2graphst if self.graph else _mol2ripsst
+        to_simplex_tree = lambda path_to_mol2_file: [
+            _to_simplextree(path=path_to_mol2_file, filtrations=self.filtrations)
+        ]
+        if self.delayed:
+            return [delayed(to_simplex_tree)(path) for path in X]
+        return Parallel(n_jobs=self.n_jobs, prefer="threads")(
+            delayed(to_simplex_tree)(path) for path in X
+        )