multipers 2.0.0__cp312-cp312-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of multipers might be problematic. Click here for more details.

Files changed (78) hide show
  1. multipers/.dylibs/libc++.1.0.dylib +0 -0
  2. multipers/.dylibs/libtbb.12.12.dylib +0 -0
  3. multipers/.dylibs/libtbbmalloc.2.12.dylib +0 -0
  4. multipers/__init__.py +11 -0
  5. multipers/_signed_measure_meta.py +268 -0
  6. multipers/_slicer_meta.py +171 -0
  7. multipers/data/MOL2.py +350 -0
  8. multipers/data/UCR.py +18 -0
  9. multipers/data/__init__.py +1 -0
  10. multipers/data/graphs.py +466 -0
  11. multipers/data/immuno_regions.py +27 -0
  12. multipers/data/minimal_presentation_to_st_bf.py +0 -0
  13. multipers/data/pytorch2simplextree.py +91 -0
  14. multipers/data/shape3d.py +101 -0
  15. multipers/data/synthetic.py +68 -0
  16. multipers/distances.py +198 -0
  17. multipers/euler_characteristic.pyx +132 -0
  18. multipers/filtration_conversions.pxd +229 -0
  19. multipers/filtrations.pxd +225 -0
  20. multipers/function_rips.cpython-312-darwin.so +0 -0
  21. multipers/function_rips.pyx +105 -0
  22. multipers/grids.cpython-312-darwin.so +0 -0
  23. multipers/grids.pyx +281 -0
  24. multipers/hilbert_function.pyi +46 -0
  25. multipers/hilbert_function.pyx +153 -0
  26. multipers/io.cpython-312-darwin.so +0 -0
  27. multipers/io.pyx +571 -0
  28. multipers/ml/__init__.py +0 -0
  29. multipers/ml/accuracies.py +90 -0
  30. multipers/ml/convolutions.py +532 -0
  31. multipers/ml/invariants_with_persistable.py +79 -0
  32. multipers/ml/kernels.py +176 -0
  33. multipers/ml/mma.py +659 -0
  34. multipers/ml/one.py +472 -0
  35. multipers/ml/point_clouds.py +238 -0
  36. multipers/ml/signed_betti.py +50 -0
  37. multipers/ml/signed_measures.py +1542 -0
  38. multipers/ml/sliced_wasserstein.py +461 -0
  39. multipers/ml/tools.py +113 -0
  40. multipers/mma_structures.cpython-312-darwin.so +0 -0
  41. multipers/mma_structures.pxd +127 -0
  42. multipers/mma_structures.pyx +2433 -0
  43. multipers/multiparameter_edge_collapse.py +41 -0
  44. multipers/multiparameter_module_approximation.cpython-312-darwin.so +0 -0
  45. multipers/multiparameter_module_approximation.pyx +211 -0
  46. multipers/pickle.py +53 -0
  47. multipers/plots.py +326 -0
  48. multipers/point_measure_integration.cpython-312-darwin.so +0 -0
  49. multipers/point_measure_integration.pyx +139 -0
  50. multipers/rank_invariant.cpython-312-darwin.so +0 -0
  51. multipers/rank_invariant.pyx +229 -0
  52. multipers/simplex_tree_multi.cpython-312-darwin.so +0 -0
  53. multipers/simplex_tree_multi.pxd +129 -0
  54. multipers/simplex_tree_multi.pyi +715 -0
  55. multipers/simplex_tree_multi.pyx +4655 -0
  56. multipers/slicer.cpython-312-darwin.so +0 -0
  57. multipers/slicer.pxd +781 -0
  58. multipers/slicer.pyx +3393 -0
  59. multipers/tensor.pxd +13 -0
  60. multipers/test.pyx +44 -0
  61. multipers/tests/__init__.py +40 -0
  62. multipers/tests/old_test_rank_invariant.py +91 -0
  63. multipers/tests/test_diff_helper.py +74 -0
  64. multipers/tests/test_hilbert_function.py +82 -0
  65. multipers/tests/test_mma.py +51 -0
  66. multipers/tests/test_point_clouds.py +59 -0
  67. multipers/tests/test_python-cpp_conversion.py +82 -0
  68. multipers/tests/test_signed_betti.py +181 -0
  69. multipers/tests/test_simplextreemulti.py +98 -0
  70. multipers/tests/test_slicer.py +63 -0
  71. multipers/torch/__init__.py +1 -0
  72. multipers/torch/diff_grids.py +217 -0
  73. multipers/torch/rips_density.py +257 -0
  74. multipers-2.0.0.dist-info/LICENSE +21 -0
  75. multipers-2.0.0.dist-info/METADATA +29 -0
  76. multipers-2.0.0.dist-info/RECORD +78 -0
  77. multipers-2.0.0.dist-info/WHEEL +5 -0
  78. multipers-2.0.0.dist-info/top_level.txt +1 -0
multipers/data/MOL2.py ADDED
@@ -0,0 +1,350 @@
1
+ import numpy as np
2
+ from os.path import expanduser
3
+ import pandas as pd
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from os.path import expanduser
6
+ from os import listdir
7
+ import os
8
+ import MDAnalysis as mda
9
+ import matplotlib.pyplot as plt
10
+ from MDAnalysis.topology.guessers import guess_masses
11
+ import multipers as mp
12
+ # from numba import njit
13
+ from tqdm import tqdm
14
+ from typing import Iterable
15
+ from joblib import Parallel, delayed
16
+ from sklearn.base import BaseEstimator,TransformerMixin
17
+
18
+
19
+ DATASET_PATH = expanduser("~/Datasets/")
20
+ JC_path = DATASET_PATH + "Cleves-Jain/"
21
+ DUDE_path = DATASET_PATH + "DUD-E/"
22
+
23
+
24
+ #pathes = get_data_path()
25
+ #imgs = apply_pipeline(pathes=pathes, pipeline=pipeline_img)
26
+ #distances_to_letter, ytest = img_distances(imgs)
27
+
28
+
29
+ def _get_mols_in_path(folder):
30
+ with open(folder+"/TargetList", "r") as f:
31
+ train_data = [folder + "/" + mol.strip() for mol in f.readlines()]
32
+ criterion = lambda dataset : dataset.endswith(".mol2") and not dataset.startswith("final") and dataset not in train_data
33
+ test_data = [folder + "/" + dataset for dataset in listdir(folder) if criterion(folder + "/" + dataset)]
34
+ return train_data, test_data
35
+ def get_data_path_JC(type="dict"):
36
+ if type == "dict": out = {}
37
+ elif type == "list": out = []
38
+ else: raise TypeError(f"Type {out} not supported")
39
+ for stuff in listdir(JC_path):
40
+ if stuff.startswith("target_"):
41
+ current_letter = stuff[-1]
42
+ to_add = _get_mols_in_path(JC_path + stuff)
43
+ if type == "dict": out[current_letter] = to_add
44
+ elif type == "list": out.append(to_add)
45
+ decoy_folder = JC_path + "RognanRing850/"
46
+ to_add = [decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")]
47
+ if type == "dict": out["decoy"] = to_add
48
+ elif type == "list": out.append(to_add)
49
+ return out
50
+ def get_all_JC_path():
51
+ out = []
52
+ for stuff in listdir(JC_path):
53
+ if stuff.startswith("target_"):
54
+ train_data, test_data = _get_mols_in_path(JC_path + stuff)
55
+ out += train_data
56
+ out += test_data
57
+ decoy_folder = JC_path + "RognanRing850/"
58
+ out +=[decoy_folder + mol for mol in listdir(decoy_folder) if mol.endswith(".mol2")]
59
+ return out
60
+
61
+
62
+ def split_multimol(path:str, mol_name:str, out_folder_name:str = "splitted", enforce_charges:bool=False):
63
+ with open(path + mol_name, "r") as f:
64
+ lines = f.readlines()
65
+ splitted_mols = []
66
+ index = 0
67
+ for i,line in enumerate(lines):
68
+ is_last = i == len(lines)-1
69
+ if line.strip() == "@<TRIPOS>MOLECULE" or is_last:
70
+ if i != index:
71
+ molecule = "".join(lines[index:i + is_last])
72
+ if enforce_charges:
73
+ # print(f"Replaced molecule {i}")
74
+ molecule = molecule.replace("NO_CHARGES","USER_CHARGES")
75
+ # print(molecule)
76
+ # return
77
+ index = i
78
+ splitted_mols.append(molecule)
79
+ if not os.path.exists(path + out_folder_name):
80
+ os.mkdir(path + out_folder_name)
81
+ for i,mol in enumerate(splitted_mols):
82
+ with open(path + out_folder_name + f"/{i}.mol2", "w") as f:
83
+ f.write(mol)
84
+ return [path+out_folder_name + f"/{i}.mol2" for i in range(len(splitted_mols))]
85
+
86
+ # @njit(parallel=True)
87
+ def apply_pipeline(pathes:dict, pipeline):
88
+ img_dict = {}
89
+ for key, value in tqdm(pathes.items(), desc="Applying pipeline"):
90
+ if len(key) == 1:
91
+ train_paths, test_paths = value
92
+ train_imgs = pipeline.transform(train_paths)
93
+ test_imgs = pipeline.transform(test_paths)
94
+ img_dict[key] = (train_imgs, test_imgs)
95
+ else:
96
+ assert key == "decoy"
97
+ img_dict[key] = pipeline.transform(value)
98
+ return img_dict
99
+
100
+ from sklearn.metrics import pairwise_distances
101
+ def img_distances(img_dict:dict):
102
+ distances_to_anchors = []
103
+ ytest = []
104
+ decoy_list = img_dict["decoy"]
105
+ for letter, imgs in img_dict.items():
106
+ if len(letter) != 1 : continue # decoy
107
+ xtrain, xtest = imgs
108
+ assert len(xtest)>0
109
+ train_data, test_data = xtrain, np.concatenate([xtest ,decoy_list])
110
+ D = pairwise_distances(train_data, test_data)
111
+ distances_to_anchors.append(D)
112
+ letter_ytest = np.array([letter]*len(xtest) + ['0']*len(decoy_list), dtype="<U1")
113
+ ytest.append(letter_ytest)
114
+ return distances_to_anchors, ytest
115
+
116
+ def get_EF_vector_from_distances(distances, ytest, alpha=0.05):
117
+ EF = []
118
+ for distance_to_anchors, letter_ytest in zip(distances, ytest):
119
+ indices = np.argsort(distance_to_anchors, axis=1)
120
+ n = indices.shape[1]
121
+ n_max = int(alpha*n)
122
+ good_indices = (letter_ytest[indices[:,:n_max]] == letter_ytest[0]) ## assumes that ytest[:,0] are the good letters
123
+ EF_letter = good_indices.sum(axis=1) / (letter_ytest == letter_ytest[0]).sum()
124
+ EF_letter /= alpha
125
+ EF.append(EF_letter.mean())
126
+ return np.mean(EF)
127
+
128
+ def EF_from_distance_matrix(distances:np.ndarray, labels:list|np.ndarray, alpha:float, anchors_in_test=True):
129
+ """
130
+ Computes the Enrichment Factor from a distance matrix, and its labels.
131
+ - First axis of the distance matrix is the anchors on which to compute the EF
132
+ - Second axis is the test. For convenience, anchors can be put in test, if the flag anchors_in_test is set to true.
133
+ - labels is a table of bools, representing the the labels of the test axis of the distance matrix.
134
+ - alpha : the EF alpha parameter.
135
+ """
136
+ n = len(labels)
137
+ n_max = int(alpha*n)
138
+ indices = np.argsort(distances, axis=1)
139
+ EF_ = [((labels[idx[:n_max]]).sum()-anchors_in_test)/(labels.sum()-anchors_in_test) for idx in indices]
140
+ return np.mean(EF_)/alpha
141
+
142
+ def EF_AUC(distances:np.ndarray, labels:np.ndarray, anchors_in_test=0):
143
+ if distances.ndim == 1:
144
+ distances = distances[None,:]
145
+ assert distances.ndim == 2
146
+ indices = np.argsort(distances, axis=1)
147
+ out = []
148
+ for i in range(1,distances.size):
149
+ proportion_of_good_indices = (labels[indices[:,:i]].sum(axis=1).mean() -anchors_in_test)/min(i,labels.sum() -anchors_in_test)
150
+ out.append(proportion_of_good_indices)
151
+ # print(out)
152
+ return np.mean(out)
153
+
154
+
155
+ def theorical_max_EF(distances,labels, alpha):
156
+ n = len(labels)
157
+ n_max = int(alpha*n)
158
+ num_true_labels = np.sum(labels == labels[0]) ## if labels are not True / False, assumes that the first one is a good one
159
+ return min(n_max, num_true_labels)/alpha
160
+
161
+
162
+ def theorical_max_EF_from_distances(list_of_distances,list_of_labels, alpha):
163
+ return np.mean([theorical_max_EF(distances, labels,alpha) for distances, labels in zip(list_of_distances, list_of_labels)])
164
+
165
+ def plot_EF_from_distances(alphas = [0.01, 0.02, 0.05, 0.1], EF = EF_from_distance_matrix, plot:bool=True):
166
+ y = np.round([EF(alpha=alpha) for alpha in alphas], decimals=2)
167
+ if plot:
168
+ _alphas = np.linspace(0.01, 1., 100)
169
+ plt.figure()
170
+ plt.plot(_alphas, [EF(alpha=alpha) for alpha in _alphas])
171
+ plt.scatter(alphas, y, c='r')
172
+ plt.title("Enrichment Factor")
173
+ plt.xlabel(r"$\alpha$" + f" = {alphas}")
174
+ plt.ylabel(r"$\mathrm{EF}_\alpha$" + f" = {y}")
175
+ return y
176
+
177
+
178
+ def lines2bonds(mol:mda.Universe, bond_types = ['ar','am',3,2,1,0], molecule_format=None):
179
+ extension = mol.filename.split('.')[-1].lower() if molecule_format is None else molecule_format
180
+ match extension:
181
+ case 'mol2':
182
+ out = lines2bonds_MOL2(mol)['bond_type']
183
+ case 'pdb':
184
+ out = lines2bonds_PDB(mol)
185
+ case _:
186
+ raise Exception('Invalid, or not supported molecule format.')
187
+ return LabelEncoder().fit(bond_types).transform(out)
188
+
189
+
190
+ def lines2bonds_MOL2(mol:mda.Universe):
191
+ _lines = open(mol.filename, "r").readlines()
192
+ out = []
193
+ index = 0
194
+ while index < len(_lines) and _lines[index].strip() != "@<TRIPOS>BOND":
195
+ index += 1
196
+ index += 1
197
+ while index < len(_lines) and _lines[index].strip()[0] != "@":
198
+ line = _lines[index].strip().split(" ")
199
+ for j,truc in enumerate(line):
200
+ line[j] = truc.strip()
201
+ # try:
202
+ out.append([stuff for stuff in line if len(stuff) > 0])
203
+ # except:
204
+ # print_lin
205
+ index +=1
206
+ out = pd.DataFrame(out, columns=["bond_id","atom1", "atom2", "bond_type"])
207
+ out.set_index(["bond_id"],inplace=True)
208
+ return out
209
+
210
+
211
+ def lines2bonds_PDB(mol:mda.Universe):
212
+ raise Exception('Not yet implemented.')
213
+ return
214
+
215
+ def _mol2graphst(path:str|mda.Universe, filtrations:Iterable[str], molecule_format=None):
216
+ molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
217
+
218
+ num_filtrations = len(filtrations)
219
+ nodes = molecule.atoms.indices.reshape(1,-1)
220
+ edges = molecule.bonds.dump_contents().T
221
+ num_vertices = nodes.shape[1]
222
+ num_edges = edges.shape[1]
223
+
224
+ st = mp.SimplexTreeMulti(num_parameters = num_filtrations)
225
+
226
+ ## Edges filtration
227
+ # edges = np.array(bonds_df[["atom1", "atom2"]]).T
228
+ edges_filtration = np.zeros((num_edges, num_filtrations), dtype=np.float32) - np.inf
229
+ for i, filtration in enumerate(filtrations):
230
+ match filtration:
231
+ case "bond_length":
232
+ bond_lengths = molecule.bonds.bonds()
233
+ edges_filtration[:,i] = bond_lengths
234
+ case "bond_type":
235
+ bond_types = lines2bonds(mol=molecule, molecule_format=molecule_format)
236
+ edges_filtration[:,i] = bond_types
237
+ case _:
238
+ pass
239
+
240
+ ## Nodes filtration
241
+ nodes_filtrations = np.zeros((num_vertices,num_filtrations), dtype=np.float32) + np.min(edges_filtration, axis=0) # better than - np.inf
242
+ st.insert_batch(nodes, nodes_filtrations)
243
+
244
+ st.insert_batch(edges, edges_filtration)
245
+ for i, filtration in enumerate(filtrations):
246
+ match filtration:
247
+ case "charge":
248
+ charges = molecule.atoms.charges
249
+ st.fill_lowerstar(charges, parameter=i)
250
+ case "atomic_mass":
251
+ masses = molecule.atoms.masses
252
+ null_indices = masses == 0
253
+ if np.any(null_indices): # guess if necessary
254
+ masses[null_indices] = guess_masses(molecule.atoms.types)[null_indices]
255
+ st.fill_lowerstar(-masses, parameter=i)
256
+ case _:
257
+ pass
258
+ st.make_filtration_non_decreasing() # Necessary ?
259
+ return st
260
+
261
+
262
+ def _mol2ripsst(path:str, filtrations:Iterable[str], threshold=np.inf, bond_types:list=['ar','am',3,2,1,0]):
263
+ import gudhi as gd
264
+ assert 'bond_length' == filtrations[0], "Bond length has to be first for rips."
265
+ molecule = path if isinstance(path, mda.Universe) else mda.Universe(path)
266
+ num_parameters = len(filtrations)
267
+ st_rips = gd.RipsComplex(points = molecule.atoms.positions, max_edge_length=threshold).create_simplex_tree()
268
+ st = mp.SimplexTreeMulti(st_rips, num_parameters=num_parameters,
269
+ default_values = [bond_types.index(0) if f == "bond_type" else -np.inf for f in filtrations[1:]] # the 0 index is the label of 'no bond' in bond_types
270
+ )
271
+
272
+ ## Edges filtration
273
+ mol_bonds = molecule.bonds.indices.T
274
+ edges_filtration = np.zeros((mol_bonds.shape[1], num_parameters), dtype=np.float32) - np.inf
275
+ for i, filtration in enumerate(filtrations):
276
+ match filtration:
277
+ case "bond_type":
278
+ edges_filtration[:,i] = lines2bonds(mol=molecule, bond_types=bond_types)
279
+ case "atomic_mass":
280
+ continue
281
+ case "charge":
282
+ continue
283
+ case 'bond_length':
284
+ edges_filtration[:,i] = [st_rips.filtration(s) for s in mol_bonds.T]
285
+ case _:
286
+ raise Exception(f"Invalid filtration {filtration}. Available ones : bond_type, atomic_mass, charge, bond_length.")
287
+ st.assign_batch_filtration(mol_bonds, edges_filtration, propagate=False)
288
+ min_filtration = edges_filtration.min(axis=0)
289
+ st.assign_batch_filtration(np.asarray([list(range(st.num_vertices))], dtype=int), np.asarray([min_filtration]*st.num_vertices, dtype=np.float32), propagate=False)
290
+ ## Nodes filtration
291
+ for i, filtration in enumerate(filtrations):
292
+ match filtration:
293
+ case "charge":
294
+ charges = molecule.atoms.charges
295
+ st.fill_lowerstar(charges, parameter=i)
296
+ case "atomic_mass":
297
+ masses = molecule.atoms.masses
298
+ null_indices = masses == 0
299
+ if np.any(null_indices): # guess if necessary
300
+ masses[null_indices] = guess_masses(molecule.atoms.types)[null_indices]
301
+ # print(masses)
302
+ st.fill_lowerstar(-masses, parameter=i)
303
+ case _:
304
+ pass
305
+ st.make_filtration_non_decreasing() # Necessary ?
306
+ return st
307
+
308
+
309
+ class Molecule2SimplexTree(BaseEstimator, TransformerMixin):
310
+ """
311
+ Transforms a list of MDA-compatible files into a list of mulitparameter simplextrees
312
+
313
+ Input
314
+ -----
315
+ X: Iterable[path_to_files:str]
316
+
317
+ Output
318
+ ------
319
+ Iterable[multipers.SimplexTreeMulti]
320
+
321
+ Parameters
322
+ ----------
323
+ - filtrations : list of filtration names. Available ones : 'charge', 'atomic_mass', 'bond_length', 'bond_type'. Others are ignored.
324
+ - graph : bool. If true, will use the graph given by the molecule, otherwise, a Rips Complex Based on the distance. '
325
+ In that case bond_length is ignored (it's the 1rst parameter).
326
+ """
327
+ def __init__(self,
328
+ delayed:bool=False,
329
+ filtrations:Iterable[str]=[],
330
+ graph:bool=True,
331
+ n_jobs:int=1) -> None:
332
+ super().__init__()
333
+ self.delayed=delayed
334
+ self.n_jobs = n_jobs
335
+ self.filtrations=filtrations
336
+ self.graph=graph
337
+ self._molecule_format=None
338
+ return
339
+ def fit(self, X:Iterable[str], y=None):
340
+ if len(X) == 0: return self
341
+ test_mol = mda.Universe(X[0])
342
+ self._molecule_format = test_mol.filename.split('.')[-1].lower()
343
+ return self
344
+ def transform(self,X:Iterable[str]):
345
+ _to_simplextree = _mol2graphst if self.graph else _mol2ripsst
346
+ to_simplex_tree = lambda path_to_mol2_file : _to_simplextree(path=path_to_mol2_file, filtrations=self.filtrations)
347
+ if self.delayed:
348
+ return [delayed(to_simplex_tree)(path) for path in X]
349
+ return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(to_simplex_tree)(path) for path in X)
350
+
multipers/data/UCR.py ADDED
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+ from os.path import expanduser
3
+ import pandas as pd
4
+ from sklearn.preprocessing import LabelEncoder
5
+
6
+ def get(dataset:str="UCR/Coffee", test:bool=False, DATASET_PATH:str=expanduser("~/Datasets/"), dim=3,delay=1,skip=1):
7
+ from gudhi.point_cloud.timedelay import TimeDelayEmbedding
8
+ dataset_path = DATASET_PATH + dataset + "/" + dataset[4:]
9
+ dataset_path += "_TEST.tsv" if test else "_TRAIN.tsv"
10
+ data = np.array(pd.read_csv(dataset_path, delimiter='\t', header=None, index_col=None))
11
+ Y = LabelEncoder().fit_transform(data[:,0])
12
+ data = data[:,1:]
13
+ tde = TimeDelayEmbedding(dim=dim, delay=delay, skip=skip).transform(data)
14
+ return tde, Y
15
+ def get_train(*args, **kwargs):
16
+ return get(*args, **kwargs, test=False)
17
+ def get_test(*args, **kwargs):
18
+ return get(*args, **kwargs, test=True)
@@ -0,0 +1 @@
1
+ from .synthetic import *