pico-ml 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. pico/__init__.py +3 -0
  2. pico/__main__.py +3 -0
  3. pico/cli/__init__.py +2 -0
  4. pico/cli/main.py +117 -0
  5. pico/conf/SupportedCV.py +17 -0
  6. pico/conf/SupportedModels.py +73 -0
  7. pico/conf/algo_sklearn.json +51 -0
  8. pico/conf/parameters.py +14 -0
  9. pico/domain/ClassificationDesign.py +107 -0
  10. pico/domain/Controller.py +397 -0
  11. pico/domain/DataMatrix.py +147 -0
  12. pico/domain/ExperimentDTO.py +17 -0
  13. pico/domain/MetaData.py +229 -0
  14. pico/domain/MetaboExperiment.py +696 -0
  15. pico/domain/MetaboModel.py +53 -0
  16. pico/domain/ModelFactory.py +45 -0
  17. pico/domain/Results.py +602 -0
  18. pico/domain/SplitGroup.py +202 -0
  19. pico/domain/__init__.py +9 -0
  20. pico/domain/dumps/metadata/.gitkeep +0 -0
  21. pico/domain/dumps/splits/.gitkeep +0 -0
  22. pico/service/DataFormat.py +180 -0
  23. pico/service/ExperimentDesign.py +30 -0
  24. pico/service/LoggerConfig.py +150 -0
  25. pico/service/Plots.py +472 -0
  26. pico/service/RunMLalgo.py +93 -0
  27. pico/service/SamplesPairing.py +390 -0
  28. pico/service/Utils.py +497 -0
  29. pico/service/__init__.py +7 -0
  30. pico/ui/__init__.py +1 -0
  31. pico/ui/app.py +145 -0
  32. pico/ui/assets/000_Stylesheet.css +464 -0
  33. pico/ui/assets/DecisionTree.png +0 -0
  34. pico/ui/assets/Figure_home_wider.png +0 -0
  35. pico/ui/assets/favicon.ico +0 -0
  36. pico/ui/assets/help_icon.png +0 -0
  37. pico/ui/assets/help_icon.svg +15 -0
  38. pico/ui/assets/update_figure_steps_MeDIC_4.svg +1 -0
  39. pico/ui/tabs/AggregatedResultsTab.py +394 -0
  40. pico/ui/tabs/InfoTab.py +440 -0
  41. pico/ui/tabs/InterpretTab.py +21 -0
  42. pico/ui/tabs/MLTab.py +487 -0
  43. pico/ui/tabs/MetaTab.py +23 -0
  44. pico/ui/tabs/ResultsTab.py +1062 -0
  45. pico/ui/tabs/SplitsTab.py +1227 -0
  46. pico/ui/tabs/__init__.py +6 -0
  47. pico/ui/tabs/utils.py +101 -0
  48. pico_ml-2.0.0.dist-info/METADATA +86 -0
  49. pico_ml-2.0.0.dist-info/RECORD +52 -0
  50. pico_ml-2.0.0.dist-info/WHEEL +4 -0
  51. pico_ml-2.0.0.dist-info/entry_points.txt +2 -0
  52. pico_ml-2.0.0.dist-info/licenses/LICENSE +437 -0
@@ -0,0 +1,390 @@
1
+ from sklearn.model_selection import train_test_split
2
+ import glob, random
3
+ import numpy as np
4
+
5
+
6
+ class SamplesPairing:
7
+ def __init__(
8
+ self, pairings, sample_names, targets, IDs, proportion_in_test, nbr_splits
9
+ ):
10
+ self.pairings = pairings
11
+ self.sample_names = sample_names
12
+ self.targets = targets
13
+ self.ids = IDs
14
+ self.proportion = float(proportion_in_test)
15
+ self.nbr_splits = int(nbr_splits)
16
+ self.dict_splits = {}
17
+
18
+ self.names_dict = {n: idx for idx, n in enumerate(self.sample_names)}
19
+
20
+ def split(self):
21
+ """
22
+ We assume there can be maximum two types of pairing simultaneously
23
+ :return: Nothing
24
+ """
25
+ if not self._is_there_pairing_to_do():
26
+
27
+ ###### Creation de X et y ######
28
+
29
+ X = []
30
+ y = []
31
+ for s in self.sample_names: # itère sur chq nom de sample
32
+ for i, id in enumerate(self.ids): # itère sur chq id unique
33
+ if id in s: # vérifie si l'id est présent dans le nom de sample
34
+ X.append(s)
35
+ y.append(
36
+ self.targets[i]
37
+ ) # ajoute label correspondant à l'id(correspondant lui-meme au sample)
38
+
39
+ ##### Create the splits ######
40
+
41
+ for i in range(self.nbr_splits):
42
+ X_train, X_test, y_train, y_test = train_test_split(
43
+ X, y, test_size=self.proportion, random_state=i, stratify=y
44
+ )
45
+ # convert sample names to indices so its easier to reconstruct later
46
+ # X_train = [self.names_dict[i] for i in X_train]
47
+ # X_test = [self.names_dict[i] for i in X_test]
48
+
49
+ ###### save splits dans dict ######
50
+
51
+ self.dict_splits["split{}".format(i)] = [
52
+ X_train,
53
+ X_test,
54
+ y_train,
55
+ y_test,
56
+ ]
57
+ else:
58
+ self._test_pairing_patterns_spelling()
59
+ # select base files to do the spliting on
60
+
61
+ ###### Creation de X et y ######
62
+
63
+ X = []
64
+ y = []
65
+ first_patterns = [i[0] for i in self.pairings]
66
+ for s in self.sample_names: # itère sur chaque nom de sample
67
+ ok = True
68
+ for p in first_patterns: # itère sur chq 1er pattern des pairings
69
+ if (
70
+ p not in s
71
+ ): # check si le (ou un des deux) pattern n'est pas présent
72
+ ok = False
73
+ # if it does, check its ID to match its target
74
+ if (
75
+ ok
76
+ ): # si le/les patterns sont présents, ajoute le sample au groupe de base/de reference
77
+ for i, id in enumerate(self.ids):
78
+ if id in s:
79
+ X.append(s)
80
+ y.append(self.targets[i])
81
+
82
+ ##### Create the splits ######
83
+
84
+ for i in range(self.nbr_splits):
85
+ X_train, X_test, y_train, y_train = train_test_split(
86
+ X, y, test_size=self.proportion, random_state=i, stratify=y
87
+ )
88
+
89
+ # replace pattern to match spliting
90
+ # create target list accordingly
91
+
92
+ pattern_list = first_patterns
93
+ idx = len(self.pairings) - 1
94
+ new_xy = []
95
+ if idx + 1 == 1: # if there is one pairing to do
96
+ new_xy = self._iterate_on_pattern_to_get_paired_files(
97
+ pattern_list, idx, self.pairings[idx], X_train, X_test
98
+ )
99
+ elif idx + 1 == 2: # if there is two pairings to do
100
+ X1 = X_train
101
+ X2 = X_test
102
+ for p in self.pairings[0]:
103
+ pattern_list[0] = p
104
+ new_sub_xy = self._iterate_on_pattern_to_get_paired_files(
105
+ pattern_list, idx, self.pairings[idx], X1, X2
106
+ )
107
+ X1 = new_sub_xy[0][0]
108
+ X2 = new_sub_xy[0][1]
109
+
110
+ new_xy.extend(
111
+ new_sub_xy
112
+ ) # in shape of a list containing all [xtrain, xtest, ytrain, ytest] groups
113
+
114
+ ###### save splits dans dict ######
115
+
116
+ # shuffles
117
+ new_xy = np.swapaxes(
118
+ new_xy, 0, 1
119
+ ) # is now in shape of 4 big lists (xtrain, xtest, ytrain, ytest)
120
+
121
+ new_X_train = new_xy[0]
122
+ new_y_train = new_xy[2]
123
+ train_zip = list(zip(new_X_train, new_y_train))
124
+ random.Random(13).shuffle(train_zip)
125
+ X_train, y_train = zip(*train_zip)
126
+
127
+ new_X_test = new_xy[1]
128
+ new_y_test = new_xy[3]
129
+ test_zip = list(zip(new_X_test, new_y_test))
130
+ random.Random(13).shuffle(test_zip)
131
+ X_test, y_test = zip(*test_zip)
132
+
133
+ # save to dict
134
+ X_train = [self.names_dict[i] for i in X_train]
135
+ X_test = [self.names_dict[i] for i in X_test]
136
+
137
+ self.dict_splits["split{}".format(i)] = [
138
+ X_train,
139
+ X_test,
140
+ y_train,
141
+ y_test,
142
+ ]
143
+
144
+ def _is_there_pairing_to_do(self):
145
+ no_pairing = 0
146
+ for p in self.pairings:
147
+ if p == "no":
148
+ no_pairing += 1
149
+ if no_pairing == len(self.pairings):
150
+ return False
151
+ else:
152
+ return True
153
+
154
+ def _iterate_on_pattern_to_get_paired_files(self, base, idx, pairing, X1, X2):
155
+ """
156
+
157
+ :param base: list being updated of the patterns
158
+ :param idx: index of the pairing to fit with base shape
159
+ :param pairing: list of patterns for a single pairing
160
+ :param X1: Xtrain data to replace pattern in
161
+ :param X2: Xtest data to replace pattern in
162
+ :return: lists of new xtrain, xtest, ytrain and ytest
163
+ """
164
+ p1 = base[idx]
165
+ new_xy = []
166
+ for pattern in pairing:
167
+ base[idx] = pattern
168
+ X1_2, X2_2, y1_2, y2_2 = self._replace_pattern_get_matching_files(
169
+ p1, base[idx], X1, X2
170
+ )
171
+ new_xy.append([X1_2, X2_2, y1_2, y2_2])
172
+
173
+ return new_xy
174
+
175
+ def _replace_pattern_get_matching_files(self, p1, p2, Xtrain, Xtest):
176
+ Xtrain_2 = [i.replace(p1, p2) for i in Xtrain]
177
+ Xtest_2 = [i.replace(p1, p2) for i in Xtest]
178
+ ytrain_2 = []
179
+ ytest_2 = []
180
+
181
+ for file in Xtrain_2:
182
+ for i, id in enumerate(self.ids):
183
+ if id in file.split("/")[-1]:
184
+ ytrain_2.append(self.targets[i])
185
+ for file in Xtest_2:
186
+ for i, id in enumerate(self.ids):
187
+ if id in file.split("/")[-1]:
188
+ ytest_2.append(self.targets[i])
189
+
190
+ return Xtrain_2, Xtest_2, ytrain_2, ytest_2
191
+
192
+ def _test_pairing_patterns_spelling(self):
193
+ """
194
+ If the list obtained directly from the directory with a glob.glob is different of the list obtained by replacing
195
+ the pattern from the "base list" (list of file containing the first pattern of the pairing) it means there is a
196
+ problem of pattern spelling that will lead to further trouble in the rest of the processing
197
+ :return: nothing
198
+ """
199
+ path_of_files = self.files.split("/")[:-1]
200
+ path_of_files = "/".join(path_of_files)
201
+
202
+ prob_patterns = []
203
+
204
+ # TODO
205
+ # not sure if all the cases are covered, especially the case where the pattern is not found and return the
206
+ # same list unchanged
207
+ for pairing in self.pairings:
208
+ p1 = glob.glob(path_of_files + "*" + pairing[0] + "*")
209
+ for pattern in pairing:
210
+ p = glob.glob(path_of_files + "*" + pattern + "*")
211
+ p_by_replace = [f.replace(pairing[0], pattern) for f in p1]
212
+
213
+ if p.sort() != p_by_replace.sort():
214
+ prob_patterns.append(pattern)
215
+
216
+ if len(prob_patterns) > 0:
217
+ raise (
218
+ "There is a pattern error, here are the problematic patterns : {}".format(
219
+ prob_patterns
220
+ )
221
+ )
222
+
223
+
224
+ # #Conditionnal statement to handle split creation with pairing(s)
225
+ # if pairing_pn == "no" and pairing_12 == "no": # no pairing
226
+ # X = []
227
+ # y = []
228
+ # for file in files_list:
229
+ # for i, id in enumerate(uniq_ID):
230
+ # if id in file.split("/")[-1]:
231
+ # X.append(file)
232
+ # y.append(targets[i])
233
+ #
234
+ # # Create the splits
235
+ # for i in range(nbr_splits):
236
+ # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percent_in_test, random_state=i)
237
+ # splits_dict["split{}".format(i)] = [X_train, X_test, y_train, y_test]
238
+ #
239
+ # elif pairing_pn != "no" and pairing_12 == "no": # pairing of positive and negative files
240
+ # X_pos = []
241
+ # y_pos = []
242
+ # for file in files_list:
243
+ # if pair_id_pos in file.split("/")[-1]:
244
+ # for i, id in enumerate(uniq_ID):
245
+ # if id in file.split("/")[-1]:
246
+ # X_pos.append(file)
247
+ # y_pos.append(targets[i])
248
+ #
249
+ # # Create the splits
250
+ # for i in range(nbr_splits):
251
+ # X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_pos, y_pos, test_size=percent_in_test,
252
+ # random_state=i)
253
+ # X_train_n = [i.replace(pair_id_pos, pair_id_neg) for i in X_train_p]
254
+ # X_test_n = [i.replace(pair_id_pos, pair_id_neg) for i in X_test_p]
255
+ # y_train_n = []
256
+ # y_test_n = []
257
+ # for file in X_train_n:
258
+ # for i, id in enumerate(uniq_ID):
259
+ # if id in file.split("/")[-1]:
260
+ # y_train_n.append(targets[i])
261
+ # for file in X_test_n:
262
+ # for i, id in enumerate(uniq_ID):
263
+ # if id in file.split("/")[-1]:
264
+ # y_test_n.append(targets[i])
265
+ #
266
+ # # shuffle datasets to make sure pos files and neg are seen in a random order by the algorithms
267
+ # X_train = X_train_p + X_train_n
268
+ # y_train = y_train_p + y_train_n
269
+ # train_zip = list(zip(X_train, y_train))
270
+ # random.Random(13).shuffle(train_zip)
271
+ # X_train, y_train = zip(*train_zip)
272
+ #
273
+ # X_test = X_test_p + X_test_n
274
+ # y_test = y_test_p + y_test_n
275
+ # test_zip = list(zip(X_test, y_test))
276
+ # random.Random(13).shuffle(test_zip)
277
+ # X_test, y_test = zip(*test_zip)
278
+ #
279
+ # splits_dict["split{}".format(i)] = [X_train, X_test, y_train, y_test]
280
+ #
281
+ # elif pairing_pn == "no" and pairing_12 != "no": # pairing over another condition
282
+ # X_1 = []
283
+ # y_1 = []
284
+ # for file in files_list:
285
+ # if pair_id_1 in file.split("/")[-1]:
286
+ # for i, id in enumerate(uniq_ID):
287
+ # if id in file.split("/")[-1]:
288
+ # X_1.append(file)
289
+ # y_1.append(targets[i])
290
+ #
291
+ # # Create the splits
292
+ # for i in range(nbr_splits):
293
+ # X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=percent_in_test,
294
+ # random_state=i)
295
+ # X_train_2 = [i.replace(pair_id_1, pair_id_2) for i in X_train_1]
296
+ # X_test_2 = [i.replace(pair_id_1, pair_id_2) for i in X_test_1]
297
+ # y_train_2 = []
298
+ # y_test_2 = []
299
+ # for file in X_train_2:
300
+ # for i, id in enumerate(uniq_ID):
301
+ # if id in file.split("/")[-1]:
302
+ # y_train_2.append(targets[i])
303
+ # for file in X_test_2:
304
+ # for i, id in enumerate(uniq_ID):
305
+ # if id in file.split("/")[-1]:
306
+ # y_test_2.append(targets[i])
307
+ #
308
+ # # shuffle datasets to make sure pos files and neg are seen in a random order by the algorithms
309
+ # X_train = X_train_1 + X_train_2
310
+ # y_train = y_train_1 + y_train_2
311
+ # train_zip = list(zip(X_train, y_train))
312
+ # random.Random(13).shuffle(train_zip)
313
+ # X_train, y_train = zip(*train_zip)
314
+ #
315
+ # X_test = X_test_1 + X_test_2
316
+ # y_test = y_test_1 + y_test_2
317
+ # test_zip = list(zip(X_test, y_test))
318
+ # random.Random(13).shuffle(test_zip)
319
+ # X_test, y_test = zip(*test_zip)
320
+ #
321
+ # splits_dict["split{}".format(i)] = [X_train, X_test, y_train, y_test]
322
+ #
323
+ # elif pairing_pn != "no" and pairing_12 != "no": # pairing pos/neg AND another condition
324
+ # X_pos_1 = []
325
+ # y_pos_1 = []
326
+ # for file in files_list:
327
+ # if pair_id_pos in file.split("/")[-1] and pair_id_1 in file.split("/")[-1]:
328
+ # for i, id in enumerate(uniq_ID):
329
+ # if id in file.split("/")[-1]:
330
+ # X_pos_1.append(file)
331
+ # y_pos_1.append(targets[i])
332
+ #
333
+ # # Create the splits
334
+ # for i in range(nbr_splits):
335
+ # X_train_pos_1, X_test_pos_1, y_train_pos_1, y_test_pos_1 = train_test_split(X_pos_1, y_pos_1,
336
+ # test_size=percent_in_test,
337
+ # random_state=i)
338
+ # X_train_pos_2 = [i.replace(pair_id_1, pair_id_2) for i in X_train_pos_1]
339
+ # X_test_pos_2 = [i.replace(pair_id_1, pair_id_2) for i in X_test_pos_1]
340
+ # y_train_pos_2 = []
341
+ # y_test_pos_2 = []
342
+ # for file in X_train_pos_2:
343
+ # for i, id in enumerate(uniq_ID):
344
+ # if id in file.split("/")[-1]:
345
+ # y_train_pos_2.append(targets[i])
346
+ # for file in X_test_pos_2:
347
+ # for i, id in enumerate(uniq_ID):
348
+ # if id in file.split("/")[-1]:
349
+ # y_test_pos_2.append(targets[i])
350
+ #
351
+ # X_train_neg_1 = [i.replace(pair_id_pos, pair_id_neg) for i in X_train_pos_1]
352
+ # X_test_neg_1 = [i.replace(pair_id_pos, pair_id_neg) for i in X_test_pos_1]
353
+ # y_train_neg_1 = []
354
+ # y_test_neg_1 = []
355
+ # for file in X_train_neg_1:
356
+ # for i, id in enumerate(uniq_ID):
357
+ # if id in file.split("/")[-1]:
358
+ # y_train_neg_1.append(targets[i])
359
+ # for file in X_test_neg_1:
360
+ # for i, id in enumerate(uniq_ID):
361
+ # if id in file.split("/")[-1]:
362
+ # y_test_neg_1.append(targets[i])
363
+ #
364
+ # X_train_neg_2 = [i.replace(pair_id_1, pair_id_2) for i in X_train_neg_1]
365
+ # X_test_neg_2 = [i.replace(pair_id_1, pair_id_2) for i in X_test_neg_1]
366
+ # y_train_neg_2 = []
367
+ # y_test_neg_2 = []
368
+ # for file in X_train_neg_2:
369
+ # for i, id in enumerate(uniq_ID):
370
+ # if id in file.split("/")[-1]:
371
+ # y_train_neg_2.append(targets[i])
372
+ # for file in X_test_neg_2:
373
+ # for i, id in enumerate(uniq_ID):
374
+ # if id in file.split("/")[-1]:
375
+ # y_test_neg_2.append(targets[i])
376
+ #
377
+ # # shuffle datasets to make sure pos files and neg are seen in a random order by the algorithms
378
+ # X_train = X_train_pos_1 + X_train_pos_2 + X_train_neg_1 + X_train_neg_2
379
+ # y_train = y_train_pos_1 + y_train_pos_2 + y_train_neg_1 + y_train_neg_2
380
+ # train_zip = list(zip(X_train, y_train))
381
+ # random.Random(13).shuffle(train_zip)
382
+ # X_train, y_train = zip(*train_zip)
383
+ #
384
+ # X_test = X_test_pos_1 + X_test_pos_2 + X_test_neg_1 + X_test_neg_2
385
+ # y_test = y_test_pos_1 + y_test_pos_2 + y_test_neg_1 + y_test_neg_2
386
+ # test_zip = list(zip(X_test, y_test))
387
+ # random.Random(13).shuffle(test_zip)
388
+ # X_test, y_test = zip(*test_zip)
389
+ #
390
+ # splits_dict["split{}".format(i)] = [X_train, X_test, y_train, y_test]