pico-ml 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pico/__init__.py +3 -0
- pico/__main__.py +3 -0
- pico/cli/__init__.py +2 -0
- pico/cli/main.py +117 -0
- pico/conf/SupportedCV.py +17 -0
- pico/conf/SupportedModels.py +73 -0
- pico/conf/algo_sklearn.json +51 -0
- pico/conf/parameters.py +14 -0
- pico/domain/ClassificationDesign.py +107 -0
- pico/domain/Controller.py +397 -0
- pico/domain/DataMatrix.py +147 -0
- pico/domain/ExperimentDTO.py +17 -0
- pico/domain/MetaData.py +229 -0
- pico/domain/MetaboExperiment.py +696 -0
- pico/domain/MetaboModel.py +53 -0
- pico/domain/ModelFactory.py +45 -0
- pico/domain/Results.py +602 -0
- pico/domain/SplitGroup.py +202 -0
- pico/domain/__init__.py +9 -0
- pico/domain/dumps/metadata/.gitkeep +0 -0
- pico/domain/dumps/splits/.gitkeep +0 -0
- pico/service/DataFormat.py +180 -0
- pico/service/ExperimentDesign.py +30 -0
- pico/service/LoggerConfig.py +150 -0
- pico/service/Plots.py +472 -0
- pico/service/RunMLalgo.py +93 -0
- pico/service/SamplesPairing.py +390 -0
- pico/service/Utils.py +497 -0
- pico/service/__init__.py +7 -0
- pico/ui/__init__.py +1 -0
- pico/ui/app.py +145 -0
- pico/ui/assets/000_Stylesheet.css +464 -0
- pico/ui/assets/DecisionTree.png +0 -0
- pico/ui/assets/Figure_home_wider.png +0 -0
- pico/ui/assets/favicon.ico +0 -0
- pico/ui/assets/help_icon.png +0 -0
- pico/ui/assets/help_icon.svg +15 -0
- pico/ui/assets/update_figure_steps_MeDIC_4.svg +1 -0
- pico/ui/tabs/AggregatedResultsTab.py +394 -0
- pico/ui/tabs/InfoTab.py +440 -0
- pico/ui/tabs/InterpretTab.py +21 -0
- pico/ui/tabs/MLTab.py +487 -0
- pico/ui/tabs/MetaTab.py +23 -0
- pico/ui/tabs/ResultsTab.py +1062 -0
- pico/ui/tabs/SplitsTab.py +1227 -0
- pico/ui/tabs/__init__.py +6 -0
- pico/ui/tabs/utils.py +101 -0
- pico_ml-2.0.0.dist-info/METADATA +86 -0
- pico_ml-2.0.0.dist-info/RECORD +52 -0
- pico_ml-2.0.0.dist-info/WHEEL +4 -0
- pico_ml-2.0.0.dist-info/entry_points.txt +2 -0
- pico_ml-2.0.0.dist-info/licenses/LICENSE +437 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from typing import List, Tuple, Union
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sklearn.model_selection import train_test_split
|
|
5
|
+
|
|
6
|
+
from . import MetaData
|
|
7
|
+
from ..service import Utils, init_logger
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SplitGroup:
|
|
12
|
+
def __init__(self, metadata: MetaData, selected_targets: List[str], train_test_proportion: float,
|
|
13
|
+
number_of_splits: int, classes_design: dict, pairing_column: str,
|
|
14
|
+
uniq_sample_id: List[str], balance_correction: int = 0,
|
|
15
|
+
classes_repartition: Union[dict, None] = None,
|
|
16
|
+
test_split_seed: Union[int,None] = None):
|
|
17
|
+
self._logger = init_logger()
|
|
18
|
+
self._metadata = metadata
|
|
19
|
+
self._number_of_split = number_of_splits
|
|
20
|
+
self._classes_design = classes_design
|
|
21
|
+
self._splits = []
|
|
22
|
+
self._compute_splits(train_test_proportion, number_of_splits, pairing_column, selected_targets,
|
|
23
|
+
uniq_sample_id, balance_correction, classes_repartition, test_split_seed)
|
|
24
|
+
|
|
25
|
+
def _compute_splits(self, train_test_proportion: float, number_of_splits: int, pairing_column: str,
|
|
26
|
+
selected_targets: List[str], uniq_sample_id: List[str],
|
|
27
|
+
balance_correction: int = 0,
|
|
28
|
+
classes_repartition: Union[dict, None] = None,
|
|
29
|
+
test_split_seed: Union[int,None] = None) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Create the desired number of split for the experiment. It includes/hadles the train-test repartition, the class
|
|
32
|
+
balancing, the pairing of samples, the classes design, etc.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
train_test_proportion (float): Proportion of the dataset to include in the test split.
|
|
36
|
+
number_of_splits (int): total number of splits
|
|
37
|
+
pairing_column (str): Column to user of the pairing. When empty ("") not pairing is done.
|
|
38
|
+
selected_targets (List[str]): The selection of classes done with the interface or the automate.py (the names of the
|
|
39
|
+
selected classes/targets)
|
|
40
|
+
We consider selected_targets that has targets coming from multiple columns, that they are separated
|
|
41
|
+
by "__" i.e. "ali__A", "med__B", etc.
|
|
42
|
+
balance_correction (int, optional): balance correction to adjust proportion between classes.
|
|
43
|
+
Defaults to 0. (for no balancing)
|
|
44
|
+
classes_repartition (Union[dict, None], optional): . Defaults to None.
|
|
45
|
+
test_split_seed (int | None, optional): Split seed number. For test purpose only,
|
|
46
|
+
to be used from automate.py to test one specific split. Defaults to None.
|
|
47
|
+
"""
|
|
48
|
+
self._logger.info("_compute_split function beginning")
|
|
49
|
+
|
|
50
|
+
# 1 - filter out the samples having a target not included in the classification design
|
|
51
|
+
# retrieve metadata dataframe
|
|
52
|
+
df_filter = self._metadata.get_metadata()
|
|
53
|
+
# keep only the lines for which the value in the final_targets column is in selected_targets
|
|
54
|
+
df_filter = df_filter[df_filter[self._metadata.get_target_column()].isin(selected_targets)]
|
|
55
|
+
# keep only the lines that correspond to samples in data file
|
|
56
|
+
# (handles the cases of a metadata file for multiple data files : where samples in
|
|
57
|
+
# the metadata having corresponding targets are not in the provided data file)
|
|
58
|
+
df_filter = df_filter[df_filter[self._metadata.get_id_column()].isin(uniq_sample_id)]
|
|
59
|
+
|
|
60
|
+
# 2 - select only one sample per entity
|
|
61
|
+
if pairing_column != "":
|
|
62
|
+
# sort the dataframe by the pairing_column values
|
|
63
|
+
df_entity = df_filter.sort_values(pairing_column)
|
|
64
|
+
# group samples by the pairing column and keep only the first row of each group (.nth(0) is more stable
|
|
65
|
+
# than .first())
|
|
66
|
+
# Carefull : the groupby function change the index of the dataframe to the column it groups by
|
|
67
|
+
df_entity = df_entity.groupby(pairing_column).nth(0)
|
|
68
|
+
else:
|
|
69
|
+
df_entity = df_filter
|
|
70
|
+
|
|
71
|
+
# 2.5 - extract ids and targets, transform targets to labels
|
|
72
|
+
ids = df_entity[self._metadata.get_id_column()]
|
|
73
|
+
targets = df_entity[self._metadata.get_target_column()]
|
|
74
|
+
labels = Utils.load_classes_from_targets(self._classes_design, targets)
|
|
75
|
+
|
|
76
|
+
# 3- procede with the train-test division on the selected samples
|
|
77
|
+
if test_split_seed is not None:
|
|
78
|
+
self._logger.debug(f"Testing split seed #{test_split_seed}")
|
|
79
|
+
split_indexes: list[int] = [test_split_seed] # Test only one split seed
|
|
80
|
+
else:
|
|
81
|
+
split_indexes = list(range(number_of_splits)) # All splits indexes
|
|
82
|
+
|
|
83
|
+
for split_index in split_indexes:
|
|
84
|
+
if pairing_column == "":
|
|
85
|
+
X_train, X_test, y_train, y_test = train_test_split(ids, labels, test_size=train_test_proportion,
|
|
86
|
+
random_state=split_index, stratify=labels)
|
|
87
|
+
|
|
88
|
+
# 4- retrieve the paired samples corresponding to the one in train or test set
|
|
89
|
+
else:
|
|
90
|
+
# random shuffle initialisation for second shuffle of samples
|
|
91
|
+
rng = np.random.default_rng(seed=split_index)
|
|
92
|
+
# define the ids column as the index of the dataframe, so it can be extracted with groupby().groups
|
|
93
|
+
df = df_filter.set_index(self._metadata.get_id_column())
|
|
94
|
+
# groups is a dictionary with 'keys' as the pairing value and 'values' as the index of the lines corresponding to the pairing
|
|
95
|
+
groups = df.groupby(pairing_column).groups
|
|
96
|
+
# apply the train-test division on the pairing values / the entity
|
|
97
|
+
# TODO : careful check if labels is in the right order with the data
|
|
98
|
+
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(df_entity.index, labels,
|
|
99
|
+
test_size=train_test_proportion,
|
|
100
|
+
random_state=split_index,
|
|
101
|
+
stratify=labels)
|
|
102
|
+
# retrieve the ids corresponding the to entities in train
|
|
103
|
+
X_train = []
|
|
104
|
+
for representative in X_train_temp:
|
|
105
|
+
represented_pairing_value = df_filter.loc[representative][pairing_column]
|
|
106
|
+
X_train.extend(groups[represented_pairing_value])
|
|
107
|
+
# retrieve targets corresponding to ids and then convert to labels
|
|
108
|
+
X_train = pd.Series(X_train)
|
|
109
|
+
targets = df.loc[X_train][self._metadata.get_target_column()]
|
|
110
|
+
y_train = Utils.load_classes_from_targets(self._classes_design, targets)
|
|
111
|
+
|
|
112
|
+
training_data = list(zip(X_train, y_train))
|
|
113
|
+
rng.shuffle(training_data)
|
|
114
|
+
X_train, y_train = zip(*training_data)
|
|
115
|
+
|
|
116
|
+
# retrieve the ids corresponding the to entities in test
|
|
117
|
+
X_test = []
|
|
118
|
+
for representative in X_test_temp:
|
|
119
|
+
represented_pairing_value = df_filter.loc[representative][pairing_column]
|
|
120
|
+
X_test.extend(groups[represented_pairing_value])
|
|
121
|
+
# retrieve targets corresponding to ids and then convert to labels
|
|
122
|
+
X_test = pd.Series(X_test)
|
|
123
|
+
targets = df.loc[X_test][self._metadata.get_target_column()]
|
|
124
|
+
y_test = Utils.load_classes_from_targets(self._classes_design, targets)
|
|
125
|
+
|
|
126
|
+
testing_data = list(zip(X_test, y_test))
|
|
127
|
+
rng.shuffle(testing_data)
|
|
128
|
+
X_test, y_test = zip(*testing_data)
|
|
129
|
+
|
|
130
|
+
if balance_correction > 0:
|
|
131
|
+
X_train, y_train = Utils.remove_random_samples_from_class(X_train,
|
|
132
|
+
y_train,
|
|
133
|
+
balance_correction,
|
|
134
|
+
classes_repartition)
|
|
135
|
+
X_train = list(X_train)
|
|
136
|
+
y_train = list(y_train)
|
|
137
|
+
X_test = list(X_test)
|
|
138
|
+
y_test = list(y_test)
|
|
139
|
+
|
|
140
|
+
if not self._validate_split(y_train, y_test):
|
|
141
|
+
raise RuntimeError(f"_compute_split step #4 aborted for the invalid split #{split_index}.")
|
|
142
|
+
|
|
143
|
+
self._splits.append([X_train, X_test, y_train, y_test])
|
|
144
|
+
|
|
145
|
+
self._number_of_split = len(self._splits) # Update the number of splits if some have been removed
|
|
146
|
+
|
|
147
|
+
self._logger.info("_compute_split function done")
|
|
148
|
+
|
|
149
|
+
def load_split_with_index(self, split_index: int) -> list:
|
|
150
|
+
return self._splits[split_index]
|
|
151
|
+
|
|
152
|
+
def get_number_of_splits(self):
|
|
153
|
+
"""
|
|
154
|
+
Return the attribute of number of split
|
|
155
|
+
"""
|
|
156
|
+
return self._number_of_split
|
|
157
|
+
|
|
158
|
+
def filter_sample_with_pairing_group(self, pairing_column: str) -> Tuple[List[str], List[str]]:
|
|
159
|
+
"""
|
|
160
|
+
Function only needs the name of the column used to pair samples together.
|
|
161
|
+
It retrieves other informations from the attributes (object MetaData).
|
|
162
|
+
Then it iterates over all the metadata dataframe to store only one sample of each entity (entity stands for a
|
|
163
|
+
biological source, like an individual). Multiple samples can originate from one entity.
|
|
164
|
+
"""
|
|
165
|
+
metadata_dataframe = self._metadata.get_metadata()
|
|
166
|
+
id_column = self._metadata.get_id_column()
|
|
167
|
+
target_column = self._metadata.get_target_column()
|
|
168
|
+
filtered_id = []
|
|
169
|
+
filtered_target = []
|
|
170
|
+
already_selected_value = set()
|
|
171
|
+
# TODO : might want to change the process to sorting all lines and then picking the first one
|
|
172
|
+
for index, row in metadata_dataframe.iterrows():
|
|
173
|
+
if row[pairing_column] not in already_selected_value:
|
|
174
|
+
already_selected_value.add(row[pairing_column])
|
|
175
|
+
filtered_id.append(row[id_column])
|
|
176
|
+
filtered_target.append(row[target_column])
|
|
177
|
+
return filtered_id, filtered_target
|
|
178
|
+
|
|
179
|
+
def get_selected_targets_and_ids(self, selected_targets: List[str], samples_id: List[str],
|
|
180
|
+
targets: List[str]) -> Tuple[Tuple[str], Tuple[str]]:
|
|
181
|
+
"""
|
|
182
|
+
Function just filters out the target/id that are not in the selected_targets list
|
|
183
|
+
"""
|
|
184
|
+
return tuple(zip(*[(target, id) for target, id in zip(targets, samples_id) if target in selected_targets]))
|
|
185
|
+
|
|
186
|
+
def _validate_split(self, y_train: list, y_test: list) -> bool:
|
|
187
|
+
# Test and train validation: they must have at least 2 classes.
|
|
188
|
+
nb_test_classes: int = len(set(y_test))
|
|
189
|
+
nb_train_classes: int = len(set(y_train))
|
|
190
|
+
|
|
191
|
+
if nb_test_classes < 2 or nb_train_classes < 2:
|
|
192
|
+
error_msg: str = "At least 2 classes must be present in both train and test splits."
|
|
193
|
+
if nb_test_classes < 2:
|
|
194
|
+
error_msg += f" Test set contains only the class '{next(iter(set(y_test)))}'."
|
|
195
|
+
if nb_train_classes < 2:
|
|
196
|
+
error_msg += f" Train set contains only the class '{next(iter(set(y_train)))}'."
|
|
197
|
+
|
|
198
|
+
self._logger.error(error_msg)
|
|
199
|
+
return False
|
|
200
|
+
|
|
201
|
+
return True
|
|
202
|
+
|
pico/domain/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .SplitGroup import SplitGroup
|
|
2
|
+
from .MetaData import MetaData
|
|
3
|
+
from .ClassificationDesign import ClassificationDesign
|
|
4
|
+
from .MetaboExperiment import MetaboExperiment
|
|
5
|
+
from .ModelFactory import ModelFactory
|
|
6
|
+
from .Results import Results
|
|
7
|
+
from .Controller import Controller
|
|
8
|
+
from .DataMatrix import DataMatrix
|
|
9
|
+
from .MetaboModel import MetaboModel
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import os
|
|
4
|
+
import base64
|
|
5
|
+
import io
|
|
6
|
+
from .Utils import *
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataFormat:
|
|
10
|
+
"""
|
|
11
|
+
Take data file(s) as input and output a matrix where columns are samples and lines features. With the matrix comes
|
|
12
|
+
a list of the columns names to retrieve the samples properly.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, filename, data=None, use_raw=False, from_base64_str=True):
|
|
16
|
+
self.use_raw = use_raw
|
|
17
|
+
self.filename = filename
|
|
18
|
+
self.base64 = from_base64_str
|
|
19
|
+
self.data = data
|
|
20
|
+
|
|
21
|
+
# TODO : make sure to check if "not progen" matrix are well handled
|
|
22
|
+
if self.base64:
|
|
23
|
+
self.in_format = "base64"
|
|
24
|
+
elif os.path.isfile(filename):
|
|
25
|
+
self.in_format = "file"
|
|
26
|
+
elif os.path.isdir(filename):
|
|
27
|
+
self.in_format = "LDTD"
|
|
28
|
+
else:
|
|
29
|
+
raise TypeError(
|
|
30
|
+
"The given path is not valid, it has to be a file or a directory."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def convert(self):
|
|
34
|
+
"""
|
|
35
|
+
Convert data format for easier use by the PICO
|
|
36
|
+
"""
|
|
37
|
+
if self.in_format == "base64":
|
|
38
|
+
data_type, data_string = self.data.split(",")
|
|
39
|
+
self.data = base64.b64decode(data_string)
|
|
40
|
+
data = self._convert_from_file()
|
|
41
|
+
elif self.in_format == "file":
|
|
42
|
+
data = self._convert_from_file()
|
|
43
|
+
elif self.in_format == "LDTD":
|
|
44
|
+
raise ValueError("reading data from LDTD is not supported yet")
|
|
45
|
+
#data = self._convert_from_LDTD()
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError("self.in_format does not correspond to accepted values (base64, file)")
|
|
48
|
+
return data
|
|
49
|
+
|
|
50
|
+
def _convert_from_file(self):
|
|
51
|
+
"""
|
|
52
|
+
Take a file path or an StringIO object and read it as a pandas Dataframe
|
|
53
|
+
"""
|
|
54
|
+
file_ext = self.filename.split(".")[-1]
|
|
55
|
+
# TODO : beware of the sep (, or ;)
|
|
56
|
+
if "csv" in file_ext: # Abundance matrices of Progenesis are always in csv format, so its checked first
|
|
57
|
+
if self.in_format == "base64": # this condition is to make readable the input data from dcc.Upload
|
|
58
|
+
self.data = io.StringIO(self.data.decode("utf-8"))
|
|
59
|
+
else: # this else is to enable the pd dataframe to be read from full file path
|
|
60
|
+
self.data = self.filename
|
|
61
|
+
header = pd.read_csv(self.data, header=None, sep=None, engine="python", nrows=3,
|
|
62
|
+
index_col=0,).fillna("").to_numpy()
|
|
63
|
+
|
|
64
|
+
# Needs to reset the pointer to the top of the ioString (to be able to read the string again)
|
|
65
|
+
if self.in_format == "base64":
|
|
66
|
+
self.data.seek(0)
|
|
67
|
+
|
|
68
|
+
if "Normalised abundance" in header[0] or "Raw abundance" in header[0]:
|
|
69
|
+
datatable = pd.read_csv(self.data, header=[0, 1, 2], sep=None, engine="python", index_col=0)
|
|
70
|
+
# Will return : datatable_compoundsInfo, datatable, labels, sample_names
|
|
71
|
+
return self._read_Progenesis_data_table(datatable, header)
|
|
72
|
+
else:
|
|
73
|
+
datatable = pd.read_csv(self.data, sep=None, engine="python", index_col=0)
|
|
74
|
+
# WARNING : return None, datatable, None, None
|
|
75
|
+
return self._read_general_data_table(datatable)
|
|
76
|
+
|
|
77
|
+
elif ("xls" in file_ext or "od" in file_ext): # TODO : restrict the "od" condition, might be too large
|
|
78
|
+
if self.in_format == "base64": # same as above
|
|
79
|
+
self.data = io.StringIO(io.BytesIO(self.data))
|
|
80
|
+
else:
|
|
81
|
+
self.data = self.filename
|
|
82
|
+
datatable = pd.read_excel(self.data, index_col=0)
|
|
83
|
+
# WARNING : return None, datatable, None, None
|
|
84
|
+
return self._read_general_data_table(datatable)
|
|
85
|
+
|
|
86
|
+
else:
|
|
87
|
+
raise TypeError("The input file is not of the right type, must be excel, odt or csv.")
|
|
88
|
+
|
|
89
|
+
def _convert_from_LDTD(self):
|
|
90
|
+
# TODO : implement the handling of LDTD data format
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
def _read_general_data_table(self, datatable):
|
|
94
|
+
"""
|
|
95
|
+
for now does nothing, but might be the place to deal with custom format of matrices with extra/unecessary columns
|
|
96
|
+
or informations
|
|
97
|
+
! careful : output only the datable and 3 empty strings because the function that calls it only needs datatable,
|
|
98
|
+
but that might change
|
|
99
|
+
"""
|
|
100
|
+
# This return list fits the return when reading a Progenesis file
|
|
101
|
+
return None, datatable, None, None
|
|
102
|
+
|
|
103
|
+
def _read_Progenesis_data_table(self, datatable, header):
|
|
104
|
+
"""
|
|
105
|
+
Assumes Raw data columns are written after Normalized data columns in the file.
|
|
106
|
+
:param datatable:
|
|
107
|
+
:return:
|
|
108
|
+
"""
|
|
109
|
+
# print(header)
|
|
110
|
+
if not self.use_raw and "Normalised abundance" in header[0]: # header.columns.tolist():
|
|
111
|
+
start_data = list(header[0]).index("Normalised abundance")
|
|
112
|
+
elif self.use_raw and "Raw abundance" in header[0]: # header.columns.tolist():
|
|
113
|
+
start_data = list(header[0]).index("Raw abundance")
|
|
114
|
+
else:
|
|
115
|
+
raise KeyError("There is no Raw or Normalized abundance detected in the header.")
|
|
116
|
+
|
|
117
|
+
new_header = []
|
|
118
|
+
for l in header:
|
|
119
|
+
new_header.append(list_filler(l))
|
|
120
|
+
|
|
121
|
+
datatable.columns = new_header
|
|
122
|
+
datatable_compoundsInfo = datatable.iloc[:, 0:start_data]
|
|
123
|
+
datatable_compoundsInfo.columns = datatable_compoundsInfo.columns.droplevel([0, 1])
|
|
124
|
+
datatable_compoundsInfo = datatable_compoundsInfo.T
|
|
125
|
+
|
|
126
|
+
if self.use_raw:
|
|
127
|
+
datatable = datatable["Raw abundance"]
|
|
128
|
+
labels, sample_names = list(zip(*datatable.columns))
|
|
129
|
+
else:
|
|
130
|
+
datatable = datatable["Normalised abundance"]
|
|
131
|
+
labels, sample_names = list(zip(*datatable.columns))
|
|
132
|
+
|
|
133
|
+
datatable.columns = datatable.columns.droplevel(0)
|
|
134
|
+
datatable = datatable.T
|
|
135
|
+
|
|
136
|
+
datatable = datatable.loc[[index for index in datatable.index if "QC" not in index]]
|
|
137
|
+
|
|
138
|
+
return datatable_compoundsInfo, datatable, labels, sample_names
|
|
139
|
+
|
|
140
|
+
# start_normalized = header.columns.tolist().index("Normalised abundance")
|
|
141
|
+
# labels_array = np.array(header.iloc[0].tolist())
|
|
142
|
+
|
|
143
|
+
# if with_raw:
|
|
144
|
+
# start_raw = header.columns.tolist().index("Raw abundance")
|
|
145
|
+
# sample_names = datatable.iloc[:, start_normalized:start_raw].columns
|
|
146
|
+
# labels = labels_array.tolist()[start_normalized:start_raw]
|
|
147
|
+
# else:
|
|
148
|
+
# sample_names = datatable.iloc[:, start_normalized:].columns
|
|
149
|
+
# labels = labels_array.tolist()[start_normalized:]
|
|
150
|
+
#
|
|
151
|
+
# current_label = ""
|
|
152
|
+
# for idx, l in enumerate(labels):
|
|
153
|
+
# if l != "nan":
|
|
154
|
+
# current_label = l
|
|
155
|
+
# else:
|
|
156
|
+
# labels[idx] = current_label
|
|
157
|
+
#
|
|
158
|
+
# if with_raw:
|
|
159
|
+
# datatable_compoundsInfo = datatable.iloc[:, 0:start_normalized]
|
|
160
|
+
# datatable_normalized = datatable.iloc[:, start_normalized:start_raw]
|
|
161
|
+
# datatable_raw = datatable.iloc[:, start_raw:]
|
|
162
|
+
# datatable_raw.columns = [i.rstrip(".1") for i in datatable_raw.columns] # Fix the columns names
|
|
163
|
+
#
|
|
164
|
+
# datatable_normalized = datatable_normalized.T
|
|
165
|
+
# datatable_raw = datatable_raw.T
|
|
166
|
+
# datatable_compoundsInfo = datatable_compoundsInfo.T
|
|
167
|
+
# datatable_normalized.rename(columns={"Compound": "Sample"})
|
|
168
|
+
# datatable_raw.rename(columns={"Compound": "Sample"})
|
|
169
|
+
#
|
|
170
|
+
# if self.use_raw:
|
|
171
|
+
# return datatable_compoundsInfo, datatable_raw, labels, sample_names
|
|
172
|
+
# else:
|
|
173
|
+
# return datatable_compoundsInfo, datatable_normalized, labels, sample_names
|
|
174
|
+
# else:
|
|
175
|
+
# datatable_compoundsInfo = datatable.iloc[:, 0:start_normalized]
|
|
176
|
+
# datatable_normalized = datatable.iloc[:, start_normalized:]
|
|
177
|
+
# datatable_normalized = datatable_normalized.T
|
|
178
|
+
# datatable_compoundsInfo = datatable_compoundsInfo.T
|
|
179
|
+
# datatable_normalized.rename(columns={"Compound": "Sample"})
|
|
180
|
+
# return datatable_compoundsInfo, datatable_normalized, labels, sample_names
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
DATA_MATRIX = "Data\\Matrix_normalised_pos.csv"
|
|
2
|
+
|
|
3
|
+
# Cas, Temoin, TC
|
|
4
|
+
EXPERIMENT_DESIGNS = {
|
|
5
|
+
"Ctrl_vs_Case": {
|
|
6
|
+
"classes": {"Controls": ["Temoin"], "Cases": ["Cas"]},
|
|
7
|
+
"TestSize": 0.2,
|
|
8
|
+
},
|
|
9
|
+
# "Control vs TC":{
|
|
10
|
+
# "classes": {
|
|
11
|
+
# "Controles":["Cas"],
|
|
12
|
+
# "TC": ["TC"]
|
|
13
|
+
# },
|
|
14
|
+
# "TestSize": 0.2,
|
|
15
|
+
# },
|
|
16
|
+
# "TC vs Cas":{
|
|
17
|
+
# "classes": {
|
|
18
|
+
# "TC":["TC"],
|
|
19
|
+
# "Cases": ["Cas"]
|
|
20
|
+
# },
|
|
21
|
+
# "TestSize": 0.2,
|
|
22
|
+
# },
|
|
23
|
+
# "Control vs all":{
|
|
24
|
+
# "classes": {
|
|
25
|
+
# "Control":["Temoin"],
|
|
26
|
+
# "All": ["Cas", "TC"]
|
|
27
|
+
# },
|
|
28
|
+
# "TestSize": 0.2,
|
|
29
|
+
# },
|
|
30
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import coloredlogs # type: ignore
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import inspect
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
import traceback
|
|
8
|
+
from functools import wraps
|
|
9
|
+
from typing import Callable
|
|
10
|
+
from .Utils import get_pico_subdir
|
|
11
|
+
|
|
12
|
+
log_filename: str|None = None # Global variable for log filename
|
|
13
|
+
|
|
14
|
+
def log_exceptions(logger: logging.Logger) -> Callable:
|
|
15
|
+
def decorator(func: Callable):
|
|
16
|
+
@wraps(func)
|
|
17
|
+
def wrapper(*args, **kwargs) -> Callable:
|
|
18
|
+
try:
|
|
19
|
+
return func(*args, **kwargs)
|
|
20
|
+
except Exception as e:
|
|
21
|
+
# Log the exception or handle it as needed
|
|
22
|
+
thread_name = threading.current_thread().name
|
|
23
|
+
logger.error(f"Error in thread {thread_name}: {e}\n{traceback.format_exc()}")
|
|
24
|
+
raise # Re-raise the exception to preserve the original
|
|
25
|
+
return wrapper
|
|
26
|
+
return decorator
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def set_log_filename(filename: str="pico.log", add_date: bool=True, level=logging.DEBUG) -> logging.Logger:
|
|
30
|
+
"""Sets the log filename with an optional date suffix.
|
|
31
|
+
Args:
|
|
32
|
+
filename (str, optional): The base filename for the log. Defaults to "pico.log".
|
|
33
|
+
add_date (bool, optional): If True, adds the current date to the filename. Defaults to True.
|
|
34
|
+
"""
|
|
35
|
+
global log_filename
|
|
36
|
+
|
|
37
|
+
if add_date:
|
|
38
|
+
date_suffix = datetime.now().strftime("%Y-%m-%d")
|
|
39
|
+
|
|
40
|
+
if filename.lower().endswith(".log"):
|
|
41
|
+
log_filename = filename.replace(".log", f"_{date_suffix}.log")
|
|
42
|
+
else:
|
|
43
|
+
log_filename = f"{filename}_{date_suffix}.log"
|
|
44
|
+
else:
|
|
45
|
+
log_filename = filename
|
|
46
|
+
|
|
47
|
+
# Log file in ~/pico_files/logs directory
|
|
48
|
+
logs_directory = get_pico_subdir("logs")
|
|
49
|
+
log_filename = os.path.join(logs_directory, log_filename)
|
|
50
|
+
|
|
51
|
+
# Add "-----------------------" in the log file to start the current session
|
|
52
|
+
with open(log_filename, 'a') as log_file:
|
|
53
|
+
if threading.current_thread() is threading.main_thread():
|
|
54
|
+
log_file.write(f"\n{'----- New start (' + threading.current_thread().name + ') ':-<80}\n")
|
|
55
|
+
else:
|
|
56
|
+
log_file.write(f" New thread ({threading.current_thread().name})\n")
|
|
57
|
+
|
|
58
|
+
# Root logger
|
|
59
|
+
root_logger = logging.getLogger()
|
|
60
|
+
root_logger.handlers = [] # Clear existing handlers
|
|
61
|
+
root_logger.setLevel(logging.WARNING) # Only show WARNING level and above
|
|
62
|
+
root_logger.propagate = False
|
|
63
|
+
|
|
64
|
+
# Terminal (console) handler
|
|
65
|
+
console_handler = logging.StreamHandler()
|
|
66
|
+
console_handler.setLevel(logging.WARNING)
|
|
67
|
+
|
|
68
|
+
# File handler
|
|
69
|
+
file_handler = logging.FileHandler(log_filename)
|
|
70
|
+
file_handler.setLevel(logging.DEBUG) # Ensure DEBUG level for detailed logs
|
|
71
|
+
|
|
72
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
73
|
+
console_handler.setFormatter(formatter)
|
|
74
|
+
file_handler.setFormatter(formatter)
|
|
75
|
+
|
|
76
|
+
root_logger.addHandler(console_handler)
|
|
77
|
+
root_logger.addHandler(file_handler)
|
|
78
|
+
|
|
79
|
+
# Ensure colored logs for terminal
|
|
80
|
+
coloredlogs.install(level=logging.WARNING, logger=root_logger, stream=console_handler.stream)
|
|
81
|
+
|
|
82
|
+
# werkzeug logs
|
|
83
|
+
werkzeug_logger = logging.getLogger('werkzeug')
|
|
84
|
+
werkzeug_logger.setLevel(logging.INFO) # Log level to INFO for console
|
|
85
|
+
|
|
86
|
+
# Add logs for werkzeug in terminal
|
|
87
|
+
werkzeug_console_handler = logging.StreamHandler()
|
|
88
|
+
werkzeug_console_handler.setLevel(logging.INFO)
|
|
89
|
+
werkzeug_console_handler.setFormatter(formatter)
|
|
90
|
+
werkzeug_logger.addHandler(werkzeug_console_handler)
|
|
91
|
+
werkzeug_logger.propagate = False
|
|
92
|
+
|
|
93
|
+
return init_logger()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def init_logger(module_name: str|None=None, level=logging.DEBUG) -> logging.Logger:
|
|
97
|
+
""""pico.log"
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
module_name (str | None, optional): The name of the module for the logger. Defaults to None.
|
|
101
|
+
level (int, optional): The logging level. Defaults to logging.DEBUG.
|
|
102
|
+
Levels (from high to low): logging.CRITICAL logging.ERROR
|
|
103
|
+
logging.WARNING logging.INFO logging.DEBUG
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
logging.Logger: The configured logger instance
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
global log_filename
|
|
110
|
+
|
|
111
|
+
def get_module_name() -> str | None:
|
|
112
|
+
cur_frame = inspect.currentframe()
|
|
113
|
+
if not cur_frame:
|
|
114
|
+
return None
|
|
115
|
+
frame = cur_frame.f_back
|
|
116
|
+
if not frame:
|
|
117
|
+
return None
|
|
118
|
+
module = inspect.getmodule(frame)
|
|
119
|
+
if not module:
|
|
120
|
+
return None
|
|
121
|
+
return module.__name__
|
|
122
|
+
|
|
123
|
+
if module_name is None:
|
|
124
|
+
module_name = get_module_name() or "pico"
|
|
125
|
+
|
|
126
|
+
logger = logging.getLogger(module_name)
|
|
127
|
+
logger.setLevel(level)
|
|
128
|
+
logger.propagate = False
|
|
129
|
+
|
|
130
|
+
if not logger.handlers:
|
|
131
|
+
# Terminal (console) handler
|
|
132
|
+
console_handler = logging.StreamHandler()
|
|
133
|
+
console_handler.setLevel(level)
|
|
134
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levellevel)s - %(message)s')
|
|
135
|
+
console_handler.setFormatter(formatter)
|
|
136
|
+
logger.addHandler(console_handler)
|
|
137
|
+
|
|
138
|
+
# File handler
|
|
139
|
+
if log_filename is not None:
|
|
140
|
+
file_handler = logging.FileHandler(log_filename)
|
|
141
|
+
file_handler.setLevel(level)
|
|
142
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
143
|
+
file_handler.setFormatter(formatter)
|
|
144
|
+
|
|
145
|
+
logger.addHandler(file_handler)
|
|
146
|
+
|
|
147
|
+
# Ensure colored logs for terminal
|
|
148
|
+
coloredlogs.install(level=level, logger=logger, stream=console_handler.stream)
|
|
149
|
+
|
|
150
|
+
return logger
|