masster 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -736
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.5.dist-info/RECORD +0 -50
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/helpers.py
CHANGED
|
@@ -1,433 +1,1650 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import polars as pl
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def get_chrom(self, uids=None, samples=None):
|
|
13
|
-
# Check if consensus_df is empty or doesn't have required columns
|
|
14
|
-
if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
|
|
15
|
-
self.logger.error("No consensus data found. Please run
|
|
16
|
-
return None
|
|
17
|
-
|
|
18
|
-
ids = self._get_consensus_uids(uids)
|
|
19
|
-
sample_uids = self._get_sample_uids(samples)
|
|
20
|
-
|
|
21
|
-
if self.consensus_map is None:
|
|
22
|
-
self.logger.error("No consensus map found.")
|
|
23
|
-
return None
|
|
24
|
-
|
|
25
|
-
# Pre-filter all DataFrames to reduce join sizes
|
|
26
|
-
filtered_consensus_mapping = self.consensus_mapping_df.filter(
|
|
27
|
-
pl.col("consensus_uid").is_in(ids),
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
# Get feature_uids that we actually need
|
|
31
|
-
relevant_feature_uids = filtered_consensus_mapping["feature_uid"].to_list()
|
|
32
|
-
|
|
33
|
-
self.logger.debug(
|
|
34
|
-
f"Filtering features_df for {len(relevant_feature_uids)} relevant feature_uids.",
|
|
35
|
-
)
|
|
36
|
-
# Pre-filter features_df to only relevant features and samples
|
|
37
|
-
filtered_features = self.features_df.filter(
|
|
38
|
-
pl.col("feature_uid").is_in(relevant_feature_uids)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)
|
|
61
|
-
.
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
key =
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
rt
|
|
134
|
-
|
|
135
|
-
feature.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
df2
|
|
157
|
-
|
|
158
|
-
df2
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
"
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
df2
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
df1
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
"
|
|
246
|
-
"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
)
|
|
262
|
-
.
|
|
263
|
-
.
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
matches
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
If uids is
|
|
293
|
-
If uids is a
|
|
294
|
-
If uids is a list of
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
np.random.
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
If uids is
|
|
333
|
-
If uids is a
|
|
334
|
-
If uids is a list of
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
np.random.
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
If samples is
|
|
377
|
-
If samples is a
|
|
378
|
-
If samples is a list of
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
np.random.
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
not_in_consensus
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
if
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_chrom(self, uids=None, samples=None):
|
|
13
|
+
# Check if consensus_df is empty or doesn't have required columns
|
|
14
|
+
if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
|
|
15
|
+
self.logger.error("No consensus data found. Please run merge() first.")
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
ids = self._get_consensus_uids(uids)
|
|
19
|
+
sample_uids = self._get_sample_uids(samples)
|
|
20
|
+
|
|
21
|
+
if self.consensus_map is None:
|
|
22
|
+
self.logger.error("No consensus map found.")
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
# Pre-filter all DataFrames to reduce join sizes
|
|
26
|
+
filtered_consensus_mapping = self.consensus_mapping_df.filter(
|
|
27
|
+
pl.col("consensus_uid").is_in(ids),
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Get feature_uids that we actually need
|
|
31
|
+
relevant_feature_uids = filtered_consensus_mapping["feature_uid"].to_list()
|
|
32
|
+
|
|
33
|
+
self.logger.debug(
|
|
34
|
+
f"Filtering features_df for {len(relevant_feature_uids)} relevant feature_uids.",
|
|
35
|
+
)
|
|
36
|
+
# Pre-filter features_df to only relevant features and samples
|
|
37
|
+
filtered_features = self.features_df.filter(
|
|
38
|
+
pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
|
|
39
|
+
).select([
|
|
40
|
+
"feature_uid",
|
|
41
|
+
"chrom",
|
|
42
|
+
"rt",
|
|
43
|
+
"rt_original",
|
|
44
|
+
"sample_uid",
|
|
45
|
+
])
|
|
46
|
+
|
|
47
|
+
# Pre-filter samples_df
|
|
48
|
+
filtered_samples = self.samples_df.filter(
|
|
49
|
+
pl.col("sample_uid").is_in(sample_uids),
|
|
50
|
+
).select(["sample_uid", "sample_name"])
|
|
51
|
+
|
|
52
|
+
# Perform a three-way join to get all needed data
|
|
53
|
+
self.logger.debug("Joining DataFrames to get complete chromatogram data.")
|
|
54
|
+
df_combined = (
|
|
55
|
+
filtered_consensus_mapping.join(
|
|
56
|
+
filtered_features,
|
|
57
|
+
on="feature_uid",
|
|
58
|
+
how="inner",
|
|
59
|
+
)
|
|
60
|
+
.join(filtered_samples, on="sample_uid", how="inner")
|
|
61
|
+
.with_columns(
|
|
62
|
+
(pl.col("rt") - pl.col("rt_original")).alias("rt_shift"),
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Update chrom objects with rt_shift efficiently
|
|
67
|
+
self.logger.debug("Updating chromatogram objects with rt_shift values.")
|
|
68
|
+
chrom_data = df_combined.select(["chrom", "rt_shift"]).to_dict(as_series=False)
|
|
69
|
+
for chrom_obj, rt_shift in zip(chrom_data["chrom"], chrom_data["rt_shift"]):
|
|
70
|
+
if chrom_obj is not None:
|
|
71
|
+
chrom_obj.rt_shift = rt_shift
|
|
72
|
+
|
|
73
|
+
# Get all unique combinations for complete matrix
|
|
74
|
+
all_consensus_uids = sorted(df_combined["consensus_uid"].unique().to_list())
|
|
75
|
+
all_sample_names = sorted(df_combined["sample_name"].unique().to_list())
|
|
76
|
+
|
|
77
|
+
# Create a mapping dictionary for O(1) lookup instead of O(n) filtering
|
|
78
|
+
self.logger.debug("Creating lookup dictionary for chromatogram objects.")
|
|
79
|
+
chrom_lookup = {}
|
|
80
|
+
for row in df_combined.select([
|
|
81
|
+
"consensus_uid",
|
|
82
|
+
"sample_name",
|
|
83
|
+
"chrom",
|
|
84
|
+
]).iter_rows():
|
|
85
|
+
key = (row[0], row[1]) # (consensus_uid, sample_name)
|
|
86
|
+
chrom_lookup[key] = row[2] # chrom object
|
|
87
|
+
|
|
88
|
+
# Build pivot data efficiently using the lookup dictionary
|
|
89
|
+
pivot_data = []
|
|
90
|
+
total_iterations = len(all_consensus_uids)
|
|
91
|
+
progress_interval = max(1, total_iterations // 10) # Show progress every 10%
|
|
92
|
+
|
|
93
|
+
for i, consensus_uid in enumerate(all_consensus_uids):
|
|
94
|
+
if i % progress_interval == 0:
|
|
95
|
+
progress_percent = (i / total_iterations) * 100
|
|
96
|
+
self.logger.debug(
|
|
97
|
+
f"Building pivot data: {progress_percent:.0f}% complete ({i}/{total_iterations})",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
row_data = {"consensus_uid": consensus_uid}
|
|
101
|
+
for sample_name in all_sample_names:
|
|
102
|
+
key = (consensus_uid, sample_name)
|
|
103
|
+
row_data[sample_name] = chrom_lookup.get(key, None)
|
|
104
|
+
pivot_data.append(row_data)
|
|
105
|
+
|
|
106
|
+
self.logger.debug(
|
|
107
|
+
f"Building pivot data: 100% complete ({total_iterations}/{total_iterations})",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Create Polars DataFrame with complex objects
|
|
111
|
+
df2_pivoted = pl.DataFrame(pivot_data)
|
|
112
|
+
|
|
113
|
+
# Return as Polars DataFrame (can handle complex objects like Chromatogram)
|
|
114
|
+
return df2_pivoted
|
|
115
|
+
|
|
116
|
+
def set_folder(self, folder):
|
|
117
|
+
"""
|
|
118
|
+
Set the folder for saving and loading files.
|
|
119
|
+
"""
|
|
120
|
+
if not os.path.exists(folder):
|
|
121
|
+
os.makedirs(folder)
|
|
122
|
+
self.folder = folder
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def align_reset(self):
|
|
126
|
+
if self.alignment_ref_index is None:
|
|
127
|
+
return
|
|
128
|
+
self.logger.debug("Resetting alignment.")
|
|
129
|
+
# iterate over all feature maps and set RT to original RT
|
|
130
|
+
for feature_map in self.features_maps:
|
|
131
|
+
for feature in feature_map:
|
|
132
|
+
rt = feature.getMetaValue("original_RT")
|
|
133
|
+
if rt is not None:
|
|
134
|
+
feature.setRT(rt)
|
|
135
|
+
feature.removeMetaValue("original_RT")
|
|
136
|
+
self.alignment_ref_index = None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# TODO I don't get this param
|
|
140
|
+
def get_consensus(self, quant="chrom_area"):
|
|
141
|
+
if self.consensus_df is None:
|
|
142
|
+
self.logger.error("No consensus map found.")
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
# Convert Polars DataFrame to pandas for this operation since the result is used for export
|
|
146
|
+
df1 = self.consensus_df.to_pandas().copy()
|
|
147
|
+
|
|
148
|
+
# set consensus_id as uint64
|
|
149
|
+
df1["consensus_id"] = df1["consensus_id"].astype("uint64")
|
|
150
|
+
# set consensus_id as index
|
|
151
|
+
df1.set_index("consensus_uid", inplace=True)
|
|
152
|
+
# sort by consensus_id
|
|
153
|
+
df1 = df1.sort_index()
|
|
154
|
+
|
|
155
|
+
df2 = self.get_consensus_matrix(quant=quant)
|
|
156
|
+
# sort df2 row by consensus_id
|
|
157
|
+
df2 = df2.sort_index()
|
|
158
|
+
# merge df and df2 on consensus_id
|
|
159
|
+
df = pd.merge(df1, df2, left_index=True, right_index=True, how="left")
|
|
160
|
+
|
|
161
|
+
return df
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# TODO I don't get this param
|
|
165
|
+
def get_consensus_matrix(self, quant="chrom_area"):
|
|
166
|
+
"""
|
|
167
|
+
Get a matrix of consensus features with samples as columns and consensus features as rows.
|
|
168
|
+
"""
|
|
169
|
+
if quant not in self.features_df.columns:
|
|
170
|
+
self.logger.error(
|
|
171
|
+
f"Quantification method {quant} not found in features_df.",
|
|
172
|
+
)
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
# Use Polars join instead of pandas merge
|
|
176
|
+
features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
|
|
177
|
+
consensus_mapping_subset = self.consensus_mapping_df.select([
|
|
178
|
+
"consensus_uid",
|
|
179
|
+
"feature_uid",
|
|
180
|
+
])
|
|
181
|
+
|
|
182
|
+
df1 = features_subset.join(
|
|
183
|
+
consensus_mapping_subset,
|
|
184
|
+
on="feature_uid",
|
|
185
|
+
how="left",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Convert to pandas for pivot operation (Polars pivot is still evolving)
|
|
189
|
+
df1_pd = df1.to_pandas()
|
|
190
|
+
df2 = df1_pd.pivot_table(
|
|
191
|
+
index="consensus_uid",
|
|
192
|
+
columns="sample_uid",
|
|
193
|
+
values=quant,
|
|
194
|
+
aggfunc="max",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Create sample_uid to sample_name mapping using Polars
|
|
198
|
+
sample_mapping = dict(
|
|
199
|
+
self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
|
|
200
|
+
)
|
|
201
|
+
# replace sample_uid with sample_name in df2
|
|
202
|
+
df2 = df2.rename(columns=sample_mapping)
|
|
203
|
+
|
|
204
|
+
# round to integer
|
|
205
|
+
df2 = df2.round()
|
|
206
|
+
# set consensus_id as uint64
|
|
207
|
+
df2.index = df2.index.astype("uint64")
|
|
208
|
+
# set index to consensus_id
|
|
209
|
+
df2.index.name = "consensus_uid"
|
|
210
|
+
return df2
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_gaps_matrix(self, uids=None):
|
|
214
|
+
"""
|
|
215
|
+
Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
|
|
216
|
+
"""
|
|
217
|
+
if self.consensus_df is None:
|
|
218
|
+
self.logger.error("No consensus map found.")
|
|
219
|
+
return None
|
|
220
|
+
uids = self._get_consensus_uids(uids)
|
|
221
|
+
|
|
222
|
+
df1 = self.get_consensus_matrix(quant="filled")
|
|
223
|
+
if df1 is None or df1.empty:
|
|
224
|
+
self.logger.warning("No gap data found.")
|
|
225
|
+
return None
|
|
226
|
+
# keep only rows where consensus_id is in ids - use pandas indexing since df1 is already pandas
|
|
227
|
+
df1 = df1[df1.index.isin(uids)]
|
|
228
|
+
return df1
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def get_gaps_stats(self, uids=None):
|
|
232
|
+
"""
|
|
233
|
+
Get statistics about gaps in the consensus features.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
df = self.get_gaps_matrix(uids=uids)
|
|
237
|
+
|
|
238
|
+
# For each column, count how many times the value is True, False, or None. Summarize in a new df with three rows: True, False, None.
|
|
239
|
+
if df is None or df.empty:
|
|
240
|
+
self.logger.warning("No gap data found.")
|
|
241
|
+
return None
|
|
242
|
+
gaps_stats = pd.DataFrame(
|
|
243
|
+
{
|
|
244
|
+
"aligned": df.apply(lambda x: (~x.astype(bool)).sum()),
|
|
245
|
+
"filled": df.apply(lambda x: x.astype(bool).sum() - pd.isnull(x).sum()),
|
|
246
|
+
"missing": df.apply(lambda x: pd.isnull(x).sum()),
|
|
247
|
+
},
|
|
248
|
+
)
|
|
249
|
+
return gaps_stats
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# TODO is uid not supposed to be a list anymore?
|
|
253
|
+
def get_consensus_matches(self, uids=None):
|
|
254
|
+
uids = self._get_consensus_uids(uids)
|
|
255
|
+
|
|
256
|
+
# find all rows in consensus_mapping_df with consensus_id=id - use Polars filtering
|
|
257
|
+
fid = (
|
|
258
|
+
self.consensus_mapping_df.filter(
|
|
259
|
+
pl.col("consensus_uid").is_in(uids),
|
|
260
|
+
)
|
|
261
|
+
.select("feature_uid")
|
|
262
|
+
.to_series()
|
|
263
|
+
.to_list()
|
|
264
|
+
)
|
|
265
|
+
# select all rows in features_df with uid in fid
|
|
266
|
+
matches = self.features_df.filter(pl.col("feature_uid").is_in(fid)).clone()
|
|
267
|
+
return matches
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def fill_reset(self):
|
|
271
|
+
# remove all features with filled=True
|
|
272
|
+
if self.features_df is None:
|
|
273
|
+
self.logger.warning("No features found.")
|
|
274
|
+
return
|
|
275
|
+
l1 = len(self.features_df)
|
|
276
|
+
self.features_df = self.features_df.filter(~pl.col("filled"))
|
|
277
|
+
# remove all rows in consensus_mapping_df where feature_uid is not in features_df['uid']
|
|
278
|
+
|
|
279
|
+
feature_uids_to_keep = self.features_df["feature_uid"].to_list()
|
|
280
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
281
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
282
|
+
)
|
|
283
|
+
self.logger.info(
|
|
284
|
+
f"Reset filled chromatograms. Chroms removed: {l1 - len(self.features_df)}",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _get_feature_uids(self, uids=None, seed=42):
|
|
289
|
+
"""
|
|
290
|
+
Helper function to get feature_uids from features_df based on input uids.
|
|
291
|
+
If uids is None, returns all feature_uids.
|
|
292
|
+
If uids is a single integer, returns a random sample of feature_uids.
|
|
293
|
+
If uids is a list of strings, returns feature_uids corresponding to those feature_uids.
|
|
294
|
+
If uids is a list of integers, returns feature_uids corresponding to those feature_uids.
|
|
295
|
+
"""
|
|
296
|
+
if uids is None:
|
|
297
|
+
# get all feature_uids from features_df
|
|
298
|
+
return self.features_df["feature_uid"].to_list()
|
|
299
|
+
elif isinstance(uids, int):
|
|
300
|
+
# choose a random sample of feature_uids
|
|
301
|
+
if len(self.features_df) > uids:
|
|
302
|
+
np.random.seed(seed)
|
|
303
|
+
return np.random.choice(
|
|
304
|
+
self.features_df["feature_uid"].to_list(),
|
|
305
|
+
uids,
|
|
306
|
+
replace=False,
|
|
307
|
+
).tolist()
|
|
308
|
+
else:
|
|
309
|
+
return self.features_df["feature_uid"].to_list()
|
|
310
|
+
else:
|
|
311
|
+
# iterate over all uids. If the item is a string, assume it's a feature_uid
|
|
312
|
+
feature_uids = []
|
|
313
|
+
for uid in uids:
|
|
314
|
+
if isinstance(uid, str):
|
|
315
|
+
matching_rows = self.features_df.filter(pl.col("feature_uid") == uid)
|
|
316
|
+
if not matching_rows.is_empty():
|
|
317
|
+
feature_uids.append(
|
|
318
|
+
matching_rows.row(0, named=True)["feature_uid"],
|
|
319
|
+
)
|
|
320
|
+
elif isinstance(uid, int):
|
|
321
|
+
if uid in self.features_df["feature_uid"].to_list():
|
|
322
|
+
feature_uids.append(uid)
|
|
323
|
+
# remove duplicates
|
|
324
|
+
feature_uids = list(set(feature_uids))
|
|
325
|
+
return feature_uids
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _get_consensus_uids(self, uids=None, seed=42):
|
|
329
|
+
"""
|
|
330
|
+
Helper function to get consensus_uids from consensus_df based on input uids.
|
|
331
|
+
If uids is None, returns all consensus_uids.
|
|
332
|
+
If uids is a single integer, returns a random sample of consensus_uids.
|
|
333
|
+
If uids is a list of strings, returns consensus_uids corresponding to those consensus_ids.
|
|
334
|
+
If uids is a list of integers, returns consensus_uids corresponding to those consensus_uids.
|
|
335
|
+
"""
|
|
336
|
+
# Check if consensus_df is empty or doesn't have required columns
|
|
337
|
+
if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
if uids is None:
|
|
341
|
+
# get all consensus_uids from consensus_df
|
|
342
|
+
return self.consensus_df["consensus_uid"].to_list()
|
|
343
|
+
elif isinstance(uids, int):
|
|
344
|
+
# choose a random sample of consensus_uids
|
|
345
|
+
if len(self.consensus_df) > uids:
|
|
346
|
+
np.random.seed(seed) # for reproducibility
|
|
347
|
+
return np.random.choice(
|
|
348
|
+
self.consensus_df["consensus_uid"].to_list(),
|
|
349
|
+
uids,
|
|
350
|
+
replace=False,
|
|
351
|
+
).tolist()
|
|
352
|
+
else:
|
|
353
|
+
return self.consensus_df["consensus_uid"].to_list()
|
|
354
|
+
else:
|
|
355
|
+
# iterate over all uids. If the item is a string, assume it's a consensus_id
|
|
356
|
+
consensus_uids = []
|
|
357
|
+
for uid in uids:
|
|
358
|
+
if isinstance(uid, str):
|
|
359
|
+
matching_rows = self.consensus_df.filter(pl.col("consensus_id") == uid)
|
|
360
|
+
if not matching_rows.is_empty():
|
|
361
|
+
consensus_uids.append(
|
|
362
|
+
matching_rows.row(0, named=True)["consensus_uid"],
|
|
363
|
+
)
|
|
364
|
+
elif isinstance(uid, int):
|
|
365
|
+
if uid in self.consensus_df["consensus_uid"].to_list():
|
|
366
|
+
consensus_uids.append(uid)
|
|
367
|
+
# remove duplicates
|
|
368
|
+
consensus_uids = list(set(consensus_uids))
|
|
369
|
+
return consensus_uids
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _get_sample_uids(self, samples=None, seed=42):
|
|
373
|
+
"""
|
|
374
|
+
Helper function to get sample_uids from samples_df based on input samples.
|
|
375
|
+
If samples is None, returns all sample_uids.
|
|
376
|
+
If samples is a single integer, returns a random sample of sample_uids.
|
|
377
|
+
If samples is a list of strings, returns sample_uids corresponding to those sample_names.
|
|
378
|
+
If samples is a list of integers, returns sample_uids corresponding to those sample_uids.
|
|
379
|
+
"""
|
|
380
|
+
if samples is None:
|
|
381
|
+
# get all sample_uids from samples_df
|
|
382
|
+
return self.samples_df["sample_uid"].to_list()
|
|
383
|
+
elif isinstance(samples, int):
|
|
384
|
+
# choose a random sample of sample_uids
|
|
385
|
+
if len(self.samples_df) > samples:
|
|
386
|
+
np.random.seed(seed) # for reproducibility
|
|
387
|
+
return np.random.choice(
|
|
388
|
+
self.samples_df["sample_uid"].to_list(),
|
|
389
|
+
samples,
|
|
390
|
+
replace=False,
|
|
391
|
+
).tolist()
|
|
392
|
+
else:
|
|
393
|
+
return self.samples_df["sample_uid"].to_list()
|
|
394
|
+
else:
|
|
395
|
+
# iterate over all samples. If the item is a string, assume it's a sample_name
|
|
396
|
+
sample_uids = []
|
|
397
|
+
for sample in samples:
|
|
398
|
+
if isinstance(sample, str):
|
|
399
|
+
matching_rows = self.samples_df.filter(pl.col("sample_name") == sample)
|
|
400
|
+
if not matching_rows.is_empty():
|
|
401
|
+
sample_uids.append(
|
|
402
|
+
matching_rows.row(0, named=True)["sample_uid"],
|
|
403
|
+
)
|
|
404
|
+
elif isinstance(sample, int):
|
|
405
|
+
if sample in self.samples_df["sample_uid"].to_list():
|
|
406
|
+
sample_uids.append(sample)
|
|
407
|
+
# remove duplicates
|
|
408
|
+
sample_uids = list(set(sample_uids))
|
|
409
|
+
return sample_uids
|
|
410
|
+
|
|
411
|
+
def get_orphans(self):
|
|
412
|
+
"""
|
|
413
|
+
Get all features that are not in the consensus mapping.
|
|
414
|
+
"""
|
|
415
|
+
not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
|
|
416
|
+
return not_in_consensus
|
|
417
|
+
|
|
418
|
+
def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
|
|
419
|
+
"""
|
|
420
|
+
Perform compress_features, compress_ms2, and compress_chrom operations.
|
|
421
|
+
|
|
422
|
+
Parameters:
|
|
423
|
+
max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
|
|
424
|
+
"""
|
|
425
|
+
self.logger.info("Starting full compression...")
|
|
426
|
+
if features:
|
|
427
|
+
self.compress_features()
|
|
428
|
+
if ms2:
|
|
429
|
+
self.compress_ms2(max_replicates=ms2_max)
|
|
430
|
+
if chrom:
|
|
431
|
+
self.compress_chrom()
|
|
432
|
+
self.logger.info("Compression completed")
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def compress_features(self):
|
|
436
|
+
"""
|
|
437
|
+
Compress features_df by:
|
|
438
|
+
1. Deleting features that are not associated to any consensus (according to consensus_mapping_df)
|
|
439
|
+
2. Setting the m2_specs column to None to save memory
|
|
440
|
+
"""
|
|
441
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
442
|
+
self.logger.warning("No features_df found.")
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
446
|
+
self.logger.warning("No consensus_mapping_df found.")
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
initial_count = len(self.features_df)
|
|
450
|
+
|
|
451
|
+
# Get feature_uids that are associated with consensus features
|
|
452
|
+
consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
|
|
453
|
+
|
|
454
|
+
# Filter features_df to keep only features associated with consensus
|
|
455
|
+
self.features_df = self.features_df.filter(
|
|
456
|
+
pl.col("feature_uid").is_in(consensus_feature_uids)
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Set ms2_specs column to None if it exists
|
|
460
|
+
if "ms2_specs" in self.features_df.columns:
|
|
461
|
+
# Create a list of None values with the same length as the dataframe
|
|
462
|
+
# This preserves the Object dtype instead of converting to Null
|
|
463
|
+
none_values = [None] * len(self.features_df)
|
|
464
|
+
self.features_df = self.features_df.with_columns(
|
|
465
|
+
pl.Series("ms2_specs", none_values, dtype=pl.Object)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
removed_count = initial_count - len(self.features_df)
|
|
469
|
+
self.logger.info(f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column")
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def restore_features(self, samples=None, maps=False):
|
|
473
|
+
"""
|
|
474
|
+
Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
|
|
475
|
+
from the corresponding samples by reading features_df from the sample5 file.
|
|
476
|
+
Use the feature_id for matching.
|
|
477
|
+
|
|
478
|
+
Parameters:
|
|
479
|
+
samples (list, optional): List of sample_uids or sample_names to restore.
|
|
480
|
+
If None, restores all samples.
|
|
481
|
+
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
482
|
+
"""
|
|
483
|
+
import datetime
|
|
484
|
+
from masster.sample.sample import Sample
|
|
485
|
+
|
|
486
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
487
|
+
self.logger.error("No features_df found in study.")
|
|
488
|
+
return
|
|
489
|
+
|
|
490
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
491
|
+
self.logger.error("No samples_df found in study.")
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
# Get sample_uids to process
|
|
495
|
+
sample_uids = self._get_sample_uids(samples)
|
|
496
|
+
|
|
497
|
+
if not sample_uids:
|
|
498
|
+
self.logger.warning("No valid samples specified.")
|
|
499
|
+
return
|
|
500
|
+
|
|
501
|
+
# Columns to update from sample data
|
|
502
|
+
columns_to_update = ['chrom', 'chrom_area', 'ms2_scans', 'ms2_specs']
|
|
503
|
+
|
|
504
|
+
self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
|
|
505
|
+
|
|
506
|
+
# Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
|
|
507
|
+
study_feature_mapping = {}
|
|
508
|
+
for row in self.features_df.iter_rows(named=True):
|
|
509
|
+
if "feature_id" in row and "feature_uid" in row and "sample_uid" in row:
|
|
510
|
+
key = (row["sample_uid"], row["feature_id"])
|
|
511
|
+
study_feature_mapping[key] = row["feature_uid"]
|
|
512
|
+
|
|
513
|
+
# Process each sample
|
|
514
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
515
|
+
for sample_uid in tqdm(sample_uids,
|
|
516
|
+
unit="sample",
|
|
517
|
+
disable=tqdm_disable,
|
|
518
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples"):
|
|
519
|
+
# Get sample info
|
|
520
|
+
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
521
|
+
if sample_row.is_empty():
|
|
522
|
+
self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
sample_info = sample_row.row(0, named=True)
|
|
526
|
+
sample_path = sample_info.get("sample_path")
|
|
527
|
+
sample_name = sample_info.get("sample_name")
|
|
528
|
+
|
|
529
|
+
if not sample_path or not os.path.exists(sample_path):
|
|
530
|
+
self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
# Load sample to get its features_df
|
|
535
|
+
# Use a direct load call with map=False to prevent feature synchronization
|
|
536
|
+
# which would remove filled features that don't exist in the original FeatureMap
|
|
537
|
+
sample = Sample(log_level='DEBUG')
|
|
538
|
+
sample._load_sample5(sample_path, map=False)
|
|
539
|
+
|
|
540
|
+
if sample.features_df is None or sample.features_df.is_empty():
|
|
541
|
+
self.logger.warning(f"No features found in sample {sample_name}")
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
# Create update data for this sample
|
|
545
|
+
updates_made = 0
|
|
546
|
+
for row in sample.features_df.iter_rows(named=True):
|
|
547
|
+
feature_id = row.get("feature_id")
|
|
548
|
+
if feature_id is None:
|
|
549
|
+
continue
|
|
550
|
+
|
|
551
|
+
key = (sample_uid, feature_id)
|
|
552
|
+
if key in study_feature_mapping:
|
|
553
|
+
feature_uid = study_feature_mapping[key]
|
|
554
|
+
|
|
555
|
+
# Update the specific columns in study.features_df
|
|
556
|
+
for col in columns_to_update:
|
|
557
|
+
if col in row and col in self.features_df.columns:
|
|
558
|
+
# Get the original column dtype to preserve it
|
|
559
|
+
original_dtype = self.features_df[col].dtype
|
|
560
|
+
|
|
561
|
+
# Update the specific row and column, preserving dtype
|
|
562
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
563
|
+
|
|
564
|
+
# Handle object columns (like Chromatogram) differently
|
|
565
|
+
if original_dtype == pl.Object:
|
|
566
|
+
self.features_df = self.features_df.with_columns(
|
|
567
|
+
pl.when(mask)
|
|
568
|
+
.then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
|
|
569
|
+
.otherwise(pl.col(col))
|
|
570
|
+
.alias(col)
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
self.features_df = self.features_df.with_columns(
|
|
574
|
+
pl.when(mask)
|
|
575
|
+
.then(pl.lit(row[col], dtype=original_dtype))
|
|
576
|
+
.otherwise(pl.col(col))
|
|
577
|
+
.alias(col)
|
|
578
|
+
)
|
|
579
|
+
updates_made += 1
|
|
580
|
+
|
|
581
|
+
self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
|
|
582
|
+
|
|
583
|
+
# If maps is True, load featureXML data
|
|
584
|
+
if maps:
|
|
585
|
+
if hasattr(sample, 'feature_maps'):
|
|
586
|
+
self.feature_maps.extend(sample.feature_maps)
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
596
|
+
"""
|
|
597
|
+
Restore chromatograms from individual .sample5 files and gap-fill missing ones.
|
|
598
|
+
|
|
599
|
+
This function combines the functionality of restore_features() and fill_chrom():
|
|
600
|
+
1. First restores chromatograms from individual .sample5 files (like restore_features)
|
|
601
|
+
2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
|
|
602
|
+
3. ONLY updates the 'chrom' column, not chrom_area or other derived values
|
|
603
|
+
|
|
604
|
+
Parameters:
|
|
605
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
606
|
+
If None, processes all samples.
|
|
607
|
+
mz_tol (float): m/z tolerance for gap filling (default: 0.010)
|
|
608
|
+
rt_tol (float): RT tolerance for gap filling (default: 10.0)
|
|
609
|
+
"""
|
|
610
|
+
import datetime
|
|
611
|
+
import numpy as np
|
|
612
|
+
from masster.sample.sample import Sample
|
|
613
|
+
from masster.chromatogram import Chromatogram
|
|
614
|
+
|
|
615
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
616
|
+
self.logger.error("No features_df found in study.")
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
620
|
+
self.logger.error("No samples_df found in study.")
|
|
621
|
+
return
|
|
622
|
+
|
|
623
|
+
# Get sample_uids to process
|
|
624
|
+
sample_uids = self._get_sample_uids(samples)
|
|
625
|
+
if not sample_uids:
|
|
626
|
+
self.logger.warning("No valid samples specified.")
|
|
627
|
+
return
|
|
628
|
+
|
|
629
|
+
self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
|
|
630
|
+
|
|
631
|
+
# Create mapping of (sample_uid, feature_id) to feature_uid
|
|
632
|
+
study_feature_mapping = {}
|
|
633
|
+
for row in self.features_df.iter_rows(named=True):
|
|
634
|
+
if "feature_id" in row and "feature_uid" in row and "sample_uid" in row:
|
|
635
|
+
key = (row["sample_uid"], row["feature_id"])
|
|
636
|
+
study_feature_mapping[key] = row["feature_uid"]
|
|
637
|
+
|
|
638
|
+
# Phase 1: Restore from individual .sample5 files (like restore_features)
|
|
639
|
+
restored_count = 0
|
|
640
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
641
|
+
|
|
642
|
+
self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
|
|
643
|
+
for sample_uid in tqdm(sample_uids,
|
|
644
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
|
|
645
|
+
disable=tqdm_disable):
|
|
646
|
+
|
|
647
|
+
# Get sample info
|
|
648
|
+
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
649
|
+
if sample_row.is_empty():
|
|
650
|
+
self.logger.warning(f"Sample with uid {sample_uid} not found.")
|
|
651
|
+
continue
|
|
652
|
+
|
|
653
|
+
sample_info = sample_row.row(0, named=True)
|
|
654
|
+
sample_path = sample_info.get("sample_path")
|
|
655
|
+
sample_name = sample_info.get("sample_name")
|
|
656
|
+
|
|
657
|
+
if not sample_path or not os.path.exists(sample_path):
|
|
658
|
+
self.logger.warning(f"Sample file not found: {sample_path}")
|
|
659
|
+
continue
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
# Load sample (with map=False to prevent feature synchronization)
|
|
663
|
+
sample = Sample(log_level='WARNING')
|
|
664
|
+
sample._load_sample5(sample_path, map=False)
|
|
665
|
+
|
|
666
|
+
if sample.features_df is None or sample.features_df.is_empty():
|
|
667
|
+
self.logger.warning(f"No features found in sample {sample_name}")
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
# Update chromatograms from this sample
|
|
671
|
+
for row in sample.features_df.iter_rows(named=True):
|
|
672
|
+
feature_id = row.get("feature_id")
|
|
673
|
+
chrom = row.get("chrom")
|
|
674
|
+
|
|
675
|
+
if feature_id is None or chrom is None:
|
|
676
|
+
continue
|
|
677
|
+
|
|
678
|
+
key = (sample_uid, feature_id)
|
|
679
|
+
if key in study_feature_mapping:
|
|
680
|
+
feature_uid = study_feature_mapping[key]
|
|
681
|
+
|
|
682
|
+
# Update only the chrom column
|
|
683
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
684
|
+
self.features_df = self.features_df.with_columns(
|
|
685
|
+
pl.when(mask)
|
|
686
|
+
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
687
|
+
.otherwise(pl.col("chrom"))
|
|
688
|
+
.alias("chrom")
|
|
689
|
+
)
|
|
690
|
+
restored_count += 1
|
|
691
|
+
|
|
692
|
+
except Exception as e:
|
|
693
|
+
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
694
|
+
continue
|
|
695
|
+
|
|
696
|
+
self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
|
|
697
|
+
|
|
698
|
+
# Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
|
|
699
|
+
self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
|
|
700
|
+
|
|
701
|
+
# Count how many chromatograms are still missing
|
|
702
|
+
empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
|
|
703
|
+
total_chroms = len(self.features_df)
|
|
704
|
+
|
|
705
|
+
self.logger.debug(f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms/total_chroms*100:.1f}%)")
|
|
706
|
+
|
|
707
|
+
if empty_chroms == 0:
|
|
708
|
+
self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
|
|
709
|
+
return
|
|
710
|
+
|
|
711
|
+
# Get consensus info for gap filling
|
|
712
|
+
consensus_info = {}
|
|
713
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
714
|
+
consensus_info[row["consensus_uid"]] = {
|
|
715
|
+
"rt_start_mean": row["rt_start_mean"],
|
|
716
|
+
"rt_end_mean": row["rt_end_mean"],
|
|
717
|
+
"mz": row["mz"],
|
|
718
|
+
"rt": row["rt"],
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
filled_count = 0
|
|
722
|
+
|
|
723
|
+
# Process each sample that has missing chromatograms
|
|
724
|
+
for sample_uid in tqdm(sample_uids,
|
|
725
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
|
|
726
|
+
disable=tqdm_disable):
|
|
727
|
+
|
|
728
|
+
# Get features with missing chromatograms for this sample
|
|
729
|
+
missing_features = self.features_df.filter(
|
|
730
|
+
(pl.col("sample_uid") == sample_uid) &
|
|
731
|
+
(pl.col("chrom").is_null())
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
if missing_features.is_empty():
|
|
735
|
+
continue
|
|
736
|
+
|
|
737
|
+
# Get sample info
|
|
738
|
+
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
739
|
+
sample_info = sample_row.row(0, named=True)
|
|
740
|
+
sample_path = sample_info.get("sample_path")
|
|
741
|
+
sample_name = sample_info.get("sample_name")
|
|
742
|
+
|
|
743
|
+
if not sample_path or not os.path.exists(sample_path):
|
|
744
|
+
continue
|
|
745
|
+
|
|
746
|
+
try:
|
|
747
|
+
# Load sample for MS1 data extraction
|
|
748
|
+
sample = Sample(log_level='WARNING')
|
|
749
|
+
sample._load_sample5(sample_path, map=False)
|
|
750
|
+
|
|
751
|
+
if not hasattr(sample, 'ms1_df') or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
752
|
+
continue
|
|
753
|
+
|
|
754
|
+
# Process each missing feature
|
|
755
|
+
for feature_row in missing_features.iter_rows(named=True):
|
|
756
|
+
feature_uid = feature_row["feature_uid"]
|
|
757
|
+
mz = feature_row["mz"]
|
|
758
|
+
rt = feature_row["rt"]
|
|
759
|
+
rt_start = feature_row.get("rt_start", rt - rt_tol)
|
|
760
|
+
rt_end = feature_row.get("rt_end", rt + rt_tol)
|
|
761
|
+
|
|
762
|
+
# Extract EIC from MS1 data
|
|
763
|
+
d = sample.ms1_df.filter(
|
|
764
|
+
(pl.col("mz") >= mz - mz_tol) &
|
|
765
|
+
(pl.col("mz") <= mz + mz_tol) &
|
|
766
|
+
(pl.col("rt") >= rt_start - rt_tol) &
|
|
767
|
+
(pl.col("rt") <= rt_end + rt_tol)
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# Create chromatogram
|
|
771
|
+
if d.is_empty():
|
|
772
|
+
# Create empty chromatogram
|
|
773
|
+
eic = Chromatogram(
|
|
774
|
+
rt=np.array([rt_start, rt_end]),
|
|
775
|
+
inty=np.array([0.0, 0.0]),
|
|
776
|
+
label=f"EIC mz={mz:.4f} (gap-filled)",
|
|
777
|
+
file=sample_path,
|
|
778
|
+
mz=mz,
|
|
779
|
+
mz_tol=mz_tol,
|
|
780
|
+
feature_start=rt_start,
|
|
781
|
+
feature_end=rt_end,
|
|
782
|
+
feature_apex=rt,
|
|
783
|
+
)
|
|
784
|
+
else:
|
|
785
|
+
# Create real chromatogram from data
|
|
786
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
787
|
+
|
|
788
|
+
if len(eic_rt) > 4:
|
|
789
|
+
eic = Chromatogram(
|
|
790
|
+
eic_rt["rt"].to_numpy(),
|
|
791
|
+
eic_rt["inty"].to_numpy(),
|
|
792
|
+
label=f"EIC mz={mz:.4f} (gap-filled)",
|
|
793
|
+
file=sample_path,
|
|
794
|
+
mz=mz,
|
|
795
|
+
mz_tol=mz_tol,
|
|
796
|
+
feature_start=rt_start,
|
|
797
|
+
feature_end=rt_end,
|
|
798
|
+
feature_apex=rt,
|
|
799
|
+
).find_peaks()
|
|
800
|
+
else:
|
|
801
|
+
eic = Chromatogram(
|
|
802
|
+
eic_rt["rt"].to_numpy(),
|
|
803
|
+
eic_rt["inty"].to_numpy(),
|
|
804
|
+
label=f"EIC mz={mz:.4f} (gap-filled)",
|
|
805
|
+
file=sample_path,
|
|
806
|
+
mz=mz,
|
|
807
|
+
mz_tol=mz_tol,
|
|
808
|
+
feature_start=rt_start,
|
|
809
|
+
feature_end=rt_end,
|
|
810
|
+
feature_apex=rt,
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
# Update the chromatogram in the study
|
|
814
|
+
mask = pl.col("feature_uid") == feature_uid
|
|
815
|
+
self.features_df = self.features_df.with_columns(
|
|
816
|
+
pl.when(mask)
|
|
817
|
+
.then(pl.lit(eic, dtype=pl.Object, allow_object=True))
|
|
818
|
+
.otherwise(pl.col("chrom"))
|
|
819
|
+
.alias("chrom")
|
|
820
|
+
)
|
|
821
|
+
filled_count += 1
|
|
822
|
+
|
|
823
|
+
except Exception as e:
|
|
824
|
+
self.logger.error(f"Failed to gap-fill sample {sample_name}: {e}")
|
|
825
|
+
continue
|
|
826
|
+
|
|
827
|
+
self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
|
|
828
|
+
|
|
829
|
+
# Final summary
|
|
830
|
+
final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
831
|
+
final_total = len(self.features_df)
|
|
832
|
+
|
|
833
|
+
self.logger.info(f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null/final_total*100:.1f}%)")
|
|
834
|
+
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def compress_ms2(self, max_replicates=5):
|
|
838
|
+
"""
|
|
839
|
+
Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
|
|
840
|
+
Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
|
|
841
|
+
and then pick the top XY rows. Discard the others.
|
|
842
|
+
|
|
843
|
+
Parameters:
|
|
844
|
+
max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
|
|
845
|
+
"""
|
|
846
|
+
if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
|
|
847
|
+
self.logger.warning("No consensus_ms2 found.")
|
|
848
|
+
return
|
|
849
|
+
|
|
850
|
+
initial_count = len(self.consensus_ms2)
|
|
851
|
+
|
|
852
|
+
# Create a ranking score based on number_frags * prec_inty
|
|
853
|
+
# Handle None values by treating them as 0
|
|
854
|
+
self.consensus_ms2 = self.consensus_ms2.with_columns([
|
|
855
|
+
(
|
|
856
|
+
pl.col("number_frags").fill_null(0) *
|
|
857
|
+
pl.col("prec_inty").fill_null(0)
|
|
858
|
+
).alias("ranking_score")
|
|
859
|
+
])
|
|
860
|
+
|
|
861
|
+
# Group by consensus_uid and energy, then rank by score and keep top max_replicates
|
|
862
|
+
compressed_ms2 = (
|
|
863
|
+
self.consensus_ms2
|
|
864
|
+
.with_row_count("row_id") # Add row numbers for stable sorting
|
|
865
|
+
.sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
|
|
866
|
+
.with_columns([
|
|
867
|
+
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
|
|
868
|
+
])
|
|
869
|
+
.filter(pl.col("rank") < max_replicates)
|
|
870
|
+
.drop(["ranking_score", "row_id", "rank"])
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
self.consensus_ms2 = compressed_ms2
|
|
874
|
+
|
|
875
|
+
removed_count = initial_count - len(self.consensus_ms2)
|
|
876
|
+
self.logger.info(f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair")
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def compress_chrom(self):
|
|
880
|
+
"""
|
|
881
|
+
Set the chrom column in study.features_df to null to save memory.
|
|
882
|
+
|
|
883
|
+
This function clears all chromatogram objects from the features_df, which can
|
|
884
|
+
significantly reduce memory usage in large studies.
|
|
885
|
+
"""
|
|
886
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
887
|
+
self.logger.warning("No features_df found.")
|
|
888
|
+
return
|
|
889
|
+
|
|
890
|
+
if "chrom" not in self.features_df.columns:
|
|
891
|
+
self.logger.warning("No 'chrom' column found in features_df.")
|
|
892
|
+
return
|
|
893
|
+
|
|
894
|
+
# Count non-null chromatograms before compression
|
|
895
|
+
non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
896
|
+
|
|
897
|
+
# Set chrom column to None while keeping dtype as object
|
|
898
|
+
self.features_df = self.features_df.with_columns(
|
|
899
|
+
pl.lit(None, dtype=pl.Object).alias("chrom")
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def set_source(self, filename):
|
|
906
|
+
"""
|
|
907
|
+
Reassign file_source for all samples in samples_df. If filename contains only a path,
|
|
908
|
+
keep the current basename and build an absolute path. Check that the new file exists
|
|
909
|
+
before overwriting the old file_source.
|
|
910
|
+
|
|
911
|
+
Parameters:
|
|
912
|
+
filename (str): New file path or directory path for all samples
|
|
913
|
+
|
|
914
|
+
Returns:
|
|
915
|
+
None
|
|
916
|
+
"""
|
|
917
|
+
import os
|
|
918
|
+
|
|
919
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
920
|
+
self.logger.warning("No samples found in study.")
|
|
921
|
+
return
|
|
922
|
+
|
|
923
|
+
updated_count = 0
|
|
924
|
+
failed_count = 0
|
|
925
|
+
|
|
926
|
+
# Get all current file_source values
|
|
927
|
+
current_sources = self.samples_df.get_column("file_source").to_list()
|
|
928
|
+
sample_names = self.samples_df.get_column("sample_name").to_list()
|
|
929
|
+
|
|
930
|
+
new_sources = []
|
|
931
|
+
|
|
932
|
+
for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
|
|
933
|
+
# Check if filename is just a directory path
|
|
934
|
+
if os.path.isdir(filename):
|
|
935
|
+
if current_source is None or current_source == "":
|
|
936
|
+
self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
|
|
937
|
+
new_sources.append(current_source)
|
|
938
|
+
failed_count += 1
|
|
939
|
+
continue
|
|
940
|
+
|
|
941
|
+
# Get the basename from current file_source
|
|
942
|
+
current_basename = os.path.basename(current_source)
|
|
943
|
+
# Build new absolute path
|
|
944
|
+
new_file_path = os.path.join(filename, current_basename)
|
|
945
|
+
else:
|
|
946
|
+
# filename is a full path, make it absolute
|
|
947
|
+
new_file_path = os.path.abspath(filename)
|
|
948
|
+
|
|
949
|
+
# Check if the new file exists
|
|
950
|
+
if not os.path.exists(new_file_path):
|
|
951
|
+
self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
|
|
952
|
+
new_sources.append(current_source)
|
|
953
|
+
failed_count += 1
|
|
954
|
+
continue
|
|
955
|
+
|
|
956
|
+
# File exists, update source
|
|
957
|
+
new_sources.append(new_file_path)
|
|
958
|
+
updated_count += 1
|
|
959
|
+
|
|
960
|
+
# Log individual updates at debug level
|
|
961
|
+
self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
|
|
962
|
+
|
|
963
|
+
# Update the samples_df with new file_source values
|
|
964
|
+
self.samples_df = self.samples_df.with_columns(
|
|
965
|
+
pl.Series("file_source", new_sources).alias("file_source")
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
# Log summary
|
|
969
|
+
if updated_count > 0:
|
|
970
|
+
self.logger.info(f"Updated file_source for {updated_count} samples")
|
|
971
|
+
if failed_count > 0:
|
|
972
|
+
self.logger.warning(f"Failed to update file_source for {failed_count} samples")
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
def features_select(
|
|
976
|
+
self,
|
|
977
|
+
mz=None,
|
|
978
|
+
rt=None,
|
|
979
|
+
inty=None,
|
|
980
|
+
sample_uid=None,
|
|
981
|
+
sample_name=None,
|
|
982
|
+
consensus_uid=None,
|
|
983
|
+
feature_uid=None,
|
|
984
|
+
filled=None,
|
|
985
|
+
quality=None,
|
|
986
|
+
chrom_coherence=None,
|
|
987
|
+
chrom_prominence=None,
|
|
988
|
+
chrom_prominence_scaled=None,
|
|
989
|
+
chrom_height_scaled=None,
|
|
990
|
+
):
|
|
991
|
+
"""
|
|
992
|
+
Select features from features_df based on specified criteria and return the filtered DataFrame.
|
|
993
|
+
|
|
994
|
+
OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
|
|
995
|
+
|
|
996
|
+
Parameters:
|
|
997
|
+
mz: m/z range filter (tuple for range, single value for minimum)
|
|
998
|
+
rt: retention time range filter (tuple for range, single value for minimum)
|
|
999
|
+
inty: intensity filter (tuple for range, single value for minimum)
|
|
1000
|
+
sample_uid: sample UID filter (list, single value, or tuple for range)
|
|
1001
|
+
sample_name: sample name filter (list or single value)
|
|
1002
|
+
consensus_uid: consensus UID filter (list, single value, or tuple for range)
|
|
1003
|
+
feature_uid: feature UID filter (list, single value, or tuple for range)
|
|
1004
|
+
filled: filter for filled/not filled features (bool)
|
|
1005
|
+
quality: quality score filter (tuple for range, single value for minimum)
|
|
1006
|
+
chrom_coherence: chromatogram coherence filter (tuple for range, single value for minimum)
|
|
1007
|
+
chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1008
|
+
chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1009
|
+
chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
polars.DataFrame: Filtered features DataFrame
|
|
1013
|
+
"""
|
|
1014
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1015
|
+
self.logger.warning("No features found in study.")
|
|
1016
|
+
return pl.DataFrame()
|
|
1017
|
+
|
|
1018
|
+
# Early return if no filters provided - performance optimization
|
|
1019
|
+
filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
|
|
1020
|
+
feature_uid, filled, quality, chrom_coherence,
|
|
1021
|
+
chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
|
|
1022
|
+
if all(param is None for param in filter_params):
|
|
1023
|
+
return self.features_df.clone()
|
|
1024
|
+
|
|
1025
|
+
initial_count = len(self.features_df)
|
|
1026
|
+
|
|
1027
|
+
# Pre-check available columns once for efficiency
|
|
1028
|
+
available_columns = set(self.features_df.columns)
|
|
1029
|
+
|
|
1030
|
+
# Build all filter conditions first, then apply them all at once
|
|
1031
|
+
filter_conditions = []
|
|
1032
|
+
warnings = []
|
|
1033
|
+
|
|
1034
|
+
# Filter by m/z
|
|
1035
|
+
if mz is not None:
|
|
1036
|
+
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1037
|
+
min_mz, max_mz = mz
|
|
1038
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1039
|
+
else:
|
|
1040
|
+
filter_conditions.append(pl.col("mz") >= mz)
|
|
1041
|
+
|
|
1042
|
+
# Filter by retention time
|
|
1043
|
+
if rt is not None:
|
|
1044
|
+
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1045
|
+
min_rt, max_rt = rt
|
|
1046
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1047
|
+
else:
|
|
1048
|
+
filter_conditions.append(pl.col("rt") >= rt)
|
|
1049
|
+
|
|
1050
|
+
# Filter by intensity
|
|
1051
|
+
if inty is not None:
|
|
1052
|
+
if isinstance(inty, tuple) and len(inty) == 2:
|
|
1053
|
+
min_inty, max_inty = inty
|
|
1054
|
+
filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
|
|
1055
|
+
else:
|
|
1056
|
+
filter_conditions.append(pl.col("inty") >= inty)
|
|
1057
|
+
|
|
1058
|
+
# Filter by sample_uid
|
|
1059
|
+
if sample_uid is not None:
|
|
1060
|
+
if isinstance(sample_uid, (list, tuple)):
|
|
1061
|
+
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
1062
|
+
# Treat as range
|
|
1063
|
+
min_uid, max_uid = sample_uid
|
|
1064
|
+
filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
|
|
1065
|
+
else:
|
|
1066
|
+
# Treat as list
|
|
1067
|
+
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
1068
|
+
else:
|
|
1069
|
+
filter_conditions.append(pl.col("sample_uid") == sample_uid)
|
|
1070
|
+
|
|
1071
|
+
# Filter by sample_name (requires pre-processing)
|
|
1072
|
+
if sample_name is not None:
|
|
1073
|
+
# Get sample_uids for the given sample names
|
|
1074
|
+
if isinstance(sample_name, list):
|
|
1075
|
+
sample_uids_for_names = self.samples_df.filter(
|
|
1076
|
+
pl.col("sample_name").is_in(sample_name)
|
|
1077
|
+
)["sample_uid"].to_list()
|
|
1078
|
+
else:
|
|
1079
|
+
sample_uids_for_names = self.samples_df.filter(
|
|
1080
|
+
pl.col("sample_name") == sample_name
|
|
1081
|
+
)["sample_uid"].to_list()
|
|
1082
|
+
|
|
1083
|
+
if sample_uids_for_names:
|
|
1084
|
+
filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
|
|
1085
|
+
else:
|
|
1086
|
+
filter_conditions.append(pl.lit(False)) # No matching samples
|
|
1087
|
+
|
|
1088
|
+
# Filter by consensus_uid
|
|
1089
|
+
if consensus_uid is not None:
|
|
1090
|
+
if isinstance(consensus_uid, (list, tuple)):
|
|
1091
|
+
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1092
|
+
# Treat as range
|
|
1093
|
+
min_uid, max_uid = consensus_uid
|
|
1094
|
+
filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
1095
|
+
else:
|
|
1096
|
+
# Treat as list
|
|
1097
|
+
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1098
|
+
else:
|
|
1099
|
+
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
1100
|
+
|
|
1101
|
+
# Filter by feature_uid
|
|
1102
|
+
if feature_uid is not None:
|
|
1103
|
+
if isinstance(feature_uid, (list, tuple)):
|
|
1104
|
+
if len(feature_uid) == 2 and not isinstance(feature_uid, list):
|
|
1105
|
+
# Treat as range
|
|
1106
|
+
min_uid, max_uid = feature_uid
|
|
1107
|
+
filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
|
|
1108
|
+
else:
|
|
1109
|
+
# Treat as list
|
|
1110
|
+
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
1111
|
+
else:
|
|
1112
|
+
filter_conditions.append(pl.col("feature_uid") == feature_uid)
|
|
1113
|
+
|
|
1114
|
+
# Filter by filled status
|
|
1115
|
+
if filled is not None:
|
|
1116
|
+
if "filled" in available_columns:
|
|
1117
|
+
if filled:
|
|
1118
|
+
filter_conditions.append(pl.col("filled"))
|
|
1119
|
+
else:
|
|
1120
|
+
filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
|
|
1121
|
+
else:
|
|
1122
|
+
warnings.append("'filled' column not found in features_df")
|
|
1123
|
+
|
|
1124
|
+
# Filter by quality
|
|
1125
|
+
if quality is not None:
|
|
1126
|
+
if "quality" in available_columns:
|
|
1127
|
+
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1128
|
+
min_quality, max_quality = quality
|
|
1129
|
+
filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
|
|
1130
|
+
else:
|
|
1131
|
+
filter_conditions.append(pl.col("quality") >= quality)
|
|
1132
|
+
else:
|
|
1133
|
+
warnings.append("'quality' column not found in features_df")
|
|
1134
|
+
|
|
1135
|
+
# Filter by chromatogram coherence
|
|
1136
|
+
if chrom_coherence is not None:
|
|
1137
|
+
if "chrom_coherence" in available_columns:
|
|
1138
|
+
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1139
|
+
min_coherence, max_coherence = chrom_coherence
|
|
1140
|
+
filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
|
|
1141
|
+
else:
|
|
1142
|
+
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
1143
|
+
else:
|
|
1144
|
+
warnings.append("'chrom_coherence' column not found in features_df")
|
|
1145
|
+
|
|
1146
|
+
# Filter by chromatogram prominence
|
|
1147
|
+
if chrom_prominence is not None:
|
|
1148
|
+
if "chrom_prominence" in available_columns:
|
|
1149
|
+
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1150
|
+
min_prominence, max_prominence = chrom_prominence
|
|
1151
|
+
filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
|
|
1152
|
+
else:
|
|
1153
|
+
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
1154
|
+
else:
|
|
1155
|
+
warnings.append("'chrom_prominence' column not found in features_df")
|
|
1156
|
+
|
|
1157
|
+
# Filter by scaled chromatogram prominence
|
|
1158
|
+
if chrom_prominence_scaled is not None:
|
|
1159
|
+
if "chrom_prominence_scaled" in available_columns:
|
|
1160
|
+
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
1161
|
+
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1162
|
+
filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
|
|
1163
|
+
else:
|
|
1164
|
+
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
1165
|
+
else:
|
|
1166
|
+
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1167
|
+
|
|
1168
|
+
# Filter by scaled chromatogram height
|
|
1169
|
+
if chrom_height_scaled is not None:
|
|
1170
|
+
if "chrom_height_scaled" in available_columns:
|
|
1171
|
+
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
1172
|
+
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1173
|
+
filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
|
|
1174
|
+
else:
|
|
1175
|
+
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
1176
|
+
else:
|
|
1177
|
+
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1178
|
+
|
|
1179
|
+
# Log all warnings once at the end for efficiency
|
|
1180
|
+
for warning in warnings:
|
|
1181
|
+
self.logger.warning(warning)
|
|
1182
|
+
|
|
1183
|
+
# Apply all filters at once using lazy evaluation for optimal performance
|
|
1184
|
+
if filter_conditions:
|
|
1185
|
+
# Combine all conditions with AND
|
|
1186
|
+
combined_filter = filter_conditions[0]
|
|
1187
|
+
for condition in filter_conditions[1:]:
|
|
1188
|
+
combined_filter = combined_filter & condition
|
|
1189
|
+
|
|
1190
|
+
# Apply the combined filter using lazy evaluation
|
|
1191
|
+
feats = self.features_df.lazy().filter(combined_filter).collect()
|
|
1192
|
+
else:
|
|
1193
|
+
feats = self.features_df.clone()
|
|
1194
|
+
|
|
1195
|
+
final_count = len(feats)
|
|
1196
|
+
|
|
1197
|
+
if final_count == 0:
|
|
1198
|
+
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1199
|
+
else:
|
|
1200
|
+
removed_count = initial_count - final_count
|
|
1201
|
+
self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
|
|
1202
|
+
|
|
1203
|
+
return feats
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
def features_filter(self, features):
|
|
1207
|
+
"""
|
|
1208
|
+
Filter features_df by removing all features that match the given criteria.
|
|
1209
|
+
This is the inverse of features_select - it removes the selected features.
|
|
1210
|
+
|
|
1211
|
+
OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
|
|
1212
|
+
|
|
1213
|
+
Parameters:
|
|
1214
|
+
features: Features to remove. Can be:
|
|
1215
|
+
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1216
|
+
- list: List of feature_uids to remove
|
|
1217
|
+
- int: Single feature_uid to remove
|
|
1218
|
+
|
|
1219
|
+
Returns:
|
|
1220
|
+
None (modifies self.features_df in place)
|
|
1221
|
+
"""
|
|
1222
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1223
|
+
self.logger.warning("No features found in study.")
|
|
1224
|
+
return
|
|
1225
|
+
|
|
1226
|
+
# Early return if no features provided
|
|
1227
|
+
if features is None:
|
|
1228
|
+
self.logger.warning("No features provided for filtering.")
|
|
1229
|
+
return
|
|
1230
|
+
|
|
1231
|
+
initial_count = len(self.features_df)
|
|
1232
|
+
|
|
1233
|
+
# Determine feature_uids to remove - optimized type checking
|
|
1234
|
+
if isinstance(features, pl.DataFrame):
|
|
1235
|
+
if "feature_uid" not in features.columns:
|
|
1236
|
+
self.logger.error("features DataFrame must contain 'feature_uid' column")
|
|
1237
|
+
return
|
|
1238
|
+
feature_uids_to_remove = features["feature_uid"].to_list()
|
|
1239
|
+
elif isinstance(features, (list, tuple)):
|
|
1240
|
+
feature_uids_to_remove = list(features) # Convert tuple to list if needed
|
|
1241
|
+
elif isinstance(features, int):
|
|
1242
|
+
feature_uids_to_remove = [features]
|
|
1243
|
+
else:
|
|
1244
|
+
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
1245
|
+
return
|
|
1246
|
+
|
|
1247
|
+
# Early return if no UIDs to remove
|
|
1248
|
+
if not feature_uids_to_remove:
|
|
1249
|
+
self.logger.warning("No feature UIDs provided for filtering.")
|
|
1250
|
+
return
|
|
1251
|
+
|
|
1252
|
+
# Convert to set for faster lookup if list is large
|
|
1253
|
+
if len(feature_uids_to_remove) > 100:
|
|
1254
|
+
feature_uids_set = set(feature_uids_to_remove)
|
|
1255
|
+
# Use the set for filtering if it's significantly smaller
|
|
1256
|
+
if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
|
|
1257
|
+
feature_uids_to_remove = list(feature_uids_set)
|
|
1258
|
+
|
|
1259
|
+
# Create filter condition once
|
|
1260
|
+
filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
1261
|
+
|
|
1262
|
+
# Apply filter to features_df using lazy evaluation for better performance
|
|
1263
|
+
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
1264
|
+
|
|
1265
|
+
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1266
|
+
mapping_removed_count = 0
|
|
1267
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1268
|
+
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1269
|
+
self.consensus_mapping_df = (
|
|
1270
|
+
self.consensus_mapping_df
|
|
1271
|
+
.lazy()
|
|
1272
|
+
.filter(filter_condition)
|
|
1273
|
+
.collect()
|
|
1274
|
+
)
|
|
1275
|
+
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1276
|
+
|
|
1277
|
+
# Calculate results once and log efficiently
|
|
1278
|
+
final_count = len(self.features_df)
|
|
1279
|
+
removed_count = initial_count - final_count
|
|
1280
|
+
|
|
1281
|
+
# Single comprehensive log message
|
|
1282
|
+
if mapping_removed_count > 0:
|
|
1283
|
+
self.logger.info(f"Filtered {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}")
|
|
1284
|
+
else:
|
|
1285
|
+
self.logger.info(f"Filtered {removed_count} features. Remaining features: {final_count}")
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def features_delete(self, features):
|
|
1289
|
+
"""
|
|
1290
|
+
Delete features from features_df based on feature identifiers.
|
|
1291
|
+
This is an alias for features_filter for consistency with sample.features_delete().
|
|
1292
|
+
|
|
1293
|
+
Parameters:
|
|
1294
|
+
features: Features to delete. Can be:
|
|
1295
|
+
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1296
|
+
- list: List of feature_uids to delete
|
|
1297
|
+
- int: Single feature_uid to delete
|
|
1298
|
+
|
|
1299
|
+
Returns:
|
|
1300
|
+
None (modifies self.features_df in place)
|
|
1301
|
+
"""
|
|
1302
|
+
self.features_filter(features)
|
|
1303
|
+
|
|
1304
|
+
|
|
1305
|
+
def consensus_select(
|
|
1306
|
+
self,
|
|
1307
|
+
mz=None,
|
|
1308
|
+
rt=None,
|
|
1309
|
+
inty_mean=None,
|
|
1310
|
+
consensus_uid=None,
|
|
1311
|
+
consensus_id=None,
|
|
1312
|
+
number_samples=None,
|
|
1313
|
+
number_ms2=None,
|
|
1314
|
+
quality=None,
|
|
1315
|
+
bl=None,
|
|
1316
|
+
chrom_coherence_mean=None,
|
|
1317
|
+
chrom_prominence_mean=None,
|
|
1318
|
+
chrom_prominence_scaled_mean=None,
|
|
1319
|
+
chrom_height_scaled_mean=None,
|
|
1320
|
+
rt_delta_mean=None,
|
|
1321
|
+
):
|
|
1322
|
+
"""
|
|
1323
|
+
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
1324
|
+
|
|
1325
|
+
Parameters:
|
|
1326
|
+
mz: m/z range filter (tuple for range, single value for minimum)
|
|
1327
|
+
rt: retention time range filter (tuple for range, single value for minimum)
|
|
1328
|
+
inty_mean: mean intensity filter (tuple for range, single value for minimum)
|
|
1329
|
+
consensus_uid: consensus UID filter (list, single value, or tuple for range)
|
|
1330
|
+
consensus_id: consensus ID filter (list or single value)
|
|
1331
|
+
number_samples: number of samples filter (tuple for range, single value for minimum)
|
|
1332
|
+
number_ms2: number of MS2 spectra filter (tuple for range, single value for minimum)
|
|
1333
|
+
quality: quality score filter (tuple for range, single value for minimum)
|
|
1334
|
+
bl: baseline filter (tuple for range, single value for minimum)
|
|
1335
|
+
chrom_coherence_mean: mean chromatogram coherence filter (tuple for range, single value for minimum)
|
|
1336
|
+
chrom_prominence_mean: mean chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1337
|
+
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1338
|
+
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1339
|
+
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
1340
|
+
|
|
1341
|
+
Returns:
|
|
1342
|
+
polars.DataFrame: Filtered consensus DataFrame
|
|
1343
|
+
"""
|
|
1344
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1345
|
+
self.logger.warning("No consensus features found in study.")
|
|
1346
|
+
return pl.DataFrame()
|
|
1347
|
+
|
|
1348
|
+
consensus = self.consensus_df.clone()
|
|
1349
|
+
initial_count = len(consensus)
|
|
1350
|
+
|
|
1351
|
+
# Filter by m/z
|
|
1352
|
+
if mz is not None:
|
|
1353
|
+
consensus_len_before_filter = len(consensus)
|
|
1354
|
+
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1355
|
+
min_mz, max_mz = mz
|
|
1356
|
+
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1357
|
+
else:
|
|
1358
|
+
consensus = consensus.filter(pl.col("mz") >= mz)
|
|
1359
|
+
self.logger.debug(
|
|
1360
|
+
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Filter by retention time
|
|
1364
|
+
if rt is not None:
|
|
1365
|
+
consensus_len_before_filter = len(consensus)
|
|
1366
|
+
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1367
|
+
min_rt, max_rt = rt
|
|
1368
|
+
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1369
|
+
else:
|
|
1370
|
+
consensus = consensus.filter(pl.col("rt") >= rt)
|
|
1371
|
+
self.logger.debug(
|
|
1372
|
+
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
# Filter by mean intensity
|
|
1376
|
+
if inty_mean is not None:
|
|
1377
|
+
consensus_len_before_filter = len(consensus)
|
|
1378
|
+
if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
|
|
1379
|
+
min_inty, max_inty = inty_mean
|
|
1380
|
+
consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
|
|
1381
|
+
else:
|
|
1382
|
+
consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
|
|
1383
|
+
self.logger.debug(
|
|
1384
|
+
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1385
|
+
)
|
|
1386
|
+
|
|
1387
|
+
# Filter by consensus_uid
|
|
1388
|
+
if consensus_uid is not None:
|
|
1389
|
+
consensus_len_before_filter = len(consensus)
|
|
1390
|
+
if isinstance(consensus_uid, (list, tuple)):
|
|
1391
|
+
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1392
|
+
# Treat as range
|
|
1393
|
+
min_uid, max_uid = consensus_uid
|
|
1394
|
+
consensus = consensus.filter((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
1395
|
+
else:
|
|
1396
|
+
# Treat as list
|
|
1397
|
+
consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1398
|
+
else:
|
|
1399
|
+
consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
|
|
1400
|
+
self.logger.debug(
|
|
1401
|
+
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1402
|
+
)
|
|
1403
|
+
|
|
1404
|
+
# Filter by consensus_id
|
|
1405
|
+
if consensus_id is not None:
|
|
1406
|
+
consensus_len_before_filter = len(consensus)
|
|
1407
|
+
if isinstance(consensus_id, list):
|
|
1408
|
+
consensus = consensus.filter(pl.col("consensus_id").is_in(consensus_id))
|
|
1409
|
+
else:
|
|
1410
|
+
consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
|
|
1411
|
+
self.logger.debug(
|
|
1412
|
+
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
# Filter by number of samples
|
|
1416
|
+
if number_samples is not None:
|
|
1417
|
+
consensus_len_before_filter = len(consensus)
|
|
1418
|
+
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
1419
|
+
min_samples, max_samples = number_samples
|
|
1420
|
+
consensus = consensus.filter((pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples))
|
|
1421
|
+
else:
|
|
1422
|
+
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
1423
|
+
self.logger.debug(
|
|
1424
|
+
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
# Filter by number of MS2 spectra
|
|
1428
|
+
if number_ms2 is not None:
|
|
1429
|
+
consensus_len_before_filter = len(consensus)
|
|
1430
|
+
if "number_ms2" in consensus.columns:
|
|
1431
|
+
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
1432
|
+
min_ms2, max_ms2 = number_ms2
|
|
1433
|
+
consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
|
|
1434
|
+
else:
|
|
1435
|
+
consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
|
|
1436
|
+
else:
|
|
1437
|
+
self.logger.warning("'number_ms2' column not found in consensus_df")
|
|
1438
|
+
self.logger.debug(
|
|
1439
|
+
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1440
|
+
)
|
|
1441
|
+
|
|
1442
|
+
# Filter by quality
|
|
1443
|
+
if quality is not None:
|
|
1444
|
+
consensus_len_before_filter = len(consensus)
|
|
1445
|
+
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1446
|
+
min_quality, max_quality = quality
|
|
1447
|
+
consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
|
|
1448
|
+
else:
|
|
1449
|
+
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
1450
|
+
self.logger.debug(
|
|
1451
|
+
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
# Filter by baseline
|
|
1455
|
+
if bl is not None:
|
|
1456
|
+
consensus_len_before_filter = len(consensus)
|
|
1457
|
+
if "bl" in consensus.columns:
|
|
1458
|
+
if isinstance(bl, tuple) and len(bl) == 2:
|
|
1459
|
+
min_bl, max_bl = bl
|
|
1460
|
+
consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
|
|
1461
|
+
else:
|
|
1462
|
+
consensus = consensus.filter(pl.col("bl") >= bl)
|
|
1463
|
+
else:
|
|
1464
|
+
self.logger.warning("'bl' column not found in consensus_df")
|
|
1465
|
+
self.logger.debug(
|
|
1466
|
+
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
# Filter by mean chromatogram coherence
|
|
1470
|
+
if chrom_coherence_mean is not None:
|
|
1471
|
+
consensus_len_before_filter = len(consensus)
|
|
1472
|
+
if "chrom_coherence_mean" in consensus.columns:
|
|
1473
|
+
if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
|
|
1474
|
+
min_coherence, max_coherence = chrom_coherence_mean
|
|
1475
|
+
consensus = consensus.filter((pl.col("chrom_coherence_mean") >= min_coherence) & (pl.col("chrom_coherence_mean") <= max_coherence))
|
|
1476
|
+
else:
|
|
1477
|
+
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
1478
|
+
else:
|
|
1479
|
+
self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
|
|
1480
|
+
self.logger.debug(
|
|
1481
|
+
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1482
|
+
)
|
|
1483
|
+
|
|
1484
|
+
# Filter by mean chromatogram prominence
|
|
1485
|
+
if chrom_prominence_mean is not None:
|
|
1486
|
+
consensus_len_before_filter = len(consensus)
|
|
1487
|
+
if "chrom_prominence_mean" in consensus.columns:
|
|
1488
|
+
if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
|
|
1489
|
+
min_prominence, max_prominence = chrom_prominence_mean
|
|
1490
|
+
consensus = consensus.filter((pl.col("chrom_prominence_mean") >= min_prominence) & (pl.col("chrom_prominence_mean") <= max_prominence))
|
|
1491
|
+
else:
|
|
1492
|
+
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
1493
|
+
else:
|
|
1494
|
+
self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
|
|
1495
|
+
self.logger.debug(
|
|
1496
|
+
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
# Filter by mean scaled chromatogram prominence
|
|
1500
|
+
if chrom_prominence_scaled_mean is not None:
|
|
1501
|
+
consensus_len_before_filter = len(consensus)
|
|
1502
|
+
if "chrom_prominence_scaled_mean" in consensus.columns:
|
|
1503
|
+
if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
|
|
1504
|
+
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
1505
|
+
consensus = consensus.filter((pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled))
|
|
1506
|
+
else:
|
|
1507
|
+
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
1508
|
+
else:
|
|
1509
|
+
self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
|
|
1510
|
+
self.logger.debug(
|
|
1511
|
+
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
# Filter by mean scaled chromatogram height
|
|
1515
|
+
if chrom_height_scaled_mean is not None:
|
|
1516
|
+
consensus_len_before_filter = len(consensus)
|
|
1517
|
+
if "chrom_height_scaled_mean" in consensus.columns:
|
|
1518
|
+
if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
|
|
1519
|
+
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
1520
|
+
consensus = consensus.filter((pl.col("chrom_height_scaled_mean") >= min_height_scaled) & (pl.col("chrom_height_scaled_mean") <= max_height_scaled))
|
|
1521
|
+
else:
|
|
1522
|
+
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
1523
|
+
else:
|
|
1524
|
+
self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
|
|
1525
|
+
self.logger.debug(
|
|
1526
|
+
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1527
|
+
)
|
|
1528
|
+
|
|
1529
|
+
# Filter by mean RT delta
|
|
1530
|
+
if rt_delta_mean is not None:
|
|
1531
|
+
consensus_len_before_filter = len(consensus)
|
|
1532
|
+
if "rt_delta_mean" in consensus.columns:
|
|
1533
|
+
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
1534
|
+
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
1535
|
+
consensus = consensus.filter((pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta))
|
|
1536
|
+
else:
|
|
1537
|
+
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
1538
|
+
else:
|
|
1539
|
+
self.logger.warning("'rt_delta_mean' column not found in consensus_df")
|
|
1540
|
+
self.logger.debug(
|
|
1541
|
+
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1542
|
+
)
|
|
1543
|
+
|
|
1544
|
+
if len(consensus) == 0:
|
|
1545
|
+
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
1546
|
+
else:
|
|
1547
|
+
self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
|
|
1548
|
+
|
|
1549
|
+
return consensus
|
|
1550
|
+
|
|
1551
|
+
|
|
1552
|
+
def consensus_filter(self, consensus):
|
|
1553
|
+
"""
|
|
1554
|
+
Filter consensus_df by removing all consensus features that match the given criteria.
|
|
1555
|
+
This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
|
|
1556
|
+
|
|
1557
|
+
Parameters:
|
|
1558
|
+
consensus: Consensus features to remove. Can be:
|
|
1559
|
+
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1560
|
+
- list: List of consensus_uids to remove
|
|
1561
|
+
- int: Single consensus_uid to remove
|
|
1562
|
+
|
|
1563
|
+
Returns:
|
|
1564
|
+
None (modifies self.consensus_df and related DataFrames in place)
|
|
1565
|
+
"""
|
|
1566
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1567
|
+
self.logger.warning("No consensus features found in study.")
|
|
1568
|
+
return
|
|
1569
|
+
|
|
1570
|
+
initial_consensus_count = len(self.consensus_df)
|
|
1571
|
+
|
|
1572
|
+
# Determine consensus_uids to remove
|
|
1573
|
+
if isinstance(consensus, pl.DataFrame):
|
|
1574
|
+
if "consensus_uid" not in consensus.columns:
|
|
1575
|
+
self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
|
|
1576
|
+
return
|
|
1577
|
+
consensus_uids_to_remove = consensus["consensus_uid"].to_list()
|
|
1578
|
+
elif isinstance(consensus, list):
|
|
1579
|
+
consensus_uids_to_remove = consensus
|
|
1580
|
+
elif isinstance(consensus, int):
|
|
1581
|
+
consensus_uids_to_remove = [consensus]
|
|
1582
|
+
else:
|
|
1583
|
+
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
1584
|
+
return
|
|
1585
|
+
|
|
1586
|
+
if not consensus_uids_to_remove:
|
|
1587
|
+
self.logger.warning("No consensus UIDs provided for filtering.")
|
|
1588
|
+
return
|
|
1589
|
+
|
|
1590
|
+
# Get feature_uids that need to be removed from features_df
|
|
1591
|
+
feature_uids_to_remove = []
|
|
1592
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1593
|
+
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
1594
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1595
|
+
)["feature_uid"].to_list()
|
|
1596
|
+
|
|
1597
|
+
# Remove consensus features from consensus_df
|
|
1598
|
+
self.consensus_df = self.consensus_df.filter(
|
|
1599
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1600
|
+
)
|
|
1601
|
+
|
|
1602
|
+
# Remove from consensus_mapping_df
|
|
1603
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1604
|
+
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1605
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1606
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1607
|
+
)
|
|
1608
|
+
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1609
|
+
if removed_mapping_count > 0:
|
|
1610
|
+
self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
|
|
1611
|
+
|
|
1612
|
+
# Remove corresponding features from features_df
|
|
1613
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
1614
|
+
initial_features_count = len(self.features_df)
|
|
1615
|
+
self.features_df = self.features_df.filter(
|
|
1616
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
1617
|
+
)
|
|
1618
|
+
removed_features_count = initial_features_count - len(self.features_df)
|
|
1619
|
+
if removed_features_count > 0:
|
|
1620
|
+
self.logger.debug(f"Removed {removed_features_count} entries from features_df")
|
|
1621
|
+
|
|
1622
|
+
# Remove from consensus_ms2 if it exists
|
|
1623
|
+
if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1624
|
+
initial_ms2_count = len(self.consensus_ms2)
|
|
1625
|
+
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
1626
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1627
|
+
)
|
|
1628
|
+
removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
|
|
1629
|
+
if removed_ms2_count > 0:
|
|
1630
|
+
self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
|
|
1631
|
+
|
|
1632
|
+
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
1633
|
+
self.logger.info(f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}")
|
|
1634
|
+
|
|
1635
|
+
|
|
1636
|
+
def consensus_delete(self, consensus):
|
|
1637
|
+
"""
|
|
1638
|
+
Delete consensus features from consensus_df based on consensus identifiers.
|
|
1639
|
+
This is an alias for consensus_filter for consistency with other delete methods.
|
|
1640
|
+
|
|
1641
|
+
Parameters:
|
|
1642
|
+
consensus: Consensus features to delete. Can be:
|
|
1643
|
+
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1644
|
+
- list: List of consensus_uids to delete
|
|
1645
|
+
- int: Single consensus_uid to delete
|
|
1646
|
+
|
|
1647
|
+
Returns:
|
|
1648
|
+
None (modifies self.consensus_df and related DataFrames in place)
|
|
1649
|
+
"""
|
|
1650
|
+
self.consensus_filter(consensus)
|