masster 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -736
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.5.dist-info/RECORD +0 -50
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/helpers.py
CHANGED
|
@@ -1,364 +1,833 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import polars as pl
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
if
|
|
259
|
-
|
|
260
|
-
else:
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
)
|
|
274
|
-
else:
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
if
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Parameters removed - using hardcoded defaults
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _estimate_memory_usage(self):
|
|
10
|
+
"""
|
|
11
|
+
Estimate the memory usage of all dataframes in the Sample object.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict: A dictionary containing memory usage estimates for each dataframe
|
|
15
|
+
and the total memory usage in bytes and MB.
|
|
16
|
+
"""
|
|
17
|
+
memory_usage = {}
|
|
18
|
+
total_bytes = 0
|
|
19
|
+
|
|
20
|
+
# Check features_df
|
|
21
|
+
if self.features_df is not None and len(self.features_df) > 0:
|
|
22
|
+
features_bytes = self.features_df.estimated_size()
|
|
23
|
+
memory_usage['features_df'] = {
|
|
24
|
+
'rows': len(self.features_df),
|
|
25
|
+
'columns': len(self.features_df.columns),
|
|
26
|
+
'bytes': features_bytes,
|
|
27
|
+
'mb': features_bytes / (1024 * 1024)
|
|
28
|
+
}
|
|
29
|
+
total_bytes += features_bytes
|
|
30
|
+
else:
|
|
31
|
+
memory_usage['features_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
|
|
32
|
+
|
|
33
|
+
# Check scans_df
|
|
34
|
+
if self.scans_df is not None and len(self.scans_df) > 0:
|
|
35
|
+
scans_bytes = self.scans_df.estimated_size()
|
|
36
|
+
memory_usage['scans_df'] = {
|
|
37
|
+
'rows': len(self.scans_df),
|
|
38
|
+
'columns': len(self.scans_df.columns),
|
|
39
|
+
'bytes': scans_bytes,
|
|
40
|
+
'mb': scans_bytes / (1024 * 1024)
|
|
41
|
+
}
|
|
42
|
+
total_bytes += scans_bytes
|
|
43
|
+
else:
|
|
44
|
+
memory_usage['scans_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
|
|
45
|
+
|
|
46
|
+
# Check ms1_df
|
|
47
|
+
if self.ms1_df is not None and len(self.ms1_df) > 0:
|
|
48
|
+
ms1_bytes = self.ms1_df.estimated_size()
|
|
49
|
+
memory_usage['ms1_df'] = {
|
|
50
|
+
'rows': len(self.ms1_df),
|
|
51
|
+
'columns': len(self.ms1_df.columns),
|
|
52
|
+
'bytes': ms1_bytes,
|
|
53
|
+
'mb': ms1_bytes / (1024 * 1024)
|
|
54
|
+
}
|
|
55
|
+
total_bytes += ms1_bytes
|
|
56
|
+
else:
|
|
57
|
+
memory_usage['ms1_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
|
|
58
|
+
|
|
59
|
+
# Check chrom_df
|
|
60
|
+
if self.chrom_df is not None and len(self.chrom_df) > 0:
|
|
61
|
+
chrom_bytes = self.chrom_df.estimated_size()
|
|
62
|
+
memory_usage['chrom_df'] = {
|
|
63
|
+
'rows': len(self.chrom_df),
|
|
64
|
+
'columns': len(self.chrom_df.columns),
|
|
65
|
+
'bytes': chrom_bytes,
|
|
66
|
+
'mb': chrom_bytes / (1024 * 1024)
|
|
67
|
+
}
|
|
68
|
+
total_bytes += chrom_bytes
|
|
69
|
+
else:
|
|
70
|
+
memory_usage['chrom_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
|
|
71
|
+
|
|
72
|
+
# Add total memory usage
|
|
73
|
+
memory_usage['total'] = {
|
|
74
|
+
'bytes': total_bytes,
|
|
75
|
+
'mb': total_bytes / (1024 * 1024),
|
|
76
|
+
'gb': total_bytes / (1024 * 1024 * 1024)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Log the memory usage summary
|
|
80
|
+
if hasattr(self, 'logger'):
|
|
81
|
+
self.logger.debug(f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB")
|
|
82
|
+
for df_name, stats in memory_usage.items():
|
|
83
|
+
if df_name != 'total' and stats['bytes'] > 0:
|
|
84
|
+
self.logger.debug(f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB")
|
|
85
|
+
|
|
86
|
+
return memory_usage['total']['mb']
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_dda_stats(self):
|
|
90
|
+
# filter self.scans_df with mslevel 1
|
|
91
|
+
ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
|
|
92
|
+
return ms1
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# TODO
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_feature(self, feature_uid):
|
|
99
|
+
# get the feature with feature_uid == feature_uid
|
|
100
|
+
feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
|
|
101
|
+
if len(feature) == 0:
|
|
102
|
+
self.logger.warning(f"Feature {feature_uid} not found.")
|
|
103
|
+
return None
|
|
104
|
+
else:
|
|
105
|
+
return feature.row(0, named=True)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _get_scan_uids(self, scans=None, verbose=True):
|
|
109
|
+
if scans is None:
|
|
110
|
+
# fromuids scan all get_dfans
|
|
111
|
+
scans_uids = self.scans_df.get_column("scan_uid").to_list()
|
|
112
|
+
elif isinstance(scans, list):
|
|
113
|
+
# if scans is a list, ensure all elements are valid scan_uids
|
|
114
|
+
scans_uids = [s for s in scans if s in self.scans_df.get_column("scan_uid").to_list()]
|
|
115
|
+
if verbose and not scans_uids:
|
|
116
|
+
self.logger.error("No valid scan_uids provided.")
|
|
117
|
+
|
|
118
|
+
return scans_uids
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _get_feature_uids(self, features=None, verbose=True):
|
|
122
|
+
"""
|
|
123
|
+
Get feature UIDs from various input types.
|
|
124
|
+
|
|
125
|
+
Parameters:
|
|
126
|
+
features: Can be one of the following:
|
|
127
|
+
- None: Returns all feature UIDs from self.features_df
|
|
128
|
+
- list: Returns the list if all elements are valid feature UIDs
|
|
129
|
+
- polars.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
130
|
+
- pandas.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
131
|
+
verbose (bool): Whether to log errors for invalid inputs
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
list: List of feature UIDs
|
|
135
|
+
"""
|
|
136
|
+
if features is None:
|
|
137
|
+
# Get all feature UIDs from self.features_df
|
|
138
|
+
if self.features_df is None:
|
|
139
|
+
if verbose:
|
|
140
|
+
self.logger.warning("No features_df available.")
|
|
141
|
+
return []
|
|
142
|
+
feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
143
|
+
elif isinstance(features, list):
|
|
144
|
+
# If features is a list, ensure all elements are valid feature_uids
|
|
145
|
+
if self.features_df is None:
|
|
146
|
+
if verbose:
|
|
147
|
+
self.logger.warning("No features_df available to validate feature UIDs.")
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
valid_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
151
|
+
feature_uids = [f for f in features if f in valid_feature_uids]
|
|
152
|
+
if verbose and not feature_uids:
|
|
153
|
+
self.logger.error("No valid feature_uids provided.")
|
|
154
|
+
else:
|
|
155
|
+
# Handle polars and pandas DataFrames
|
|
156
|
+
try:
|
|
157
|
+
# Check if it's a polars DataFrame
|
|
158
|
+
if hasattr(features, 'columns') and hasattr(features, 'get_column'):
|
|
159
|
+
# Polars DataFrame
|
|
160
|
+
feature_column = None
|
|
161
|
+
if 'feature_uid' in features.columns:
|
|
162
|
+
feature_column = 'feature_uid'
|
|
163
|
+
elif 'feature_id' in features.columns:
|
|
164
|
+
feature_column = 'feature_id'
|
|
165
|
+
|
|
166
|
+
if feature_column is None:
|
|
167
|
+
if verbose:
|
|
168
|
+
self.logger.error("No 'feature_uid' or 'feature_id' column found in polars DataFrame.")
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
# Get unique values from the column
|
|
172
|
+
feature_uids = features.get_column(feature_column).unique().to_list()
|
|
173
|
+
|
|
174
|
+
# Check if it's a pandas DataFrame
|
|
175
|
+
elif hasattr(features, 'columns') and hasattr(features, 'iloc'):
|
|
176
|
+
# Pandas DataFrame
|
|
177
|
+
import pandas as pd
|
|
178
|
+
if not isinstance(features, pd.DataFrame):
|
|
179
|
+
if verbose:
|
|
180
|
+
self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
|
|
181
|
+
return []
|
|
182
|
+
|
|
183
|
+
feature_column = None
|
|
184
|
+
if 'feature_uid' in features.columns:
|
|
185
|
+
feature_column = 'feature_uid'
|
|
186
|
+
elif 'feature_id' in features.columns:
|
|
187
|
+
feature_column = 'feature_id'
|
|
188
|
+
|
|
189
|
+
if feature_column is None:
|
|
190
|
+
if verbose:
|
|
191
|
+
self.logger.error("No 'feature_uid' or 'feature_id' column found in pandas DataFrame.")
|
|
192
|
+
return []
|
|
193
|
+
|
|
194
|
+
# Get unique values from the column
|
|
195
|
+
feature_uids = features[feature_column].unique().tolist()
|
|
196
|
+
|
|
197
|
+
else:
|
|
198
|
+
if verbose:
|
|
199
|
+
self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
|
|
200
|
+
return []
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
if verbose:
|
|
204
|
+
self.logger.error(f"Error processing DataFrame input: {e}")
|
|
205
|
+
return []
|
|
206
|
+
|
|
207
|
+
return feature_uids
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def get_scan(self, scans: list | None = None, verbose=True):
|
|
211
|
+
scan_uids = self._get_scan_uids(scans, verbose=False)
|
|
212
|
+
if not scan_uids:
|
|
213
|
+
if verbose:
|
|
214
|
+
self.logger.warning("No valid scan_uids provided.")
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
scan = self.scans_df.filter(pl.col("scan_uid").is_in(scan_uids))
|
|
218
|
+
return scan
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def select_closest_scan(
|
|
222
|
+
self,
|
|
223
|
+
rt,
|
|
224
|
+
prec_mz=None,
|
|
225
|
+
mz_tol=0.01,
|
|
226
|
+
):
|
|
227
|
+
"""
|
|
228
|
+
Select the closest scan based on retention time (rt), applying additional filtering on precursor m/z (prec_mz) if provided.
|
|
229
|
+
Parameters:
|
|
230
|
+
rt (float): The target retention time to find the closest scan.
|
|
231
|
+
prec_mz (float, optional): The precursor m/z value used to filter scans. If given, only scans with ms_level 2 are considered
|
|
232
|
+
and filtered to include only those within mz_tol of prec_mz.
|
|
233
|
+
mz_tol (float, optional): The tolerance to apply when filtering scans by precursor m/z. Defaults to 0.01.
|
|
234
|
+
Returns:
|
|
235
|
+
polars.DataFrame or None: A DataFrame slice containing the closest scan if a matching scan is found;
|
|
236
|
+
otherwise, returns None.
|
|
237
|
+
Notes:
|
|
238
|
+
- If the scans_df attribute is None, the function prints an error message and returns None.
|
|
239
|
+
- When prec_mz is provided, it filters scans where ms_level equals 2 and the precursor m/z is within the given mz_tol range.
|
|
240
|
+
- If prec_mz is not provided, scans with ms_level equal to 1 are considered.
|
|
241
|
+
- The function calculates the absolute difference between each scan's rt and the given rt, sorting the scans by this difference.
|
|
242
|
+
- If no scans match the criteria, an error message is printed before returning None.
|
|
243
|
+
"""
|
|
244
|
+
# check if scans_df is None
|
|
245
|
+
if self.scans_df is None:
|
|
246
|
+
self.logger.warning("No scans found.")
|
|
247
|
+
return None
|
|
248
|
+
if prec_mz is not None:
|
|
249
|
+
ms_level = 2
|
|
250
|
+
scans = self.scans_df.filter(pl.col("ms_level") == ms_level)
|
|
251
|
+
# find all scans with prec_mz within mz_tol of prec_mz
|
|
252
|
+
scans = scans.filter(pl.col("prec_mz") > prec_mz - mz_tol)
|
|
253
|
+
scans = scans.filter(pl.col("prec_mz") < prec_mz + mz_tol)
|
|
254
|
+
# sort by distance to rt
|
|
255
|
+
scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
|
|
256
|
+
scans = scans.sort("rt_diff")
|
|
257
|
+
# return the closest scan
|
|
258
|
+
if len(scans) > 0:
|
|
259
|
+
scan = scans.slice(0, 1)
|
|
260
|
+
else:
|
|
261
|
+
self.logger.warning(
|
|
262
|
+
f"No scans found with prec_mz {prec_mz} within {mz_tol} of rt {rt}.",
|
|
263
|
+
)
|
|
264
|
+
return None
|
|
265
|
+
else:
|
|
266
|
+
mslevel = 1
|
|
267
|
+
scans = self.scans_df.filter(pl.col("ms_level") == mslevel)
|
|
268
|
+
# sort by distance to rt
|
|
269
|
+
scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
|
|
270
|
+
scans = scans.sort("rt_diff")
|
|
271
|
+
# return the closest scan
|
|
272
|
+
if len(scans) > 0:
|
|
273
|
+
scan = scans.slice(0, 1)
|
|
274
|
+
else:
|
|
275
|
+
self.logger.warning(
|
|
276
|
+
f"No scans found with ms_level {mslevel} within {mz_tol} of rt {rt}.",
|
|
277
|
+
)
|
|
278
|
+
return None
|
|
279
|
+
# return scans_df slice
|
|
280
|
+
|
|
281
|
+
return scan
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# TODO the variables here do not follow the rest (mz, rt being tuples, etc.)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def select(
|
|
288
|
+
self,
|
|
289
|
+
mz=None,
|
|
290
|
+
rt=None,
|
|
291
|
+
coherence=None,
|
|
292
|
+
inty=None,
|
|
293
|
+
rt_delta=None,
|
|
294
|
+
iso=None,
|
|
295
|
+
iso_of=None,
|
|
296
|
+
has_MS2=None,
|
|
297
|
+
prominence_scaled=None,
|
|
298
|
+
height_scaled=None,
|
|
299
|
+
prominence=None,
|
|
300
|
+
height=None,
|
|
301
|
+
):
|
|
302
|
+
"""
|
|
303
|
+
Select features based on specified criteria and return the filtered DataFrame.
|
|
304
|
+
|
|
305
|
+
Parameters:
|
|
306
|
+
mz: m/z range filter (tuple for range, single value for minimum)
|
|
307
|
+
rt: retention time range filter (tuple for range, single value for minimum)
|
|
308
|
+
coherence: chromatogram coherence filter (tuple for range, single value for minimum)
|
|
309
|
+
inty: intensity filter (tuple for range, single value for minimum)
|
|
310
|
+
rt_delta: retention time delta filter (tuple for range, single value for minimum)
|
|
311
|
+
iso: isotope number filter (tuple for range, single value for exact match)
|
|
312
|
+
iso_of: isotope parent filter (tuple for range, single value for exact match)
|
|
313
|
+
has_MS2: filter for features with/without MS2 spectra (bool)
|
|
314
|
+
prominence_scaled: scaled prominence filter (tuple for range, single value for minimum)
|
|
315
|
+
height_scaled: scaled height filter (tuple for range, single value for minimum)
|
|
316
|
+
prominence: prominence filter (tuple for range, single value for minimum)
|
|
317
|
+
height: height filter (tuple for range, single value for minimum)
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
polars.DataFrame: Filtered features DataFrame
|
|
321
|
+
"""
|
|
322
|
+
# remove all features with coherence < coherence
|
|
323
|
+
if self.features_df is None:
|
|
324
|
+
# self.logger.info("No features found. R")
|
|
325
|
+
return
|
|
326
|
+
feats = self.features_df.clone()
|
|
327
|
+
if coherence is not None:
|
|
328
|
+
has_coherence = "chrom_coherence" in self.features_df.columns
|
|
329
|
+
if not has_coherence:
|
|
330
|
+
self.logger.warning("No coherence data found in features.")
|
|
331
|
+
else:
|
|
332
|
+
# record len for logging
|
|
333
|
+
feats_len_before_filter = len(feats)
|
|
334
|
+
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
335
|
+
min_coherence, max_coherence = coherence
|
|
336
|
+
feats = feats.filter(
|
|
337
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
feats = feats.filter(pl.col("chrom_coherence") >= coherence)
|
|
341
|
+
self.logger.debug(
|
|
342
|
+
f"Selected features by coherence. Features removed: {feats_len_before_filter - len(feats)}",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if mz is not None:
|
|
346
|
+
feats_len_before_filter = len(feats)
|
|
347
|
+
if isinstance(mz, tuple) and len(mz) == 2:
|
|
348
|
+
min_mz, max_mz = mz
|
|
349
|
+
feats = feats.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
350
|
+
else:
|
|
351
|
+
feats = feats.filter(pl.col("mz") >= mz)
|
|
352
|
+
self.logger.debug(
|
|
353
|
+
f"Selected features by mz. Features removed: {feats_len_before_filter - len(feats)}",
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if rt is not None:
|
|
357
|
+
feats_len_before_filter = len(feats)
|
|
358
|
+
if isinstance(rt, tuple) and len(rt) == 2:
|
|
359
|
+
min_rt, max_rt = rt
|
|
360
|
+
feats = feats.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
361
|
+
else:
|
|
362
|
+
feats = feats.filter(pl.col("rt") >= rt)
|
|
363
|
+
self.logger.debug(
|
|
364
|
+
f"Selected features by rt. Features removed: {feats_len_before_filter - len(feats)}",
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
if inty is not None:
|
|
368
|
+
feats_len_before_filter = len(feats)
|
|
369
|
+
if isinstance(inty, tuple) and len(inty) == 2:
|
|
370
|
+
min_inty, max_inty = inty
|
|
371
|
+
feats = feats.filter(
|
|
372
|
+
(pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
|
|
373
|
+
)
|
|
374
|
+
else:
|
|
375
|
+
feats = feats.filter(pl.col("inty") >= inty)
|
|
376
|
+
self.logger.debug(
|
|
377
|
+
f"Selected features by intensity. Features removed: {feats_len_before_filter - len(feats)}",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
if rt_delta is not None:
|
|
381
|
+
feats_len_before_filter = len(feats)
|
|
382
|
+
if "rt_delta" not in feats.columns:
|
|
383
|
+
self.logger.warning("No rt_delta data found in features.")
|
|
384
|
+
return
|
|
385
|
+
if isinstance(rt_delta, tuple) and len(rt_delta) == 2:
|
|
386
|
+
min_rt_delta, max_rt_delta = rt_delta
|
|
387
|
+
feats = feats.filter(
|
|
388
|
+
(pl.col("rt_delta") >= min_rt_delta) & (pl.col("rt_delta") <= max_rt_delta),
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
feats = feats.filter(pl.col("rt_delta") >= rt_delta)
|
|
392
|
+
self.logger.debug(
|
|
393
|
+
f"Selected features by rt_delta. Features removed: {feats_len_before_filter - len(feats)}",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
if iso is not None:
|
|
397
|
+
feats_len_before_filter = len(feats)
|
|
398
|
+
if isinstance(iso, tuple) and len(iso) == 2:
|
|
399
|
+
min_iso, max_iso = iso
|
|
400
|
+
feats = feats.filter(
|
|
401
|
+
(pl.col("iso") >= min_iso) & (pl.col("iso") <= max_iso),
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
feats = feats.filter(pl.col("iso") == iso)
|
|
405
|
+
self.logger.debug(
|
|
406
|
+
f"Selected features by iso. Features removed: {feats_len_before_filter - len(feats)}",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
if iso_of is not None:
|
|
410
|
+
feats_len_before_filter = len(feats)
|
|
411
|
+
if isinstance(iso_of, tuple) and len(iso_of) == 2:
|
|
412
|
+
min_iso_of, max_iso_of = iso_of
|
|
413
|
+
feats = feats.filter(
|
|
414
|
+
(pl.col("iso_of") >= min_iso_of) & (pl.col("iso_of") <= max_iso_of),
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
feats = feats.filter(pl.col("iso_of") == iso_of)
|
|
418
|
+
self.logger.debug(
|
|
419
|
+
f"Selected features by iso_of. Features removed: {feats_len_before_filter - len(feats)}",
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if has_MS2 is not None:
|
|
423
|
+
feats_len_before_filter = len(feats)
|
|
424
|
+
if has_MS2:
|
|
425
|
+
feats = feats.filter(pl.col("ms2_scans").is_not_null())
|
|
426
|
+
else:
|
|
427
|
+
feats = feats.filter(pl.col("ms2_scans").is_null())
|
|
428
|
+
self.logger.debug(
|
|
429
|
+
f"Selected features by MS2 presence. Features removed: {feats_len_before_filter - len(feats)}",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
if prominence_scaled is not None:
|
|
433
|
+
feats_len_before_filter = len(feats)
|
|
434
|
+
if isinstance(prominence_scaled, tuple) and len(prominence_scaled) == 2:
|
|
435
|
+
min_prominence_scaled, max_prominence_scaled = prominence_scaled
|
|
436
|
+
feats = feats.filter(
|
|
437
|
+
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
438
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
439
|
+
)
|
|
440
|
+
else:
|
|
441
|
+
feats = feats.filter(pl.col("chrom_prominence_scaled") >= prominence_scaled)
|
|
442
|
+
self.logger.debug(
|
|
443
|
+
f"Selected features by prominence_scaled. Features removed: {feats_len_before_filter - len(feats)}",
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
if height_scaled is not None:
|
|
447
|
+
feats_len_before_filter = len(feats)
|
|
448
|
+
if isinstance(height_scaled, tuple) and len(height_scaled) == 2:
|
|
449
|
+
min_height_scaled, max_height_scaled = height_scaled
|
|
450
|
+
feats = feats.filter(
|
|
451
|
+
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
452
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
feats = feats.filter(pl.col("chrom_height_scaled") >= height_scaled)
|
|
456
|
+
self.logger.debug(
|
|
457
|
+
f"Selected features by height_scaled. Features removed: {feats_len_before_filter - len(feats)}",
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
if prominence is not None:
|
|
461
|
+
feats_len_before_filter = len(feats)
|
|
462
|
+
if isinstance(prominence, tuple) and len(prominence) == 2:
|
|
463
|
+
min_prominence, max_prominence = prominence
|
|
464
|
+
feats = feats.filter(
|
|
465
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
466
|
+
)
|
|
467
|
+
else:
|
|
468
|
+
feats = feats.filter(pl.col("chrom_prominence") >= prominence)
|
|
469
|
+
self.logger.debug(
|
|
470
|
+
f"Selected features by prominence. Features removed: {feats_len_before_filter - len(feats)}",
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
if height is not None:
|
|
474
|
+
feats_len_before_filter = len(feats)
|
|
475
|
+
# Check if chrom_height column exists, if not use chrom_height_scaled
|
|
476
|
+
height_col = "chrom_height" if "chrom_height" in feats.columns else "chrom_height_scaled"
|
|
477
|
+
if isinstance(height, tuple) and len(height) == 2:
|
|
478
|
+
min_height, max_height = height
|
|
479
|
+
feats = feats.filter(
|
|
480
|
+
(pl.col(height_col) >= min_height) & (pl.col(height_col) <= max_height),
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
feats = feats.filter(pl.col(height_col) >= height)
|
|
484
|
+
self.logger.debug(
|
|
485
|
+
f"Selected features by {height_col}. Features removed: {feats_len_before_filter - len(feats)}",
|
|
486
|
+
)
|
|
487
|
+
if len(feats) == 0:
|
|
488
|
+
self.logger.warning("No features remaining after applying selection criteria.")
|
|
489
|
+
else:
|
|
490
|
+
self.logger.info(f"Selected features. Features remaining: {len(feats)}")
|
|
491
|
+
return feats
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _features_sync(self):
|
|
497
|
+
"""
|
|
498
|
+
Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
|
|
499
|
+
but not the other, using feature_id for mapping between them.
|
|
500
|
+
|
|
501
|
+
This function ensures that:
|
|
502
|
+
- Features in the FeatureMap that don't have corresponding entries in features_df are removed
|
|
503
|
+
- Features in features_df that don't have corresponding entries in the FeatureMap are removed
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
None
|
|
507
|
+
|
|
508
|
+
Side Effects:
|
|
509
|
+
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with synchronized features
|
|
510
|
+
Updates self.features_df by filtering to only include features present in the FeatureMap
|
|
511
|
+
|
|
512
|
+
Note:
|
|
513
|
+
Uses feature_id as the mapping key. feature_id contains OpenMS unique IDs that correspond
|
|
514
|
+
to the unique IDs of features in the FeatureMap.
|
|
515
|
+
"""
|
|
516
|
+
if self.features_df is None or self.features is None:
|
|
517
|
+
self.logger.warning("Cannot sync: features_df or FeatureMap is None.")
|
|
518
|
+
return
|
|
519
|
+
|
|
520
|
+
try:
|
|
521
|
+
# Import pyopenms
|
|
522
|
+
import pyopenms as oms
|
|
523
|
+
|
|
524
|
+
# Get feature_ids from features_df
|
|
525
|
+
df_feature_ids = set(self.features_df.get_column("feature_id").to_list())
|
|
526
|
+
|
|
527
|
+
# Get feature unique IDs from FeatureMap
|
|
528
|
+
feature_map_ids = set()
|
|
529
|
+
for i in range(self.features.size()):
|
|
530
|
+
feature = self.features[i]
|
|
531
|
+
unique_id = str(feature.getUniqueId()) # Convert to string to match DataFrame
|
|
532
|
+
feature_map_ids.add(unique_id)
|
|
533
|
+
|
|
534
|
+
# Find features that exist in both
|
|
535
|
+
common_feature_ids = df_feature_ids & feature_map_ids
|
|
536
|
+
|
|
537
|
+
# Safety check: log error and exit if no features are matching
|
|
538
|
+
if not common_feature_ids:
|
|
539
|
+
self.logger.error(
|
|
540
|
+
f"No matching features found between FeatureMap and features_df. "
|
|
541
|
+
f"FeatureMap has {len(feature_map_ids)} features, "
|
|
542
|
+
f"features_df has {len(df_feature_ids)} features. "
|
|
543
|
+
f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes."
|
|
544
|
+
)
|
|
545
|
+
return
|
|
546
|
+
|
|
547
|
+
# Create new synchronized FeatureMap with only common features
|
|
548
|
+
synced_feature_map = oms.FeatureMap()
|
|
549
|
+
for i in range(self.features.size()):
|
|
550
|
+
feature = self.features[i]
|
|
551
|
+
unique_id = str(feature.getUniqueId())
|
|
552
|
+
if unique_id in common_feature_ids:
|
|
553
|
+
synced_feature_map.push_back(feature)
|
|
554
|
+
|
|
555
|
+
# Filter features_df to only include features that exist in FeatureMap
|
|
556
|
+
synced_features_df = self.features_df.filter(
|
|
557
|
+
pl.col("feature_id").is_in(list(common_feature_ids))
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Update the objects
|
|
561
|
+
original_map_size = self.features.size()
|
|
562
|
+
original_df_size = len(self.features_df)
|
|
563
|
+
|
|
564
|
+
self.features = synced_feature_map
|
|
565
|
+
self.features_df = synced_features_df
|
|
566
|
+
|
|
567
|
+
# Log the synchronization results
|
|
568
|
+
map_removed = original_map_size - self.features.size()
|
|
569
|
+
df_removed = original_df_size - len(self.features_df)
|
|
570
|
+
|
|
571
|
+
# only log if features were removed
|
|
572
|
+
if map_removed > 0 or df_removed > 0:
|
|
573
|
+
self.logger.info(
|
|
574
|
+
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
575
|
+
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
576
|
+
f"({df_removed} removed)"
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
self.logger.debug(
|
|
580
|
+
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
581
|
+
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
582
|
+
f"({df_removed} removed)"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
except ImportError:
|
|
586
|
+
self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
|
|
587
|
+
except Exception as e:
|
|
588
|
+
self.logger.error(f"Error during feature synchronization: {e}")
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def features_delete(self, features: list|None=None):
|
|
592
|
+
"""
|
|
593
|
+
Delete features from both self.features_df and self.features based on a list of feature UIDs.
|
|
594
|
+
|
|
595
|
+
Parameters:
|
|
596
|
+
features (list, optional): List of feature UIDs to delete. If None, all features will be deleted.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
None
|
|
600
|
+
|
|
601
|
+
Side Effects:
|
|
602
|
+
Updates self.features_df by removing specified features.
|
|
603
|
+
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the remaining features.
|
|
604
|
+
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
605
|
+
|
|
606
|
+
Note:
|
|
607
|
+
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
608
|
+
containing only the features that should remain after deletion.
|
|
609
|
+
"""
|
|
610
|
+
if self.features_df is None:
|
|
611
|
+
self.logger.warning("No features found.")
|
|
612
|
+
return
|
|
613
|
+
|
|
614
|
+
# Get the feature UIDs to delete
|
|
615
|
+
feature_uids_to_delete = self._get_feature_uids(features=features, verbose=True)
|
|
616
|
+
|
|
617
|
+
if not feature_uids_to_delete:
|
|
618
|
+
self.logger.warning("No valid feature UIDs provided for deletion.")
|
|
619
|
+
return
|
|
620
|
+
|
|
621
|
+
original_count = len(self.features_df)
|
|
622
|
+
|
|
623
|
+
# Update features_df by filtering out the features to delete
|
|
624
|
+
self.features_df = self.features_df.filter(
|
|
625
|
+
~pl.col("feature_uid").is_in(feature_uids_to_delete)
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
629
|
+
if self.features is not None:
|
|
630
|
+
try:
|
|
631
|
+
# Import pyopenms
|
|
632
|
+
import pyopenms as oms
|
|
633
|
+
|
|
634
|
+
# Create new FeatureMap with only features to keep
|
|
635
|
+
filtered_map = oms.FeatureMap()
|
|
636
|
+
|
|
637
|
+
# Get the feature UIDs that should remain after deletion
|
|
638
|
+
remaining_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
639
|
+
|
|
640
|
+
# Iterate through existing features and keep only those not in deletion list
|
|
641
|
+
for i in range(self.features.size()):
|
|
642
|
+
feature = self.features[i]
|
|
643
|
+
# Since feature UIDs in DataFrame are sequential (0, 1, 2, ...) and correspond to indices
|
|
644
|
+
# we can check if the current index is in the remaining UIDs
|
|
645
|
+
if i in remaining_feature_uids:
|
|
646
|
+
filtered_map.push_back(feature)
|
|
647
|
+
|
|
648
|
+
# Replace the original FeatureMap with the filtered one
|
|
649
|
+
self.features = filtered_map
|
|
650
|
+
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
651
|
+
|
|
652
|
+
except ImportError:
|
|
653
|
+
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
654
|
+
except Exception as e:
|
|
655
|
+
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
656
|
+
|
|
657
|
+
# Update scans_df to remove feature_uid associations for deleted features
|
|
658
|
+
if hasattr(self, 'scans_df') and self.scans_df is not None:
|
|
659
|
+
self.scans_df = self.scans_df.with_columns(
|
|
660
|
+
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
661
|
+
.then(None)
|
|
662
|
+
.otherwise(pl.col("feature_uid"))
|
|
663
|
+
.alias("feature_uid")
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
deleted_count = original_count - len(self.features_df)
|
|
667
|
+
self.logger.info(f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}")
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def _delete_ms2(self):
|
|
671
|
+
"""
|
|
672
|
+
Unlinks MS2 spectra from features in the dataset.
|
|
673
|
+
This method removes the association between MS2 spectra and features in the features dataframe by setting
|
|
674
|
+
the 'ms2_scans' and 'ms2_specs' columns to None. It also updates the scans dataframe to remove the feature
|
|
675
|
+
id (feature_uid) association for the linked MS2 spectra.
|
|
676
|
+
Parameters:
|
|
677
|
+
Returns:
|
|
678
|
+
None
|
|
679
|
+
Side Effects:
|
|
680
|
+
Updates self.features_df by setting 'ms2_scans' and 'ms2_specs' columns to None. Also, updates self.scans_df
|
|
681
|
+
by resetting the 'feature_uid' column for linked MS2 spectra.
|
|
682
|
+
"""
|
|
683
|
+
if self.features_df is None:
|
|
684
|
+
# self.logger.warning("No features found.")
|
|
685
|
+
return
|
|
686
|
+
|
|
687
|
+
self.logger.debug("Unlinking MS2 spectra from features...")
|
|
688
|
+
|
|
689
|
+
# Set ms2_scans and ms2_specs to None using Polars syntax
|
|
690
|
+
self.features_df = self.features_df.with_columns([
|
|
691
|
+
pl.lit(None).alias("ms2_scans"),
|
|
692
|
+
pl.lit(None).alias("ms2_specs"),
|
|
693
|
+
])
|
|
694
|
+
|
|
695
|
+
# Update scans_df to remove feature_uid association for linked MS2 spectra
|
|
696
|
+
self.scans_df = self.scans_df.with_columns(
|
|
697
|
+
pl.when(pl.col("ms_level") == 2).then(None).otherwise(pl.col("feature_uid")).alias("feature_uid"),
|
|
698
|
+
)
|
|
699
|
+
self.logger.info("MS2 spectra unlinked from features.")
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def features_filter(self, features):
|
|
703
|
+
"""
|
|
704
|
+
Keep only the specified features and delete all others. This is the opposite of features_delete().
|
|
705
|
+
|
|
706
|
+
Parameters:
|
|
707
|
+
features: Can be one of the following:
|
|
708
|
+
- list: List of feature UIDs to keep
|
|
709
|
+
- polars.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
710
|
+
- pandas.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
None
|
|
714
|
+
|
|
715
|
+
Side Effects:
|
|
716
|
+
Updates self.features_df by keeping only the specified features.
|
|
717
|
+
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the specified features.
|
|
718
|
+
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
719
|
+
|
|
720
|
+
Note:
|
|
721
|
+
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
722
|
+
containing only the features that should be kept.
|
|
723
|
+
"""
|
|
724
|
+
if self.features_df is None:
|
|
725
|
+
self.logger.warning("No features found.")
|
|
726
|
+
return
|
|
727
|
+
|
|
728
|
+
if features is None:
|
|
729
|
+
self.logger.warning("No features specified to keep. Use features_delete() to delete all features.")
|
|
730
|
+
return
|
|
731
|
+
|
|
732
|
+
# Get the feature UIDs to keep
|
|
733
|
+
feature_uids_to_keep = self._get_feature_uids(features=features, verbose=True)
|
|
734
|
+
|
|
735
|
+
if not feature_uids_to_keep:
|
|
736
|
+
self.logger.warning("No valid feature UIDs provided to keep.")
|
|
737
|
+
return
|
|
738
|
+
|
|
739
|
+
original_count = len(self.features_df)
|
|
740
|
+
|
|
741
|
+
# Update features_df by keeping only the specified features
|
|
742
|
+
self.features_df = self.features_df.filter(
|
|
743
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep)
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Calculate which features were deleted (all except the ones to keep)
|
|
747
|
+
all_feature_uids = set(range(original_count)) # Assuming sequential UIDs
|
|
748
|
+
feature_uids_to_delete = list(all_feature_uids - set(feature_uids_to_keep))
|
|
749
|
+
|
|
750
|
+
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
751
|
+
if self.features is not None:
|
|
752
|
+
try:
|
|
753
|
+
# Import pyopenms
|
|
754
|
+
import pyopenms as oms
|
|
755
|
+
|
|
756
|
+
# Create new FeatureMap with only features to keep
|
|
757
|
+
filtered_map = oms.FeatureMap()
|
|
758
|
+
|
|
759
|
+
# Iterate through existing features and keep only those in the keep list
|
|
760
|
+
for i in range(self.features.size()):
|
|
761
|
+
feature = self.features[i]
|
|
762
|
+
# Since feature UIDs in DataFrame are sequential (0, 1, 2, ...) and correspond to indices
|
|
763
|
+
# we can check if the current index is in the keep UIDs
|
|
764
|
+
if i in feature_uids_to_keep:
|
|
765
|
+
filtered_map.push_back(feature)
|
|
766
|
+
|
|
767
|
+
# Replace the original FeatureMap with the filtered one
|
|
768
|
+
self.features = filtered_map
|
|
769
|
+
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
770
|
+
|
|
771
|
+
except ImportError:
|
|
772
|
+
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
773
|
+
except Exception as e:
|
|
774
|
+
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
775
|
+
|
|
776
|
+
# Update scans_df to remove feature_uid associations for deleted features
|
|
777
|
+
if hasattr(self, 'scans_df') and self.scans_df is not None and feature_uids_to_delete:
|
|
778
|
+
self.scans_df = self.scans_df.with_columns(
|
|
779
|
+
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
780
|
+
.then(None)
|
|
781
|
+
.otherwise(pl.col("feature_uid"))
|
|
782
|
+
.alias("feature_uid")
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
kept_count = len(self.features_df)
|
|
786
|
+
deleted_count = original_count - kept_count
|
|
787
|
+
self.logger.info(f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}")
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def set_source(self, filename):
|
|
791
|
+
"""
|
|
792
|
+
Reassign file_source. If filename contains only a path, keep the current basename
|
|
793
|
+
and build an absolute path. Check that the new file exists before overwriting
|
|
794
|
+
the old file_source.
|
|
795
|
+
|
|
796
|
+
Parameters:
|
|
797
|
+
filename (str): New file path or directory path
|
|
798
|
+
|
|
799
|
+
Returns:
|
|
800
|
+
None
|
|
801
|
+
"""
|
|
802
|
+
import os
|
|
803
|
+
|
|
804
|
+
# Store the old file_source for logging
|
|
805
|
+
old_file_source = getattr(self, 'file_source', None)
|
|
806
|
+
|
|
807
|
+
# Check if filename is just a directory path
|
|
808
|
+
if os.path.isdir(filename):
|
|
809
|
+
if old_file_source is None:
|
|
810
|
+
self.logger.error("Cannot build path: no current file_source available")
|
|
811
|
+
return
|
|
812
|
+
|
|
813
|
+
# Get the basename from current file_source
|
|
814
|
+
current_basename = os.path.basename(old_file_source)
|
|
815
|
+
# Build new absolute path
|
|
816
|
+
new_file_path = os.path.join(filename, current_basename)
|
|
817
|
+
else:
|
|
818
|
+
# filename is a full path, make it absolute
|
|
819
|
+
new_file_path = os.path.abspath(filename)
|
|
820
|
+
|
|
821
|
+
# Check if the new file exists
|
|
822
|
+
if not os.path.exists(new_file_path):
|
|
823
|
+
self.logger.error(f"File does not exist: {new_file_path}")
|
|
824
|
+
return
|
|
825
|
+
|
|
826
|
+
# Update file_source
|
|
827
|
+
self.file_source = new_file_path
|
|
828
|
+
|
|
829
|
+
# Log the change
|
|
830
|
+
if old_file_source is not None:
|
|
831
|
+
self.logger.info(f"Updated file_source from {old_file_source} to {self.file_source}")
|
|
832
|
+
else:
|
|
833
|
+
self.logger.info(f"Set file_source to {self.file_source}")
|