masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +16 -6
- masster/sample/defaults/sample_def.py +1 -1
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +190 -140
- masster/sample/load.py +13 -9
- masster/sample/plot.py +256 -147
- masster/sample/processing.py +18 -12
- masster/sample/sample.py +10 -4
- masster/sample/sample5_schema.json +38 -29
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +231 -13
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +757 -246
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +57 -25
- masster/study/plot.py +1244 -129
- masster/study/processing.py +194 -86
- masster/study/save.py +7 -7
- masster/study/study.py +154 -89
- masster/study/study5_schema.json +15 -15
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/METADATA +1 -1
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/RECORD +33 -31
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/WHEEL +0 -0
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/entry_points.txt +0 -0
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/licenses/LICENSE +0 -0
masster/sample/helpers.py
CHANGED
|
@@ -9,81 +9,81 @@ import polars as pl
|
|
|
9
9
|
def _estimate_memory_usage(self):
|
|
10
10
|
"""
|
|
11
11
|
Estimate the memory usage of all dataframes in the Sample object.
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
Returns:
|
|
14
14
|
dict: A dictionary containing memory usage estimates for each dataframe
|
|
15
15
|
and the total memory usage in bytes and MB.
|
|
16
16
|
"""
|
|
17
17
|
memory_usage = {}
|
|
18
18
|
total_bytes = 0
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
# Check features_df
|
|
21
21
|
if self.features_df is not None and len(self.features_df) > 0:
|
|
22
22
|
features_bytes = self.features_df.estimated_size()
|
|
23
|
-
memory_usage[
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
memory_usage["features_df"] = {
|
|
24
|
+
"rows": len(self.features_df),
|
|
25
|
+
"columns": len(self.features_df.columns),
|
|
26
|
+
"bytes": features_bytes,
|
|
27
|
+
"mb": features_bytes / (1024 * 1024),
|
|
28
28
|
}
|
|
29
29
|
total_bytes += features_bytes
|
|
30
30
|
else:
|
|
31
|
-
memory_usage[
|
|
32
|
-
|
|
31
|
+
memory_usage["features_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
32
|
+
|
|
33
33
|
# Check scans_df
|
|
34
34
|
if self.scans_df is not None and len(self.scans_df) > 0:
|
|
35
35
|
scans_bytes = self.scans_df.estimated_size()
|
|
36
|
-
memory_usage[
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
memory_usage["scans_df"] = {
|
|
37
|
+
"rows": len(self.scans_df),
|
|
38
|
+
"columns": len(self.scans_df.columns),
|
|
39
|
+
"bytes": scans_bytes,
|
|
40
|
+
"mb": scans_bytes / (1024 * 1024),
|
|
41
41
|
}
|
|
42
42
|
total_bytes += scans_bytes
|
|
43
43
|
else:
|
|
44
|
-
memory_usage[
|
|
45
|
-
|
|
44
|
+
memory_usage["scans_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
45
|
+
|
|
46
46
|
# Check ms1_df
|
|
47
47
|
if self.ms1_df is not None and len(self.ms1_df) > 0:
|
|
48
48
|
ms1_bytes = self.ms1_df.estimated_size()
|
|
49
|
-
memory_usage[
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
memory_usage["ms1_df"] = {
|
|
50
|
+
"rows": len(self.ms1_df),
|
|
51
|
+
"columns": len(self.ms1_df.columns),
|
|
52
|
+
"bytes": ms1_bytes,
|
|
53
|
+
"mb": ms1_bytes / (1024 * 1024),
|
|
54
54
|
}
|
|
55
55
|
total_bytes += ms1_bytes
|
|
56
56
|
else:
|
|
57
|
-
memory_usage[
|
|
58
|
-
|
|
57
|
+
memory_usage["ms1_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
58
|
+
|
|
59
59
|
# Check chrom_df
|
|
60
60
|
if self.chrom_df is not None and len(self.chrom_df) > 0:
|
|
61
61
|
chrom_bytes = self.chrom_df.estimated_size()
|
|
62
|
-
memory_usage[
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
62
|
+
memory_usage["chrom_df"] = {
|
|
63
|
+
"rows": len(self.chrom_df),
|
|
64
|
+
"columns": len(self.chrom_df.columns),
|
|
65
|
+
"bytes": chrom_bytes,
|
|
66
|
+
"mb": chrom_bytes / (1024 * 1024),
|
|
67
67
|
}
|
|
68
68
|
total_bytes += chrom_bytes
|
|
69
69
|
else:
|
|
70
|
-
memory_usage[
|
|
71
|
-
|
|
70
|
+
memory_usage["chrom_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
71
|
+
|
|
72
72
|
# Add total memory usage
|
|
73
|
-
memory_usage[
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
memory_usage["total"] = {
|
|
74
|
+
"bytes": total_bytes,
|
|
75
|
+
"mb": total_bytes / (1024 * 1024),
|
|
76
|
+
"gb": total_bytes / (1024 * 1024 * 1024),
|
|
77
77
|
}
|
|
78
|
-
|
|
78
|
+
|
|
79
79
|
# Log the memory usage summary
|
|
80
|
-
if hasattr(self,
|
|
80
|
+
if hasattr(self, "logger"):
|
|
81
81
|
self.logger.debug(f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB")
|
|
82
82
|
for df_name, stats in memory_usage.items():
|
|
83
|
-
if df_name !=
|
|
83
|
+
if df_name != "total" and stats["bytes"] > 0:
|
|
84
84
|
self.logger.debug(f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB")
|
|
85
|
-
|
|
86
|
-
return memory_usage[
|
|
85
|
+
|
|
86
|
+
return memory_usage["total"]["mb"]
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
def get_dda_stats(self):
|
|
@@ -92,9 +92,6 @@ def get_dda_stats(self):
|
|
|
92
92
|
return ms1
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
# TODO
|
|
96
|
-
|
|
97
|
-
|
|
98
95
|
def get_feature(self, feature_uid):
|
|
99
96
|
# get the feature with feature_uid == feature_uid
|
|
100
97
|
feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
|
|
@@ -121,7 +118,7 @@ def _get_scan_uids(self, scans=None, verbose=True):
|
|
|
121
118
|
def _get_feature_uids(self, features=None, verbose=True):
|
|
122
119
|
"""
|
|
123
120
|
Get feature UIDs from various input types.
|
|
124
|
-
|
|
121
|
+
|
|
125
122
|
Parameters:
|
|
126
123
|
features: Can be one of the following:
|
|
127
124
|
- None: Returns all feature UIDs from self.features_df
|
|
@@ -129,7 +126,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
129
126
|
- polars.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
130
127
|
- pandas.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
131
128
|
verbose (bool): Whether to log errors for invalid inputs
|
|
132
|
-
|
|
129
|
+
|
|
133
130
|
Returns:
|
|
134
131
|
list: List of feature UIDs
|
|
135
132
|
"""
|
|
@@ -146,7 +143,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
146
143
|
if verbose:
|
|
147
144
|
self.logger.warning("No features_df available to validate feature UIDs.")
|
|
148
145
|
return []
|
|
149
|
-
|
|
146
|
+
|
|
150
147
|
valid_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
151
148
|
feature_uids = [f for f in features if f in valid_feature_uids]
|
|
152
149
|
if verbose and not feature_uids:
|
|
@@ -155,50 +152,53 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
155
152
|
# Handle polars and pandas DataFrames
|
|
156
153
|
try:
|
|
157
154
|
# Check if it's a polars DataFrame
|
|
158
|
-
if hasattr(features,
|
|
155
|
+
if hasattr(features, "columns") and hasattr(features, "get_column"):
|
|
159
156
|
# Polars DataFrame
|
|
160
157
|
feature_column = None
|
|
161
|
-
if
|
|
162
|
-
feature_column =
|
|
163
|
-
elif
|
|
164
|
-
feature_column =
|
|
165
|
-
|
|
158
|
+
if "feature_uid" in features.columns:
|
|
159
|
+
feature_column = "feature_uid"
|
|
160
|
+
elif "feature_id" in features.columns:
|
|
161
|
+
feature_column = "feature_id"
|
|
162
|
+
|
|
166
163
|
if feature_column is None:
|
|
167
164
|
if verbose:
|
|
168
165
|
self.logger.error("No 'feature_uid' or 'feature_id' column found in polars DataFrame.")
|
|
169
166
|
return []
|
|
170
|
-
|
|
167
|
+
|
|
171
168
|
# Get unique values from the column
|
|
172
169
|
feature_uids = features.get_column(feature_column).unique().to_list()
|
|
173
|
-
|
|
170
|
+
|
|
174
171
|
# Check if it's a pandas DataFrame
|
|
175
|
-
elif hasattr(features,
|
|
172
|
+
elif hasattr(features, "columns") and hasattr(features, "iloc"):
|
|
176
173
|
# Pandas DataFrame
|
|
177
174
|
import pandas as pd
|
|
175
|
+
|
|
178
176
|
if not isinstance(features, pd.DataFrame):
|
|
179
177
|
if verbose:
|
|
180
|
-
self.logger.error(
|
|
178
|
+
self.logger.error(
|
|
179
|
+
"Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame."
|
|
180
|
+
)
|
|
181
181
|
return []
|
|
182
|
-
|
|
182
|
+
|
|
183
183
|
feature_column = None
|
|
184
|
-
if
|
|
185
|
-
feature_column =
|
|
186
|
-
elif
|
|
187
|
-
feature_column =
|
|
188
|
-
|
|
184
|
+
if "feature_uid" in features.columns:
|
|
185
|
+
feature_column = "feature_uid"
|
|
186
|
+
elif "feature_id" in features.columns:
|
|
187
|
+
feature_column = "feature_id"
|
|
188
|
+
|
|
189
189
|
if feature_column is None:
|
|
190
190
|
if verbose:
|
|
191
191
|
self.logger.error("No 'feature_uid' or 'feature_id' column found in pandas DataFrame.")
|
|
192
192
|
return []
|
|
193
|
-
|
|
193
|
+
|
|
194
194
|
# Get unique values from the column
|
|
195
195
|
feature_uids = features[feature_column].unique().tolist()
|
|
196
|
-
|
|
196
|
+
|
|
197
197
|
else:
|
|
198
198
|
if verbose:
|
|
199
199
|
self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
|
|
200
200
|
return []
|
|
201
|
-
|
|
201
|
+
|
|
202
202
|
except Exception as e:
|
|
203
203
|
if verbose:
|
|
204
204
|
self.logger.error(f"Error processing DataFrame input: {e}")
|
|
@@ -281,7 +281,59 @@ def select_closest_scan(
|
|
|
281
281
|
return scan
|
|
282
282
|
|
|
283
283
|
|
|
284
|
-
|
|
284
|
+
def get_eic(self, mz, mz_tol=0.01):
|
|
285
|
+
"""
|
|
286
|
+
Extract an extracted ion chromatogram (EIC) from `ms1_df` for a target m/z ± mz_tol.
|
|
287
|
+
|
|
288
|
+
The function filters `self.ms1_df` for rows with `mz` within the tolerance, aggregates
|
|
289
|
+
intensities per retention time (summing intensities for the same `rt`), sorts by `rt`,
|
|
290
|
+
stores the resulting chromatogram in `self.chrom_df` and returns it.
|
|
291
|
+
|
|
292
|
+
Parameters:
|
|
293
|
+
mz (float): target m/z value
|
|
294
|
+
mz_tol (float): tolerance around mz (default 0.01)
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
polars.DataFrame or None: chromatogram with columns ['rt', 'inty'] or None if not available
|
|
298
|
+
"""
|
|
299
|
+
# Validate ms1_df
|
|
300
|
+
if not hasattr(self, "ms1_df") or self.ms1_df is None:
|
|
301
|
+
if hasattr(self, "logger"):
|
|
302
|
+
self.logger.warning("No ms1_df available to build EIC.")
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
# Filter by mz window
|
|
307
|
+
mz_min = mz - mz_tol
|
|
308
|
+
mz_max = mz + mz_tol
|
|
309
|
+
matches = self.ms1_df.filter((pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max))
|
|
310
|
+
|
|
311
|
+
if len(matches) == 0:
|
|
312
|
+
if hasattr(self, "logger"):
|
|
313
|
+
self.logger.debug(f"No ms1 points found for mz={mz} ± {mz_tol}.")
|
|
314
|
+
# ensure chrom_df is None when nothing found
|
|
315
|
+
self.chrom_df = None
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
# Aggregate intensities per retention time. Use sum in case multiple points per rt.
|
|
319
|
+
chrom = (
|
|
320
|
+
matches.group_by("rt")
|
|
321
|
+
.agg([pl.col("inty").sum().alias("inty")])
|
|
322
|
+
.sort("rt")
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Attach to Sample
|
|
326
|
+
self.chrom_df = chrom
|
|
327
|
+
|
|
328
|
+
if hasattr(self, "logger"):
|
|
329
|
+
self.logger.debug(f"Built EIC for mz={mz} ± {mz_tol}: {len(chrom)} points.")
|
|
330
|
+
|
|
331
|
+
return chrom
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
if hasattr(self, "logger"):
|
|
335
|
+
self.logger.error(f"Error building EIC for mz={mz}: {e}")
|
|
336
|
+
return None
|
|
285
337
|
|
|
286
338
|
|
|
287
339
|
def select(
|
|
@@ -301,7 +353,7 @@ def select(
|
|
|
301
353
|
):
|
|
302
354
|
"""
|
|
303
355
|
Select features based on specified criteria and return the filtered DataFrame.
|
|
304
|
-
|
|
356
|
+
|
|
305
357
|
Parameters:
|
|
306
358
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
307
359
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -315,7 +367,7 @@ def select(
|
|
|
315
367
|
height_scaled: scaled height filter (tuple for range, single value for minimum)
|
|
316
368
|
prominence: prominence filter (tuple for range, single value for minimum)
|
|
317
369
|
height: height filter (tuple for range, single value for minimum)
|
|
318
|
-
|
|
370
|
+
|
|
319
371
|
Returns:
|
|
320
372
|
polars.DataFrame: Filtered features DataFrame
|
|
321
373
|
"""
|
|
@@ -491,24 +543,22 @@ def select(
|
|
|
491
543
|
return feats
|
|
492
544
|
|
|
493
545
|
|
|
494
|
-
|
|
495
|
-
|
|
496
546
|
def _features_sync(self):
|
|
497
547
|
"""
|
|
498
|
-
Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
|
|
548
|
+
Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
|
|
499
549
|
but not the other, using feature_id for mapping between them.
|
|
500
|
-
|
|
550
|
+
|
|
501
551
|
This function ensures that:
|
|
502
552
|
- Features in the FeatureMap that don't have corresponding entries in features_df are removed
|
|
503
553
|
- Features in features_df that don't have corresponding entries in the FeatureMap are removed
|
|
504
|
-
|
|
554
|
+
|
|
505
555
|
Returns:
|
|
506
556
|
None
|
|
507
|
-
|
|
557
|
+
|
|
508
558
|
Side Effects:
|
|
509
559
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with synchronized features
|
|
510
560
|
Updates self.features_df by filtering to only include features present in the FeatureMap
|
|
511
|
-
|
|
561
|
+
|
|
512
562
|
Note:
|
|
513
563
|
Uses feature_id as the mapping key. feature_id contains OpenMS unique IDs that correspond
|
|
514
564
|
to the unique IDs of features in the FeatureMap.
|
|
@@ -516,34 +566,34 @@ def _features_sync(self):
|
|
|
516
566
|
if self.features_df is None or self.features is None:
|
|
517
567
|
self.logger.warning("Cannot sync: features_df or FeatureMap is None.")
|
|
518
568
|
return
|
|
519
|
-
|
|
569
|
+
|
|
520
570
|
try:
|
|
521
571
|
# Import pyopenms
|
|
522
572
|
import pyopenms as oms
|
|
523
|
-
|
|
573
|
+
|
|
524
574
|
# Get feature_ids from features_df
|
|
525
575
|
df_feature_ids = set(self.features_df.get_column("feature_id").to_list())
|
|
526
|
-
|
|
576
|
+
|
|
527
577
|
# Get feature unique IDs from FeatureMap
|
|
528
578
|
feature_map_ids = set()
|
|
529
579
|
for i in range(self.features.size()):
|
|
530
580
|
feature = self.features[i]
|
|
531
581
|
unique_id = str(feature.getUniqueId()) # Convert to string to match DataFrame
|
|
532
582
|
feature_map_ids.add(unique_id)
|
|
533
|
-
|
|
583
|
+
|
|
534
584
|
# Find features that exist in both
|
|
535
585
|
common_feature_ids = df_feature_ids & feature_map_ids
|
|
536
|
-
|
|
586
|
+
|
|
537
587
|
# Safety check: log error and exit if no features are matching
|
|
538
588
|
if not common_feature_ids:
|
|
539
589
|
self.logger.error(
|
|
540
590
|
f"No matching features found between FeatureMap and features_df. "
|
|
541
591
|
f"FeatureMap has {len(feature_map_ids)} features, "
|
|
542
592
|
f"features_df has {len(df_feature_ids)} features. "
|
|
543
|
-
f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes."
|
|
593
|
+
f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes.",
|
|
544
594
|
)
|
|
545
595
|
return
|
|
546
|
-
|
|
596
|
+
|
|
547
597
|
# Create new synchronized FeatureMap with only common features
|
|
548
598
|
synced_feature_map = oms.FeatureMap()
|
|
549
599
|
for i in range(self.features.size()):
|
|
@@ -551,19 +601,19 @@ def _features_sync(self):
|
|
|
551
601
|
unique_id = str(feature.getUniqueId())
|
|
552
602
|
if unique_id in common_feature_ids:
|
|
553
603
|
synced_feature_map.push_back(feature)
|
|
554
|
-
|
|
604
|
+
|
|
555
605
|
# Filter features_df to only include features that exist in FeatureMap
|
|
556
606
|
synced_features_df = self.features_df.filter(
|
|
557
|
-
pl.col("feature_id").is_in(list(common_feature_ids))
|
|
607
|
+
pl.col("feature_id").is_in(list(common_feature_ids)),
|
|
558
608
|
)
|
|
559
|
-
|
|
609
|
+
|
|
560
610
|
# Update the objects
|
|
561
611
|
original_map_size = self.features.size()
|
|
562
612
|
original_df_size = len(self.features_df)
|
|
563
|
-
|
|
613
|
+
|
|
564
614
|
self.features = synced_feature_map
|
|
565
615
|
self.features_df = synced_features_df
|
|
566
|
-
|
|
616
|
+
|
|
567
617
|
# Log the synchronization results
|
|
568
618
|
map_removed = original_map_size - self.features.size()
|
|
569
619
|
df_removed = original_df_size - len(self.features_df)
|
|
@@ -573,36 +623,36 @@ def _features_sync(self):
|
|
|
573
623
|
self.logger.info(
|
|
574
624
|
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
575
625
|
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
576
|
-
f"({df_removed} removed)"
|
|
626
|
+
f"({df_removed} removed)",
|
|
577
627
|
)
|
|
578
628
|
else:
|
|
579
629
|
self.logger.debug(
|
|
580
630
|
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
581
631
|
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
582
|
-
f"({df_removed} removed)"
|
|
632
|
+
f"({df_removed} removed)",
|
|
583
633
|
)
|
|
584
|
-
|
|
634
|
+
|
|
585
635
|
except ImportError:
|
|
586
636
|
self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
|
|
587
637
|
except Exception as e:
|
|
588
638
|
self.logger.error(f"Error during feature synchronization: {e}")
|
|
589
639
|
|
|
590
640
|
|
|
591
|
-
def features_delete(self, features: list|None=None):
|
|
641
|
+
def features_delete(self, features: list | None = None):
|
|
592
642
|
"""
|
|
593
643
|
Delete features from both self.features_df and self.features based on a list of feature UIDs.
|
|
594
|
-
|
|
644
|
+
|
|
595
645
|
Parameters:
|
|
596
646
|
features (list, optional): List of feature UIDs to delete. If None, all features will be deleted.
|
|
597
|
-
|
|
647
|
+
|
|
598
648
|
Returns:
|
|
599
649
|
None
|
|
600
|
-
|
|
650
|
+
|
|
601
651
|
Side Effects:
|
|
602
652
|
Updates self.features_df by removing specified features.
|
|
603
653
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the remaining features.
|
|
604
654
|
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
605
|
-
|
|
655
|
+
|
|
606
656
|
Note:
|
|
607
657
|
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
608
658
|
containing only the features that should remain after deletion.
|
|
@@ -610,33 +660,33 @@ def features_delete(self, features: list|None=None):
|
|
|
610
660
|
if self.features_df is None:
|
|
611
661
|
self.logger.warning("No features found.")
|
|
612
662
|
return
|
|
613
|
-
|
|
663
|
+
|
|
614
664
|
# Get the feature UIDs to delete
|
|
615
665
|
feature_uids_to_delete = self._get_feature_uids(features=features, verbose=True)
|
|
616
|
-
|
|
666
|
+
|
|
617
667
|
if not feature_uids_to_delete:
|
|
618
668
|
self.logger.warning("No valid feature UIDs provided for deletion.")
|
|
619
669
|
return
|
|
620
|
-
|
|
670
|
+
|
|
621
671
|
original_count = len(self.features_df)
|
|
622
|
-
|
|
672
|
+
|
|
623
673
|
# Update features_df by filtering out the features to delete
|
|
624
674
|
self.features_df = self.features_df.filter(
|
|
625
|
-
~pl.col("feature_uid").is_in(feature_uids_to_delete)
|
|
675
|
+
~pl.col("feature_uid").is_in(feature_uids_to_delete),
|
|
626
676
|
)
|
|
627
|
-
|
|
677
|
+
|
|
628
678
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
629
679
|
if self.features is not None:
|
|
630
680
|
try:
|
|
631
681
|
# Import pyopenms
|
|
632
682
|
import pyopenms as oms
|
|
633
|
-
|
|
683
|
+
|
|
634
684
|
# Create new FeatureMap with only features to keep
|
|
635
685
|
filtered_map = oms.FeatureMap()
|
|
636
|
-
|
|
686
|
+
|
|
637
687
|
# Get the feature UIDs that should remain after deletion
|
|
638
688
|
remaining_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
639
|
-
|
|
689
|
+
|
|
640
690
|
# Iterate through existing features and keep only those not in deletion list
|
|
641
691
|
for i in range(self.features.size()):
|
|
642
692
|
feature = self.features[i]
|
|
@@ -644,25 +694,25 @@ def features_delete(self, features: list|None=None):
|
|
|
644
694
|
# we can check if the current index is in the remaining UIDs
|
|
645
695
|
if i in remaining_feature_uids:
|
|
646
696
|
filtered_map.push_back(feature)
|
|
647
|
-
|
|
697
|
+
|
|
648
698
|
# Replace the original FeatureMap with the filtered one
|
|
649
699
|
self.features = filtered_map
|
|
650
700
|
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
651
|
-
|
|
701
|
+
|
|
652
702
|
except ImportError:
|
|
653
703
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
654
704
|
except Exception as e:
|
|
655
705
|
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
656
|
-
|
|
706
|
+
|
|
657
707
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
658
|
-
if hasattr(self,
|
|
708
|
+
if hasattr(self, "scans_df") and self.scans_df is not None:
|
|
659
709
|
self.scans_df = self.scans_df.with_columns(
|
|
660
710
|
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
661
711
|
.then(None)
|
|
662
712
|
.otherwise(pl.col("feature_uid"))
|
|
663
|
-
.alias("feature_uid")
|
|
713
|
+
.alias("feature_uid"),
|
|
664
714
|
)
|
|
665
|
-
|
|
715
|
+
|
|
666
716
|
deleted_count = original_count - len(self.features_df)
|
|
667
717
|
self.logger.info(f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}")
|
|
668
718
|
|
|
@@ -702,21 +752,21 @@ def _delete_ms2(self):
|
|
|
702
752
|
def features_filter(self, features):
|
|
703
753
|
"""
|
|
704
754
|
Keep only the specified features and delete all others. This is the opposite of features_delete().
|
|
705
|
-
|
|
755
|
+
|
|
706
756
|
Parameters:
|
|
707
757
|
features: Can be one of the following:
|
|
708
758
|
- list: List of feature UIDs to keep
|
|
709
759
|
- polars.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
710
760
|
- pandas.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
711
|
-
|
|
761
|
+
|
|
712
762
|
Returns:
|
|
713
763
|
None
|
|
714
|
-
|
|
764
|
+
|
|
715
765
|
Side Effects:
|
|
716
766
|
Updates self.features_df by keeping only the specified features.
|
|
717
767
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the specified features.
|
|
718
768
|
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
719
|
-
|
|
769
|
+
|
|
720
770
|
Note:
|
|
721
771
|
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
722
772
|
containing only the features that should be kept.
|
|
@@ -724,38 +774,38 @@ def features_filter(self, features):
|
|
|
724
774
|
if self.features_df is None:
|
|
725
775
|
self.logger.warning("No features found.")
|
|
726
776
|
return
|
|
727
|
-
|
|
777
|
+
|
|
728
778
|
if features is None:
|
|
729
779
|
self.logger.warning("No features specified to keep. Use features_delete() to delete all features.")
|
|
730
780
|
return
|
|
731
|
-
|
|
781
|
+
|
|
732
782
|
# Get the feature UIDs to keep
|
|
733
783
|
feature_uids_to_keep = self._get_feature_uids(features=features, verbose=True)
|
|
734
|
-
|
|
784
|
+
|
|
735
785
|
if not feature_uids_to_keep:
|
|
736
786
|
self.logger.warning("No valid feature UIDs provided to keep.")
|
|
737
787
|
return
|
|
738
|
-
|
|
788
|
+
|
|
739
789
|
original_count = len(self.features_df)
|
|
740
|
-
|
|
790
|
+
|
|
741
791
|
# Update features_df by keeping only the specified features
|
|
742
792
|
self.features_df = self.features_df.filter(
|
|
743
|
-
pl.col("feature_uid").is_in(feature_uids_to_keep)
|
|
793
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
744
794
|
)
|
|
745
|
-
|
|
795
|
+
|
|
746
796
|
# Calculate which features were deleted (all except the ones to keep)
|
|
747
797
|
all_feature_uids = set(range(original_count)) # Assuming sequential UIDs
|
|
748
798
|
feature_uids_to_delete = list(all_feature_uids - set(feature_uids_to_keep))
|
|
749
|
-
|
|
799
|
+
|
|
750
800
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
751
801
|
if self.features is not None:
|
|
752
802
|
try:
|
|
753
803
|
# Import pyopenms
|
|
754
804
|
import pyopenms as oms
|
|
755
|
-
|
|
805
|
+
|
|
756
806
|
# Create new FeatureMap with only features to keep
|
|
757
807
|
filtered_map = oms.FeatureMap()
|
|
758
|
-
|
|
808
|
+
|
|
759
809
|
# Iterate through existing features and keep only those in the keep list
|
|
760
810
|
for i in range(self.features.size()):
|
|
761
811
|
feature = self.features[i]
|
|
@@ -763,25 +813,25 @@ def features_filter(self, features):
|
|
|
763
813
|
# we can check if the current index is in the keep UIDs
|
|
764
814
|
if i in feature_uids_to_keep:
|
|
765
815
|
filtered_map.push_back(feature)
|
|
766
|
-
|
|
816
|
+
|
|
767
817
|
# Replace the original FeatureMap with the filtered one
|
|
768
818
|
self.features = filtered_map
|
|
769
819
|
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
770
|
-
|
|
820
|
+
|
|
771
821
|
except ImportError:
|
|
772
822
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
773
823
|
except Exception as e:
|
|
774
824
|
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
775
|
-
|
|
825
|
+
|
|
776
826
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
777
|
-
if hasattr(self,
|
|
827
|
+
if hasattr(self, "scans_df") and self.scans_df is not None and feature_uids_to_delete:
|
|
778
828
|
self.scans_df = self.scans_df.with_columns(
|
|
779
829
|
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
780
830
|
.then(None)
|
|
781
831
|
.otherwise(pl.col("feature_uid"))
|
|
782
|
-
.alias("feature_uid")
|
|
832
|
+
.alias("feature_uid"),
|
|
783
833
|
)
|
|
784
|
-
|
|
834
|
+
|
|
785
835
|
kept_count = len(self.features_df)
|
|
786
836
|
deleted_count = original_count - kept_count
|
|
787
837
|
self.logger.info(f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}")
|
|
@@ -789,27 +839,27 @@ def features_filter(self, features):
|
|
|
789
839
|
|
|
790
840
|
def set_source(self, filename):
|
|
791
841
|
"""
|
|
792
|
-
Reassign file_source. If filename contains only a path, keep the current basename
|
|
793
|
-
and build an absolute path. Check that the new file exists before overwriting
|
|
842
|
+
Reassign file_source. If filename contains only a path, keep the current basename
|
|
843
|
+
and build an absolute path. Check that the new file exists before overwriting
|
|
794
844
|
the old file_source.
|
|
795
|
-
|
|
845
|
+
|
|
796
846
|
Parameters:
|
|
797
847
|
filename (str): New file path or directory path
|
|
798
|
-
|
|
848
|
+
|
|
799
849
|
Returns:
|
|
800
850
|
None
|
|
801
851
|
"""
|
|
802
852
|
import os
|
|
803
|
-
|
|
853
|
+
|
|
804
854
|
# Store the old file_source for logging
|
|
805
|
-
old_file_source = getattr(self,
|
|
806
|
-
|
|
855
|
+
old_file_source = getattr(self, "file_source", None)
|
|
856
|
+
|
|
807
857
|
# Check if filename is just a directory path
|
|
808
858
|
if os.path.isdir(filename):
|
|
809
859
|
if old_file_source is None:
|
|
810
860
|
self.logger.error("Cannot build path: no current file_source available")
|
|
811
861
|
return
|
|
812
|
-
|
|
862
|
+
|
|
813
863
|
# Get the basename from current file_source
|
|
814
864
|
current_basename = os.path.basename(old_file_source)
|
|
815
865
|
# Build new absolute path
|
|
@@ -817,15 +867,15 @@ def set_source(self, filename):
|
|
|
817
867
|
else:
|
|
818
868
|
# filename is a full path, make it absolute
|
|
819
869
|
new_file_path = os.path.abspath(filename)
|
|
820
|
-
|
|
870
|
+
|
|
821
871
|
# Check if the new file exists
|
|
822
872
|
if not os.path.exists(new_file_path):
|
|
823
873
|
self.logger.error(f"File does not exist: {new_file_path}")
|
|
824
874
|
return
|
|
825
|
-
|
|
875
|
+
|
|
826
876
|
# Update file_source
|
|
827
877
|
self.file_source = new_file_path
|
|
828
|
-
|
|
878
|
+
|
|
829
879
|
# Log the change
|
|
830
880
|
if old_file_source is not None:
|
|
831
881
|
self.logger.info(f"Updated file_source from {old_file_source} to {self.file_source}")
|