masster 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +90 -94
- masster/sample/defaults/sample_def.py +15 -0
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +137 -136
- masster/sample/lib.py +11 -11
- masster/sample/load.py +13 -9
- masster/sample/plot.py +167 -60
- masster/sample/processing.py +150 -153
- masster/sample/sample.py +4 -4
- masster/sample/sample5_schema.json +62 -62
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +224 -6
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +293 -245
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +51 -25
- masster/study/plot.py +453 -17
- masster/study/processing.py +197 -123
- masster/study/save.py +7 -7
- masster/study/study.py +97 -88
- masster/study/study5_schema.json +82 -82
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/RECORD +34 -32
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0
masster/sample/helpers.py
CHANGED
|
@@ -9,81 +9,81 @@ import polars as pl
|
|
|
9
9
|
def _estimate_memory_usage(self):
|
|
10
10
|
"""
|
|
11
11
|
Estimate the memory usage of all dataframes in the Sample object.
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
Returns:
|
|
14
14
|
dict: A dictionary containing memory usage estimates for each dataframe
|
|
15
15
|
and the total memory usage in bytes and MB.
|
|
16
16
|
"""
|
|
17
17
|
memory_usage = {}
|
|
18
18
|
total_bytes = 0
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
# Check features_df
|
|
21
21
|
if self.features_df is not None and len(self.features_df) > 0:
|
|
22
22
|
features_bytes = self.features_df.estimated_size()
|
|
23
|
-
memory_usage[
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
memory_usage["features_df"] = {
|
|
24
|
+
"rows": len(self.features_df),
|
|
25
|
+
"columns": len(self.features_df.columns),
|
|
26
|
+
"bytes": features_bytes,
|
|
27
|
+
"mb": features_bytes / (1024 * 1024),
|
|
28
28
|
}
|
|
29
29
|
total_bytes += features_bytes
|
|
30
30
|
else:
|
|
31
|
-
memory_usage[
|
|
32
|
-
|
|
31
|
+
memory_usage["features_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
32
|
+
|
|
33
33
|
# Check scans_df
|
|
34
34
|
if self.scans_df is not None and len(self.scans_df) > 0:
|
|
35
35
|
scans_bytes = self.scans_df.estimated_size()
|
|
36
|
-
memory_usage[
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
memory_usage["scans_df"] = {
|
|
37
|
+
"rows": len(self.scans_df),
|
|
38
|
+
"columns": len(self.scans_df.columns),
|
|
39
|
+
"bytes": scans_bytes,
|
|
40
|
+
"mb": scans_bytes / (1024 * 1024),
|
|
41
41
|
}
|
|
42
42
|
total_bytes += scans_bytes
|
|
43
43
|
else:
|
|
44
|
-
memory_usage[
|
|
45
|
-
|
|
44
|
+
memory_usage["scans_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
45
|
+
|
|
46
46
|
# Check ms1_df
|
|
47
47
|
if self.ms1_df is not None and len(self.ms1_df) > 0:
|
|
48
48
|
ms1_bytes = self.ms1_df.estimated_size()
|
|
49
|
-
memory_usage[
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
memory_usage["ms1_df"] = {
|
|
50
|
+
"rows": len(self.ms1_df),
|
|
51
|
+
"columns": len(self.ms1_df.columns),
|
|
52
|
+
"bytes": ms1_bytes,
|
|
53
|
+
"mb": ms1_bytes / (1024 * 1024),
|
|
54
54
|
}
|
|
55
55
|
total_bytes += ms1_bytes
|
|
56
56
|
else:
|
|
57
|
-
memory_usage[
|
|
58
|
-
|
|
57
|
+
memory_usage["ms1_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
58
|
+
|
|
59
59
|
# Check chrom_df
|
|
60
60
|
if self.chrom_df is not None and len(self.chrom_df) > 0:
|
|
61
61
|
chrom_bytes = self.chrom_df.estimated_size()
|
|
62
|
-
memory_usage[
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
62
|
+
memory_usage["chrom_df"] = {
|
|
63
|
+
"rows": len(self.chrom_df),
|
|
64
|
+
"columns": len(self.chrom_df.columns),
|
|
65
|
+
"bytes": chrom_bytes,
|
|
66
|
+
"mb": chrom_bytes / (1024 * 1024),
|
|
67
67
|
}
|
|
68
68
|
total_bytes += chrom_bytes
|
|
69
69
|
else:
|
|
70
|
-
memory_usage[
|
|
71
|
-
|
|
70
|
+
memory_usage["chrom_df"] = {"rows": 0, "columns": 0, "bytes": 0, "mb": 0}
|
|
71
|
+
|
|
72
72
|
# Add total memory usage
|
|
73
|
-
memory_usage[
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
memory_usage["total"] = {
|
|
74
|
+
"bytes": total_bytes,
|
|
75
|
+
"mb": total_bytes / (1024 * 1024),
|
|
76
|
+
"gb": total_bytes / (1024 * 1024 * 1024),
|
|
77
77
|
}
|
|
78
|
-
|
|
78
|
+
|
|
79
79
|
# Log the memory usage summary
|
|
80
|
-
if hasattr(self,
|
|
80
|
+
if hasattr(self, "logger"):
|
|
81
81
|
self.logger.debug(f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB")
|
|
82
82
|
for df_name, stats in memory_usage.items():
|
|
83
|
-
if df_name !=
|
|
83
|
+
if df_name != "total" and stats["bytes"] > 0:
|
|
84
84
|
self.logger.debug(f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB")
|
|
85
|
-
|
|
86
|
-
return memory_usage[
|
|
85
|
+
|
|
86
|
+
return memory_usage["total"]["mb"]
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
def get_dda_stats(self):
|
|
@@ -121,7 +121,7 @@ def _get_scan_uids(self, scans=None, verbose=True):
|
|
|
121
121
|
def _get_feature_uids(self, features=None, verbose=True):
|
|
122
122
|
"""
|
|
123
123
|
Get feature UIDs from various input types.
|
|
124
|
-
|
|
124
|
+
|
|
125
125
|
Parameters:
|
|
126
126
|
features: Can be one of the following:
|
|
127
127
|
- None: Returns all feature UIDs from self.features_df
|
|
@@ -129,7 +129,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
129
129
|
- polars.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
130
130
|
- pandas.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
|
|
131
131
|
verbose (bool): Whether to log errors for invalid inputs
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
Returns:
|
|
134
134
|
list: List of feature UIDs
|
|
135
135
|
"""
|
|
@@ -146,7 +146,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
146
146
|
if verbose:
|
|
147
147
|
self.logger.warning("No features_df available to validate feature UIDs.")
|
|
148
148
|
return []
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
valid_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
151
151
|
feature_uids = [f for f in features if f in valid_feature_uids]
|
|
152
152
|
if verbose and not feature_uids:
|
|
@@ -155,50 +155,53 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
155
155
|
# Handle polars and pandas DataFrames
|
|
156
156
|
try:
|
|
157
157
|
# Check if it's a polars DataFrame
|
|
158
|
-
if hasattr(features,
|
|
158
|
+
if hasattr(features, "columns") and hasattr(features, "get_column"):
|
|
159
159
|
# Polars DataFrame
|
|
160
160
|
feature_column = None
|
|
161
|
-
if
|
|
162
|
-
feature_column =
|
|
163
|
-
elif
|
|
164
|
-
feature_column =
|
|
165
|
-
|
|
161
|
+
if "feature_uid" in features.columns:
|
|
162
|
+
feature_column = "feature_uid"
|
|
163
|
+
elif "feature_id" in features.columns:
|
|
164
|
+
feature_column = "feature_id"
|
|
165
|
+
|
|
166
166
|
if feature_column is None:
|
|
167
167
|
if verbose:
|
|
168
168
|
self.logger.error("No 'feature_uid' or 'feature_id' column found in polars DataFrame.")
|
|
169
169
|
return []
|
|
170
|
-
|
|
170
|
+
|
|
171
171
|
# Get unique values from the column
|
|
172
172
|
feature_uids = features.get_column(feature_column).unique().to_list()
|
|
173
|
-
|
|
173
|
+
|
|
174
174
|
# Check if it's a pandas DataFrame
|
|
175
|
-
elif hasattr(features,
|
|
175
|
+
elif hasattr(features, "columns") and hasattr(features, "iloc"):
|
|
176
176
|
# Pandas DataFrame
|
|
177
177
|
import pandas as pd
|
|
178
|
+
|
|
178
179
|
if not isinstance(features, pd.DataFrame):
|
|
179
180
|
if verbose:
|
|
180
|
-
self.logger.error(
|
|
181
|
+
self.logger.error(
|
|
182
|
+
"Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame."
|
|
183
|
+
)
|
|
181
184
|
return []
|
|
182
|
-
|
|
185
|
+
|
|
183
186
|
feature_column = None
|
|
184
|
-
if
|
|
185
|
-
feature_column =
|
|
186
|
-
elif
|
|
187
|
-
feature_column =
|
|
188
|
-
|
|
187
|
+
if "feature_uid" in features.columns:
|
|
188
|
+
feature_column = "feature_uid"
|
|
189
|
+
elif "feature_id" in features.columns:
|
|
190
|
+
feature_column = "feature_id"
|
|
191
|
+
|
|
189
192
|
if feature_column is None:
|
|
190
193
|
if verbose:
|
|
191
194
|
self.logger.error("No 'feature_uid' or 'feature_id' column found in pandas DataFrame.")
|
|
192
195
|
return []
|
|
193
|
-
|
|
196
|
+
|
|
194
197
|
# Get unique values from the column
|
|
195
198
|
feature_uids = features[feature_column].unique().tolist()
|
|
196
|
-
|
|
199
|
+
|
|
197
200
|
else:
|
|
198
201
|
if verbose:
|
|
199
202
|
self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
|
|
200
203
|
return []
|
|
201
|
-
|
|
204
|
+
|
|
202
205
|
except Exception as e:
|
|
203
206
|
if verbose:
|
|
204
207
|
self.logger.error(f"Error processing DataFrame input: {e}")
|
|
@@ -301,7 +304,7 @@ def select(
|
|
|
301
304
|
):
|
|
302
305
|
"""
|
|
303
306
|
Select features based on specified criteria and return the filtered DataFrame.
|
|
304
|
-
|
|
307
|
+
|
|
305
308
|
Parameters:
|
|
306
309
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
307
310
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -315,7 +318,7 @@ def select(
|
|
|
315
318
|
height_scaled: scaled height filter (tuple for range, single value for minimum)
|
|
316
319
|
prominence: prominence filter (tuple for range, single value for minimum)
|
|
317
320
|
height: height filter (tuple for range, single value for minimum)
|
|
318
|
-
|
|
321
|
+
|
|
319
322
|
Returns:
|
|
320
323
|
polars.DataFrame: Filtered features DataFrame
|
|
321
324
|
"""
|
|
@@ -491,24 +494,22 @@ def select(
|
|
|
491
494
|
return feats
|
|
492
495
|
|
|
493
496
|
|
|
494
|
-
|
|
495
|
-
|
|
496
497
|
def _features_sync(self):
|
|
497
498
|
"""
|
|
498
|
-
Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
|
|
499
|
+
Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
|
|
499
500
|
but not the other, using feature_id for mapping between them.
|
|
500
|
-
|
|
501
|
+
|
|
501
502
|
This function ensures that:
|
|
502
503
|
- Features in the FeatureMap that don't have corresponding entries in features_df are removed
|
|
503
504
|
- Features in features_df that don't have corresponding entries in the FeatureMap are removed
|
|
504
|
-
|
|
505
|
+
|
|
505
506
|
Returns:
|
|
506
507
|
None
|
|
507
|
-
|
|
508
|
+
|
|
508
509
|
Side Effects:
|
|
509
510
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with synchronized features
|
|
510
511
|
Updates self.features_df by filtering to only include features present in the FeatureMap
|
|
511
|
-
|
|
512
|
+
|
|
512
513
|
Note:
|
|
513
514
|
Uses feature_id as the mapping key. feature_id contains OpenMS unique IDs that correspond
|
|
514
515
|
to the unique IDs of features in the FeatureMap.
|
|
@@ -516,34 +517,34 @@ def _features_sync(self):
|
|
|
516
517
|
if self.features_df is None or self.features is None:
|
|
517
518
|
self.logger.warning("Cannot sync: features_df or FeatureMap is None.")
|
|
518
519
|
return
|
|
519
|
-
|
|
520
|
+
|
|
520
521
|
try:
|
|
521
522
|
# Import pyopenms
|
|
522
523
|
import pyopenms as oms
|
|
523
|
-
|
|
524
|
+
|
|
524
525
|
# Get feature_ids from features_df
|
|
525
526
|
df_feature_ids = set(self.features_df.get_column("feature_id").to_list())
|
|
526
|
-
|
|
527
|
+
|
|
527
528
|
# Get feature unique IDs from FeatureMap
|
|
528
529
|
feature_map_ids = set()
|
|
529
530
|
for i in range(self.features.size()):
|
|
530
531
|
feature = self.features[i]
|
|
531
532
|
unique_id = str(feature.getUniqueId()) # Convert to string to match DataFrame
|
|
532
533
|
feature_map_ids.add(unique_id)
|
|
533
|
-
|
|
534
|
+
|
|
534
535
|
# Find features that exist in both
|
|
535
536
|
common_feature_ids = df_feature_ids & feature_map_ids
|
|
536
|
-
|
|
537
|
+
|
|
537
538
|
# Safety check: log error and exit if no features are matching
|
|
538
539
|
if not common_feature_ids:
|
|
539
540
|
self.logger.error(
|
|
540
541
|
f"No matching features found between FeatureMap and features_df. "
|
|
541
542
|
f"FeatureMap has {len(feature_map_ids)} features, "
|
|
542
543
|
f"features_df has {len(df_feature_ids)} features. "
|
|
543
|
-
f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes."
|
|
544
|
+
f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes.",
|
|
544
545
|
)
|
|
545
546
|
return
|
|
546
|
-
|
|
547
|
+
|
|
547
548
|
# Create new synchronized FeatureMap with only common features
|
|
548
549
|
synced_feature_map = oms.FeatureMap()
|
|
549
550
|
for i in range(self.features.size()):
|
|
@@ -551,19 +552,19 @@ def _features_sync(self):
|
|
|
551
552
|
unique_id = str(feature.getUniqueId())
|
|
552
553
|
if unique_id in common_feature_ids:
|
|
553
554
|
synced_feature_map.push_back(feature)
|
|
554
|
-
|
|
555
|
+
|
|
555
556
|
# Filter features_df to only include features that exist in FeatureMap
|
|
556
557
|
synced_features_df = self.features_df.filter(
|
|
557
|
-
pl.col("feature_id").is_in(list(common_feature_ids))
|
|
558
|
+
pl.col("feature_id").is_in(list(common_feature_ids)),
|
|
558
559
|
)
|
|
559
|
-
|
|
560
|
+
|
|
560
561
|
# Update the objects
|
|
561
562
|
original_map_size = self.features.size()
|
|
562
563
|
original_df_size = len(self.features_df)
|
|
563
|
-
|
|
564
|
+
|
|
564
565
|
self.features = synced_feature_map
|
|
565
566
|
self.features_df = synced_features_df
|
|
566
|
-
|
|
567
|
+
|
|
567
568
|
# Log the synchronization results
|
|
568
569
|
map_removed = original_map_size - self.features.size()
|
|
569
570
|
df_removed = original_df_size - len(self.features_df)
|
|
@@ -573,36 +574,36 @@ def _features_sync(self):
|
|
|
573
574
|
self.logger.info(
|
|
574
575
|
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
575
576
|
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
576
|
-
f"({df_removed} removed)"
|
|
577
|
+
f"({df_removed} removed)",
|
|
577
578
|
)
|
|
578
579
|
else:
|
|
579
580
|
self.logger.debug(
|
|
580
581
|
f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
|
|
581
582
|
f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
|
|
582
|
-
f"({df_removed} removed)"
|
|
583
|
+
f"({df_removed} removed)",
|
|
583
584
|
)
|
|
584
|
-
|
|
585
|
+
|
|
585
586
|
except ImportError:
|
|
586
587
|
self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
|
|
587
588
|
except Exception as e:
|
|
588
589
|
self.logger.error(f"Error during feature synchronization: {e}")
|
|
589
590
|
|
|
590
591
|
|
|
591
|
-
def features_delete(self, features: list|None=None):
|
|
592
|
+
def features_delete(self, features: list | None = None):
|
|
592
593
|
"""
|
|
593
594
|
Delete features from both self.features_df and self.features based on a list of feature UIDs.
|
|
594
|
-
|
|
595
|
+
|
|
595
596
|
Parameters:
|
|
596
597
|
features (list, optional): List of feature UIDs to delete. If None, all features will be deleted.
|
|
597
|
-
|
|
598
|
+
|
|
598
599
|
Returns:
|
|
599
600
|
None
|
|
600
|
-
|
|
601
|
+
|
|
601
602
|
Side Effects:
|
|
602
603
|
Updates self.features_df by removing specified features.
|
|
603
604
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the remaining features.
|
|
604
605
|
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
605
|
-
|
|
606
|
+
|
|
606
607
|
Note:
|
|
607
608
|
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
608
609
|
containing only the features that should remain after deletion.
|
|
@@ -610,33 +611,33 @@ def features_delete(self, features: list|None=None):
|
|
|
610
611
|
if self.features_df is None:
|
|
611
612
|
self.logger.warning("No features found.")
|
|
612
613
|
return
|
|
613
|
-
|
|
614
|
+
|
|
614
615
|
# Get the feature UIDs to delete
|
|
615
616
|
feature_uids_to_delete = self._get_feature_uids(features=features, verbose=True)
|
|
616
|
-
|
|
617
|
+
|
|
617
618
|
if not feature_uids_to_delete:
|
|
618
619
|
self.logger.warning("No valid feature UIDs provided for deletion.")
|
|
619
620
|
return
|
|
620
|
-
|
|
621
|
+
|
|
621
622
|
original_count = len(self.features_df)
|
|
622
|
-
|
|
623
|
+
|
|
623
624
|
# Update features_df by filtering out the features to delete
|
|
624
625
|
self.features_df = self.features_df.filter(
|
|
625
|
-
~pl.col("feature_uid").is_in(feature_uids_to_delete)
|
|
626
|
+
~pl.col("feature_uid").is_in(feature_uids_to_delete),
|
|
626
627
|
)
|
|
627
|
-
|
|
628
|
+
|
|
628
629
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
629
630
|
if self.features is not None:
|
|
630
631
|
try:
|
|
631
632
|
# Import pyopenms
|
|
632
633
|
import pyopenms as oms
|
|
633
|
-
|
|
634
|
+
|
|
634
635
|
# Create new FeatureMap with only features to keep
|
|
635
636
|
filtered_map = oms.FeatureMap()
|
|
636
|
-
|
|
637
|
+
|
|
637
638
|
# Get the feature UIDs that should remain after deletion
|
|
638
639
|
remaining_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
639
|
-
|
|
640
|
+
|
|
640
641
|
# Iterate through existing features and keep only those not in deletion list
|
|
641
642
|
for i in range(self.features.size()):
|
|
642
643
|
feature = self.features[i]
|
|
@@ -644,25 +645,25 @@ def features_delete(self, features: list|None=None):
|
|
|
644
645
|
# we can check if the current index is in the remaining UIDs
|
|
645
646
|
if i in remaining_feature_uids:
|
|
646
647
|
filtered_map.push_back(feature)
|
|
647
|
-
|
|
648
|
+
|
|
648
649
|
# Replace the original FeatureMap with the filtered one
|
|
649
650
|
self.features = filtered_map
|
|
650
651
|
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
651
|
-
|
|
652
|
+
|
|
652
653
|
except ImportError:
|
|
653
654
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
654
655
|
except Exception as e:
|
|
655
656
|
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
656
|
-
|
|
657
|
+
|
|
657
658
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
658
|
-
if hasattr(self,
|
|
659
|
+
if hasattr(self, "scans_df") and self.scans_df is not None:
|
|
659
660
|
self.scans_df = self.scans_df.with_columns(
|
|
660
661
|
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
661
662
|
.then(None)
|
|
662
663
|
.otherwise(pl.col("feature_uid"))
|
|
663
|
-
.alias("feature_uid")
|
|
664
|
+
.alias("feature_uid"),
|
|
664
665
|
)
|
|
665
|
-
|
|
666
|
+
|
|
666
667
|
deleted_count = original_count - len(self.features_df)
|
|
667
668
|
self.logger.info(f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}")
|
|
668
669
|
|
|
@@ -702,21 +703,21 @@ def _delete_ms2(self):
|
|
|
702
703
|
def features_filter(self, features):
|
|
703
704
|
"""
|
|
704
705
|
Keep only the specified features and delete all others. This is the opposite of features_delete().
|
|
705
|
-
|
|
706
|
+
|
|
706
707
|
Parameters:
|
|
707
708
|
features: Can be one of the following:
|
|
708
709
|
- list: List of feature UIDs to keep
|
|
709
710
|
- polars.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
710
711
|
- pandas.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
|
|
711
|
-
|
|
712
|
+
|
|
712
713
|
Returns:
|
|
713
714
|
None
|
|
714
|
-
|
|
715
|
+
|
|
715
716
|
Side Effects:
|
|
716
717
|
Updates self.features_df by keeping only the specified features.
|
|
717
718
|
Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the specified features.
|
|
718
719
|
Updates self.scans_df by removing feature_uid associations for deleted features.
|
|
719
|
-
|
|
720
|
+
|
|
720
721
|
Note:
|
|
721
722
|
The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
|
|
722
723
|
containing only the features that should be kept.
|
|
@@ -724,38 +725,38 @@ def features_filter(self, features):
|
|
|
724
725
|
if self.features_df is None:
|
|
725
726
|
self.logger.warning("No features found.")
|
|
726
727
|
return
|
|
727
|
-
|
|
728
|
+
|
|
728
729
|
if features is None:
|
|
729
730
|
self.logger.warning("No features specified to keep. Use features_delete() to delete all features.")
|
|
730
731
|
return
|
|
731
|
-
|
|
732
|
+
|
|
732
733
|
# Get the feature UIDs to keep
|
|
733
734
|
feature_uids_to_keep = self._get_feature_uids(features=features, verbose=True)
|
|
734
|
-
|
|
735
|
+
|
|
735
736
|
if not feature_uids_to_keep:
|
|
736
737
|
self.logger.warning("No valid feature UIDs provided to keep.")
|
|
737
738
|
return
|
|
738
|
-
|
|
739
|
+
|
|
739
740
|
original_count = len(self.features_df)
|
|
740
|
-
|
|
741
|
+
|
|
741
742
|
# Update features_df by keeping only the specified features
|
|
742
743
|
self.features_df = self.features_df.filter(
|
|
743
|
-
pl.col("feature_uid").is_in(feature_uids_to_keep)
|
|
744
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
744
745
|
)
|
|
745
|
-
|
|
746
|
+
|
|
746
747
|
# Calculate which features were deleted (all except the ones to keep)
|
|
747
748
|
all_feature_uids = set(range(original_count)) # Assuming sequential UIDs
|
|
748
749
|
feature_uids_to_delete = list(all_feature_uids - set(feature_uids_to_keep))
|
|
749
|
-
|
|
750
|
+
|
|
750
751
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
751
752
|
if self.features is not None:
|
|
752
753
|
try:
|
|
753
754
|
# Import pyopenms
|
|
754
755
|
import pyopenms as oms
|
|
755
|
-
|
|
756
|
+
|
|
756
757
|
# Create new FeatureMap with only features to keep
|
|
757
758
|
filtered_map = oms.FeatureMap()
|
|
758
|
-
|
|
759
|
+
|
|
759
760
|
# Iterate through existing features and keep only those in the keep list
|
|
760
761
|
for i in range(self.features.size()):
|
|
761
762
|
feature = self.features[i]
|
|
@@ -763,25 +764,25 @@ def features_filter(self, features):
|
|
|
763
764
|
# we can check if the current index is in the keep UIDs
|
|
764
765
|
if i in feature_uids_to_keep:
|
|
765
766
|
filtered_map.push_back(feature)
|
|
766
|
-
|
|
767
|
+
|
|
767
768
|
# Replace the original FeatureMap with the filtered one
|
|
768
769
|
self.features = filtered_map
|
|
769
770
|
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
770
|
-
|
|
771
|
+
|
|
771
772
|
except ImportError:
|
|
772
773
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
773
774
|
except Exception as e:
|
|
774
775
|
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
775
|
-
|
|
776
|
+
|
|
776
777
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
777
|
-
if hasattr(self,
|
|
778
|
+
if hasattr(self, "scans_df") and self.scans_df is not None and feature_uids_to_delete:
|
|
778
779
|
self.scans_df = self.scans_df.with_columns(
|
|
779
780
|
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
780
781
|
.then(None)
|
|
781
782
|
.otherwise(pl.col("feature_uid"))
|
|
782
|
-
.alias("feature_uid")
|
|
783
|
+
.alias("feature_uid"),
|
|
783
784
|
)
|
|
784
|
-
|
|
785
|
+
|
|
785
786
|
kept_count = len(self.features_df)
|
|
786
787
|
deleted_count = original_count - kept_count
|
|
787
788
|
self.logger.info(f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}")
|
|
@@ -789,27 +790,27 @@ def features_filter(self, features):
|
|
|
789
790
|
|
|
790
791
|
def set_source(self, filename):
|
|
791
792
|
"""
|
|
792
|
-
Reassign file_source. If filename contains only a path, keep the current basename
|
|
793
|
-
and build an absolute path. Check that the new file exists before overwriting
|
|
793
|
+
Reassign file_source. If filename contains only a path, keep the current basename
|
|
794
|
+
and build an absolute path. Check that the new file exists before overwriting
|
|
794
795
|
the old file_source.
|
|
795
|
-
|
|
796
|
+
|
|
796
797
|
Parameters:
|
|
797
798
|
filename (str): New file path or directory path
|
|
798
|
-
|
|
799
|
+
|
|
799
800
|
Returns:
|
|
800
801
|
None
|
|
801
802
|
"""
|
|
802
803
|
import os
|
|
803
|
-
|
|
804
|
+
|
|
804
805
|
# Store the old file_source for logging
|
|
805
|
-
old_file_source = getattr(self,
|
|
806
|
-
|
|
806
|
+
old_file_source = getattr(self, "file_source", None)
|
|
807
|
+
|
|
807
808
|
# Check if filename is just a directory path
|
|
808
809
|
if os.path.isdir(filename):
|
|
809
810
|
if old_file_source is None:
|
|
810
811
|
self.logger.error("Cannot build path: no current file_source available")
|
|
811
812
|
return
|
|
812
|
-
|
|
813
|
+
|
|
813
814
|
# Get the basename from current file_source
|
|
814
815
|
current_basename = os.path.basename(old_file_source)
|
|
815
816
|
# Build new absolute path
|
|
@@ -817,15 +818,15 @@ def set_source(self, filename):
|
|
|
817
818
|
else:
|
|
818
819
|
# filename is a full path, make it absolute
|
|
819
820
|
new_file_path = os.path.abspath(filename)
|
|
820
|
-
|
|
821
|
+
|
|
821
822
|
# Check if the new file exists
|
|
822
823
|
if not os.path.exists(new_file_path):
|
|
823
824
|
self.logger.error(f"File does not exist: {new_file_path}")
|
|
824
825
|
return
|
|
825
|
-
|
|
826
|
+
|
|
826
827
|
# Update file_source
|
|
827
828
|
self.file_source = new_file_path
|
|
828
|
-
|
|
829
|
+
|
|
829
830
|
# Log the change
|
|
830
831
|
if old_file_source is not None:
|
|
831
832
|
self.logger.info(f"Updated file_source from {old_file_source} to {self.file_source}")
|