masster 0.5.9__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +2 -2
- masster/sample/helpers.py +47 -15
- masster/sample/plot.py +702 -574
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +91 -48
- masster/study/h5.py +32 -14
- masster/study/helpers.py +23 -4
- masster/study/load.py +1 -164
- masster/study/merge.py +2 -8
- masster/study/plot.py +105 -35
- masster/study/processing.py +6 -6
- masster/study/study5_schema.json +3 -0
- {masster-0.5.9.dist-info → masster-0.5.10.dist-info}/METADATA +3 -1
- {masster-0.5.9.dist-info → masster-0.5.10.dist-info}/RECORD +18 -18
- {masster-0.5.9.dist-info → masster-0.5.10.dist-info}/WHEEL +0 -0
- {masster-0.5.9.dist-info → masster-0.5.10.dist-info}/entry_points.txt +0 -0
- {masster-0.5.9.dist-info → masster-0.5.10.dist-info}/licenses/LICENSE +0 -0
masster/sample/processing.py
CHANGED
|
@@ -1264,7 +1264,7 @@ def find_ms2(self, **kwargs):
|
|
|
1264
1264
|
|
|
1265
1265
|
# Log completion
|
|
1266
1266
|
self.logger.success(
|
|
1267
|
-
f"MS2 linking completed.
|
|
1267
|
+
f"MS2 linking completed. Features with MS2 data: {c}.",
|
|
1268
1268
|
)
|
|
1269
1269
|
self.features_df = features_df
|
|
1270
1270
|
|
masster/sample/sample.py
CHANGED
|
@@ -1,35 +1,98 @@
|
|
|
1
1
|
"""
|
|
2
|
-
sample.py
|
|
2
|
+
sample.py - Mass Spectrometry Sample Analysis Module
|
|
3
3
|
|
|
4
|
-
This module provides tools for processing and analyzing Data-Dependent Acquisition (DDA)
|
|
5
|
-
It defines the `Sample` class, which offers methods to load, process, analyze,
|
|
6
|
-
|
|
4
|
+
This module provides comprehensive tools for processing and analyzing Data-Dependent Acquisition (DDA)
|
|
5
|
+
mass spectrometry data. It defines the `Sample` class, which offers methods to load, process, analyze,
|
|
6
|
+
and visualize mass spectrometry data from various file formats.
|
|
7
|
+
|
|
8
|
+
Supported File Formats:
|
|
9
|
+
- mzML (standard XML format for mass spectrometry data)
|
|
10
|
+
- Thermo RAW (native Thermo Fisher Scientific format)
|
|
11
|
+
- Sciex WIFF (native Sciex format)
|
|
12
|
+
- Sample5 (MASSter's native HDF5-based format for optimized storage)
|
|
7
13
|
|
|
8
14
|
Key Features:
|
|
9
|
-
- **File Handling**: Load and save data in multiple formats
|
|
10
|
-
- **Feature Detection**: Detect and process mass spectrometry features
|
|
11
|
-
- **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra
|
|
12
|
-
- **
|
|
13
|
-
- **
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
|
|
15
|
+
- **File Handling**: Load and save data in multiple formats with automatic format detection
|
|
16
|
+
- **Feature Detection**: Detect and process mass spectrometry features using advanced algorithms
|
|
17
|
+
- **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra with comprehensive metadata
|
|
18
|
+
- **Adduct Detection**: Find and annotate adducts and in-source fragments
|
|
19
|
+
- **Isotope Analysis**: Detect and process isotopic patterns
|
|
20
|
+
- **Chromatogram Extraction**: Extract and analyze chromatograms (EIC, BPC, TIC)
|
|
21
|
+
- **Visualization**: Generate interactive and static plots for spectra, chromatograms, and 2D maps
|
|
22
|
+
- **Statistics**: Compute and export detailed DDA run statistics and quality metrics
|
|
23
|
+
- **Data Export**: Export processed data to various formats (XLSX, MGF, etc.)
|
|
24
|
+
- **Memory Management**: Efficient handling of large datasets with on-disk storage options
|
|
25
|
+
|
|
26
|
+
Core Dependencies:
|
|
27
|
+
- `pyopenms`: OpenMS library for file handling and feature detection algorithms
|
|
28
|
+
- `polars`: High-performance data manipulation and analysis
|
|
29
|
+
- `numpy`: Numerical computations and array operations
|
|
30
|
+
- `bokeh`, `panel`, `holoviews`, `datashader`: Interactive visualizations and dashboards
|
|
31
|
+
- `h5py`: HDF5 file format support for Sample5 files
|
|
20
32
|
|
|
21
33
|
Classes:
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Example Usage:
|
|
25
|
-
```python
|
|
26
|
-
from masster.sample import Sample
|
|
34
|
+
Sample: Main class for handling DDA mass spectrometry data, providing methods for
|
|
35
|
+
data import, processing, analysis, and visualization.
|
|
27
36
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
37
|
+
Typical Workflow:
|
|
38
|
+
1. Load mass spectrometry data file
|
|
39
|
+
2. Detect features using find_features()
|
|
40
|
+
3. Optionally find MS2 spectra with find_ms2()
|
|
41
|
+
4. Analyze and visualize results
|
|
42
|
+
5. Export processed data
|
|
32
43
|
|
|
44
|
+
Example Usage:
|
|
45
|
+
Basic analysis workflow:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from masster.sample import Sample
|
|
49
|
+
|
|
50
|
+
# Load a mass spectrometry file
|
|
51
|
+
sample = Sample(filename="experiment.mzML")
|
|
52
|
+
|
|
53
|
+
# Detect features
|
|
54
|
+
sample.find_features()
|
|
55
|
+
|
|
56
|
+
# Find MS2 spectra for features
|
|
57
|
+
sample.find_ms2()
|
|
58
|
+
|
|
59
|
+
# Generate 2D visualization
|
|
60
|
+
sample.plot_2d()
|
|
61
|
+
|
|
62
|
+
# Export results
|
|
63
|
+
sample.export_features("features.xlsx")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Advanced usage with custom parameters:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from masster.sample import Sample
|
|
70
|
+
from masster.sample.defaults import sample_defaults, find_features_defaults
|
|
71
|
+
|
|
72
|
+
# Create custom parameters
|
|
73
|
+
params = sample_defaults(log_level="DEBUG", label="My Experiment")
|
|
74
|
+
ff_params = find_features_defaults(noise_threshold_int=1000)
|
|
75
|
+
|
|
76
|
+
# Initialize with custom parameters
|
|
77
|
+
sample = Sample(params=params)
|
|
78
|
+
sample.load("data.raw")
|
|
79
|
+
|
|
80
|
+
# Feature detection with custom parameters
|
|
81
|
+
sample.find_features(params=ff_params)
|
|
82
|
+
|
|
83
|
+
# Generate comprehensive statistics
|
|
84
|
+
stats = sample.get_dda_stats()
|
|
85
|
+
sample.plot_dda_stats()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Notes:
|
|
89
|
+
- The Sample class maintains processing history and parameters for reproducibility
|
|
90
|
+
- Large files can be processed with on-disk storage to manage memory usage
|
|
91
|
+
- All visualizations are interactive by default and can be exported as static images
|
|
92
|
+
- The module supports both individual sample analysis and batch processing workflows
|
|
93
|
+
|
|
94
|
+
Version: Part of the MASSter mass spectrometry analysis framework
|
|
95
|
+
Author: Zamboni Lab, ETH Zurich
|
|
33
96
|
"""
|
|
34
97
|
|
|
35
98
|
import importlib
|
|
@@ -49,16 +112,12 @@ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
|
|
|
49
112
|
|
|
50
113
|
# Sample-specific imports - keeping these private, only for internal use
|
|
51
114
|
from masster.sample.h5 import _load_sample5
|
|
52
|
-
# from masster.sample.h5 import _load_sample5_study
|
|
53
115
|
from masster.sample.h5 import _save_sample5
|
|
54
|
-
# from masster.sample.helpers import _delete_ms2
|
|
55
116
|
from masster.sample.helpers import _estimate_memory_usage
|
|
56
117
|
from masster.sample.helpers import _get_scan_uids
|
|
57
118
|
from masster.sample.helpers import _get_feature_uids
|
|
58
|
-
# from masster.sample.helpers import _features_sync - made internal only
|
|
59
119
|
from masster.sample.adducts import find_adducts
|
|
60
120
|
from masster.sample.adducts import _get_adducts
|
|
61
|
-
# Removed _get_adducts - only used in study modules
|
|
62
121
|
from masster.sample.helpers import features_delete
|
|
63
122
|
from masster.sample.helpers import features_filter
|
|
64
123
|
from masster.sample.helpers import features_select
|
|
@@ -70,23 +129,17 @@ from masster.sample.helpers import get_eic
|
|
|
70
129
|
from masster.sample.helpers import set_source
|
|
71
130
|
from masster.sample.helpers import _recreate_feature_map
|
|
72
131
|
from masster.sample.helpers import _get_feature_map
|
|
73
|
-
# Load functions - keeping only specific ones needed for external API
|
|
74
|
-
# from masster.sample.load import _load_featureXML - made internal only
|
|
75
|
-
# from masster.sample.load import _load_ms2data - made internal only
|
|
76
|
-
# from masster.sample.load import _load_mzML - made internal only
|
|
77
|
-
# from masster.sample.load import _load_raw - made internal only
|
|
78
|
-
# from masster.sample.load import _load_wiff - made internal only
|
|
79
132
|
from masster.sample.load import chrom_extract
|
|
80
133
|
from masster.sample.load import _index_file
|
|
81
134
|
from masster.sample.load import load
|
|
82
135
|
from masster.sample.load import load_noms1
|
|
83
|
-
from masster.sample.load import _load_ms1
|
|
136
|
+
from masster.sample.load import _load_ms1
|
|
84
137
|
from masster.sample.load import sanitize
|
|
85
138
|
from masster.sample.plot import plot_2d
|
|
86
139
|
from masster.sample.plot import plot_2d_oracle
|
|
87
140
|
from masster.sample.plot import plot_dda_stats
|
|
88
141
|
from masster.sample.plot import plot_chrom
|
|
89
|
-
from masster.sample.plot import plot_features_stats
|
|
142
|
+
from masster.sample.plot import plot_features_stats
|
|
90
143
|
from masster.sample.plot import plot_ms2_cycle
|
|
91
144
|
from masster.sample.plot import plot_ms2_eic
|
|
92
145
|
from masster.sample.plot import plot_ms2_q1
|
|
@@ -113,7 +166,6 @@ from masster.sample.save import export_features
|
|
|
113
166
|
from masster.sample.save import export_mgf
|
|
114
167
|
from masster.sample.save import export_xlsx
|
|
115
168
|
from masster.sample.save import save
|
|
116
|
-
# Removed internal-only import: _save_featureXML
|
|
117
169
|
|
|
118
170
|
|
|
119
171
|
class Sample:
|
|
@@ -402,6 +454,7 @@ class Sample:
|
|
|
402
454
|
f"{base_modname}.chromatogram",
|
|
403
455
|
f"{base_modname}.spectrum",
|
|
404
456
|
f"{base_modname}.logger",
|
|
457
|
+
f"{base_modname}.lib",
|
|
405
458
|
]
|
|
406
459
|
|
|
407
460
|
# Add study submodules
|
|
@@ -414,17 +467,9 @@ class Sample:
|
|
|
414
467
|
):
|
|
415
468
|
study_modules.append(module_name)
|
|
416
469
|
|
|
417
|
-
""" # Add parameters submodules
|
|
418
|
-
parameters_modules = []
|
|
419
|
-
parameters_module_prefix = f"{base_modname}.parameters."
|
|
420
|
-
for module_name in sys.modules:
|
|
421
|
-
if module_name.startswith(parameters_module_prefix) and module_name != current_module:
|
|
422
|
-
parameters_modules.append(module_name)
|
|
423
|
-
"""
|
|
424
|
-
|
|
425
470
|
all_modules_to_reload = (
|
|
426
471
|
core_modules + sample_modules + study_modules
|
|
427
|
-
)
|
|
472
|
+
)
|
|
428
473
|
|
|
429
474
|
# Reload all discovered modules
|
|
430
475
|
for full_module_name in all_modules_to_reload:
|
|
@@ -466,8 +511,6 @@ class Sample:
|
|
|
466
511
|
else:
|
|
467
512
|
str += "Features: 0\n"
|
|
468
513
|
str += "Features with MS2 spectra: 0\n"
|
|
469
|
-
|
|
470
|
-
# estimate memory usage
|
|
471
514
|
mem_usage = self._estimate_memory_usage()
|
|
472
515
|
str += f"Estimated memory usage: {mem_usage:.2f} MB\n"
|
|
473
516
|
|
masster/study/h5.py
CHANGED
|
@@ -818,6 +818,19 @@ def _reorder_columns_by_schema(
|
|
|
818
818
|
|
|
819
819
|
def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataFrame:
|
|
820
820
|
"""Create DataFrame handling Object columns properly."""
|
|
821
|
+
# First check all data for numpy object arrays and move them to object columns
|
|
822
|
+
additional_object_cols = []
|
|
823
|
+
for k, v in data.items():
|
|
824
|
+
if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
|
|
825
|
+
# This is a numpy object array that should be treated as object
|
|
826
|
+
additional_object_cols.append(k)
|
|
827
|
+
object_columns.append(k)
|
|
828
|
+
|
|
829
|
+
if additional_object_cols:
|
|
830
|
+
# Re-run reconstruction for these columns
|
|
831
|
+
for col in additional_object_cols:
|
|
832
|
+
data[col] = _reconstruct_object_column(data[col], col)
|
|
833
|
+
|
|
821
834
|
object_data = {k: v for k, v in data.items() if k in object_columns}
|
|
822
835
|
regular_data = {k: v for k, v in data.items() if k not in object_columns}
|
|
823
836
|
|
|
@@ -1103,11 +1116,18 @@ def _load_dataframe_from_group(
|
|
|
1103
1116
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
1104
1117
|
column_data = group[col][:]
|
|
1105
1118
|
|
|
1106
|
-
#
|
|
1107
|
-
|
|
1108
|
-
|
|
1119
|
+
# Check if this is a known object column by name
|
|
1120
|
+
known_object_columns = {"ms1_spec", "chrom", "ms2_scans", "ms2_specs", "spec", "adducts", "iso"}
|
|
1121
|
+
is_known_object = col in known_object_columns
|
|
1122
|
+
|
|
1123
|
+
if is_known_object:
|
|
1124
|
+
# Known object column, always reconstruct
|
|
1125
|
+
data[col] = _reconstruct_object_column(column_data, col)
|
|
1126
|
+
if col not in object_columns:
|
|
1127
|
+
object_columns.append(col)
|
|
1128
|
+
elif len(column_data) > 0 and isinstance(column_data[0], bytes):
|
|
1109
1129
|
try:
|
|
1110
|
-
# Check if it looks like JSON
|
|
1130
|
+
# Check if it looks like JSON for unknown columns
|
|
1111
1131
|
test_decode = column_data[0].decode("utf-8")
|
|
1112
1132
|
if test_decode.startswith("[") or test_decode.startswith("{"):
|
|
1113
1133
|
# Looks like JSON, treat as object column
|
|
@@ -1738,9 +1758,7 @@ def _save_study5(self, filename):
|
|
|
1738
1758
|
)
|
|
1739
1759
|
pbar.update(1)
|
|
1740
1760
|
|
|
1741
|
-
self.logger.success(f"Study saved
|
|
1742
|
-
self.logger.debug(f"Save completed for {filename}")
|
|
1743
|
-
self.logger.debug(f"Save completed for {filename}")
|
|
1761
|
+
self.logger.success(f"Study saved to {filename}")
|
|
1744
1762
|
|
|
1745
1763
|
|
|
1746
1764
|
def _load_study5(self, filename=None):
|
|
@@ -1859,7 +1877,7 @@ def _load_study5(self, filename=None):
|
|
|
1859
1877
|
)
|
|
1860
1878
|
else:
|
|
1861
1879
|
self.logger.debug(
|
|
1862
|
-
"
|
|
1880
|
+
"Updated parameters from loaded history",
|
|
1863
1881
|
)
|
|
1864
1882
|
else:
|
|
1865
1883
|
self.logger.debug(
|
|
@@ -2093,8 +2111,8 @@ def _load_study5(self, filename=None):
|
|
|
2093
2111
|
# Ensure the column is Int64 type
|
|
2094
2112
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
2095
2113
|
|
|
2096
|
-
self.logger.
|
|
2097
|
-
f"
|
|
2114
|
+
self.logger.debug(
|
|
2115
|
+
f"Sanitized {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
|
|
2098
2116
|
)
|
|
2099
2117
|
|
|
2100
2118
|
# Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
|
|
@@ -2218,7 +2236,7 @@ def _sanitize_nulls(self):
|
|
|
2218
2236
|
pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
|
|
2219
2237
|
)
|
|
2220
2238
|
|
|
2221
|
-
self.logger.debug(f"
|
|
2239
|
+
self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
|
|
2222
2240
|
|
|
2223
2241
|
# Sanitize consensus_df consensus_id column
|
|
2224
2242
|
if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
@@ -2244,8 +2262,8 @@ def _sanitize_nulls(self):
|
|
|
2244
2262
|
self.consensus_df = self.consensus_df.with_columns(
|
|
2245
2263
|
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
2246
2264
|
)
|
|
2247
|
-
|
|
2248
|
-
self.logger.debug(f"
|
|
2265
|
+
|
|
2266
|
+
self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
|
|
2249
2267
|
|
|
2250
2268
|
# Sanitize rt_original in features_df by replacing null or NaN values with rt values
|
|
2251
2269
|
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
@@ -2262,4 +2280,4 @@ def _sanitize_nulls(self):
|
|
|
2262
2280
|
.otherwise(pl.col("rt_original"))
|
|
2263
2281
|
.alias("rt_original")
|
|
2264
2282
|
)
|
|
2265
|
-
self.logger.debug(f"
|
|
2283
|
+
self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
|
masster/study/helpers.py
CHANGED
|
@@ -1630,7 +1630,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1630
1630
|
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
1631
1631
|
continue
|
|
1632
1632
|
|
|
1633
|
-
self.logger.
|
|
1633
|
+
self.logger.success(
|
|
1634
1634
|
f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
|
|
1635
1635
|
)
|
|
1636
1636
|
|
|
@@ -2940,6 +2940,7 @@ def features_delete(self, features):
|
|
|
2940
2940
|
|
|
2941
2941
|
def consensus_select(
|
|
2942
2942
|
self,
|
|
2943
|
+
uid=None,
|
|
2943
2944
|
mz=None,
|
|
2944
2945
|
rt=None,
|
|
2945
2946
|
inty_mean=None,
|
|
@@ -2956,14 +2957,12 @@ def consensus_select(
|
|
|
2956
2957
|
rt_delta_mean=None,
|
|
2957
2958
|
id_top_score=None,
|
|
2958
2959
|
identified=None,
|
|
2959
|
-
# New adduct filter parameters
|
|
2960
2960
|
adduct_top=None,
|
|
2961
2961
|
adduct_charge_top=None,
|
|
2962
2962
|
adduct_mass_neutral_top=None,
|
|
2963
2963
|
adduct_mass_shift_top=None,
|
|
2964
2964
|
adduct_group=None,
|
|
2965
2965
|
adduct_of=None,
|
|
2966
|
-
# New identification filter parameters
|
|
2967
2966
|
id_top_name=None,
|
|
2968
2967
|
id_top_class=None,
|
|
2969
2968
|
id_top_adduct=None,
|
|
@@ -2976,6 +2975,11 @@ def consensus_select(
|
|
|
2976
2975
|
OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
|
|
2977
2976
|
|
|
2978
2977
|
Parameters:
|
|
2978
|
+
uid: consensus UID filter with flexible formats:
|
|
2979
|
+
- None: include all consensus features (default)
|
|
2980
|
+
- int: single specific consensus_uid
|
|
2981
|
+
- tuple: range of consensus_uids (consensus_uid_min, consensus_uid_max)
|
|
2982
|
+
- list: specific list of consensus_uid values
|
|
2979
2983
|
mz: m/z filter with flexible formats:
|
|
2980
2984
|
- float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
|
|
2981
2985
|
- tuple (mz_min, mz_max): range where mz_max > mz_min
|
|
@@ -3023,7 +3027,7 @@ def consensus_select(
|
|
|
3023
3027
|
return pl.DataFrame()
|
|
3024
3028
|
|
|
3025
3029
|
# Early return optimization - check if any filters are provided
|
|
3026
|
-
filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
|
|
3030
|
+
filter_params = [uid, mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
|
|
3027
3031
|
number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
|
|
3028
3032
|
chrom_prominence_scaled_mean, chrom_height_scaled_mean,
|
|
3029
3033
|
rt_delta_mean, id_top_score, identified,
|
|
@@ -3044,6 +3048,21 @@ def consensus_select(
|
|
|
3044
3048
|
warnings = []
|
|
3045
3049
|
|
|
3046
3050
|
# Build all filter conditions efficiently
|
|
3051
|
+
# Handle uid parameter first (consensus_uid filter with flexible formats)
|
|
3052
|
+
if uid is not None:
|
|
3053
|
+
if isinstance(uid, int):
|
|
3054
|
+
# Single specific consensus_uid
|
|
3055
|
+
filter_conditions.append(pl.col("consensus_uid") == uid)
|
|
3056
|
+
elif isinstance(uid, tuple) and len(uid) == 2:
|
|
3057
|
+
# Range of consensus_uids (consensus_uid_min, consensus_uid_max)
|
|
3058
|
+
min_uid, max_uid = uid
|
|
3059
|
+
filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
3060
|
+
elif isinstance(uid, list):
|
|
3061
|
+
# Specific list of consensus_uid values
|
|
3062
|
+
filter_conditions.append(pl.col("consensus_uid").is_in(uid))
|
|
3063
|
+
else:
|
|
3064
|
+
self.logger.warning(f"Invalid uid parameter type: {type(uid)}. Expected int, tuple, or list.")
|
|
3065
|
+
|
|
3047
3066
|
if mz is not None:
|
|
3048
3067
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
3049
3068
|
if mz[1] < mz[0]:
|
masster/study/load.py
CHANGED
|
@@ -139,7 +139,7 @@ def add(
|
|
|
139
139
|
f"No files found in {folder}. Please check the folder path or file patterns.",
|
|
140
140
|
)
|
|
141
141
|
else:
|
|
142
|
-
self.logger.debug(f"
|
|
142
|
+
self.logger.debug(f"Added {counter} samples to the study.")
|
|
143
143
|
|
|
144
144
|
# Return a simple summary to suppress marimo's automatic object display
|
|
145
145
|
return f"Added {counter} samples to study"
|
|
@@ -2055,169 +2055,6 @@ def _sanitize(self):
|
|
|
2055
2055
|
except Exception as e:
|
|
2056
2056
|
self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
|
|
2057
2057
|
|
|
2058
|
-
'''
|
|
2059
|
-
def _load_features(self):
|
|
2060
|
-
"""
|
|
2061
|
-
Load features by reconstructing FeatureMaps from the processed features_df data.
|
|
2062
|
-
|
|
2063
|
-
This ensures that the loaded FeatureMaps contain the same processed features
|
|
2064
|
-
as stored in features_df, rather than loading raw features from .featureXML files
|
|
2065
|
-
which may not match the processed data after filtering, alignment, etc.
|
|
2066
|
-
"""
|
|
2067
|
-
import polars as pl
|
|
2068
|
-
import pyopenms as oms
|
|
2069
|
-
from tqdm import tqdm
|
|
2070
|
-
from datetime import datetime
|
|
2071
|
-
|
|
2072
|
-
self.features_maps = []
|
|
2073
|
-
|
|
2074
|
-
# Check if features_df exists and is not empty
|
|
2075
|
-
if self.features_df is None:
|
|
2076
|
-
self.logger.warning("features_df is None. Falling back to XML loading.")
|
|
2077
|
-
self._load_features_from_xml()
|
|
2078
|
-
return
|
|
2079
|
-
|
|
2080
|
-
if len(self.features_df) == 0:
|
|
2081
|
-
self.logger.warning("features_df is empty. Falling back to XML loading.")
|
|
2082
|
-
self._load_features_from_xml()
|
|
2083
|
-
return
|
|
2084
|
-
|
|
2085
|
-
# If we get here, we should use the new method
|
|
2086
|
-
self.logger.debug("Reconstructing FeatureMaps from features_df.")
|
|
2087
|
-
|
|
2088
|
-
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
2089
|
-
|
|
2090
|
-
# Process each sample in order
|
|
2091
|
-
for sample_index, row_dict in tqdm(
|
|
2092
|
-
enumerate(self.samples_df.iter_rows(named=True)),
|
|
2093
|
-
total=len(self.samples_df),
|
|
2094
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
|
|
2095
|
-
disable=tdqm_disable,
|
|
2096
|
-
):
|
|
2097
|
-
sample_uid = row_dict["sample_uid"]
|
|
2098
|
-
sample_name = row_dict["sample_name"]
|
|
2099
|
-
|
|
2100
|
-
# Get features for this sample from features_df
|
|
2101
|
-
sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
2102
|
-
|
|
2103
|
-
# Create new FeatureMap
|
|
2104
|
-
feature_map = oms.FeatureMap()
|
|
2105
|
-
|
|
2106
|
-
# Convert DataFrame features to OpenMS Features
|
|
2107
|
-
# Keep track of next available feature_id for this sample
|
|
2108
|
-
next_feature_id = 1
|
|
2109
|
-
used_feature_ids = set()
|
|
2110
|
-
|
|
2111
|
-
# First pass: collect existing feature_ids to avoid conflicts
|
|
2112
|
-
for feature_row in sample_features.iter_rows(named=True):
|
|
2113
|
-
if feature_row["feature_id"] is not None:
|
|
2114
|
-
used_feature_ids.add(int(feature_row["feature_id"]))
|
|
2115
|
-
|
|
2116
|
-
# Find the next available feature_id
|
|
2117
|
-
while next_feature_id in used_feature_ids:
|
|
2118
|
-
next_feature_id += 1
|
|
2119
|
-
|
|
2120
|
-
for feature_row in sample_features.iter_rows(named=True):
|
|
2121
|
-
feature = oms.Feature()
|
|
2122
|
-
|
|
2123
|
-
# Set properties from DataFrame (handle missing values gracefully)
|
|
2124
|
-
try:
|
|
2125
|
-
# Skip features with missing critical data
|
|
2126
|
-
if feature_row["mz"] is None:
|
|
2127
|
-
self.logger.warning("Skipping feature due to missing mz")
|
|
2128
|
-
continue
|
|
2129
|
-
if feature_row["rt"] is None:
|
|
2130
|
-
self.logger.warning("Skipping feature due to missing rt")
|
|
2131
|
-
continue
|
|
2132
|
-
if feature_row["inty"] is None:
|
|
2133
|
-
self.logger.warning("Skipping feature due to missing inty")
|
|
2134
|
-
continue
|
|
2135
|
-
|
|
2136
|
-
# Handle missing feature_id by generating a new one
|
|
2137
|
-
if feature_row["feature_id"] is None:
|
|
2138
|
-
feature_id = next_feature_id
|
|
2139
|
-
next_feature_id += 1
|
|
2140
|
-
self.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID")
|
|
2141
|
-
else:
|
|
2142
|
-
feature_id = int(feature_row["feature_id"])
|
|
2143
|
-
|
|
2144
|
-
feature.setUniqueId(feature_id)
|
|
2145
|
-
feature.setMZ(float(feature_row["mz"]))
|
|
2146
|
-
feature.setRT(float(feature_row["rt"]))
|
|
2147
|
-
feature.setIntensity(float(feature_row["inty"]))
|
|
2148
|
-
|
|
2149
|
-
# Handle optional fields that might be None
|
|
2150
|
-
if feature_row.get("quality") is not None:
|
|
2151
|
-
feature.setOverallQuality(float(feature_row["quality"]))
|
|
2152
|
-
if feature_row.get("charge") is not None:
|
|
2153
|
-
feature.setCharge(int(feature_row["charge"]))
|
|
2154
|
-
|
|
2155
|
-
# Add to feature map
|
|
2156
|
-
feature_map.push_back(feature)
|
|
2157
|
-
except (ValueError, TypeError) as e:
|
|
2158
|
-
self.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
2159
|
-
continue
|
|
2160
|
-
|
|
2161
|
-
self.features_maps.append(feature_map)
|
|
2162
|
-
|
|
2163
|
-
self.logger.debug(
|
|
2164
|
-
f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
|
|
2165
|
-
)
|
|
2166
|
-
'''
|
|
2167
|
-
|
|
2168
|
-
'''
|
|
2169
|
-
def _load_features_from_xml(self):
|
|
2170
|
-
"""
|
|
2171
|
-
Original load_features method that loads from .featureXML files.
|
|
2172
|
-
Used as fallback when features_df is not available.
|
|
2173
|
-
"""
|
|
2174
|
-
self.features_maps = []
|
|
2175
|
-
self.logger.debug("Loading features from featureXML files.")
|
|
2176
|
-
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
2177
|
-
for _index, row_dict in tqdm(
|
|
2178
|
-
enumerate(self.samples_df.iter_rows(named=True)),
|
|
2179
|
-
total=len(self.samples_df),
|
|
2180
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
|
|
2181
|
-
disable=tdqm_disable,
|
|
2182
|
-
):
|
|
2183
|
-
if self.folder is not None:
|
|
2184
|
-
filename = os.path.join(
|
|
2185
|
-
self.folder,
|
|
2186
|
-
row_dict["sample_name"] + ".featureXML",
|
|
2187
|
-
)
|
|
2188
|
-
else:
|
|
2189
|
-
filename = os.path.join(
|
|
2190
|
-
os.getcwd(),
|
|
2191
|
-
row_dict["sample_name"] + ".featureXML",
|
|
2192
|
-
)
|
|
2193
|
-
# check if file exists
|
|
2194
|
-
if not os.path.exists(filename):
|
|
2195
|
-
filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
|
|
2196
|
-
|
|
2197
|
-
if not os.path.exists(filename):
|
|
2198
|
-
self.features_maps.append(None)
|
|
2199
|
-
continue
|
|
2200
|
-
|
|
2201
|
-
fh = oms.FeatureXMLFile()
|
|
2202
|
-
fm = oms.FeatureMap()
|
|
2203
|
-
fh.load(filename, fm)
|
|
2204
|
-
self.features_maps.append(fm)
|
|
2205
|
-
self.logger.debug("Features loaded successfully.")
|
|
2206
|
-
'''
|
|
2207
|
-
'''
|
|
2208
|
-
def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
2209
|
-
"""
|
|
2210
|
-
Load a consensus map from a file.
|
|
2211
|
-
"""
|
|
2212
|
-
if not os.path.exists(filename):
|
|
2213
|
-
self.logger.error(f"File {filename} does not exist.")
|
|
2214
|
-
return
|
|
2215
|
-
fh = oms.ConsensusXMLFile()
|
|
2216
|
-
self.consensus_map = oms.ConsensusMap()
|
|
2217
|
-
fh.load(filename, self.consensus_map)
|
|
2218
|
-
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
2219
|
-
'''
|
|
2220
|
-
|
|
2221
2058
|
def _add_samples_batch(
|
|
2222
2059
|
self,
|
|
2223
2060
|
files,
|
masster/study/merge.py
CHANGED
|
@@ -340,8 +340,6 @@ def merge(study, **kwargs) -> None:
|
|
|
340
340
|
- MS2 spectra are automatically linked when link_ms2=True
|
|
341
341
|
- Adduct relationships are identified and stored after merging
|
|
342
342
|
"""
|
|
343
|
-
start_time = time.time()
|
|
344
|
-
|
|
345
343
|
# Initialize with defaults and override with kwargs
|
|
346
344
|
params = merge_defaults()
|
|
347
345
|
|
|
@@ -486,10 +484,6 @@ def merge(study, **kwargs) -> None:
|
|
|
486
484
|
|
|
487
485
|
# Finalize merge: filter by min_samples and add isotope/MS2 data
|
|
488
486
|
__finalize_merge(study, params.link_ms2, params.extract_ms1, params.min_samples)
|
|
489
|
-
|
|
490
|
-
# Log completion without the misleading feature count
|
|
491
|
-
elapsed = time.time() - start_time
|
|
492
|
-
study.logger.debug(f"Merge process completed in {elapsed:.1f}s")
|
|
493
487
|
|
|
494
488
|
|
|
495
489
|
def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
@@ -3082,9 +3076,9 @@ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
|
|
|
3082
3076
|
# Count tight clusters with specified thresholds
|
|
3083
3077
|
tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
|
|
3084
3078
|
|
|
3085
|
-
study.logger.
|
|
3079
|
+
study.logger.success(
|
|
3086
3080
|
f"Merging completed. Consensus features: {len(study.consensus_df)}. "
|
|
3087
|
-
f"Completeness: {c:.2f}. Tight clusters
|
|
3081
|
+
f"Completeness: {c:.2f}. Tight clusters: {tight_clusters}.",
|
|
3088
3082
|
)
|
|
3089
3083
|
else:
|
|
3090
3084
|
study.logger.warning(
|