masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +11 -11
- masster/sample/helpers.py +2 -2
- masster/sample/load.py +10 -8
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +7 -3
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +230 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +95 -73
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
masster/sample/adducts.py
CHANGED
|
@@ -473,7 +473,7 @@ def find_adducts(self, **kwargs):
|
|
|
473
473
|
self.logger.debug(f"Min probability threshold: {min_probability}")
|
|
474
474
|
|
|
475
475
|
# Generate comprehensive adduct specifications using the Sample method
|
|
476
|
-
adducts_df =
|
|
476
|
+
adducts_df = _get_adducts(self,
|
|
477
477
|
adducts_list=adducts_list,
|
|
478
478
|
charge_min=charge_min,
|
|
479
479
|
charge_max=charge_max,
|
masster/sample/h5.py
CHANGED
|
@@ -62,8 +62,8 @@ def _save_sample5(
|
|
|
62
62
|
return
|
|
63
63
|
|
|
64
64
|
# synchronize feature_map if it exists
|
|
65
|
-
if hasattr(self, "_feature_map") and self._feature_map is not None:
|
|
66
|
-
|
|
65
|
+
#if hasattr(self, "_feature_map") and self._feature_map is not None:
|
|
66
|
+
# self._features_sync()
|
|
67
67
|
|
|
68
68
|
# if no extension is given, add .sample5
|
|
69
69
|
if not filename.endswith(".sample5"):
|
|
@@ -1057,15 +1057,15 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
1057
1057
|
# Parameters are now loaded from metadata JSON (see above)
|
|
1058
1058
|
# Lib and lib_match are no longer saved/loaded
|
|
1059
1059
|
|
|
1060
|
-
if map:
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1060
|
+
#if map:
|
|
1061
|
+
# featureXML = filename.replace(".sample5", ".featureXML")
|
|
1062
|
+
# if os.path.exists(featureXML):
|
|
1063
|
+
# self._load_featureXML(featureXML)
|
|
1064
|
+
# #self._features_sync()
|
|
1065
|
+
# else:
|
|
1066
|
+
# self.logger.warning(
|
|
1067
|
+
# f"Feature XML file {featureXML} not found, skipping loading.",
|
|
1068
|
+
# )
|
|
1069
1069
|
|
|
1070
1070
|
# set self.file_path to *.sample5
|
|
1071
1071
|
self.file_path = filename
|
masster/sample/helpers.py
CHANGED
|
@@ -569,7 +569,7 @@ def select(
|
|
|
569
569
|
self.logger.info(f"Selected features. Features remaining: {len(feats)}")
|
|
570
570
|
return feats
|
|
571
571
|
|
|
572
|
-
|
|
572
|
+
'''
|
|
573
573
|
def _features_sync(self):
|
|
574
574
|
"""
|
|
575
575
|
Synchronizes the cached FeatureMap with features_df.
|
|
@@ -675,7 +675,7 @@ def _features_sync(self):
|
|
|
675
675
|
self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
|
|
676
676
|
except Exception as e:
|
|
677
677
|
self.logger.error(f"Error during feature synchronization: {e}")
|
|
678
|
-
|
|
678
|
+
'''
|
|
679
679
|
|
|
680
680
|
def features_delete(self, features: list | None = None):
|
|
681
681
|
"""
|
masster/sample/load.py
CHANGED
|
@@ -46,6 +46,7 @@ import polars as pl
|
|
|
46
46
|
from tqdm import tqdm
|
|
47
47
|
|
|
48
48
|
from masster.chromatogram import Chromatogram
|
|
49
|
+
from .h5 import _load_sample5
|
|
49
50
|
from masster.spectrum import Spectrum
|
|
50
51
|
|
|
51
52
|
# Suppress pyOpenMS warnings globally
|
|
@@ -96,13 +97,13 @@ def load(
|
|
|
96
97
|
|
|
97
98
|
# check if file is mzML
|
|
98
99
|
if filename.lower().endswith(".mzml"):
|
|
99
|
-
|
|
100
|
+
_load_mzML(self, filename)
|
|
100
101
|
elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
|
|
101
|
-
|
|
102
|
+
_load_wiff(self, filename)
|
|
102
103
|
elif filename.lower().endswith(".raw"):
|
|
103
|
-
|
|
104
|
+
_load_raw(self, filename)
|
|
104
105
|
elif filename.lower().endswith(".sample5"):
|
|
105
|
-
|
|
106
|
+
_load_sample5(self, filename)
|
|
106
107
|
# elif filename.lower().endswith(".h5"):
|
|
107
108
|
# self._load_h5(filename)
|
|
108
109
|
else:
|
|
@@ -155,13 +156,14 @@ def load_noms1(
|
|
|
155
156
|
|
|
156
157
|
# check if file is mzML
|
|
157
158
|
if filename.lower().endswith(".mzml"):
|
|
158
|
-
|
|
159
|
+
_load_mzML(self, filename)
|
|
159
160
|
elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
|
|
160
|
-
|
|
161
|
+
_load_wiff(self, filename)
|
|
161
162
|
elif filename.lower().endswith(".raw"):
|
|
162
|
-
|
|
163
|
+
_load_raw(self, filename)
|
|
163
164
|
elif filename.lower().endswith(".sample5"):
|
|
164
|
-
|
|
165
|
+
from masster.sample.h5 import _load_sample5_study
|
|
166
|
+
_load_sample5_study(self, filename) # Use optimized version for study loading
|
|
165
167
|
else:
|
|
166
168
|
raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
|
|
167
169
|
|
masster/sample/processing.py
CHANGED
masster/sample/sample.py
CHANGED
|
@@ -48,9 +48,9 @@ from masster.sample.defaults.find_ms2_def import find_ms2_defaults
|
|
|
48
48
|
from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
|
|
49
49
|
|
|
50
50
|
# Sample-specific imports - keeping these private, only for internal use
|
|
51
|
-
|
|
51
|
+
from masster.sample.h5 import _load_sample5
|
|
52
52
|
# from masster.sample.h5 import _load_sample5_study
|
|
53
|
-
|
|
53
|
+
from masster.sample.h5 import _save_sample5
|
|
54
54
|
# from masster.sample.helpers import _delete_ms2
|
|
55
55
|
from masster.sample.helpers import _estimate_memory_usage
|
|
56
56
|
from masster.sample.helpers import _get_scan_uids
|
|
@@ -263,12 +263,16 @@ class Sample:
|
|
|
263
263
|
_get_feature_map = _get_feature_map
|
|
264
264
|
|
|
265
265
|
# Additional method assignments for all imported functions
|
|
266
|
-
# Removed internal-only methods:
|
|
266
|
+
# Removed internal-only methods: _load_sample5_study, _delete_ms2, _features_sync
|
|
267
267
|
_estimate_memory_usage = _estimate_memory_usage
|
|
268
268
|
_get_scan_uids = _get_scan_uids
|
|
269
269
|
_get_feature_uids = _get_feature_uids
|
|
270
270
|
features_delete = features_delete
|
|
271
271
|
features_filter = features_filter
|
|
272
|
+
_save_sample5 = _save_sample5
|
|
273
|
+
_load_sample5 = _load_sample5
|
|
274
|
+
|
|
275
|
+
|
|
272
276
|
# Removed internal-only load methods: _load_featureXML, _load_ms2data, _load_mzML, _load_raw, _load_wiff
|
|
273
277
|
chrom_extract = chrom_extract
|
|
274
278
|
_index_file = _index_file # Renamed from index_file to be internal-only
|
|
@@ -24,30 +24,7 @@ class align_defaults:
|
|
|
24
24
|
skip_blanks (bool): Whether to skip blank samples. Default is False.
|
|
25
25
|
|
|
26
26
|
KD algorithm specific parameters:
|
|
27
|
-
min_samples (int): Minimum number of samples required for KD alignment. Default is 3.
|
|
28
|
-
nr_partitions (int): Number of partitions in m/z dimension. Default is 100.
|
|
29
|
-
warp_enabled (bool): Enable non-linear retention time transformation. Default is True.
|
|
30
|
-
warp_rt_tol (float): RT tolerance for the LOWESS fit. Default is 5.0.
|
|
31
27
|
warp_mz_tol (float): m/z tolerance for the LOWESS fit. Default is 0.05.
|
|
32
|
-
warp_max_pairwise_log_fc (float): Maximum absolute log10 fold change threshold for pairing. Default is 0.5.
|
|
33
|
-
warp_min_rel_cc_size (float): Minimum relative connected component size. Default is 0.5.
|
|
34
|
-
warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment. Default is 0.
|
|
35
|
-
link_rt_tol (float): Width of RT tolerance window for linking (seconds). Default is 30.0.
|
|
36
|
-
link_mz_tol (float): m/z tolerance for linking features (ppm or Da). Default is 10.0.
|
|
37
|
-
link_charge_merging (str): Charge merging strategy for linking. Default is "With_charge_zero".
|
|
38
|
-
link_adduct_merging (str): Adduct merging strategy for linking. Default is "Any".
|
|
39
|
-
distance_RT_exponent (float): Exponent for normalized RT differences. Default is 1.0.
|
|
40
|
-
distance_RT_weight (float): Weight factor for final RT distances. Default is 1.0.
|
|
41
|
-
distance_MZ_exponent (float): Exponent for normalized m/z differences. Default is 2.0.
|
|
42
|
-
distance_MZ_weight (float): Weight factor for final m/z distances. Default is 1.0.
|
|
43
|
-
distance_intensity_exponent (float): Exponent for differences in relative intensity. Default is 1.0.
|
|
44
|
-
distance_intensity_weight (float): Weight factor for final intensity distances. Default is 1.0.
|
|
45
|
-
distance_intensity_log_transform (str): Log-transform intensities. Default is "enabled".
|
|
46
|
-
LOWESS_span (float): Fraction of datapoints for each local regression. Default is 0.666666666666667.
|
|
47
|
-
LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting. Default is 3.
|
|
48
|
-
LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes). Default is -1.0.
|
|
49
|
-
LOWESS_interpolation_type (str): Method for interpolation between datapoints. Default is "cspline".
|
|
50
|
-
LOWESS_extrapolation_type (str): Method for extrapolation outside data range. Default is "four-point-linear".
|
|
51
28
|
"""
|
|
52
29
|
|
|
53
30
|
rt_tol: float = 5.0
|
|
@@ -60,30 +37,7 @@ class align_defaults:
|
|
|
60
37
|
algorithm: str = "kd"
|
|
61
38
|
|
|
62
39
|
# KD algorithm specific parameters
|
|
63
|
-
min_samples: int = 3
|
|
64
|
-
nr_partitions: int = 100
|
|
65
|
-
warp_enabled: bool = True
|
|
66
|
-
warp_rt_tol: float = 5.0
|
|
67
40
|
warp_mz_tol: float = 0.05
|
|
68
|
-
warp_max_pairwise_log_fc: float = 0.5
|
|
69
|
-
warp_min_rel_cc_size: float = 0.5
|
|
70
|
-
warp_max_nr_conflicts: int = 0
|
|
71
|
-
link_rt_tol: float = 30.0
|
|
72
|
-
link_mz_tol: float = 10.0
|
|
73
|
-
link_charge_merging: str = "With_charge_zero"
|
|
74
|
-
link_adduct_merging: str = "Any"
|
|
75
|
-
distance_RT_exponent: float = 1.0
|
|
76
|
-
distance_RT_weight: float = 1.0
|
|
77
|
-
distance_MZ_exponent: float = 2.0
|
|
78
|
-
distance_MZ_weight: float = 1.0
|
|
79
|
-
distance_intensity_exponent: float = 1.0
|
|
80
|
-
distance_intensity_weight: float = 1.0
|
|
81
|
-
distance_intensity_log_transform: str = "enabled"
|
|
82
|
-
LOWESS_span: float = 0.666666666666667
|
|
83
|
-
LOWESS_num_iterations: int = 3
|
|
84
|
-
LOWESS_delta: float = -1.0
|
|
85
|
-
LOWESS_interpolation_type: str = "cspline"
|
|
86
|
-
LOWESS_extrapolation_type: str = "four-point-linear"
|
|
87
41
|
|
|
88
42
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
89
43
|
default_factory=lambda: {
|
|
@@ -139,32 +93,6 @@ class align_defaults:
|
|
|
139
93
|
"allowed_values": ["pc", "kd"],
|
|
140
94
|
},
|
|
141
95
|
# KD algorithm specific parameters
|
|
142
|
-
"min_samples": {
|
|
143
|
-
"dtype": int,
|
|
144
|
-
"description": "Minimum number of samples required for KD alignment algorithm",
|
|
145
|
-
"default": 3,
|
|
146
|
-
"min_value": 2,
|
|
147
|
-
"max_value": 1000,
|
|
148
|
-
},
|
|
149
|
-
"nr_partitions": {
|
|
150
|
-
"dtype": int,
|
|
151
|
-
"description": "Number of partitions in m/z dimension for KD algorithm",
|
|
152
|
-
"default": 100,
|
|
153
|
-
"min_value": 1,
|
|
154
|
-
"max_value": 1000,
|
|
155
|
-
},
|
|
156
|
-
"warp_enabled": {
|
|
157
|
-
"dtype": bool,
|
|
158
|
-
"description": "Enable non-linear retention time transformation for KD algorithm",
|
|
159
|
-
"default": True,
|
|
160
|
-
},
|
|
161
|
-
"warp_rt_tol": {
|
|
162
|
-
"dtype": float,
|
|
163
|
-
"description": "RT tolerance for the LOWESS fit in KD algorithm (seconds)",
|
|
164
|
-
"default": 5.0,
|
|
165
|
-
"min_value": 0.1,
|
|
166
|
-
"max_value": 60.0,
|
|
167
|
-
},
|
|
168
96
|
"warp_mz_tol": {
|
|
169
97
|
"dtype": float,
|
|
170
98
|
"description": "m/z tolerance for the LOWESS fit in KD algorithm (Da)",
|
|
@@ -172,138 +100,6 @@ class align_defaults:
|
|
|
172
100
|
"min_value": 0.001,
|
|
173
101
|
"max_value": 1.0,
|
|
174
102
|
},
|
|
175
|
-
"warp_max_pairwise_log_fc": {
|
|
176
|
-
"dtype": float,
|
|
177
|
-
"description": "Maximum absolute log10 fold change between two compatible signals during compatibility graph construction in KD algorithm",
|
|
178
|
-
"default": 0.5,
|
|
179
|
-
"min_value": -1.0,
|
|
180
|
-
"max_value": 10.0,
|
|
181
|
-
},
|
|
182
|
-
"warp_min_rel_cc_size": {
|
|
183
|
-
"dtype": float,
|
|
184
|
-
"description": "Minimum relative connected component size for KD algorithm",
|
|
185
|
-
"default": 0.5,
|
|
186
|
-
"min_value": 0.0,
|
|
187
|
-
"max_value": 1.0,
|
|
188
|
-
},
|
|
189
|
-
"warp_max_nr_conflicts": {
|
|
190
|
-
"dtype": int,
|
|
191
|
-
"description": "Allow up to this many conflicts (features from the same map) per connected component to be used for alignment (-1 means allow any number of conflicts)",
|
|
192
|
-
"default": 0,
|
|
193
|
-
"min_value": -1,
|
|
194
|
-
"max_value": 1000,
|
|
195
|
-
},
|
|
196
|
-
"link_rt_tol": {
|
|
197
|
-
"dtype": float,
|
|
198
|
-
"description": "Width of RT tolerance window for linking in KD algorithm (seconds)",
|
|
199
|
-
"default": 30.0,
|
|
200
|
-
"min_value": 0.0,
|
|
201
|
-
"max_value": 300.0,
|
|
202
|
-
},
|
|
203
|
-
"link_mz_tol": {
|
|
204
|
-
"dtype": float,
|
|
205
|
-
"description": "m/z tolerance for linking features in KD algorithm (ppm or Da)",
|
|
206
|
-
"default": 10.0,
|
|
207
|
-
"min_value": 0.0,
|
|
208
|
-
"max_value": 100.0,
|
|
209
|
-
},
|
|
210
|
-
"link_charge_merging": {
|
|
211
|
-
"dtype": str,
|
|
212
|
-
"description": "Charge merging strategy for linking features in KD algorithm",
|
|
213
|
-
"default": "With_charge_zero",
|
|
214
|
-
"allowed_values": ["Identical", "With_charge_zero", "Any"],
|
|
215
|
-
},
|
|
216
|
-
"link_adduct_merging": {
|
|
217
|
-
"dtype": str,
|
|
218
|
-
"description": "Adduct merging strategy for linking features in KD algorithm",
|
|
219
|
-
"default": "Any",
|
|
220
|
-
"allowed_values": ["Identical", "With_unknown_adducts", "Any"],
|
|
221
|
-
},
|
|
222
|
-
"distance_RT_exponent": {
|
|
223
|
-
"dtype": float,
|
|
224
|
-
"description": "Normalized RT differences are raised to this power in KD algorithm",
|
|
225
|
-
"default": 1.0,
|
|
226
|
-
"min_value": 0.0,
|
|
227
|
-
"max_value": 10.0,
|
|
228
|
-
},
|
|
229
|
-
"distance_RT_weight": {
|
|
230
|
-
"dtype": float,
|
|
231
|
-
"description": "Final RT distances are weighted by this factor in KD algorithm",
|
|
232
|
-
"default": 1.0,
|
|
233
|
-
"min_value": 0.0,
|
|
234
|
-
"max_value": 100.0,
|
|
235
|
-
},
|
|
236
|
-
"distance_MZ_exponent": {
|
|
237
|
-
"dtype": float,
|
|
238
|
-
"description": "Normalized m/z differences are raised to this power in KD algorithm",
|
|
239
|
-
"default": 2.0,
|
|
240
|
-
"min_value": 0.0,
|
|
241
|
-
"max_value": 10.0,
|
|
242
|
-
},
|
|
243
|
-
"distance_MZ_weight": {
|
|
244
|
-
"dtype": float,
|
|
245
|
-
"description": "Final m/z distances are weighted by this factor in KD algorithm",
|
|
246
|
-
"default": 1.0,
|
|
247
|
-
"min_value": 0.0,
|
|
248
|
-
"max_value": 100.0,
|
|
249
|
-
},
|
|
250
|
-
"distance_intensity_exponent": {
|
|
251
|
-
"dtype": float,
|
|
252
|
-
"description": "Differences in relative intensity are raised to this power in KD algorithm",
|
|
253
|
-
"default": 1.0,
|
|
254
|
-
"min_value": 0.0,
|
|
255
|
-
"max_value": 10.0,
|
|
256
|
-
},
|
|
257
|
-
"distance_intensity_weight": {
|
|
258
|
-
"dtype": float,
|
|
259
|
-
"description": "Final intensity distances are weighted by this factor in KD algorithm",
|
|
260
|
-
"default": 1.0,
|
|
261
|
-
"min_value": 0.0,
|
|
262
|
-
"max_value": 100.0,
|
|
263
|
-
},
|
|
264
|
-
"distance_intensity_log_transform": {
|
|
265
|
-
"dtype": str,
|
|
266
|
-
"description": "Log-transform intensities in KD algorithm distance calculation",
|
|
267
|
-
"default": "enabled",
|
|
268
|
-
"allowed_values": ["enabled", "disabled"],
|
|
269
|
-
},
|
|
270
|
-
"LOWESS_span": {
|
|
271
|
-
"dtype": float,
|
|
272
|
-
"description": "Fraction of datapoints to use for each local regression in LOWESS fitting",
|
|
273
|
-
"default": 0.666666666666667,
|
|
274
|
-
"min_value": 0.0,
|
|
275
|
-
"max_value": 1.0,
|
|
276
|
-
},
|
|
277
|
-
"LOWESS_num_iterations": {
|
|
278
|
-
"dtype": int,
|
|
279
|
-
"description": "Number of robustifying iterations for LOWESS fitting",
|
|
280
|
-
"default": 3,
|
|
281
|
-
"min_value": 0,
|
|
282
|
-
"max_value": 10,
|
|
283
|
-
},
|
|
284
|
-
"LOWESS_delta": {
|
|
285
|
-
"dtype": float,
|
|
286
|
-
"description": "Nonnegative parameter for LOWESS computations (negative value auto-computes)",
|
|
287
|
-
"default": -1.0,
|
|
288
|
-
"min_value": -1.0,
|
|
289
|
-
"max_value": 1000.0,
|
|
290
|
-
},
|
|
291
|
-
"LOWESS_interpolation_type": {
|
|
292
|
-
"dtype": str,
|
|
293
|
-
"description": "Method to use for interpolation between datapoints computed by LOWESS",
|
|
294
|
-
"default": "cspline",
|
|
295
|
-
"allowed_values": ["linear", "cspline", "akima"],
|
|
296
|
-
},
|
|
297
|
-
"LOWESS_extrapolation_type": {
|
|
298
|
-
"dtype": str,
|
|
299
|
-
"description": "Method to use for extrapolation outside the data range in LOWESS",
|
|
300
|
-
"default": "four-point-linear",
|
|
301
|
-
"allowed_values": [
|
|
302
|
-
"two-point-linear",
|
|
303
|
-
"four-point-linear",
|
|
304
|
-
"global-linear",
|
|
305
|
-
],
|
|
306
|
-
},
|
|
307
103
|
},
|
|
308
104
|
repr=False,
|
|
309
105
|
)
|
|
@@ -21,10 +21,11 @@ class fill_defaults:
|
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
uids: Optional[list] = None
|
|
24
|
-
mz_tol: float = 0.
|
|
24
|
+
mz_tol: float = 0.050
|
|
25
25
|
rt_tol: float = 10.0
|
|
26
26
|
min_samples_rel: float = 0.00
|
|
27
27
|
min_samples_abs: int = 5
|
|
28
|
+
threads: int = 6
|
|
28
29
|
|
|
29
30
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
30
31
|
default_factory=lambda: {
|
|
@@ -61,6 +62,13 @@ class fill_defaults:
|
|
|
61
62
|
"min_value": 0,
|
|
62
63
|
"max_value": 100,
|
|
63
64
|
},
|
|
65
|
+
"threads": {
|
|
66
|
+
"dtype": int,
|
|
67
|
+
"description": "Number of parallel threads",
|
|
68
|
+
"default": 6,
|
|
69
|
+
"min_value": 1,
|
|
70
|
+
"max_value": 32,
|
|
71
|
+
},
|
|
64
72
|
},
|
|
65
73
|
repr=False,
|
|
66
74
|
)
|
|
@@ -13,7 +13,7 @@ class merge_defaults:
|
|
|
13
13
|
method selection, grouping tolerances, and algorithm-specific parameters.
|
|
14
14
|
|
|
15
15
|
Attributes:
|
|
16
|
-
method (str): Merge method to use ('kd', 'qt', '
|
|
16
|
+
method (str): Merge method to use ('kd', 'qt', 'kd_chunked', 'qt_chunked'). Default is "kd".
|
|
17
17
|
min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
|
|
18
18
|
rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
|
|
19
19
|
mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
|
|
@@ -25,38 +25,31 @@ class merge_defaults:
|
|
|
25
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
method: str = "
|
|
28
|
+
method: str = "kd"
|
|
29
29
|
min_samples: int = 2
|
|
30
30
|
rt_tol: float = 5.0
|
|
31
|
-
mz_tol: float = 0.
|
|
31
|
+
mz_tol: float = 0.05
|
|
32
32
|
chunk_size: int = 500
|
|
33
33
|
nr_partitions: int = 1000
|
|
34
34
|
min_rel_cc_size: float = 0.1
|
|
35
35
|
max_pairwise_log_fc: float = -1.0
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
|
+
extract_ms1: bool = True
|
|
39
|
+
|
|
40
|
+
# Cross-chunk merging parameters
|
|
41
|
+
dechunking: str = "hierarchical"
|
|
38
42
|
|
|
39
43
|
# Parallel processing parameters
|
|
40
44
|
threads: Optional[int] = None
|
|
41
|
-
|
|
42
|
-
# KD-Strict specific parameters
|
|
43
|
-
optimize_rt_tol: bool = False
|
|
44
|
-
rt_tol_range: tuple = (0.5, 4.0)
|
|
45
|
-
rt_tol_steps: int = 7
|
|
46
|
-
secondary_merge_rt_tol: float = 1.0
|
|
47
|
-
secondary_merge_mz_tol: float = 0.005
|
|
48
|
-
min_sample_overlap: float = 0.8
|
|
49
|
-
max_rt_spread: float = 2.0 # Will default to 2x rt_tol
|
|
50
|
-
min_coherence: float = 0.0
|
|
51
45
|
|
|
52
46
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
53
47
|
default_factory=lambda: {
|
|
54
48
|
"method": {
|
|
55
49
|
"dtype": str,
|
|
56
50
|
"description": "Merge method (algorithm) to use",
|
|
57
|
-
"default": "
|
|
58
|
-
"allowed_values": ["
|
|
59
|
-
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
|
|
51
|
+
"default": "kd",
|
|
52
|
+
"allowed_values": ["kd", "qt",
|
|
60
53
|
"kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
|
|
61
54
|
},
|
|
62
55
|
"min_samples": {
|
|
@@ -118,7 +111,17 @@ class merge_defaults:
|
|
|
118
111
|
"description": "Whether to link MS2 spectra to consensus features",
|
|
119
112
|
"default": True,
|
|
120
113
|
},
|
|
121
|
-
|
|
114
|
+
"extract_ms1": {
|
|
115
|
+
"dtype": bool,
|
|
116
|
+
"description": "Whether to extract MS1 chromatograms for consensus features",
|
|
117
|
+
"default": True,
|
|
118
|
+
},
|
|
119
|
+
"dechunking": {
|
|
120
|
+
"dtype": str,
|
|
121
|
+
"description": "Cross-chunk merging algorithm for chunked methods",
|
|
122
|
+
"default": "hierarchical",
|
|
123
|
+
"allowed_values": ["hierarchical", "kdtree"],
|
|
124
|
+
},
|
|
122
125
|
"threads": {
|
|
123
126
|
"dtype": [int, type(None)],
|
|
124
127
|
"description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
|
|
@@ -126,58 +129,6 @@ class merge_defaults:
|
|
|
126
129
|
"min_value": 1,
|
|
127
130
|
"max_value": 32,
|
|
128
131
|
},
|
|
129
|
-
# KD-Strict specific parameters
|
|
130
|
-
"optimize_rt_tol": {
|
|
131
|
-
"dtype": bool,
|
|
132
|
-
"description": "Enable RT tolerance optimization for kd-strict method",
|
|
133
|
-
"default": False,
|
|
134
|
-
},
|
|
135
|
-
"rt_tol_range": {
|
|
136
|
-
"dtype": tuple,
|
|
137
|
-
"description": "RT tolerance range for optimization (min, max) in seconds",
|
|
138
|
-
"default": (0.8, 2.0),
|
|
139
|
-
},
|
|
140
|
-
"rt_tol_steps": {
|
|
141
|
-
"dtype": int,
|
|
142
|
-
"description": "Number of steps for RT tolerance optimization",
|
|
143
|
-
"default": 5,
|
|
144
|
-
"min_value": 3,
|
|
145
|
-
"max_value": 20,
|
|
146
|
-
},
|
|
147
|
-
"secondary_merge_rt_tol": {
|
|
148
|
-
"dtype": float,
|
|
149
|
-
"description": "RT tolerance for secondary clustering in kd-strict (seconds)",
|
|
150
|
-
"default": 0.5,
|
|
151
|
-
"min_value": 0.1,
|
|
152
|
-
"max_value": 5.0,
|
|
153
|
-
},
|
|
154
|
-
"secondary_merge_mz_tol": {
|
|
155
|
-
"dtype": float,
|
|
156
|
-
"description": "m/z tolerance for secondary clustering in kd-strict (Da)",
|
|
157
|
-
"default": 0.005,
|
|
158
|
-
"min_value": 0.001,
|
|
159
|
-
"max_value": 0.1,
|
|
160
|
-
},
|
|
161
|
-
"min_sample_overlap": {
|
|
162
|
-
"dtype": float,
|
|
163
|
-
"description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
|
|
164
|
-
"default": 0.8,
|
|
165
|
-
"min_value": 0.0,
|
|
166
|
-
"max_value": 1.0,
|
|
167
|
-
},
|
|
168
|
-
"max_rt_spread": {
|
|
169
|
-
"dtype": float,
|
|
170
|
-
"description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
|
|
171
|
-
"default": None,
|
|
172
|
-
"min_value": 0.1,
|
|
173
|
-
},
|
|
174
|
-
"min_coherence": {
|
|
175
|
-
"dtype": float,
|
|
176
|
-
"description": "Minimum chromatographic coherence score (0.0 = disabled)",
|
|
177
|
-
"default": 0.0,
|
|
178
|
-
"min_value": 0.0,
|
|
179
|
-
"max_value": 1.0,
|
|
180
|
-
},
|
|
181
132
|
},
|
|
182
133
|
repr=False,
|
|
183
134
|
)
|
masster/study/export.py
CHANGED
|
@@ -551,7 +551,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
551
551
|
else:
|
|
552
552
|
self.logger.info("No identification data available for mzTab export")
|
|
553
553
|
except Exception as e:
|
|
554
|
-
self.logger.
|
|
554
|
+
self.logger.debug(f"Could not retrieve identification data: {e}")
|
|
555
555
|
id_data = None
|
|
556
556
|
top_id_data = None
|
|
557
557
|
full_id_data = None
|
|
@@ -1190,10 +1190,12 @@ def export_xlsx(self, filename: str | None = None) -> None:
|
|
|
1190
1190
|
"""
|
|
1191
1191
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
1192
1192
|
|
|
1193
|
-
The Excel file contains
|
|
1194
|
-
-
|
|
1195
|
-
-
|
|
1193
|
+
The Excel file contains five worksheets:
|
|
1194
|
+
- samples: Samples dataframe
|
|
1195
|
+
- consensus: Consensus features dataframe
|
|
1196
1196
|
- identification: Identification results with library annotations (get_id)
|
|
1197
|
+
- gaps: Gaps matrix showing filled vs non-filled features (get_gaps_matrix)
|
|
1198
|
+
- matrix: Consensus matrix with samples as columns (get_consensus_matrix)
|
|
1197
1199
|
|
|
1198
1200
|
Args:
|
|
1199
1201
|
filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
|
|
@@ -1263,7 +1265,25 @@ def export_xlsx(self, filename: str | None = None) -> None:
|
|
|
1263
1265
|
f"Error getting identification data: {e}. Skipping identification worksheet.",
|
|
1264
1266
|
)
|
|
1265
1267
|
|
|
1266
|
-
# 4.
|
|
1268
|
+
# 4. Gaps matrix (filled vs non-filled features)
|
|
1269
|
+
try:
|
|
1270
|
+
gaps_df = self.get_gaps_matrix()
|
|
1271
|
+
if gaps_df is not None and not gaps_df.is_empty():
|
|
1272
|
+
gaps_pandas = gaps_df.to_pandas()
|
|
1273
|
+
worksheets["gaps"] = gaps_pandas
|
|
1274
|
+
self.logger.debug(
|
|
1275
|
+
f"Added gaps worksheet with {len(gaps_pandas)} rows",
|
|
1276
|
+
)
|
|
1277
|
+
else:
|
|
1278
|
+
self.logger.warning(
|
|
1279
|
+
"get_gaps_matrix() returned empty data, skipping gaps worksheet",
|
|
1280
|
+
)
|
|
1281
|
+
except Exception as e:
|
|
1282
|
+
self.logger.debug(
|
|
1283
|
+
f"Error getting gaps data: {e}. Skipping gaps worksheet.",
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
# 5. Consensus matrix (last worksheet)
|
|
1267
1287
|
try:
|
|
1268
1288
|
matrix_df = self.get_consensus_matrix()
|
|
1269
1289
|
if matrix_df is not None and not matrix_df.is_empty():
|