masster 0.3.19__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/data/libs/README.md +17 -0
- masster/data/libs/ccm.py +533 -0
- masster/data/libs/central_carbon_README.md +17 -0
- masster/data/libs/central_carbon_metabolites.csv +120 -0
- masster/data/libs/urine.py +333 -0
- masster/data/libs/urine_metabolites.csv +51 -0
- masster/sample/lib.py +32 -25
- masster/sample/load.py +7 -1
- masster/sample/plot.py +111 -26
- masster/study/helpers.py +230 -6
- masster/study/plot.py +457 -182
- masster/study/study.py +4 -0
- {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
- {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/RECORD +24 -18
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
- {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
- {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0
masster/sample/plot.py
CHANGED
|
@@ -141,6 +141,110 @@ def _display_plot(plot_object, layout=None):
|
|
|
141
141
|
return None
|
|
142
142
|
|
|
143
143
|
|
|
144
|
+
def _handle_sample_plot_output(self, plot_obj, filename=None, plot_type="bokeh"):
|
|
145
|
+
"""
|
|
146
|
+
Helper function to handle consistent save/display behavior for sample plots.
|
|
147
|
+
|
|
148
|
+
Parameters:
|
|
149
|
+
plot_obj: The plot object (bokeh figure, holoviews layout, or panel object)
|
|
150
|
+
filename: Optional filename to save the plot
|
|
151
|
+
plot_type: Type of plot object ("bokeh", "panel", "holoviews")
|
|
152
|
+
"""
|
|
153
|
+
if filename is not None:
|
|
154
|
+
# Convert relative paths to absolute paths using sample folder as base
|
|
155
|
+
import os
|
|
156
|
+
if hasattr(self, 'folder') and self.folder and not os.path.isabs(filename):
|
|
157
|
+
filename = os.path.join(self.folder, filename)
|
|
158
|
+
|
|
159
|
+
# Convert to absolute path for logging
|
|
160
|
+
abs_filename = os.path.abspath(filename)
|
|
161
|
+
|
|
162
|
+
if filename.endswith(".html"):
|
|
163
|
+
if plot_type == "panel":
|
|
164
|
+
plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
|
|
165
|
+
elif plot_type == "holoviews":
|
|
166
|
+
import panel
|
|
167
|
+
panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
|
|
168
|
+
elif plot_type == "bokeh":
|
|
169
|
+
from bokeh.plotting import output_file
|
|
170
|
+
from bokeh.io import save
|
|
171
|
+
output_file(filename)
|
|
172
|
+
save(plot_obj)
|
|
173
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
174
|
+
elif filename.endswith(".png"):
|
|
175
|
+
try:
|
|
176
|
+
if plot_type == "bokeh":
|
|
177
|
+
from bokeh.io.export import export_png
|
|
178
|
+
export_png(plot_obj, filename=filename)
|
|
179
|
+
elif plot_type in ["panel", "holoviews"]:
|
|
180
|
+
import holoviews as hv
|
|
181
|
+
hv.save(plot_obj, filename, fmt="png")
|
|
182
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
183
|
+
except Exception:
|
|
184
|
+
# Fall back to HTML if PNG export not available
|
|
185
|
+
html_filename = filename.replace('.png', '.html')
|
|
186
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
187
|
+
if plot_type == "panel":
|
|
188
|
+
plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
189
|
+
elif plot_type == "holoviews":
|
|
190
|
+
import panel
|
|
191
|
+
panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
192
|
+
elif plot_type == "bokeh":
|
|
193
|
+
from bokeh.plotting import output_file
|
|
194
|
+
from bokeh.io import save
|
|
195
|
+
output_file(html_filename)
|
|
196
|
+
save(plot_obj)
|
|
197
|
+
self.logger.warning(f"PNG export not available, saved as HTML instead: {abs_html_filename}")
|
|
198
|
+
elif filename.endswith(".pdf"):
|
|
199
|
+
# Try to save as PDF, fall back to HTML if not available
|
|
200
|
+
try:
|
|
201
|
+
if plot_type == "bokeh":
|
|
202
|
+
from bokeh.io.export import export_pdf
|
|
203
|
+
export_pdf(plot_obj, filename=filename)
|
|
204
|
+
elif plot_type in ["panel", "holoviews"]:
|
|
205
|
+
import holoviews as hv
|
|
206
|
+
hv.save(plot_obj, filename, fmt="pdf")
|
|
207
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
208
|
+
except ImportError:
|
|
209
|
+
# Fall back to HTML if PDF export not available
|
|
210
|
+
html_filename = filename.replace('.pdf', '.html')
|
|
211
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
212
|
+
if plot_type == "panel":
|
|
213
|
+
plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
214
|
+
elif plot_type == "holoviews":
|
|
215
|
+
import panel
|
|
216
|
+
panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
217
|
+
elif plot_type == "bokeh":
|
|
218
|
+
from bokeh.plotting import output_file
|
|
219
|
+
from bokeh.io import save
|
|
220
|
+
output_file(html_filename)
|
|
221
|
+
save(plot_obj)
|
|
222
|
+
self.logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
|
|
223
|
+
else:
|
|
224
|
+
# Default to HTML for unknown extensions
|
|
225
|
+
if plot_type == "panel":
|
|
226
|
+
plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
|
|
227
|
+
elif plot_type == "holoviews":
|
|
228
|
+
import panel
|
|
229
|
+
panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
|
|
230
|
+
elif plot_type == "bokeh":
|
|
231
|
+
from bokeh.plotting import output_file
|
|
232
|
+
from bokeh.io import save
|
|
233
|
+
output_file(filename)
|
|
234
|
+
save(plot_obj)
|
|
235
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
236
|
+
else:
|
|
237
|
+
# Show in notebook when no filename provided
|
|
238
|
+
if plot_type == "panel":
|
|
239
|
+
plot_obj.show() # type: ignore[attr-defined]
|
|
240
|
+
elif plot_type == "holoviews":
|
|
241
|
+
import panel
|
|
242
|
+
return panel.panel(plot_obj)
|
|
243
|
+
elif plot_type == "bokeh":
|
|
244
|
+
from bokeh.plotting import show
|
|
245
|
+
show(plot_obj)
|
|
246
|
+
|
|
247
|
+
|
|
144
248
|
def plot_chrom(
|
|
145
249
|
self,
|
|
146
250
|
feature_uid=None,
|
|
@@ -271,16 +375,9 @@ def plot_chrom(
|
|
|
271
375
|
|
|
272
376
|
layout = layout.cols(1)
|
|
273
377
|
layout = panel.Column(layout)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
layout.save(filename, embed=True)
|
|
278
|
-
else:
|
|
279
|
-
# save the panel layout as a png
|
|
280
|
-
hv.save(layout, filename, fmt="png")
|
|
281
|
-
else:
|
|
282
|
-
# Check if we're in a notebook environment and display appropriately
|
|
283
|
-
return _display_plot(layout.object, layout)
|
|
378
|
+
|
|
379
|
+
# Use consistent save/display behavior
|
|
380
|
+
self._handle_sample_plot_output(layout, filename, "panel")
|
|
284
381
|
|
|
285
382
|
|
|
286
383
|
def plot_2d(
|
|
@@ -830,12 +927,8 @@ def plot_2d(
|
|
|
830
927
|
layout = panel.Column(overlay)
|
|
831
928
|
|
|
832
929
|
if filename is not None:
|
|
833
|
-
#
|
|
834
|
-
|
|
835
|
-
layout.save(filename, embed=True)
|
|
836
|
-
else:
|
|
837
|
-
# save the panel layout as a png
|
|
838
|
-
hv.save(overlay, filename, fmt="png")
|
|
930
|
+
# Use consistent save/display behavior
|
|
931
|
+
self._handle_sample_plot_output(layout, filename, "panel")
|
|
839
932
|
return None
|
|
840
933
|
else:
|
|
841
934
|
# Check if we're in a notebook environment and display appropriately
|
|
@@ -1960,16 +2053,8 @@ def plot_feature_stats(
|
|
|
1960
2053
|
# Arrange the plots in a layout with three columns
|
|
1961
2054
|
layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
|
|
1962
2055
|
|
|
1963
|
-
#
|
|
1964
|
-
|
|
1965
|
-
if filename.endswith(".html"):
|
|
1966
|
-
panel.panel(layout).save(filename, embed=True) # type: ignore[attr-defined]
|
|
1967
|
-
else:
|
|
1968
|
-
hv.save(layout, filename, fmt="png")
|
|
1969
|
-
else:
|
|
1970
|
-
# Check if we're in a notebook environment and display appropriately
|
|
1971
|
-
layout_obj = panel.panel(layout)
|
|
1972
|
-
return _display_plot(layout, layout_obj)
|
|
2056
|
+
# Use consistent save/display behavior
|
|
2057
|
+
self._handle_sample_plot_output(layout, filename, "holoviews")
|
|
1973
2058
|
|
|
1974
2059
|
|
|
1975
2060
|
def plot_tic(
|
masster/study/helpers.py
CHANGED
|
@@ -969,13 +969,20 @@ def restore_features(self, samples=None, maps=False):
|
|
|
969
969
|
# Load sample to get its features_df
|
|
970
970
|
# Use a direct load call with map=False to prevent feature synchronization
|
|
971
971
|
# which would remove filled features that don't exist in the original FeatureMap
|
|
972
|
-
|
|
972
|
+
# Use ERROR log level to suppress info messages
|
|
973
|
+
sample = Sample(log_level="ERROR")
|
|
973
974
|
sample._load_sample5(sample_path, map=False)
|
|
974
975
|
|
|
975
976
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
976
977
|
self.logger.warning(f"No features found in sample {sample_name}")
|
|
977
978
|
continue
|
|
978
979
|
|
|
980
|
+
# Check which columns are actually available in the sample
|
|
981
|
+
available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
|
|
982
|
+
if not available_columns:
|
|
983
|
+
self.logger.debug(f"No target columns found in sample {sample_name}")
|
|
984
|
+
continue
|
|
985
|
+
|
|
979
986
|
# Create update data for this sample
|
|
980
987
|
updates_made = 0
|
|
981
988
|
for row in sample.features_df.iter_rows(named=True):
|
|
@@ -987,8 +994,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
987
994
|
if key in study_feature_mapping:
|
|
988
995
|
feature_uid = study_feature_mapping[key]
|
|
989
996
|
|
|
990
|
-
# Update the
|
|
991
|
-
for col in
|
|
997
|
+
# Update only the available columns in study.features_df
|
|
998
|
+
for col in available_columns:
|
|
992
999
|
if col in row and col in self.features_df.columns:
|
|
993
1000
|
# Get the original column dtype to preserve it
|
|
994
1001
|
original_dtype = self.features_df[col].dtype
|
|
@@ -1013,7 +1020,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1013
1020
|
)
|
|
1014
1021
|
updates_made += 1
|
|
1015
1022
|
|
|
1016
|
-
|
|
1023
|
+
if updates_made > 0:
|
|
1024
|
+
self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
|
|
1017
1025
|
|
|
1018
1026
|
# If maps is True, load featureXML data
|
|
1019
1027
|
if maps:
|
|
@@ -1096,13 +1104,18 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1096
1104
|
|
|
1097
1105
|
try:
|
|
1098
1106
|
# Load sample (with map=False to prevent feature synchronization)
|
|
1099
|
-
|
|
1107
|
+
# Use ERROR log level to suppress info messages
|
|
1108
|
+
sample = Sample(log_level="ERROR")
|
|
1100
1109
|
sample._load_sample5(sample_path, map=False)
|
|
1101
1110
|
|
|
1102
1111
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
1103
1112
|
self.logger.warning(f"No features found in sample {sample_name}")
|
|
1104
1113
|
continue
|
|
1105
1114
|
|
|
1115
|
+
# Check if chrom column exists in sample
|
|
1116
|
+
if "chrom" not in sample.features_df.columns:
|
|
1117
|
+
continue
|
|
1118
|
+
|
|
1106
1119
|
# Update chromatograms from this sample
|
|
1107
1120
|
for row in sample.features_df.iter_rows(named=True):
|
|
1108
1121
|
feature_id = row.get("feature_id")
|
|
@@ -1183,7 +1196,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1183
1196
|
|
|
1184
1197
|
try:
|
|
1185
1198
|
# Load sample for MS1 data extraction
|
|
1186
|
-
|
|
1199
|
+
# Use ERROR log level to suppress info messages
|
|
1200
|
+
sample = Sample(log_level="ERROR")
|
|
1187
1201
|
sample._load_sample5(sample_path, map=False)
|
|
1188
1202
|
|
|
1189
1203
|
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
@@ -3233,3 +3247,213 @@ def migrate_map_id_to_index(self):
|
|
|
3233
3247
|
|
|
3234
3248
|
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3235
3249
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|
|
3250
|
+
|
|
3251
|
+
|
|
3252
|
+
def restore_ms2(self, samples=None, **kwargs):
|
|
3253
|
+
"""
|
|
3254
|
+
Restore MS2 data by re-running find_ms2 on specified samples.
|
|
3255
|
+
|
|
3256
|
+
This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
|
|
3257
|
+
from the original sample files. Use this to reverse the effects of compress_ms2().
|
|
3258
|
+
|
|
3259
|
+
Parameters:
|
|
3260
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3261
|
+
If None, processes all samples.
|
|
3262
|
+
**kwargs: Additional keyword arguments passed to find_ms2()
|
|
3263
|
+
(e.g., mz_tol, centroid, deisotope, etc.)
|
|
3264
|
+
"""
|
|
3265
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
3266
|
+
self.logger.error("No features_df found in study.")
|
|
3267
|
+
return
|
|
3268
|
+
|
|
3269
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
3270
|
+
self.logger.error("No samples_df found in study.")
|
|
3271
|
+
return
|
|
3272
|
+
|
|
3273
|
+
# Get sample_uids to process
|
|
3274
|
+
sample_uids = self._get_sample_uids(samples)
|
|
3275
|
+
if not sample_uids:
|
|
3276
|
+
self.logger.warning("No valid samples specified.")
|
|
3277
|
+
return
|
|
3278
|
+
|
|
3279
|
+
self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
|
|
3280
|
+
|
|
3281
|
+
# Clear existing consensus_ms2 to rebuild from scratch
|
|
3282
|
+
initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3283
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
3284
|
+
|
|
3285
|
+
# Re-run find_ms2 which will rebuild consensus_ms2
|
|
3286
|
+
try:
|
|
3287
|
+
self.find_ms2(**kwargs)
|
|
3288
|
+
|
|
3289
|
+
final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3290
|
+
|
|
3291
|
+
self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
|
|
3292
|
+
|
|
3293
|
+
except Exception as e:
|
|
3294
|
+
self.logger.error(f"Failed to restore MS2 data: {e}")
|
|
3295
|
+
raise
|
|
3296
|
+
|
|
3297
|
+
|
|
3298
|
+
def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
|
|
3299
|
+
"""
|
|
3300
|
+
Reverse any compression effects by restoring compressed data adaptively.
|
|
3301
|
+
|
|
3302
|
+
This function restores data that was compressed using compress(), compress_features(),
|
|
3303
|
+
compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
|
|
3304
|
+
decompression process for speed by only processing what actually needs restoration.
|
|
3305
|
+
|
|
3306
|
+
Parameters:
|
|
3307
|
+
features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
|
|
3308
|
+
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3309
|
+
chrom (bool): Restore chromatogram objects
|
|
3310
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3311
|
+
If None, processes all samples.
|
|
3312
|
+
**kwargs: Additional keyword arguments for restoration functions:
|
|
3313
|
+
- For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
|
|
3314
|
+
- For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
|
|
3315
|
+
|
|
3316
|
+
Performance Optimizations:
|
|
3317
|
+
- Adaptive processing: Only restores what actually needs restoration
|
|
3318
|
+
- Processes features and chromatograms together when possible (shared file I/O)
|
|
3319
|
+
- Uses cached sample instances to avoid repeated file loading
|
|
3320
|
+
- Processes MS2 restoration last as it's the most computationally expensive
|
|
3321
|
+
- Provides detailed progress information for long-running operations
|
|
3322
|
+
|
|
3323
|
+
Example:
|
|
3324
|
+
# Restore everything (but only what needs restoration)
|
|
3325
|
+
study.decompress()
|
|
3326
|
+
|
|
3327
|
+
# Restore only chromatograms with custom tolerances
|
|
3328
|
+
study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
|
|
3329
|
+
|
|
3330
|
+
# Restore specific samples only
|
|
3331
|
+
study.decompress(samples=["sample1", "sample2"])
|
|
3332
|
+
"""
|
|
3333
|
+
if not any([features, ms2, chrom]):
|
|
3334
|
+
self.logger.warning("No decompression operations specified.")
|
|
3335
|
+
return
|
|
3336
|
+
|
|
3337
|
+
# Get sample_uids to process
|
|
3338
|
+
sample_uids = self._get_sample_uids(samples)
|
|
3339
|
+
if not sample_uids:
|
|
3340
|
+
self.logger.warning("No valid samples specified.")
|
|
3341
|
+
return
|
|
3342
|
+
|
|
3343
|
+
# Adaptively check what actually needs to be done
|
|
3344
|
+
import polars as pl
|
|
3345
|
+
|
|
3346
|
+
# Check if features need restoration (more sophisticated logic)
|
|
3347
|
+
features_need_restoration = False
|
|
3348
|
+
if features and not self.features_df.is_empty():
|
|
3349
|
+
# Check for completely missing columns that should exist after feature processing
|
|
3350
|
+
missing_cols = []
|
|
3351
|
+
for col in ["ms2_scans", "ms2_specs"]:
|
|
3352
|
+
if col not in self.features_df.columns:
|
|
3353
|
+
missing_cols.append(col)
|
|
3354
|
+
|
|
3355
|
+
# If columns are missing entirely, we likely need restoration
|
|
3356
|
+
if missing_cols:
|
|
3357
|
+
features_need_restoration = True
|
|
3358
|
+
else:
|
|
3359
|
+
# If columns exist, check if they're mostly null (indicating compression)
|
|
3360
|
+
# But be smart about it - only check if we have consensus features with MS2
|
|
3361
|
+
if not self.consensus_ms2.is_empty():
|
|
3362
|
+
# We have MS2 data, so ms2_specs should have some content
|
|
3363
|
+
null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
|
|
3364
|
+
total_features = len(self.features_df)
|
|
3365
|
+
# If more than 90% are null but we have MS2 data, likely compressed
|
|
3366
|
+
if null_ms2_specs > (total_features * 0.9):
|
|
3367
|
+
features_need_restoration = True
|
|
3368
|
+
|
|
3369
|
+
# Check if chromatograms need restoration
|
|
3370
|
+
chrom_need_restoration = False
|
|
3371
|
+
if chrom and not self.features_df.is_empty():
|
|
3372
|
+
if "chrom" not in self.features_df.columns:
|
|
3373
|
+
# Column completely missing
|
|
3374
|
+
chrom_need_restoration = True
|
|
3375
|
+
else:
|
|
3376
|
+
null_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
|
|
3377
|
+
total_features = len(self.features_df)
|
|
3378
|
+
# If more than 50% are null, likely need restoration
|
|
3379
|
+
chrom_need_restoration = null_chroms > (total_features * 0.5)
|
|
3380
|
+
|
|
3381
|
+
# Check if MS2 data might need restoration (compare expected vs actual)
|
|
3382
|
+
ms2_need_restoration = False
|
|
3383
|
+
if ms2:
|
|
3384
|
+
current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3385
|
+
consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
3386
|
+
|
|
3387
|
+
if consensus_count > 0:
|
|
3388
|
+
# Calculate expected MS2 count based on consensus features with MS2 potential
|
|
3389
|
+
# This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
|
|
3390
|
+
expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
|
|
3391
|
+
expected_ms2 = consensus_count * expected_ratio
|
|
3392
|
+
|
|
3393
|
+
if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
|
|
3394
|
+
ms2_need_restoration = True
|
|
3395
|
+
|
|
3396
|
+
# Build list of operations that actually need to be done
|
|
3397
|
+
operations_needed = []
|
|
3398
|
+
if features and features_need_restoration:
|
|
3399
|
+
operations_needed.append("features")
|
|
3400
|
+
if chrom and chrom_need_restoration:
|
|
3401
|
+
operations_needed.append("chromatograms")
|
|
3402
|
+
if ms2 and ms2_need_restoration:
|
|
3403
|
+
operations_needed.append("MS2 spectra")
|
|
3404
|
+
|
|
3405
|
+
# Early exit if nothing needs to be done
|
|
3406
|
+
if not operations_needed:
|
|
3407
|
+
self.logger.info("All data appears to be already decompressed. No operations needed.")
|
|
3408
|
+
return
|
|
3409
|
+
|
|
3410
|
+
self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
|
|
3411
|
+
|
|
3412
|
+
try:
|
|
3413
|
+
# Phase 1: Restore features and chromatograms together (shared file I/O)
|
|
3414
|
+
if ("features" in operations_needed and "chromatograms" in operations_needed):
|
|
3415
|
+
self.logger.info("Phase 1: Restoring features and chromatograms together...")
|
|
3416
|
+
|
|
3417
|
+
# Extract relevant kwargs for restore_features and restore_chrom
|
|
3418
|
+
restore_kwargs = {}
|
|
3419
|
+
if 'mz_tol' in kwargs:
|
|
3420
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3421
|
+
if 'rt_tol' in kwargs:
|
|
3422
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3423
|
+
|
|
3424
|
+
# Restore features first (includes chrom column)
|
|
3425
|
+
self.restore_features(samples=samples)
|
|
3426
|
+
|
|
3427
|
+
# Then do additional chrom gap-filling if needed
|
|
3428
|
+
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3429
|
+
|
|
3430
|
+
elif ("features" in operations_needed and "chromatograms" not in operations_needed):
|
|
3431
|
+
self.logger.info("Phase 1: Restoring features data...")
|
|
3432
|
+
self.restore_features(samples=samples)
|
|
3433
|
+
|
|
3434
|
+
elif ("chromatograms" in operations_needed and "features" not in operations_needed):
|
|
3435
|
+
self.logger.info("Phase 1: Restoring chromatograms...")
|
|
3436
|
+
restore_kwargs = {}
|
|
3437
|
+
if 'mz_tol' in kwargs:
|
|
3438
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3439
|
+
if 'rt_tol' in kwargs:
|
|
3440
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3441
|
+
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3442
|
+
|
|
3443
|
+
# Phase 2: Restore MS2 data (most computationally expensive, done last)
|
|
3444
|
+
if "MS2 spectra" in operations_needed:
|
|
3445
|
+
self.logger.info("Phase 2: Restoring MS2 spectra...")
|
|
3446
|
+
|
|
3447
|
+
# Extract MS2-specific kwargs
|
|
3448
|
+
ms2_kwargs = {}
|
|
3449
|
+
for key, value in kwargs.items():
|
|
3450
|
+
if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
|
|
3451
|
+
ms2_kwargs[key] = value
|
|
3452
|
+
|
|
3453
|
+
self.restore_ms2(samples=samples, **ms2_kwargs)
|
|
3454
|
+
|
|
3455
|
+
self.logger.info("Adaptive decompression completed successfully")
|
|
3456
|
+
|
|
3457
|
+
except Exception as e:
|
|
3458
|
+
self.logger.error(f"Decompression failed: {e}")
|
|
3459
|
+
raise
|