masster 0.3.19__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (24) hide show
  1. masster/__init__.py +2 -0
  2. masster/_version.py +1 -1
  3. masster/data/libs/README.md +17 -0
  4. masster/data/libs/ccm.py +533 -0
  5. masster/data/libs/central_carbon_README.md +17 -0
  6. masster/data/libs/central_carbon_metabolites.csv +120 -0
  7. masster/data/libs/urine.py +333 -0
  8. masster/data/libs/urine_metabolites.csv +51 -0
  9. masster/sample/lib.py +32 -25
  10. masster/sample/load.py +7 -1
  11. masster/sample/plot.py +111 -26
  12. masster/study/helpers.py +230 -6
  13. masster/study/plot.py +457 -182
  14. masster/study/study.py +4 -0
  15. {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
  16. {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/RECORD +24 -18
  17. /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
  18. /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  19. /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  20. /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  21. /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  22. {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
  23. {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
  24. {masster-0.3.19.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0
masster/sample/plot.py CHANGED
@@ -141,6 +141,110 @@ def _display_plot(plot_object, layout=None):
141
141
  return None
142
142
 
143
143
 
144
+ def _handle_sample_plot_output(self, plot_obj, filename=None, plot_type="bokeh"):
145
+ """
146
+ Helper function to handle consistent save/display behavior for sample plots.
147
+
148
+ Parameters:
149
+ plot_obj: The plot object (bokeh figure, holoviews layout, or panel object)
150
+ filename: Optional filename to save the plot
151
+ plot_type: Type of plot object ("bokeh", "panel", "holoviews")
152
+ """
153
+ if filename is not None:
154
+ # Convert relative paths to absolute paths using sample folder as base
155
+ import os
156
+ if hasattr(self, 'folder') and self.folder and not os.path.isabs(filename):
157
+ filename = os.path.join(self.folder, filename)
158
+
159
+ # Convert to absolute path for logging
160
+ abs_filename = os.path.abspath(filename)
161
+
162
+ if filename.endswith(".html"):
163
+ if plot_type == "panel":
164
+ plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
165
+ elif plot_type == "holoviews":
166
+ import panel
167
+ panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
168
+ elif plot_type == "bokeh":
169
+ from bokeh.plotting import output_file
170
+ from bokeh.io import save
171
+ output_file(filename)
172
+ save(plot_obj)
173
+ self.logger.info(f"Plot saved to: {abs_filename}")
174
+ elif filename.endswith(".png"):
175
+ try:
176
+ if plot_type == "bokeh":
177
+ from bokeh.io.export import export_png
178
+ export_png(plot_obj, filename=filename)
179
+ elif plot_type in ["panel", "holoviews"]:
180
+ import holoviews as hv
181
+ hv.save(plot_obj, filename, fmt="png")
182
+ self.logger.info(f"Plot saved to: {abs_filename}")
183
+ except Exception:
184
+ # Fall back to HTML if PNG export not available
185
+ html_filename = filename.replace('.png', '.html')
186
+ abs_html_filename = os.path.abspath(html_filename)
187
+ if plot_type == "panel":
188
+ plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
189
+ elif plot_type == "holoviews":
190
+ import panel
191
+ panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
192
+ elif plot_type == "bokeh":
193
+ from bokeh.plotting import output_file
194
+ from bokeh.io import save
195
+ output_file(html_filename)
196
+ save(plot_obj)
197
+ self.logger.warning(f"PNG export not available, saved as HTML instead: {abs_html_filename}")
198
+ elif filename.endswith(".pdf"):
199
+ # Try to save as PDF, fall back to HTML if not available
200
+ try:
201
+ if plot_type == "bokeh":
202
+ from bokeh.io.export import export_pdf
203
+ export_pdf(plot_obj, filename=filename)
204
+ elif plot_type in ["panel", "holoviews"]:
205
+ import holoviews as hv
206
+ hv.save(plot_obj, filename, fmt="pdf")
207
+ self.logger.info(f"Plot saved to: {abs_filename}")
208
+ except ImportError:
209
+ # Fall back to HTML if PDF export not available
210
+ html_filename = filename.replace('.pdf', '.html')
211
+ abs_html_filename = os.path.abspath(html_filename)
212
+ if plot_type == "panel":
213
+ plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
214
+ elif plot_type == "holoviews":
215
+ import panel
216
+ panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
217
+ elif plot_type == "bokeh":
218
+ from bokeh.plotting import output_file
219
+ from bokeh.io import save
220
+ output_file(html_filename)
221
+ save(plot_obj)
222
+ self.logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
223
+ else:
224
+ # Default to HTML for unknown extensions
225
+ if plot_type == "panel":
226
+ plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
227
+ elif plot_type == "holoviews":
228
+ import panel
229
+ panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
230
+ elif plot_type == "bokeh":
231
+ from bokeh.plotting import output_file
232
+ from bokeh.io import save
233
+ output_file(filename)
234
+ save(plot_obj)
235
+ self.logger.info(f"Plot saved to: {abs_filename}")
236
+ else:
237
+ # Show in notebook when no filename provided
238
+ if plot_type == "panel":
239
+ plot_obj.show() # type: ignore[attr-defined]
240
+ elif plot_type == "holoviews":
241
+ import panel
242
+ return panel.panel(plot_obj)
243
+ elif plot_type == "bokeh":
244
+ from bokeh.plotting import show
245
+ show(plot_obj)
246
+
247
+
144
248
  def plot_chrom(
145
249
  self,
146
250
  feature_uid=None,
@@ -271,16 +375,9 @@ def plot_chrom(
271
375
 
272
376
  layout = layout.cols(1)
273
377
  layout = panel.Column(layout)
274
- if filename is not None:
275
- # if filename includes .html, save the panel layout to an HTML file
276
- if filename.endswith(".html"):
277
- layout.save(filename, embed=True)
278
- else:
279
- # save the panel layout as a png
280
- hv.save(layout, filename, fmt="png")
281
- else:
282
- # Check if we're in a notebook environment and display appropriately
283
- return _display_plot(layout.object, layout)
378
+
379
+ # Use consistent save/display behavior
380
+ self._handle_sample_plot_output(layout, filename, "panel")
284
381
 
285
382
 
286
383
  def plot_2d(
@@ -830,12 +927,8 @@ def plot_2d(
830
927
  layout = panel.Column(overlay)
831
928
 
832
929
  if filename is not None:
833
- # if filename includes .html, save the panel layout to an HTML file
834
- if filename.endswith(".html"):
835
- layout.save(filename, embed=True)
836
- else:
837
- # save the panel layout as a png
838
- hv.save(overlay, filename, fmt="png")
930
+ # Use consistent save/display behavior
931
+ self._handle_sample_plot_output(layout, filename, "panel")
839
932
  return None
840
933
  else:
841
934
  # Check if we're in a notebook environment and display appropriately
@@ -1960,16 +2053,8 @@ def plot_feature_stats(
1960
2053
  # Arrange the plots in a layout with three columns
1961
2054
  layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
1962
2055
 
1963
- # Save or display the layout based on the filename parameter
1964
- if filename is not None:
1965
- if filename.endswith(".html"):
1966
- panel.panel(layout).save(filename, embed=True) # type: ignore[attr-defined]
1967
- else:
1968
- hv.save(layout, filename, fmt="png")
1969
- else:
1970
- # Check if we're in a notebook environment and display appropriately
1971
- layout_obj = panel.panel(layout)
1972
- return _display_plot(layout, layout_obj)
2056
+ # Use consistent save/display behavior
2057
+ self._handle_sample_plot_output(layout, filename, "holoviews")
1973
2058
 
1974
2059
 
1975
2060
  def plot_tic(
masster/study/helpers.py CHANGED
@@ -969,13 +969,20 @@ def restore_features(self, samples=None, maps=False):
969
969
  # Load sample to get its features_df
970
970
  # Use a direct load call with map=False to prevent feature synchronization
971
971
  # which would remove filled features that don't exist in the original FeatureMap
972
- sample = Sample(log_level="DEBUG")
972
+ # Use ERROR log level to suppress info messages
973
+ sample = Sample(log_level="ERROR")
973
974
  sample._load_sample5(sample_path, map=False)
974
975
 
975
976
  if sample.features_df is None or sample.features_df.is_empty():
976
977
  self.logger.warning(f"No features found in sample {sample_name}")
977
978
  continue
978
979
 
980
+ # Check which columns are actually available in the sample
981
+ available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
982
+ if not available_columns:
983
+ self.logger.debug(f"No target columns found in sample {sample_name}")
984
+ continue
985
+
979
986
  # Create update data for this sample
980
987
  updates_made = 0
981
988
  for row in sample.features_df.iter_rows(named=True):
@@ -987,8 +994,8 @@ def restore_features(self, samples=None, maps=False):
987
994
  if key in study_feature_mapping:
988
995
  feature_uid = study_feature_mapping[key]
989
996
 
990
- # Update the specific columns in study.features_df
991
- for col in columns_to_update:
997
+ # Update only the available columns in study.features_df
998
+ for col in available_columns:
992
999
  if col in row and col in self.features_df.columns:
993
1000
  # Get the original column dtype to preserve it
994
1001
  original_dtype = self.features_df[col].dtype
@@ -1013,7 +1020,8 @@ def restore_features(self, samples=None, maps=False):
1013
1020
  )
1014
1021
  updates_made += 1
1015
1022
 
1016
- self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
1023
+ if updates_made > 0:
1024
+ self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
1017
1025
 
1018
1026
  # If maps is True, load featureXML data
1019
1027
  if maps:
@@ -1096,13 +1104,18 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1096
1104
 
1097
1105
  try:
1098
1106
  # Load sample (with map=False to prevent feature synchronization)
1099
- sample = Sample(log_level="WARNING")
1107
+ # Use ERROR log level to suppress info messages
1108
+ sample = Sample(log_level="ERROR")
1100
1109
  sample._load_sample5(sample_path, map=False)
1101
1110
 
1102
1111
  if sample.features_df is None or sample.features_df.is_empty():
1103
1112
  self.logger.warning(f"No features found in sample {sample_name}")
1104
1113
  continue
1105
1114
 
1115
+ # Check if chrom column exists in sample
1116
+ if "chrom" not in sample.features_df.columns:
1117
+ continue
1118
+
1106
1119
  # Update chromatograms from this sample
1107
1120
  for row in sample.features_df.iter_rows(named=True):
1108
1121
  feature_id = row.get("feature_id")
@@ -1183,7 +1196,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1183
1196
 
1184
1197
  try:
1185
1198
  # Load sample for MS1 data extraction
1186
- sample = Sample(log_level="WARNING")
1199
+ # Use ERROR log level to suppress info messages
1200
+ sample = Sample(log_level="ERROR")
1187
1201
  sample._load_sample5(sample_path, map=False)
1188
1202
 
1189
1203
  if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
@@ -3233,3 +3247,213 @@ def migrate_map_id_to_index(self):
3233
3247
 
3234
3248
  self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3235
3249
  self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
3250
+
3251
+
3252
+ def restore_ms2(self, samples=None, **kwargs):
3253
+ """
3254
+ Restore MS2 data by re-running find_ms2 on specified samples.
3255
+
3256
+ This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
3257
+ from the original sample files. Use this to reverse the effects of compress_ms2().
3258
+
3259
+ Parameters:
3260
+ samples (list, optional): List of sample_uids or sample_names to process.
3261
+ If None, processes all samples.
3262
+ **kwargs: Additional keyword arguments passed to find_ms2()
3263
+ (e.g., mz_tol, centroid, deisotope, etc.)
3264
+ """
3265
+ if self.features_df is None or self.features_df.is_empty():
3266
+ self.logger.error("No features_df found in study.")
3267
+ return
3268
+
3269
+ if self.samples_df is None or self.samples_df.is_empty():
3270
+ self.logger.error("No samples_df found in study.")
3271
+ return
3272
+
3273
+ # Get sample_uids to process
3274
+ sample_uids = self._get_sample_uids(samples)
3275
+ if not sample_uids:
3276
+ self.logger.warning("No valid samples specified.")
3277
+ return
3278
+
3279
+ self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
3280
+
3281
+ # Clear existing consensus_ms2 to rebuild from scratch
3282
+ initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3283
+ self.consensus_ms2 = pl.DataFrame()
3284
+
3285
+ # Re-run find_ms2 which will rebuild consensus_ms2
3286
+ try:
3287
+ self.find_ms2(**kwargs)
3288
+
3289
+ final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3290
+
3291
+ self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
3292
+
3293
+ except Exception as e:
3294
+ self.logger.error(f"Failed to restore MS2 data: {e}")
3295
+ raise
3296
+
3297
+
3298
+ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
3299
+ """
3300
+ Reverse any compression effects by restoring compressed data adaptively.
3301
+
3302
+ This function restores data that was compressed using compress(), compress_features(),
3303
+ compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
3304
+ decompression process for speed by only processing what actually needs restoration.
3305
+
3306
+ Parameters:
3307
+ features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
3308
+ ms2 (bool): Restore MS2 spectra by re-running find_ms2()
3309
+ chrom (bool): Restore chromatogram objects
3310
+ samples (list, optional): List of sample_uids or sample_names to process.
3311
+ If None, processes all samples.
3312
+ **kwargs: Additional keyword arguments for restoration functions:
3313
+ - For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
3314
+ - For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
3315
+
3316
+ Performance Optimizations:
3317
+ - Adaptive processing: Only restores what actually needs restoration
3318
+ - Processes features and chromatograms together when possible (shared file I/O)
3319
+ - Uses cached sample instances to avoid repeated file loading
3320
+ - Processes MS2 restoration last as it's the most computationally expensive
3321
+ - Provides detailed progress information for long-running operations
3322
+
3323
+ Example:
3324
+ # Restore everything (but only what needs restoration)
3325
+ study.decompress()
3326
+
3327
+ # Restore only chromatograms with custom tolerances
3328
+ study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
3329
+
3330
+ # Restore specific samples only
3331
+ study.decompress(samples=["sample1", "sample2"])
3332
+ """
3333
+ if not any([features, ms2, chrom]):
3334
+ self.logger.warning("No decompression operations specified.")
3335
+ return
3336
+
3337
+ # Get sample_uids to process
3338
+ sample_uids = self._get_sample_uids(samples)
3339
+ if not sample_uids:
3340
+ self.logger.warning("No valid samples specified.")
3341
+ return
3342
+
3343
+ # Adaptively check what actually needs to be done
3344
+ import polars as pl
3345
+
3346
+ # Check if features need restoration (more sophisticated logic)
3347
+ features_need_restoration = False
3348
+ if features and not self.features_df.is_empty():
3349
+ # Check for completely missing columns that should exist after feature processing
3350
+ missing_cols = []
3351
+ for col in ["ms2_scans", "ms2_specs"]:
3352
+ if col not in self.features_df.columns:
3353
+ missing_cols.append(col)
3354
+
3355
+ # If columns are missing entirely, we likely need restoration
3356
+ if missing_cols:
3357
+ features_need_restoration = True
3358
+ else:
3359
+ # If columns exist, check if they're mostly null (indicating compression)
3360
+ # But be smart about it - only check if we have consensus features with MS2
3361
+ if not self.consensus_ms2.is_empty():
3362
+ # We have MS2 data, so ms2_specs should have some content
3363
+ null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
3364
+ total_features = len(self.features_df)
3365
+ # If more than 90% are null but we have MS2 data, likely compressed
3366
+ if null_ms2_specs > (total_features * 0.9):
3367
+ features_need_restoration = True
3368
+
3369
+ # Check if chromatograms need restoration
3370
+ chrom_need_restoration = False
3371
+ if chrom and not self.features_df.is_empty():
3372
+ if "chrom" not in self.features_df.columns:
3373
+ # Column completely missing
3374
+ chrom_need_restoration = True
3375
+ else:
3376
+ null_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
3377
+ total_features = len(self.features_df)
3378
+ # If more than 50% are null, likely need restoration
3379
+ chrom_need_restoration = null_chroms > (total_features * 0.5)
3380
+
3381
+ # Check if MS2 data might need restoration (compare expected vs actual)
3382
+ ms2_need_restoration = False
3383
+ if ms2:
3384
+ current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3385
+ consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
3386
+
3387
+ if consensus_count > 0:
3388
+ # Calculate expected MS2 count based on consensus features with MS2 potential
3389
+ # This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
3390
+ expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
3391
+ expected_ms2 = consensus_count * expected_ratio
3392
+
3393
+ if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
3394
+ ms2_need_restoration = True
3395
+
3396
+ # Build list of operations that actually need to be done
3397
+ operations_needed = []
3398
+ if features and features_need_restoration:
3399
+ operations_needed.append("features")
3400
+ if chrom and chrom_need_restoration:
3401
+ operations_needed.append("chromatograms")
3402
+ if ms2 and ms2_need_restoration:
3403
+ operations_needed.append("MS2 spectra")
3404
+
3405
+ # Early exit if nothing needs to be done
3406
+ if not operations_needed:
3407
+ self.logger.info("All data appears to be already decompressed. No operations needed.")
3408
+ return
3409
+
3410
+ self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
3411
+
3412
+ try:
3413
+ # Phase 1: Restore features and chromatograms together (shared file I/O)
3414
+ if ("features" in operations_needed and "chromatograms" in operations_needed):
3415
+ self.logger.info("Phase 1: Restoring features and chromatograms together...")
3416
+
3417
+ # Extract relevant kwargs for restore_features and restore_chrom
3418
+ restore_kwargs = {}
3419
+ if 'mz_tol' in kwargs:
3420
+ restore_kwargs['mz_tol'] = kwargs['mz_tol']
3421
+ if 'rt_tol' in kwargs:
3422
+ restore_kwargs['rt_tol'] = kwargs['rt_tol']
3423
+
3424
+ # Restore features first (includes chrom column)
3425
+ self.restore_features(samples=samples)
3426
+
3427
+ # Then do additional chrom gap-filling if needed
3428
+ self.restore_chrom(samples=samples, **restore_kwargs)
3429
+
3430
+ elif ("features" in operations_needed and "chromatograms" not in operations_needed):
3431
+ self.logger.info("Phase 1: Restoring features data...")
3432
+ self.restore_features(samples=samples)
3433
+
3434
+ elif ("chromatograms" in operations_needed and "features" not in operations_needed):
3435
+ self.logger.info("Phase 1: Restoring chromatograms...")
3436
+ restore_kwargs = {}
3437
+ if 'mz_tol' in kwargs:
3438
+ restore_kwargs['mz_tol'] = kwargs['mz_tol']
3439
+ if 'rt_tol' in kwargs:
3440
+ restore_kwargs['rt_tol'] = kwargs['rt_tol']
3441
+ self.restore_chrom(samples=samples, **restore_kwargs)
3442
+
3443
+ # Phase 2: Restore MS2 data (most computationally expensive, done last)
3444
+ if "MS2 spectra" in operations_needed:
3445
+ self.logger.info("Phase 2: Restoring MS2 spectra...")
3446
+
3447
+ # Extract MS2-specific kwargs
3448
+ ms2_kwargs = {}
3449
+ for key, value in kwargs.items():
3450
+ if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
3451
+ ms2_kwargs[key] = value
3452
+
3453
+ self.restore_ms2(samples=samples, **ms2_kwargs)
3454
+
3455
+ self.logger.info("Adaptive decompression completed successfully")
3456
+
3457
+ except Exception as e:
3458
+ self.logger.error(f"Decompression failed: {e}")
3459
+ raise