masster 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/sample/helpers.py CHANGED
@@ -281,7 +281,7 @@ def select_closest_scan(
281
281
  return scan
282
282
 
283
283
 
284
- def get_eic(self, mz, mz_tol=0.01):
284
+ def get_eic(self, mz, mz_tol=None):
285
285
  """
286
286
  Extract an extracted ion chromatogram (EIC) from `ms1_df` for a target m/z ± mz_tol.
287
287
 
@@ -291,11 +291,18 @@ def get_eic(self, mz, mz_tol=0.01):
291
291
 
292
292
  Parameters:
293
293
  mz (float): target m/z value
294
- mz_tol (float): tolerance around mz (default 0.01)
294
+ mz_tol (float): tolerance around mz. If None, uses self.parameters.eic_mz_tol or defaults to 0.01
295
295
 
296
296
  Returns:
297
297
  polars.DataFrame or None: chromatogram with columns ['rt', 'inty'] or None if not available
298
298
  """
299
+ # Use default mz_tol from sample parameters if not provided
300
+ if mz_tol is None:
301
+ if hasattr(self, 'parameters') and hasattr(self.parameters, 'eic_mz_tol'):
302
+ mz_tol = self.parameters.eic_mz_tol
303
+ else:
304
+ mz_tol = 0.01 # fallback default
305
+
299
306
  # Validate ms1_df
300
307
  if not hasattr(self, "ms1_df") or self.ms1_df is None:
301
308
  if hasattr(self, "logger"):
masster/sample/load.py CHANGED
@@ -379,18 +379,23 @@ def _load_raw(
379
379
  mz=peaks.mz.values,
380
380
  inty=peaks.intensity.values,
381
381
  ms_level=s["ms_level"],
382
- centroided=False,
383
382
  )
384
383
  # remove peaks with intensity <= 0
385
384
 
386
385
  bl = spect.baseline()
387
386
  spect = spect.denoise(threshold=bl)
387
+
388
388
  if spect.ms_level == 1:
389
- spect = spect.centroid(
390
- tolerance=self.parameters.mz_tol_ms1_da,
391
- ppm=self.parameters.mz_tol_ms1_ppm,
392
- min_points=self.parameters.centroid_min_points_ms1,
393
- )
389
+ # Use the same logic as mzML loading
390
+ mz = np.array(spect.mz)
391
+ median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
392
+
393
+ if median_diff is not None and median_diff < 0.01:
394
+ spect = spect.centroid(
395
+ tolerance=self.parameters.mz_tol_ms1_da,
396
+ ppm=self.parameters.mz_tol_ms1_ppm,
397
+ min_points=self.parameters.centroid_min_points_ms1,
398
+ )
394
399
  newscan = {
395
400
  "scan_uid": i,
396
401
  "cycle": cycle,
@@ -544,7 +549,6 @@ def _load_wiff(
544
549
  mz=peaks.mz.values,
545
550
  inty=peaks.intensity.values,
546
551
  ms_level=ms_level,
547
- centroided=False,
548
552
  )
549
553
  bl = spect.baseline()
550
554
  spect = spect.denoise(threshold=bl)
masster/sample/plot.py CHANGED
@@ -56,7 +56,6 @@ from bokeh.models import HoverTool
56
56
  from holoviews import dim
57
57
  from holoviews.plotting.util import process_cmap
58
58
  from matplotlib.colors import rgb2hex
59
- from masster.chromatogram import Chromatogram
60
59
 
61
60
  # Parameters removed - using hardcoded defaults
62
61
 
@@ -75,23 +74,36 @@ def _is_notebook_environment():
75
74
  # Check for Jupyter/JupyterLab
76
75
  from IPython import get_ipython
77
76
 
78
- if get_ipython() is not None:
77
+ ipython = get_ipython()
78
+ if ipython is not None:
79
79
  # Check if we're in a notebook context
80
- shell = get_ipython().__class__.__name__
80
+ shell = ipython.__class__.__name__
81
81
  if shell in ["ZMQInteractiveShell", "Shell"]: # Jupyter notebook/lab
82
82
  return True
83
83
 
84
- # Check for Marimo
84
+ # Check for Marimo - multiple ways to detect it
85
85
  import sys
86
86
 
87
+ # Check if marimo is in modules
87
88
  if "marimo" in sys.modules:
88
89
  return True
89
-
90
- # Additional check for notebook environments
90
+
91
+ # Check for marimo in the call stack or environment
92
+ import inspect
93
+ frame = inspect.currentframe()
94
+ try:
95
+ while frame:
96
+ if frame.f_globals.get("__name__", "").startswith("marimo"):
97
+ return True
98
+ frame = frame.f_back
99
+ finally:
100
+ del frame
101
+
102
+ # Additional check for notebook environments via builtins
91
103
  if hasattr(__builtins__, "__IPYTHON__") or hasattr(__builtins__, "_ih"):
92
104
  return True
93
105
 
94
- except ImportError:
106
+ except (ImportError, AttributeError):
95
107
  pass
96
108
 
97
109
  return False
@@ -106,22 +118,17 @@ def _display_plot(plot_object, layout=None):
106
118
  layout: Optional panel layout object
107
119
 
108
120
  Returns:
109
- The layout object if in notebook environment, None otherwise
121
+ The plot object for inline display in notebooks, None for browser display
110
122
  """
111
123
  if _is_notebook_environment():
112
- # Display inline in notebook
113
- try:
114
- # For Jupyter notebooks, just return the plot object -
115
- # holoviews will handle the display automatically
124
+ # In notebook environments, return the plot object for inline display
125
+ # For Jupyter notebooks, holoviews/panel objects display automatically when returned
126
+ if layout is not None:
127
+ # Return the layout object which will display inline in notebooks
128
+ return layout
129
+ else:
130
+ # Return the plot object directly for holoviews automatic display
116
131
  return plot_object
117
- except Exception:
118
- # Fallback to panel display for other notebook environments
119
- if layout is not None:
120
- return layout
121
- else:
122
- # Create a simple layout if none provided
123
- simple_layout = panel.Column(plot_object)
124
- return simple_layout
125
132
  else:
126
133
  # Display in browser (original behavior)
127
134
  if layout is not None:
@@ -512,7 +519,7 @@ def plot_2d(
512
519
  feats = feats.to_pandas()
513
520
  # if ms2_scans is not null, keep only the first element of the list
514
521
  feats["ms2_scans"] = feats["ms2_scans"].apply(
515
- lambda x: x[0] if type(x) == list else x,
522
+ lambda x: x[0] if isinstance(x, list) else x,
516
523
  )
517
524
  if mz_range is not None:
518
525
  feats = feats[(feats["mz"] >= mz_range[0]) & (feats["mz"] <= mz_range[1])]
@@ -707,8 +714,6 @@ def plot_2d(
707
714
  class MarkerSizeController(param.Parameterized):
708
715
  size_slider = param.Number(default=markersize, bounds=(1, 20), step=0.5)
709
716
 
710
- controller = MarkerSizeController()
711
-
712
717
  # Create a function that generates just the feature overlays with different sizes
713
718
  def create_feature_overlay(size_val):
714
719
  feature_overlay = None
@@ -808,7 +813,17 @@ def plot_2d(
808
813
  # Create layout
809
814
  layout = on.Column(slider_widget, reactive_plot, sizing_mode="stretch_width")
810
815
 
811
- return layout
816
+ # Handle filename saving for slider mode
817
+ if filename is not None:
818
+ if filename.endswith(".html"):
819
+ layout.save(filename, embed=True)
820
+ else:
821
+ # For slider plots, save the current state
822
+ hv.save(create_feature_overlay(markersize), filename, fmt="png")
823
+ return None
824
+ else:
825
+ # For notebook display, return the interactive layout
826
+ return _display_plot(layout, layout)
812
827
  else:
813
828
  # Create a panel layout without slider
814
829
  layout = panel.Column(overlay)
@@ -819,17 +834,11 @@ def plot_2d(
819
834
  layout.save(filename, embed=True)
820
835
  else:
821
836
  # save the panel layout as a png
822
- if use_slider_sizing:
823
- # For slider plots, save the current state of the param_plot
824
- hv.save(create_feature_overlay(markersize), filename, fmt="png")
825
- else:
826
- hv.save(overlay, filename, fmt="png")
837
+ hv.save(overlay, filename, fmt="png")
838
+ return None
827
839
  else:
828
840
  # Check if we're in a notebook environment and display appropriately
829
- if use_slider_sizing:
830
- return _display_plot(layout, layout)
831
- else:
832
- return _display_plot(overlay, layout)
841
+ return _display_plot(overlay, layout)
833
842
 
834
843
 
835
844
  def plot_2d_oracle(
@@ -982,7 +991,7 @@ def plot_2d_oracle(
982
991
  oracle_data = pd.read_csv(
983
992
  os.path.join(oracle_folder, "diag", "summary_by_feature.csv"),
984
993
  )
985
- except:
994
+ except Exception:
986
995
  print(f"Could not read {oracle_folder}/diag/summary_by_feature.csv")
987
996
  return
988
997
 
@@ -18,6 +18,9 @@ class study_defaults:
18
18
  log_level (str): Logging level to be set for the logger. Default is "INFO".
19
19
  log_label (Optional[str]): Optional label for the logger. Default is None.
20
20
  log_sink (str): Output sink for logging. Default is "sys.stdout".
21
+ polarity (str): Polarity of the study (positive/negative). Default is "positive".
22
+ eic_mz_tol (float): Default m/z tolerance for EIC extraction and consensus selection. Default is 0.01.
23
+ eic_rt_tol (float): Default RT tolerance for EIC extraction and consensus selection. Default is 10.0.
21
24
  """
22
25
 
23
26
  folder: Optional[str] = None
@@ -27,6 +30,9 @@ class study_defaults:
27
30
  log_sink: str = "sys.stdout"
28
31
  polarity: str = "positive"
29
32
 
33
+ eic_mz_tol: float = 0.01
34
+ eic_rt_tol: float = 10.0
35
+
30
36
  _param_metadata: dict[str, dict[str, Any]] = field(
31
37
  default_factory=lambda: {
32
38
  "folder": {
@@ -61,6 +67,20 @@ class study_defaults:
61
67
  "default": "positive",
62
68
  "allowed_values": ["positive", "negative", "pos", "neg"],
63
69
  },
70
+ "eic_mz_tol": {
71
+ "dtype": float,
72
+ "description": "Default m/z tolerance for EIC extraction and consensus selection (Da)",
73
+ "default": 0.01,
74
+ "min_value": 0.001,
75
+ "max_value": 1.0,
76
+ },
77
+ "eic_rt_tol": {
78
+ "dtype": float,
79
+ "description": "Default RT tolerance for EIC extraction and consensus selection (seconds)",
80
+ "default": 10.0,
81
+ "min_value": 0.2,
82
+ "max_value": 60.0,
83
+ },
64
84
  },
65
85
  repr=False,
66
86
  )
masster/study/h5.py CHANGED
@@ -695,19 +695,59 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
695
695
  if schema_columns is None:
696
696
  schema_columns = []
697
697
 
698
- # First pass: load all existing columns
698
+ # Get available columns from HDF5 file
699
+ hdf5_columns = list(group.keys())
700
+ logger.debug(f"HDF5 columns available: {hdf5_columns}")
701
+
702
+ # Handle column name migrations for backward compatibility first
703
+ if df_name == "samples_df":
704
+ # Migrate old column names to new names
705
+ column_migrations = {
706
+ "size": "num_features",
707
+ "file_source": "sample_source",
708
+ "ms1": "num_ms1",
709
+ "ms2": "num_ms2"
710
+ }
711
+
712
+ # Create a mapping of what's actually available after migrations
713
+ effective_columns = hdf5_columns.copy()
714
+ for old_name, new_name in column_migrations.items():
715
+ if old_name in effective_columns:
716
+ logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
717
+ # Add the new name to effective columns and optionally remove old name
718
+ effective_columns.append(new_name)
719
+
720
+ # First pass: load all existing columns (including migrated ones)
699
721
  for col in schema_columns or []:
700
- if col not in group:
722
+ source_col = col
723
+
724
+ # Check if we need to load from a migrated column name
725
+ if df_name == "samples_df":
726
+ column_migrations = {
727
+ "size": "num_features",
728
+ "file_source": "sample_source",
729
+ "ms1": "num_ms1",
730
+ "ms2": "num_ms2"
731
+ }
732
+ # Reverse lookup - find old name for new name
733
+ reverse_migrations = {v: k for k, v in column_migrations.items()}
734
+ if col in reverse_migrations:
735
+ old_name = reverse_migrations[col]
736
+ if old_name in group:
737
+ source_col = old_name
738
+ logger.info(f"Loading '{col}' from old column name '{old_name}'")
739
+
740
+ if source_col not in group:
701
741
  missing_columns.append(col)
702
742
  continue
703
743
 
704
744
  dtype = schema[df_name]["columns"][col].get("dtype", "native")
705
745
  if dtype == "pl.Object" or col in object_columns:
706
746
  # Handle object columns specially
707
- data[col] = _reconstruct_object_column(group[col][:], col)
747
+ data[col] = _reconstruct_object_column(group[source_col][:], col)
708
748
  else:
709
749
  # Regular columns
710
- column_data = group[col][:]
750
+ column_data = group[source_col][:]
711
751
 
712
752
  # Convert -123 sentinel values back to None for numeric columns
713
753
  if len(column_data) > 0:
@@ -759,17 +799,43 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
759
799
  # Second pass: handle missing columns
760
800
  for col in missing_columns:
761
801
  logger.warning(f"Column '{col}' not found in {df_name}.")
762
- # For missing columns, create appropriately sized array of None values
802
+ # For missing columns, create appropriately sized array with appropriate defaults
763
803
  if col in object_columns:
764
804
  data[col] = [None] * expected_length
765
805
  logger.debug(f"Created missing object column '{col}' with length {expected_length}")
766
806
  else:
767
- data[col] = [None] * expected_length
768
- logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
807
+ # Provide specific default values for new columns for backward compatibility
808
+ if df_name == "samples_df":
809
+ if col == "sample_group":
810
+ data[col] = [""] * expected_length # Empty string default
811
+ logger.debug(f"Created missing column '{col}' with empty string defaults")
812
+ elif col == "sample_batch":
813
+ data[col] = [1] * expected_length # Batch 1 default
814
+ logger.debug(f"Created missing column '{col}' with batch 1 defaults")
815
+ elif col == "sample_sequence":
816
+ # Create increasing sequence numbers
817
+ data[col] = list(range(1, expected_length + 1))
818
+ logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
819
+ else:
820
+ data[col] = [None] * expected_length
821
+ logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
822
+ else:
823
+ data[col] = [None] * expected_length
824
+ logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
769
825
 
770
826
  # Check for columns in HDF5 file that are not in schema (for backward compatibility)
771
- hdf5_columns = list(group.keys())
772
- extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
827
+ # But skip the old column names we already migrated
828
+ migrated_old_names = set()
829
+ if df_name == "samples_df":
830
+ column_migrations = {
831
+ "size": "num_features",
832
+ "file_source": "sample_source",
833
+ "ms1": "num_ms1",
834
+ "ms2": "num_ms2"
835
+ }
836
+ migrated_old_names = set(column_migrations.keys())
837
+
838
+ extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
773
839
 
774
840
  for col in extra_columns:
775
841
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1320,9 +1386,12 @@ def _load_study5(self, filename=None):
1320
1386
  "sample_type": [],
1321
1387
  "size": [],
1322
1388
  "map_id": [],
1323
- "file_source": [],
1324
- "ms1": [],
1325
- "ms2": [],
1389
+ "sample_source": [],
1390
+ "num_ms1": [],
1391
+ "num_ms2": [],
1392
+ "sample_group": [],
1393
+ "sample_batch": [],
1394
+ "sample_sequence": [],
1326
1395
  },
1327
1396
  schema={
1328
1397
  "sample_uid": pl.Int64,
@@ -1330,10 +1399,13 @@ def _load_study5(self, filename=None):
1330
1399
  "sample_path": pl.Utf8,
1331
1400
  "sample_type": pl.Utf8,
1332
1401
  "size": pl.Int64,
1333
- "map_id": pl.Utf8,
1334
- "file_source": pl.Utf8,
1335
- "ms1": pl.Int64,
1336
- "ms2": pl.Int64,
1402
+ "map_id": pl.Int64,
1403
+ "sample_source": pl.Utf8,
1404
+ "num_ms1": pl.Int64,
1405
+ "num_ms2": pl.Int64,
1406
+ "sample_group": pl.Utf8,
1407
+ "sample_batch": pl.Int64,
1408
+ "sample_sequence": pl.Int64,
1337
1409
  },
1338
1410
  )
1339
1411
  pbar.update(1)
@@ -1354,9 +1426,12 @@ def _load_study5(self, filename=None):
1354
1426
  "sample_type": [],
1355
1427
  "size": [],
1356
1428
  "map_id": [],
1357
- "file_source": [],
1358
- "ms1": [],
1359
- "ms2": [],
1429
+ "sample_source": [],
1430
+ "num_ms1": [],
1431
+ "num_ms2": [],
1432
+ "sample_group": [],
1433
+ "sample_batch": [],
1434
+ "sample_sequence": [],
1360
1435
  },
1361
1436
  schema={
1362
1437
  "sample_uid": pl.Int64,
@@ -1364,10 +1439,13 @@ def _load_study5(self, filename=None):
1364
1439
  "sample_path": pl.Utf8,
1365
1440
  "sample_type": pl.Utf8,
1366
1441
  "size": pl.Int64,
1367
- "map_id": pl.Utf8,
1368
- "file_source": pl.Utf8,
1369
- "ms1": pl.Int64,
1370
- "ms2": pl.Int64,
1442
+ "map_id": pl.Int64,
1443
+ "sample_source": pl.Utf8,
1444
+ "num_ms1": pl.Int64,
1445
+ "num_ms2": pl.Int64,
1446
+ "sample_group": pl.Utf8,
1447
+ "sample_batch": pl.Int64,
1448
+ "sample_sequence": pl.Int64,
1371
1449
  },
1372
1450
  )
1373
1451
  pbar.update(1)
@@ -1463,4 +1541,23 @@ def _load_study5(self, filename=None):
1463
1541
  self.consensus_ms2 = None
1464
1542
  pbar.update(1)
1465
1543
 
1544
+ # Check and migrate old string-based map_id to integer indices
1545
+ if (self.samples_df is not None and
1546
+ not self.samples_df.is_empty() and
1547
+ self.samples_df['map_id'].dtype == pl.Utf8):
1548
+ self.logger.info("Detected old string-based map_id format, migrating to integer indices")
1549
+
1550
+ # Convert string-based map_id to integer indices
1551
+ sample_count = len(self.samples_df)
1552
+ new_map_ids = list(range(sample_count))
1553
+
1554
+ self.samples_df = self.samples_df.with_columns(
1555
+ pl.lit(new_map_ids).alias("map_id")
1556
+ )
1557
+
1558
+ # Ensure the column is Int64 type
1559
+ self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
1560
+
1561
+ self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
1562
+
1466
1563
  self.logger.debug("Study loaded")