masster 0.5.3__tar.gz → 0.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (94) hide show
  1. {masster-0.5.3 → masster-0.5.5}/PKG-INFO +1 -1
  2. {masster-0.5.3 → masster-0.5.5}/pyproject.toml +1 -1
  3. {masster-0.5.3 → masster-0.5.5}/src/masster/_version.py +1 -1
  4. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/adducts.py +1 -1
  5. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/h5.py +11 -11
  6. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/helpers.py +2 -2
  7. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/load.py +5 -4
  8. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/processing.py +1 -1
  9. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/sample.py +7 -3
  10. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/save.py +5 -0
  11. {masster-0.5.3 → masster-0.5.5}/src/masster/study/h5.py +70 -0
  12. {masster-0.5.3 → masster-0.5.5}/src/masster/study/helpers.py +2 -2
  13. {masster-0.5.3 → masster-0.5.5}/src/masster/study/plot.py +212 -98
  14. {masster-0.5.3 → masster-0.5.5}/src/masster/study/processing.py +0 -3
  15. {masster-0.5.3 → masster-0.5.5}/src/masster/wizard/wizard.py +13 -24
  16. {masster-0.5.3 → masster-0.5.5}/uv.lock +1 -1
  17. {masster-0.5.3 → masster-0.5.5}/.github/workflows/publish.yml +0 -0
  18. {masster-0.5.3 → masster-0.5.5}/.github/workflows/security.yml +0 -0
  19. {masster-0.5.3 → masster-0.5.5}/.github/workflows/test.yml +0 -0
  20. {masster-0.5.3 → masster-0.5.5}/.gitignore +0 -0
  21. {masster-0.5.3 → masster-0.5.5}/.pre-commit-config.yaml +0 -0
  22. {masster-0.5.3 → masster-0.5.5}/LICENSE +0 -0
  23. {masster-0.5.3 → masster-0.5.5}/Makefile +0 -0
  24. {masster-0.5.3 → masster-0.5.5}/README.md +0 -0
  25. {masster-0.5.3 → masster-0.5.5}/TESTING.md +0 -0
  26. {masster-0.5.3 → masster-0.5.5}/demo/example_batch_process.py +0 -0
  27. {masster-0.5.3 → masster-0.5.5}/demo/example_sample_process.py +0 -0
  28. {masster-0.5.3 → masster-0.5.5}/src/masster/__init__.py +0 -0
  29. {masster-0.5.3 → masster-0.5.5}/src/masster/chromatogram.py +0 -0
  30. {masster-0.5.3 → masster-0.5.5}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  31. {masster-0.5.3 → masster-0.5.5}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  32. {masster-0.5.3 → masster-0.5.5}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  33. {masster-0.5.3 → masster-0.5.5}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  34. {masster-0.5.3 → masster-0.5.5}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  35. {masster-0.5.3 → masster-0.5.5}/src/masster/data/libs/aa.csv +0 -0
  36. {masster-0.5.3 → masster-0.5.5}/src/masster/data/libs/ccm.csv +0 -0
  37. {masster-0.5.3 → masster-0.5.5}/src/masster/data/libs/urine.csv +0 -0
  38. {masster-0.5.3 → masster-0.5.5}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  39. {masster-0.5.3 → masster-0.5.5}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  40. {masster-0.5.3 → masster-0.5.5}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  41. {masster-0.5.3 → masster-0.5.5}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  42. {masster-0.5.3 → masster-0.5.5}/src/masster/lib/__init__.py +0 -0
  43. {masster-0.5.3 → masster-0.5.5}/src/masster/lib/lib.py +0 -0
  44. {masster-0.5.3 → masster-0.5.5}/src/masster/logger.py +0 -0
  45. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/__init__.py +0 -0
  46. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/__init__.py +0 -0
  47. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  48. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/find_features_def.py +0 -0
  49. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  50. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  51. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/defaults/sample_def.py +0 -0
  52. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/lib.py +0 -0
  53. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/parameters.py +0 -0
  54. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/plot.py +0 -0
  55. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/quant.py +0 -0
  56. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/sample5_schema.json +0 -0
  57. {masster-0.5.3 → masster-0.5.5}/src/masster/sample/sciex.py +0 -0
  58. {masster-0.5.3 → masster-0.5.5}/src/masster/spectrum.py +0 -0
  59. {masster-0.5.3 → masster-0.5.5}/src/masster/study/__init__.py +0 -0
  60. {masster-0.5.3 → masster-0.5.5}/src/masster/study/analysis.py +0 -0
  61. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/__init__.py +0 -0
  62. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/align_def.py +0 -0
  63. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/export_def.py +0 -0
  64. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/fill_def.py +0 -0
  65. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/find_consensus_def.py +0 -0
  66. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/find_ms2_def.py +0 -0
  67. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/identify_def.py +0 -0
  68. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  69. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/integrate_def.py +0 -0
  70. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/merge_def.py +0 -0
  71. {masster-0.5.3 → masster-0.5.5}/src/masster/study/defaults/study_def.py +0 -0
  72. {masster-0.5.3 → masster-0.5.5}/src/masster/study/export.py +0 -0
  73. {masster-0.5.3 → masster-0.5.5}/src/masster/study/id.py +0 -0
  74. {masster-0.5.3 → masster-0.5.5}/src/masster/study/load.py +0 -0
  75. {masster-0.5.3 → masster-0.5.5}/src/masster/study/merge.py +0 -0
  76. {masster-0.5.3 → masster-0.5.5}/src/masster/study/parameters.py +0 -0
  77. {masster-0.5.3 → masster-0.5.5}/src/masster/study/save.py +0 -0
  78. {masster-0.5.3 → masster-0.5.5}/src/masster/study/study.py +0 -0
  79. {masster-0.5.3 → masster-0.5.5}/src/masster/study/study5_schema.json +0 -0
  80. {masster-0.5.3 → masster-0.5.5}/src/masster/wizard/README.md +0 -0
  81. {masster-0.5.3 → masster-0.5.5}/src/masster/wizard/__init__.py +0 -0
  82. {masster-0.5.3 → masster-0.5.5}/src/masster/wizard/example.py +0 -0
  83. {masster-0.5.3 → masster-0.5.5}/tests/conftest.py +0 -0
  84. {masster-0.5.3 → masster-0.5.5}/tests/test_chromatogram.py +0 -0
  85. {masster-0.5.3 → masster-0.5.5}/tests/test_defaults.py +0 -0
  86. {masster-0.5.3 → masster-0.5.5}/tests/test_imports.py +0 -0
  87. {masster-0.5.3 → masster-0.5.5}/tests/test_integration.py +0 -0
  88. {masster-0.5.3 → masster-0.5.5}/tests/test_logger.py +0 -0
  89. {masster-0.5.3 → masster-0.5.5}/tests/test_parameters.py +0 -0
  90. {masster-0.5.3 → masster-0.5.5}/tests/test_sample.py +0 -0
  91. {masster-0.5.3 → masster-0.5.5}/tests/test_spectrum.py +0 -0
  92. {masster-0.5.3 → masster-0.5.5}/tests/test_study.py +0 -0
  93. {masster-0.5.3 → masster-0.5.5}/tests/test_version.py +0 -0
  94. {masster-0.5.3 → masster-0.5.5}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.5.3
3
+ Version: 0.5.5
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.5.3"
4
+ version = "0.5.5"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.5.4"
4
+ __version__ = "0.5.5"
5
5
 
6
6
 
7
7
  def get_version():
@@ -473,7 +473,7 @@ def find_adducts(self, **kwargs):
473
473
  self.logger.debug(f"Min probability threshold: {min_probability}")
474
474
 
475
475
  # Generate comprehensive adduct specifications using the Sample method
476
- adducts_df = self._get_adducts(
476
+ adducts_df = _get_adducts(self,
477
477
  adducts_list=adducts_list,
478
478
  charge_min=charge_min,
479
479
  charge_max=charge_max,
@@ -62,8 +62,8 @@ def _save_sample5(
62
62
  return
63
63
 
64
64
  # synchronize feature_map if it exists
65
- if hasattr(self, "_feature_map") and self._feature_map is not None:
66
- self._features_sync()
65
+ #if hasattr(self, "_feature_map") and self._feature_map is not None:
66
+ # self._features_sync()
67
67
 
68
68
  # if no extension is given, add .sample5
69
69
  if not filename.endswith(".sample5"):
@@ -1057,15 +1057,15 @@ def _load_sample5(self, filename: str, map: bool = False):
1057
1057
  # Parameters are now loaded from metadata JSON (see above)
1058
1058
  # Lib and lib_match are no longer saved/loaded
1059
1059
 
1060
- if map:
1061
- featureXML = filename.replace(".sample5", ".featureXML")
1062
- if os.path.exists(featureXML):
1063
- self._load_featureXML(featureXML)
1064
- self._features_sync()
1065
- else:
1066
- self.logger.warning(
1067
- f"Feature XML file {featureXML} not found, skipping loading.",
1068
- )
1060
+ #if map:
1061
+ # featureXML = filename.replace(".sample5", ".featureXML")
1062
+ # if os.path.exists(featureXML):
1063
+ # self._load_featureXML(featureXML)
1064
+ # #self._features_sync()
1065
+ # else:
1066
+ # self.logger.warning(
1067
+ # f"Feature XML file {featureXML} not found, skipping loading.",
1068
+ # )
1069
1069
 
1070
1070
  # set self.file_path to *.sample5
1071
1071
  self.file_path = filename
@@ -569,7 +569,7 @@ def select(
569
569
  self.logger.info(f"Selected features. Features remaining: {len(feats)}")
570
570
  return feats
571
571
 
572
-
572
+ '''
573
573
  def _features_sync(self):
574
574
  """
575
575
  Synchronizes the cached FeatureMap with features_df.
@@ -675,7 +675,7 @@ def _features_sync(self):
675
675
  self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
676
676
  except Exception as e:
677
677
  self.logger.error(f"Error during feature synchronization: {e}")
678
-
678
+ '''
679
679
 
680
680
  def features_delete(self, features: list | None = None):
681
681
  """
@@ -46,6 +46,7 @@ import polars as pl
46
46
  from tqdm import tqdm
47
47
 
48
48
  from masster.chromatogram import Chromatogram
49
+ from .h5 import _load_sample5
49
50
  from masster.spectrum import Spectrum
50
51
 
51
52
  # Suppress pyOpenMS warnings globally
@@ -96,13 +97,13 @@ def load(
96
97
 
97
98
  # check if file is mzML
98
99
  if filename.lower().endswith(".mzml"):
99
- self._load_mzML(filename)
100
+ _load_mzML(self, filename)
100
101
  elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
101
- self._load_wiff(filename)
102
+ _load_wiff(self, filename)
102
103
  elif filename.lower().endswith(".raw"):
103
- self._load_raw(filename)
104
+ _load_raw(self, filename)
104
105
  elif filename.lower().endswith(".sample5"):
105
- self._load_sample5(filename)
106
+ _load_sample5(self, filename)
106
107
  # elif filename.lower().endswith(".h5"):
107
108
  # self._load_h5(filename)
108
109
  else:
@@ -795,7 +795,7 @@ def find_features(self, **kwargs):
795
795
  )
796
796
 
797
797
  self.features_df = df
798
- self._features_sync()
798
+ #self._features_sync()
799
799
  self.logger.info(f"Feature detection completed. Total features: {len(df)}")
800
800
 
801
801
  # store params
@@ -48,9 +48,9 @@ from masster.sample.defaults.find_ms2_def import find_ms2_defaults
48
48
  from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
49
49
 
50
50
  # Sample-specific imports - keeping these private, only for internal use
51
- # from masster.sample.h5 import _load_sample5
51
+ from masster.sample.h5 import _load_sample5
52
52
  # from masster.sample.h5 import _load_sample5_study
53
- # from masster.sample.h5 import _save_sample5
53
+ from masster.sample.h5 import _save_sample5
54
54
  # from masster.sample.helpers import _delete_ms2
55
55
  from masster.sample.helpers import _estimate_memory_usage
56
56
  from masster.sample.helpers import _get_scan_uids
@@ -263,12 +263,16 @@ class Sample:
263
263
  _get_feature_map = _get_feature_map
264
264
 
265
265
  # Additional method assignments for all imported functions
266
- # Removed internal-only methods: _load_sample5, _load_sample5_study, _save_sample5, _delete_ms2, _features_sync
266
+ # Removed internal-only methods: _load_sample5_study, _delete_ms2, _features_sync
267
267
  _estimate_memory_usage = _estimate_memory_usage
268
268
  _get_scan_uids = _get_scan_uids
269
269
  _get_feature_uids = _get_feature_uids
270
270
  features_delete = features_delete
271
271
  features_filter = features_filter
272
+ _save_sample5 = _save_sample5
273
+ _load_sample5 = _load_sample5
274
+
275
+
272
276
  # Removed internal-only load methods: _load_featureXML, _load_ms2data, _load_mzML, _load_raw, _load_wiff
273
277
  chrom_extract = chrom_extract
274
278
  _index_file = _index_file # Renamed from index_file to be internal-only
@@ -411,6 +411,11 @@ def export_mgf(
411
411
  rt_str = f"{rt:.2f}"
412
412
  mz_str = f"{mz:.4f}"
413
413
 
414
+ # Initialize charge for this feature
415
+ charge = preferred_charge
416
+ if row["charge"] is not None and row["charge"] != 0:
417
+ charge = row["charge"]
418
+
414
419
  # Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
415
420
  if row["ms2_scans"] is None:
416
421
  skip = skip + 1
@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
304
304
  serialized_chunk.append(item.to_json())
305
305
  else:
306
306
  serialized_chunk.append("None")
307
+ elif col_name == "iso":
308
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
309
+ for item in chunk_data:
310
+ if item is not None:
311
+ try:
312
+ # Convert numpy array to nested list for JSON serialization
313
+ serialized_chunk.append(json.dumps(item.tolist()))
314
+ except (AttributeError, TypeError):
315
+ # Fallback for non-numpy data
316
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
317
+ else:
318
+ serialized_chunk.append("None")
319
+ elif col_name == "ms1_spec":
320
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
321
+ for item in chunk_data:
322
+ if item is not None:
323
+ try:
324
+ # Convert numpy array to nested list for JSON serialization
325
+ serialized_chunk.append(json.dumps(item.tolist()))
326
+ except (AttributeError, TypeError):
327
+ # Fallback for non-numpy data
328
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
329
+ else:
330
+ serialized_chunk.append("None")
307
331
  else:
308
332
  logger.warning(
309
333
  f"Unknown object column '{col_name}', using default serialization",
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
564
588
  else:
565
589
  data_as_str.append("None")
566
590
  group.create_dataset(col, data=data_as_str, compression=compression)
591
+ elif col == "iso":
592
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
593
+ data_as_json_strings = []
594
+ for item in data:
595
+ if item is not None:
596
+ try:
597
+ # Convert numpy array to nested list for JSON serialization
598
+ data_as_json_strings.append(json.dumps(item.tolist()))
599
+ except (AttributeError, TypeError):
600
+ # Fallback for non-numpy data
601
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
602
+ else:
603
+ data_as_json_strings.append("None")
604
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
605
+ elif col == "ms1_spec":
606
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
607
+ data_as_json_strings = []
608
+ for item in data:
609
+ if item is not None:
610
+ try:
611
+ # Convert numpy array to nested list for JSON serialization
612
+ data_as_json_strings.append(json.dumps(item.tolist()))
613
+ except (AttributeError, TypeError):
614
+ # Fallback for non-numpy data
615
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
616
+ else:
617
+ data_as_json_strings.append("None")
618
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
567
619
  else:
568
620
  logger.warning(
569
621
  f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
666
718
  },
667
719
  )
668
720
  reconstructed_data.append(converted_adducts)
721
+ elif col_name == "iso":
722
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
723
+ try:
724
+ import numpy as np
725
+ iso_data = json.loads(item)
726
+ # Convert back to numpy array
727
+ reconstructed_data.append(np.array(iso_data) if iso_data else None)
728
+ except (json.JSONDecodeError, ValueError, ImportError):
729
+ reconstructed_data.append(None)
730
+ elif col_name == "ms1_spec":
731
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
732
+ try:
733
+ import numpy as np
734
+ ms1_spec_data = json.loads(item)
735
+ # Convert back to numpy array
736
+ reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
737
+ except (json.JSONDecodeError, ValueError, ImportError):
738
+ reconstructed_data.append(None)
669
739
  else:
670
740
  # Unknown object column
671
741
  reconstructed_data.append(None)
@@ -500,7 +500,7 @@ def align_reset(self):
500
500
  # TODO I don't get this param
501
501
  def get_consensus(self, quant="chrom_area"):
502
502
  if self.consensus_df is None:
503
- self.logger.error("No consensus map found.")
503
+ self.logger.error("No consensus found.")
504
504
  return None
505
505
 
506
506
  # Convert Polars DataFrame to pandas for this operation since the result is used for export
@@ -613,7 +613,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
613
613
  import polars as pl
614
614
 
615
615
  if self.consensus_df is None or self.consensus_df.is_empty():
616
- self.logger.error("No consensus map found.")
616
+ self.logger.error("No consensus found.")
617
617
  return None
618
618
 
619
619
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
@@ -564,6 +564,10 @@ def plot_consensus_2d(
564
564
  Parameters:
565
565
  filename (str, optional): Path to save the plot
566
566
  colorby (str): Column name to use for color mapping (default: "number_samples")
567
+ Automatically detects if column contains categorical (string) or
568
+ numeric data and applies appropriate color mapping:
569
+ - Categorical: Uses factor_cmap with distinct colors and legend
570
+ - Numeric: Uses LinearColorMapper with continuous colorbar
567
571
  sizeby (str): Column name to use for size mapping (default: "inty_mean")
568
572
  markersize (int): Base marker size (default: 6)
569
573
  scaling (str): Controls whether points scale with zoom. Options:
@@ -603,7 +607,7 @@ def plot_consensus_2d(
603
607
  pl.when(
604
608
  (pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
605
609
  )
606
- .then((pl.col(sizeby).log10() * markersize / 12).pow(2))
610
+ .then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
607
611
  .otherwise(markersize)
608
612
  .alias("markersize"),
609
613
  ])
@@ -645,12 +649,13 @@ def plot_consensus_2d(
645
649
  from bokeh.models import HoverTool
646
650
  from bokeh.models import LinearColorMapper
647
651
  from bokeh.io.export import export_png
652
+ from bokeh.transform import factor_cmap
648
653
 
649
654
  try:
650
655
  from bokeh.models import ColorBar # type: ignore[attr-defined]
651
656
  except ImportError:
652
657
  from bokeh.models.annotations import ColorBar
653
- from bokeh.palettes import viridis
658
+ from bokeh.palettes import viridis, Category20
654
659
 
655
660
  # Import cmap for colormap handling
656
661
  from cmap import Colormap
@@ -695,61 +700,144 @@ def plot_consensus_2d(
695
700
  self.logger.warning(f"Could not interpret colormap '{cmap}': {e}, falling back to viridis")
696
701
  palette = viridis(256)
697
702
 
698
- color_mapper = LinearColorMapper(
699
- palette=palette,
700
- low=data[colorby].min(),
701
- high=data[colorby].max(),
703
+ # Check if colorby column contains categorical data (string/object)
704
+ colorby_values = data[colorby].to_list()
705
+ is_categorical = (
706
+ data_pd[colorby].dtype in ["object", "string", "category"] or
707
+ isinstance(colorby_values[0], str) if colorby_values else False
702
708
  )
709
+
710
+ if is_categorical:
711
+ # Handle categorical coloring
712
+ # Use natural order of unique values - don't sort to preserve correct legend mapping
713
+ # Sorting would break the correspondence between legend labels and point colors
714
+ unique_values = [v for v in data_pd[colorby].unique() if v is not None]
715
+
716
+ if len(unique_values) <= 20:
717
+ palette = Category20[min(20, max(3, len(unique_values)))]
718
+ else:
719
+ # For many categories, use a subset of the viridis palette
720
+ palette = viridis(min(256, len(unique_values)))
721
+
722
+ color_mapper = factor_cmap(colorby, palette, unique_values)
723
+ else:
724
+ # Handle numeric coloring with LinearColorMapper
725
+ color_mapper = LinearColorMapper(
726
+ palette=palette,
727
+ low=data[colorby].min(),
728
+ high=data[colorby].max(),
729
+ )
703
730
  # scatter plot rt vs mz
704
731
  p = bp.figure(
705
732
  width=width,
706
733
  height=height,
707
- title="Consensus map",
734
+ title=f"Consensus features, colored by {colorby}",
708
735
  )
709
- p.xaxis.axis_label = "Retention Time (min)"
710
- p.yaxis.axis_label = "m/z"
736
+ p.xaxis.axis_label = "RT [s]"
737
+ p.yaxis.axis_label = "m/z [Th]"
711
738
  scatter_renderer: Any = None
712
- if scaling.lower() in ["dyn", "dynamic"]:
713
- # Calculate appropriate radius for dynamic scaling based on data range
714
- rt_range = data["rt"].max() - data["rt"].min()
715
- mz_range = data["mz"].max() - data["mz"].min()
716
- # Use a fraction of the smaller dimension for radius, similar to sample plotting
717
- dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
739
+ if is_categorical:
740
+ # For categorical data, create separate renderers for each category
741
+ # This enables proper legend interactivity where each category can be toggled independently
742
+ unique_values = [v for v in data_pd[colorby].unique() if v is not None]
743
+
744
+ if len(unique_values) <= 20:
745
+ palette = Category20[min(20, max(3, len(unique_values)))]
746
+ else:
747
+ palette = viridis(min(256, len(unique_values)))
748
+
749
+ # Create a separate renderer for each category
750
+ for i, category in enumerate(unique_values):
751
+ # Filter data for this category
752
+ category_data = data.filter(pl.col(colorby) == category)
753
+ category_data_pd = category_data.to_pandas()
754
+ category_source = bp.ColumnDataSource(category_data_pd)
755
+
756
+ color = palette[i % len(palette)]
757
+
758
+ if scaling.lower() in ["dyn", "dynamic"]:
759
+ # Calculate appropriate radius for dynamic scaling
760
+ rt_range = data["rt"].max() - data["rt"].min()
761
+ mz_range = data["mz"].max() - data["mz"].min()
762
+ dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
763
+
764
+ renderer = p.circle(
765
+ x="rt",
766
+ y="mz",
767
+ radius=dynamic_radius,
768
+ fill_color=color,
769
+ line_color=None,
770
+ alpha=alpha,
771
+ source=category_source,
772
+ legend_label=str(category),
773
+ )
774
+ else:
775
+ renderer = p.scatter(
776
+ x="rt",
777
+ y="mz",
778
+ size="markersize",
779
+ fill_color=color,
780
+ line_color=None,
781
+ alpha=alpha,
782
+ source=category_source,
783
+ legend_label=str(category),
784
+ )
785
+
786
+ # No single scatter_renderer for categorical data
787
+ scatter_renderer = None
718
788
 
719
- scatter_renderer = p.circle(
720
- x="rt",
721
- y="mz",
722
- radius=dynamic_radius,
723
- fill_color={"field": colorby, "transform": color_mapper},
724
- line_color=None,
725
- alpha=alpha,
726
- source=source,
727
- )
728
789
  else:
729
- scatter_renderer = p.scatter(
730
- x="rt",
731
- y="mz",
732
- size="markersize",
733
- fill_color={"field": colorby, "transform": color_mapper},
734
- line_color=None,
735
- alpha=alpha,
736
- source=source,
737
- )
790
+ # Handle numeric coloring - single renderer with color mapping
791
+ if scaling.lower() in ["dyn", "dynamic"]:
792
+ # Calculate appropriate radius for dynamic scaling
793
+ rt_range = data["rt"].max() - data["rt"].min()
794
+ mz_range = data["mz"].max() - data["mz"].min()
795
+ dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
796
+
797
+ scatter_renderer = p.circle(
798
+ x="rt",
799
+ y="mz",
800
+ radius=dynamic_radius,
801
+ fill_color={"field": colorby, "transform": color_mapper},
802
+ line_color=None,
803
+ alpha=alpha,
804
+ source=source,
805
+ )
806
+ else:
807
+ scatter_renderer = p.scatter(
808
+ x="rt",
809
+ y="mz",
810
+ size="markersize",
811
+ fill_color={"field": colorby, "transform": color_mapper},
812
+ line_color=None,
813
+ alpha=alpha,
814
+ source=source,
815
+ )
738
816
  # add hover tool
739
- # Start with base tooltips
817
+ # Start with base tooltips - rt and mz moved to top, removed consensus_id and iso_mean
740
818
  tooltips = [
819
+ ("rt", "@rt"),
820
+ ("mz", "@mz"),
741
821
  ("consensus_uid", "@consensus_uid"),
742
- ("consensus_id", "@consensus_id"),
743
822
  ("number_samples", "@number_samples"),
744
823
  ("number_ms2", "@number_ms2"),
745
- ("rt", "@rt"),
746
- ("mz", "@mz"),
747
824
  ("inty_mean", "@inty_mean"),
748
- ("iso_mean", "@iso_mean"),
749
825
  ("coherence_mean", "@chrom_coherence_mean"),
750
826
  ("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
751
827
  ]
752
828
 
829
+ # Add adduct_top if it exists in data
830
+ if "adduct_top" in data.columns:
831
+ tooltips.append(("adduct_top", "@adduct_top"))
832
+
833
+ # Add id_top_name if it exists in data
834
+ if "id_top_name" in data.columns:
835
+ tooltips.append(("id_top_name", "@id_top_name"))
836
+
837
+ # Add id_top_adduct if it exists in data
838
+ if "id_top_adduct" in data.columns:
839
+ tooltips.append(("id_top_adduct", "@id_top_adduct"))
840
+
753
841
  # Add id_top_* columns if they exist and have non-null values
754
842
  id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
755
843
  for col in id_top_columns:
@@ -764,19 +852,28 @@ def plot_consensus_2d(
764
852
 
765
853
  hover = HoverTool(
766
854
  tooltips=tooltips,
767
- renderers=[scatter_renderer],
768
855
  )
856
+ # For categorical data, hover will work on all renderers automatically
857
+ # For numeric data, specify the single renderer
858
+ if not is_categorical and scatter_renderer:
859
+ hover.renderers = [scatter_renderer]
860
+
769
861
  p.add_tools(hover)
770
862
 
771
- # add colorbar
772
- color_bar = ColorBar(
773
- color_mapper=color_mapper,
774
- label_standoff=12,
775
- location=(0, 0),
776
- title=colorby,
777
- ticker=BasicTicker(desired_num_ticks=8),
778
- )
779
- p.add_layout(color_bar, "right")
863
+ # add colorbar only for numeric data (LinearColorMapper)
864
+ if not is_categorical:
865
+ color_bar = ColorBar(
866
+ color_mapper=color_mapper,
867
+ label_standoff=12,
868
+ location=(0, 0),
869
+ title=colorby,
870
+ ticker=BasicTicker(desired_num_ticks=8),
871
+ )
872
+ p.add_layout(color_bar, "right")
873
+ else:
874
+ # For categorical data, configure the legend that was automatically created
875
+ p.legend.location = "top_right"
876
+ p.legend.click_policy = "hide"
780
877
 
781
878
  if filename is not None:
782
879
  # Convert relative paths to absolute paths using study folder as base
@@ -1421,46 +1518,60 @@ def plot_rt_correction(
1421
1518
  p.xaxis.axis_label = f"Retention Time ({rt_unit})"
1422
1519
  p.yaxis.axis_label = "RT - RT_original (s)"
1423
1520
 
1424
- samples_info = None
1521
+ # Create sample name lookup dictionary from samples_df (all in Polars)
1522
+ sample_names_dict = {}
1425
1523
  if hasattr(self, "samples_df") and self.samples_df is not None:
1426
1524
  try:
1427
- samples_info = self.samples_df.to_pandas()
1525
+ sample_name_mapping = (
1526
+ self.samples_df
1527
+ .filter(pl.col("sample_uid").is_in(sample_uids))
1528
+ .select(["sample_uid", "sample_name"])
1529
+ )
1530
+ sample_names_dict = dict(zip(
1531
+ sample_name_mapping["sample_uid"].to_list(),
1532
+ sample_name_mapping["sample_name"].to_list()
1533
+ ))
1428
1534
  except Exception:
1429
- samples_info = None
1535
+ pass
1430
1536
 
1431
1537
  renderers = []
1432
1538
 
1433
- # Iterate samples and build curves
1434
- for uid in sample_uids:
1435
- # Select features belonging to this sample
1436
- try:
1437
- if "sample_uid" in self.features_df.columns:
1438
- sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
1439
- elif "sample_name" in self.features_df.columns:
1440
- sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
1441
- else:
1442
- self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
1443
- continue
1444
- except Exception as e:
1445
- self.logger.debug(f"Error filtering features for sample {uid}: {e}")
1446
- continue
1539
+ # Check sample identifier column
1540
+ if "sample_uid" not in self.features_df.columns:
1541
+ if "sample_name" in self.features_df.columns:
1542
+ sample_id_col = "sample_name"
1543
+ else:
1544
+ self.logger.debug("No sample identifier column in features_df")
1545
+ return
1546
+ else:
1547
+ sample_id_col = "sample_uid"
1447
1548
 
1448
- if sample_feats.is_empty():
1449
- continue
1549
+ # OPTIMIZED: Filter once, group once instead of per-sample filtering
1550
+ try:
1551
+ # Filter all data once for selected samples and required conditions
1552
+ all_sample_feats = self.features_df.filter(
1553
+ pl.col(sample_id_col).is_in(sample_uids)
1554
+ )
1555
+
1556
+ if all_sample_feats.is_empty():
1557
+ self.logger.warning("No features found for the selected samples.")
1558
+ return
1450
1559
 
1451
- # Filter to only use features with filled==False
1452
- if "filled" in sample_feats.columns:
1453
- sample_feats = sample_feats.filter(~pl.col("filled"))
1454
- if sample_feats.is_empty():
1455
- continue
1560
+ # Filter to only use features with filled==False if column exists
1561
+ if "filled" in all_sample_feats.columns:
1562
+ all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
1563
+ if all_sample_feats.is_empty():
1564
+ self.logger.warning("No non-filled features found for the selected samples.")
1565
+ return
1456
1566
 
1457
- # Stay in Polars - much faster than pandas conversion!
1458
- if "rt" not in sample_feats.columns or "rt_original" not in sample_feats.columns:
1459
- continue
1567
+ # Check required columns
1568
+ if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
1569
+ self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
1570
+ return
1460
1571
 
1461
- # Filter nulls and add delta column in Polars
1462
- sample_feats = (
1463
- sample_feats
1572
+ # Filter nulls, add delta column, and sort - all in one operation
1573
+ all_sample_feats = (
1574
+ all_sample_feats
1464
1575
  .filter(
1465
1576
  pl.col("rt").is_not_null() &
1466
1577
  pl.col("rt_original").is_not_null()
@@ -1468,33 +1579,36 @@ def plot_rt_correction(
1468
1579
  .with_columns([
1469
1580
  (pl.col("rt") - pl.col("rt_original")).alias("delta")
1470
1581
  ])
1471
- .sort("rt")
1582
+ .sort([sample_id_col, "rt"])
1472
1583
  )
1473
1584
 
1474
- if sample_feats.is_empty():
1475
- continue
1585
+ if all_sample_feats.is_empty():
1586
+ self.logger.warning("No valid RT data found for the selected samples.")
1587
+ return
1476
1588
 
1477
- # Extract arrays directly from Polars
1478
- rt = sample_feats["rt"].to_numpy()
1479
- delta = sample_feats["delta"].to_numpy()
1589
+ # Group by sample and process each group (much faster than individual filtering)
1590
+ for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
1591
+ if sample_group.is_empty():
1592
+ continue
1480
1593
 
1481
- sample_name = str(uid)
1482
- if samples_info is not None:
1483
- try:
1484
- row = samples_info[samples_info["sample_uid"] == uid]
1485
- if not row.empty:
1486
- sample_name = row.iloc[0].get("sample_name", sample_name)
1487
- except Exception:
1488
- pass
1594
+ # Extract arrays directly from Polars
1595
+ rt = sample_group["rt"].to_numpy()
1596
+ delta = sample_group["delta"].to_numpy()
1489
1597
 
1490
- color = color_map.get(uid, "#000000")
1598
+ # Get sample name efficiently from pre-built dictionary
1599
+ sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
1600
+ color = color_map.get(sample_uid, "#000000")
1491
1601
 
1492
- data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
1493
- src = ColumnDataSource(data)
1602
+ data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
1603
+ src = ColumnDataSource(data)
1494
1604
 
1495
- r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
1496
- p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
1497
- renderers.append(r_line)
1605
+ r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
1606
+ p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
1607
+ renderers.append(r_line)
1608
+
1609
+ except Exception as e:
1610
+ self.logger.error(f"Error in optimized RT correction plotting: {e}")
1611
+ return
1498
1612
 
1499
1613
  if not renderers:
1500
1614
  self.logger.warning("No RT correction curves to plot for the selected samples.")
@@ -341,9 +341,6 @@ def _integrate_chrom_impl(self, **kwargs):
341
341
  uids = params.get("uids")
342
342
  rt_tol = params.get("rt_tol")
343
343
 
344
- if self.consensus_map is None:
345
- self.logger.error("No consensus map found.")
346
- return
347
344
  if uids is None:
348
345
  # get all consensus_id from consensus_df
349
346
  ids = self.consensus_df["consensus_uid"].to_list()
@@ -455,6 +455,9 @@ class Wizard:
455
455
  params_lines.append(' # === Processing Parameters ===')
456
456
  params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
457
457
  params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
458
+ params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # Noise threshold for feature detection')
459
+ params_lines.append(f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # Chromatographic peak full width at half maximum (seconds)')
460
+ params_lines.append(f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks')
458
461
  params_lines.append('')
459
462
 
460
463
  # Alignment & Merging
@@ -643,6 +646,7 @@ class Wizard:
643
646
  ' # Step 4: Add sample5 files to study',
644
647
  ' print("\\nStep 4/7: Adding samples to study...")',
645
648
  ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
649
+ ' study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))',
646
650
  ' ',
647
651
  ' # Step 5: Core processing',
648
652
  ' print("\\nStep 5/7: Processing...")',
@@ -651,29 +655,14 @@ class Wizard:
651
655
  ' rt_tol=PARAMS[\'rt_tol\']',
652
656
  ' )',
653
657
  ' ',
654
- ' # Merge and create consensus features',
655
- ' # Use optimized method for large datasets (>500 samples)',
656
- ' num_samples = len(study.samples)',
657
- ' if num_samples > 500:',
658
- ' print(f" Large dataset detected ({num_samples} samples), using optimized qt_chunked + hierarchical method")',
659
- ' study.merge(',
660
- ' method="qt_chunked",',
661
- ' dechunking="hierarchical",',
662
- ' min_samples=PARAMS[\'min_samples_per_feature\'],',
663
- ' threads=PARAMS[\'num_cores\'],',
664
- ' rt_tol=PARAMS[\'rt_tol\'],',
665
- ' mz_tol=PARAMS[\'mz_tol\']',
666
- ' )',
667
- ' else:',
668
- ' print(f" Using standard merge method for {num_samples} samples")',
669
- ' study.merge(',
670
- ' min_samples=PARAMS[\'min_samples_per_feature\'],',
671
- ' threads=PARAMS[\'num_cores\'],',
672
- ' rt_tol=PARAMS[\'rt_tol\'],',
673
- ' mz_tol=PARAMS[\'mz_tol\']',
674
- ' )',
658
+ ' study.merge(',
659
+ ' method="qt",',
660
+ ' min_samples=PARAMS[\'min_samples_per_feature\'],',
661
+ ' threads=PARAMS[\'num_cores\'],',
662
+ ' rt_tol=PARAMS[\'rt_tol\'],'
663
+ ' )',
675
664
  ' study.find_iso()',
676
- ' study.fill(min_samples_rel=0.0)',
665
+ ' study.fill()',
677
666
  ' study.integrate()',
678
667
  ' ',
679
668
  ' # Step 6/7: Saving results',
@@ -689,8 +678,8 @@ class Wizard:
689
678
  ' study.plot_consensus_2d(filename="consensus.png")',
690
679
  ' study.plot_alignment(filename="alignment.html")',
691
680
  ' study.plot_alignment(filename="alignment.png")',
692
- ' study.plot_pca(filename="pca.html")',
693
- ' study.plot_pca(filename="pca.png")',
681
+ ' study.plot_samples_pca(filename="pca.html")',
682
+ ' study.plot_samples_pca(filename="pca.png")',
694
683
  ' study.plot_bpc(filename="bpc.html")',
695
684
  ' study.plot_bpc(filename="bpc.png")',
696
685
  ' study.plot_rt_correction(filename="rt_correction.html")',
@@ -1393,7 +1393,7 @@ wheels = [
1393
1393
 
1394
1394
  [[package]]
1395
1395
  name = "masster"
1396
- version = "0.5.3"
1396
+ version = "0.5.5"
1397
1397
  source = { editable = "." }
1398
1398
  dependencies = [
1399
1399
  { name = "alpharaw" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes