masster 0.5.1__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (95) hide show
  1. {masster-0.5.1 → masster-0.5.4}/PKG-INFO +1 -1
  2. {masster-0.5.1 → masster-0.5.4}/pyproject.toml +1 -1
  3. {masster-0.5.1 → masster-0.5.4}/src/masster/_version.py +1 -1
  4. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/adducts.py +1 -1
  5. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/h5.py +11 -11
  6. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/helpers.py +2 -2
  7. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/load.py +10 -8
  8. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/processing.py +1 -1
  9. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/sample.py +7 -3
  10. masster-0.5.1/src/masster/study/defaults/fill_chrom_def.py → masster-0.5.4/src/masster/study/defaults/align_def.py +97 -63
  11. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/fill_def.py +9 -1
  12. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/merge_def.py +20 -69
  13. {masster-0.5.1 → masster-0.5.4}/src/masster/study/export.py +25 -5
  14. {masster-0.5.1 → masster-0.5.4}/src/masster/study/h5.py +230 -42
  15. {masster-0.5.1 → masster-0.5.4}/src/masster/study/helpers.py +430 -53
  16. {masster-0.5.1 → masster-0.5.4}/src/masster/study/load.py +986 -158
  17. {masster-0.5.1 → masster-0.5.4}/src/masster/study/merge.py +683 -1076
  18. {masster-0.5.1 → masster-0.5.4}/src/masster/study/plot.py +95 -73
  19. {masster-0.5.1 → masster-0.5.4}/src/masster/study/processing.py +337 -280
  20. {masster-0.5.1 → masster-0.5.4}/src/masster/study/study.py +58 -135
  21. {masster-0.5.1 → masster-0.5.4}/src/masster/wizard/wizard.py +20 -6
  22. {masster-0.5.1 → masster-0.5.4}/uv.lock +1 -1
  23. masster-0.5.1/src/masster/study/defaults/align_def.py +0 -498
  24. {masster-0.5.1 → masster-0.5.4}/.github/workflows/publish.yml +0 -0
  25. {masster-0.5.1 → masster-0.5.4}/.github/workflows/security.yml +0 -0
  26. {masster-0.5.1 → masster-0.5.4}/.github/workflows/test.yml +0 -0
  27. {masster-0.5.1 → masster-0.5.4}/.gitignore +0 -0
  28. {masster-0.5.1 → masster-0.5.4}/.pre-commit-config.yaml +0 -0
  29. {masster-0.5.1 → masster-0.5.4}/LICENSE +0 -0
  30. {masster-0.5.1 → masster-0.5.4}/Makefile +0 -0
  31. {masster-0.5.1 → masster-0.5.4}/README.md +0 -0
  32. {masster-0.5.1 → masster-0.5.4}/TESTING.md +0 -0
  33. {masster-0.5.1 → masster-0.5.4}/demo/example_batch_process.py +0 -0
  34. {masster-0.5.1 → masster-0.5.4}/demo/example_sample_process.py +0 -0
  35. {masster-0.5.1 → masster-0.5.4}/src/masster/__init__.py +0 -0
  36. {masster-0.5.1 → masster-0.5.4}/src/masster/chromatogram.py +0 -0
  37. {masster-0.5.1 → masster-0.5.4}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  38. {masster-0.5.1 → masster-0.5.4}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  39. {masster-0.5.1 → masster-0.5.4}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  40. {masster-0.5.1 → masster-0.5.4}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  41. {masster-0.5.1 → masster-0.5.4}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  42. {masster-0.5.1 → masster-0.5.4}/src/masster/data/libs/aa.csv +0 -0
  43. {masster-0.5.1 → masster-0.5.4}/src/masster/data/libs/ccm.csv +0 -0
  44. {masster-0.5.1 → masster-0.5.4}/src/masster/data/libs/urine.csv +0 -0
  45. {masster-0.5.1 → masster-0.5.4}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  46. {masster-0.5.1 → masster-0.5.4}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  47. {masster-0.5.1 → masster-0.5.4}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  48. {masster-0.5.1 → masster-0.5.4}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  49. {masster-0.5.1 → masster-0.5.4}/src/masster/lib/__init__.py +0 -0
  50. {masster-0.5.1 → masster-0.5.4}/src/masster/lib/lib.py +0 -0
  51. {masster-0.5.1 → masster-0.5.4}/src/masster/logger.py +0 -0
  52. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/__init__.py +0 -0
  53. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/__init__.py +0 -0
  54. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  55. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/find_features_def.py +0 -0
  56. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  57. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  58. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/defaults/sample_def.py +0 -0
  59. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/lib.py +0 -0
  60. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/parameters.py +0 -0
  61. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/plot.py +0 -0
  62. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/quant.py +0 -0
  63. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/sample5_schema.json +0 -0
  64. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/save.py +0 -0
  65. {masster-0.5.1 → masster-0.5.4}/src/masster/sample/sciex.py +0 -0
  66. {masster-0.5.1 → masster-0.5.4}/src/masster/spectrum.py +0 -0
  67. {masster-0.5.1 → masster-0.5.4}/src/masster/study/__init__.py +0 -0
  68. {masster-0.5.1 → masster-0.5.4}/src/masster/study/analysis.py +0 -0
  69. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/__init__.py +0 -0
  70. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/export_def.py +0 -0
  71. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/find_consensus_def.py +0 -0
  72. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/find_ms2_def.py +0 -0
  73. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/identify_def.py +0 -0
  74. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  75. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/integrate_def.py +0 -0
  76. {masster-0.5.1 → masster-0.5.4}/src/masster/study/defaults/study_def.py +0 -0
  77. {masster-0.5.1 → masster-0.5.4}/src/masster/study/id.py +0 -0
  78. {masster-0.5.1 → masster-0.5.4}/src/masster/study/parameters.py +0 -0
  79. {masster-0.5.1 → masster-0.5.4}/src/masster/study/save.py +0 -0
  80. {masster-0.5.1 → masster-0.5.4}/src/masster/study/study5_schema.json +0 -0
  81. {masster-0.5.1 → masster-0.5.4}/src/masster/wizard/README.md +0 -0
  82. {masster-0.5.1 → masster-0.5.4}/src/masster/wizard/__init__.py +0 -0
  83. {masster-0.5.1 → masster-0.5.4}/src/masster/wizard/example.py +0 -0
  84. {masster-0.5.1 → masster-0.5.4}/tests/conftest.py +0 -0
  85. {masster-0.5.1 → masster-0.5.4}/tests/test_chromatogram.py +0 -0
  86. {masster-0.5.1 → masster-0.5.4}/tests/test_defaults.py +0 -0
  87. {masster-0.5.1 → masster-0.5.4}/tests/test_imports.py +0 -0
  88. {masster-0.5.1 → masster-0.5.4}/tests/test_integration.py +0 -0
  89. {masster-0.5.1 → masster-0.5.4}/tests/test_logger.py +0 -0
  90. {masster-0.5.1 → masster-0.5.4}/tests/test_parameters.py +0 -0
  91. {masster-0.5.1 → masster-0.5.4}/tests/test_sample.py +0 -0
  92. {masster-0.5.1 → masster-0.5.4}/tests/test_spectrum.py +0 -0
  93. {masster-0.5.1 → masster-0.5.4}/tests/test_study.py +0 -0
  94. {masster-0.5.1 → masster-0.5.4}/tests/test_version.py +0 -0
  95. {masster-0.5.1 → masster-0.5.4}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.5.1
3
+ Version: 0.5.4
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.5.1"
4
+ version = "0.5.4"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.5.1"
4
+ __version__ = "0.5.5"
5
5
 
6
6
 
7
7
  def get_version():
@@ -473,7 +473,7 @@ def find_adducts(self, **kwargs):
473
473
  self.logger.debug(f"Min probability threshold: {min_probability}")
474
474
 
475
475
  # Generate comprehensive adduct specifications using the Sample method
476
- adducts_df = self._get_adducts(
476
+ adducts_df = _get_adducts(self,
477
477
  adducts_list=adducts_list,
478
478
  charge_min=charge_min,
479
479
  charge_max=charge_max,
@@ -62,8 +62,8 @@ def _save_sample5(
62
62
  return
63
63
 
64
64
  # synchronize feature_map if it exists
65
- if hasattr(self, "_feature_map") and self._feature_map is not None:
66
- self._features_sync()
65
+ #if hasattr(self, "_feature_map") and self._feature_map is not None:
66
+ # self._features_sync()
67
67
 
68
68
  # if no extension is given, add .sample5
69
69
  if not filename.endswith(".sample5"):
@@ -1057,15 +1057,15 @@ def _load_sample5(self, filename: str, map: bool = False):
1057
1057
  # Parameters are now loaded from metadata JSON (see above)
1058
1058
  # Lib and lib_match are no longer saved/loaded
1059
1059
 
1060
- if map:
1061
- featureXML = filename.replace(".sample5", ".featureXML")
1062
- if os.path.exists(featureXML):
1063
- self._load_featureXML(featureXML)
1064
- self._features_sync()
1065
- else:
1066
- self.logger.warning(
1067
- f"Feature XML file {featureXML} not found, skipping loading.",
1068
- )
1060
+ #if map:
1061
+ # featureXML = filename.replace(".sample5", ".featureXML")
1062
+ # if os.path.exists(featureXML):
1063
+ # self._load_featureXML(featureXML)
1064
+ # #self._features_sync()
1065
+ # else:
1066
+ # self.logger.warning(
1067
+ # f"Feature XML file {featureXML} not found, skipping loading.",
1068
+ # )
1069
1069
 
1070
1070
  # set self.file_path to *.sample5
1071
1071
  self.file_path = filename
@@ -569,7 +569,7 @@ def select(
569
569
  self.logger.info(f"Selected features. Features remaining: {len(feats)}")
570
570
  return feats
571
571
 
572
-
572
+ '''
573
573
  def _features_sync(self):
574
574
  """
575
575
  Synchronizes the cached FeatureMap with features_df.
@@ -675,7 +675,7 @@ def _features_sync(self):
675
675
  self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
676
676
  except Exception as e:
677
677
  self.logger.error(f"Error during feature synchronization: {e}")
678
-
678
+ '''
679
679
 
680
680
  def features_delete(self, features: list | None = None):
681
681
  """
@@ -46,6 +46,7 @@ import polars as pl
46
46
  from tqdm import tqdm
47
47
 
48
48
  from masster.chromatogram import Chromatogram
49
+ from .h5 import _load_sample5
49
50
  from masster.spectrum import Spectrum
50
51
 
51
52
  # Suppress pyOpenMS warnings globally
@@ -96,13 +97,13 @@ def load(
96
97
 
97
98
  # check if file is mzML
98
99
  if filename.lower().endswith(".mzml"):
99
- self._load_mzML(filename)
100
+ _load_mzML(self, filename)
100
101
  elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
101
- self._load_wiff(filename)
102
+ _load_wiff(self, filename)
102
103
  elif filename.lower().endswith(".raw"):
103
- self._load_raw(filename)
104
+ _load_raw(self, filename)
104
105
  elif filename.lower().endswith(".sample5"):
105
- self._load_sample5(filename)
106
+ _load_sample5(self, filename)
106
107
  # elif filename.lower().endswith(".h5"):
107
108
  # self._load_h5(filename)
108
109
  else:
@@ -155,13 +156,14 @@ def load_noms1(
155
156
 
156
157
  # check if file is mzML
157
158
  if filename.lower().endswith(".mzml"):
158
- self._load_mzML(filename)
159
+ _load_mzML(self, filename)
159
160
  elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
160
- self._load_wiff(filename)
161
+ _load_wiff(self, filename)
161
162
  elif filename.lower().endswith(".raw"):
162
- self._load_raw(filename)
163
+ _load_raw(self, filename)
163
164
  elif filename.lower().endswith(".sample5"):
164
- self._load_sample5_study(filename) # Use optimized version for study loading
165
+ from masster.sample.h5 import _load_sample5_study
166
+ _load_sample5_study(self, filename) # Use optimized version for study loading
165
167
  else:
166
168
  raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
167
169
 
@@ -795,7 +795,7 @@ def find_features(self, **kwargs):
795
795
  )
796
796
 
797
797
  self.features_df = df
798
- self._features_sync()
798
+ #self._features_sync()
799
799
  self.logger.info(f"Feature detection completed. Total features: {len(df)}")
800
800
 
801
801
  # store params
@@ -48,9 +48,9 @@ from masster.sample.defaults.find_ms2_def import find_ms2_defaults
48
48
  from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
49
49
 
50
50
  # Sample-specific imports - keeping these private, only for internal use
51
- # from masster.sample.h5 import _load_sample5
51
+ from masster.sample.h5 import _load_sample5
52
52
  # from masster.sample.h5 import _load_sample5_study
53
- # from masster.sample.h5 import _save_sample5
53
+ from masster.sample.h5 import _save_sample5
54
54
  # from masster.sample.helpers import _delete_ms2
55
55
  from masster.sample.helpers import _estimate_memory_usage
56
56
  from masster.sample.helpers import _get_scan_uids
@@ -263,12 +263,16 @@ class Sample:
263
263
  _get_feature_map = _get_feature_map
264
264
 
265
265
  # Additional method assignments for all imported functions
266
- # Removed internal-only methods: _load_sample5, _load_sample5_study, _save_sample5, _delete_ms2, _features_sync
266
+ # Removed internal-only methods: _load_sample5_study, _delete_ms2, _features_sync
267
267
  _estimate_memory_usage = _estimate_memory_usage
268
268
  _get_scan_uids = _get_scan_uids
269
269
  _get_feature_uids = _get_feature_uids
270
270
  features_delete = features_delete
271
271
  features_filter = features_filter
272
+ _save_sample5 = _save_sample5
273
+ _load_sample5 = _load_sample5
274
+
275
+
272
276
  # Removed internal-only load methods: _load_featureXML, _load_ms2data, _load_mzML, _load_raw, _load_wiff
273
277
  chrom_extract = chrom_extract
274
278
  _index_file = _index_file # Renamed from index_file to be internal-only
@@ -1,65 +1,104 @@
1
- """Parameter class for Study fill_chrom method."""
1
+ """Parameter class for Study align method."""
2
2
 
3
3
  from dataclasses import dataclass, field
4
- from typing import Optional, Any
4
+ from typing import Any
5
5
 
6
6
 
7
7
  @dataclass
8
- class fill_chrom_defaults:
8
+ class align_defaults:
9
9
  """
10
- Parameter class for Study fill_chrom method.
10
+ Parameter class for Study align method.
11
11
 
12
- This class encapsulates parameters for filling missing chromatograms
13
- by extracting them from raw data across samples.
12
+ This class encapsulates parameters for feature alignment across samples,
13
+ including retention time and m/z tolerances, warping parameters, and
14
+ alignment algorithm settings.
14
15
 
15
16
  Attributes:
16
- uids (Optional[list]): List of consensus UIDs to process. Default is None (all).
17
- mz_tol (float): m/z tolerance for chromatogram extraction (Da). Default is 0.010.
18
- rt_tol (float): RT tolerance for chromatogram extraction (seconds). Default is 10.0.
19
- min_samples_rel (float): Minimum relative samples threshold. Default is 0.05.
20
- min_samples_abs (int): Minimum absolute samples threshold. Default is 5.
17
+ algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD). Default is 'pc'.
18
+ rt_tol (float): Maximum retention time difference for alignment. Default is 60.0.
19
+ mz_max_diff (float): Maximum m/z difference for alignment. Default is 0.02.
20
+ rt_pair_distance_frac (float): Fraction of RT difference for pair distance. Default is 0.2.
21
+ mz_pair_max_distance (float): Maximum m/z pair distance. Default is 0.01.
22
+ num_used_points (int): Number of points used for alignment. Default is 1000.
23
+ save_features (bool): Whether to save features after alignment. Default is False.
24
+ skip_blanks (bool): Whether to skip blank samples. Default is False.
25
+
26
+ KD algorithm specific parameters:
27
+ warp_mz_tol (float): m/z tolerance for the LOWESS fit. Default is 0.05.
21
28
  """
22
29
 
23
- uids: Optional[list] = None
24
- mz_tol: float = 0.010
25
- rt_tol: float = 10.0
26
- min_samples_rel: float = 0.05
27
- min_samples_abs: int = 5
30
+ rt_tol: float = 5.0
31
+ mz_max_diff: float = 0.01
32
+ rt_pair_distance_frac: float = 0.5
33
+ mz_pair_max_distance: float = 0.01
34
+ num_used_points: int = 1000
35
+ save_features: bool = False
36
+ skip_blanks: bool = False
37
+ algorithm: str = "kd"
38
+
39
+ # KD algorithm specific parameters
40
+ warp_mz_tol: float = 0.05
28
41
 
29
42
  _param_metadata: dict[str, dict[str, Any]] = field(
30
43
  default_factory=lambda: {
31
- "uids": {
32
- "dtype": "Optional[list]",
33
- "description": "List of consensus UIDs to process (None for all)",
34
- "default": None,
44
+ "rt_tol": {
45
+ "dtype": float,
46
+ "description": "Maximum retention time difference for alignment (seconds)",
47
+ "default": 5.0,
48
+ "min_value": 1.0,
49
+ "max_value": 30.0,
35
50
  },
36
- "mz_tol": {
51
+ "mz_max_diff": {
37
52
  "dtype": float,
38
- "description": "m/z tolerance for chromatogram extraction (Da)",
39
- "default": 0.010,
53
+ "description": "Maximum m/z difference for alignment (Da)",
54
+ "default": 0.01,
40
55
  "min_value": 0.001,
41
- "max_value": 0.1,
56
+ "max_value": 0.05,
42
57
  },
43
- "rt_tol": {
58
+ "rt_pair_distance_frac": {
44
59
  "dtype": float,
45
- "description": "RT tolerance for chromatogram extraction (seconds)",
46
- "default": 10.0,
47
- "min_value": 1.0,
48
- "max_value": 300.0,
60
+ "description": "Fraction of RT difference for pair distance calculation",
61
+ "default": 0.2,
62
+ "min_value": 0.1,
63
+ "max_value": 1.0,
49
64
  },
50
- "min_samples_rel": {
65
+ "mz_pair_max_distance": {
51
66
  "dtype": float,
52
- "description": "Minimum relative samples threshold (fraction)",
53
- "default": 0.05,
54
- "min_value": 0.01,
55
- "max_value": 1.0,
67
+ "description": "Maximum m/z pair distance (Da)",
68
+ "default": 0.01,
69
+ "min_value": 0.001,
70
+ "max_value": 0.2,
56
71
  },
57
- "min_samples_abs": {
72
+ "num_used_points": {
58
73
  "dtype": int,
59
- "description": "Minimum absolute samples threshold",
60
- "default": 5,
61
- "min_value": 1,
62
- "max_value": 100,
74
+ "description": "Number of points used for alignment",
75
+ "default": 1000,
76
+ "min_value": 10,
77
+ "max_value": 10000,
78
+ },
79
+ "save_features": {
80
+ "dtype": bool,
81
+ "description": "Whether to save features after alignment",
82
+ "default": False,
83
+ },
84
+ "skip_blanks": {
85
+ "dtype": bool,
86
+ "description": "Whether to skip blank samples during alignment",
87
+ "default": False,
88
+ },
89
+ "algorithm": {
90
+ "dtype": str,
91
+ "description": "Alignment algorithm to use",
92
+ "default": "pc",
93
+ "allowed_values": ["pc", "kd"],
94
+ },
95
+ # KD algorithm specific parameters
96
+ "warp_mz_tol": {
97
+ "dtype": float,
98
+ "description": "m/z tolerance for the LOWESS fit in KD algorithm (Da)",
99
+ "default": 0.05,
100
+ "min_value": 0.001,
101
+ "max_value": 1.0,
63
102
  },
64
103
  },
65
104
  repr=False,
@@ -111,14 +150,6 @@ class fill_chrom_defaults:
111
150
  metadata = self._param_metadata[param_name]
112
151
  expected_dtype = metadata["dtype"]
113
152
 
114
- # Handle optional types
115
- if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional"):
116
- if value is None:
117
- return True
118
- # Extract the inner type for validation
119
- if "list" in expected_dtype:
120
- expected_dtype = list
121
-
122
153
  # Type checking
123
154
  if expected_dtype is int:
124
155
  if not isinstance(value, int):
@@ -132,8 +163,8 @@ class fill_chrom_defaults:
132
163
  value = float(value)
133
164
  except (ValueError, TypeError):
134
165
  return False
135
- elif expected_dtype is list:
136
- if not isinstance(value, list):
166
+ elif expected_dtype is bool:
167
+ if not isinstance(value, bool):
137
168
  return False
138
169
 
139
170
  # Range validation for numeric types
@@ -143,6 +174,11 @@ class fill_chrom_defaults:
143
174
  if "max_value" in metadata and value > metadata["max_value"]:
144
175
  return False
145
176
 
177
+ # Allowed values validation for string types
178
+ if expected_dtype is str and "allowed_values" in metadata:
179
+ if value not in metadata["allowed_values"]:
180
+ return False
181
+
146
182
  return True
147
183
 
148
184
  def set(self, param_name: str, value: Any, validate: bool = True) -> bool:
@@ -167,20 +203,18 @@ class fill_chrom_defaults:
167
203
  if param_name in self._param_metadata:
168
204
  expected_dtype = self._param_metadata[param_name]["dtype"]
169
205
 
170
- # Handle optional types
171
- if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional") and value is not None:
172
- if "int" in expected_dtype and not isinstance(value, int):
173
- try:
174
- value = int(value)
175
- except (ValueError, TypeError):
176
- if validate:
177
- return False
178
- elif "float" in expected_dtype and not isinstance(value, float):
179
- try:
180
- value = float(value)
181
- except (ValueError, TypeError):
182
- if validate:
183
- return False
206
+ if expected_dtype is int and not isinstance(value, int):
207
+ try:
208
+ value = int(value)
209
+ except (ValueError, TypeError):
210
+ if validate:
211
+ return False
212
+ elif expected_dtype is float and not isinstance(value, float):
213
+ try:
214
+ value = float(value)
215
+ except (ValueError, TypeError):
216
+ if validate:
217
+ return False
184
218
 
185
219
  setattr(self, param_name, value)
186
220
  return True
@@ -21,10 +21,11 @@ class fill_defaults:
21
21
  """
22
22
 
23
23
  uids: Optional[list] = None
24
- mz_tol: float = 0.010
24
+ mz_tol: float = 0.050
25
25
  rt_tol: float = 10.0
26
26
  min_samples_rel: float = 0.00
27
27
  min_samples_abs: int = 5
28
+ threads: int = 6
28
29
 
29
30
  _param_metadata: dict[str, dict[str, Any]] = field(
30
31
  default_factory=lambda: {
@@ -61,6 +62,13 @@ class fill_defaults:
61
62
  "min_value": 0,
62
63
  "max_value": 100,
63
64
  },
65
+ "threads": {
66
+ "dtype": int,
67
+ "description": "Number of parallel threads",
68
+ "default": 6,
69
+ "min_value": 1,
70
+ "max_value": 32,
71
+ },
64
72
  },
65
73
  repr=False,
66
74
  )
@@ -13,7 +13,7 @@ class merge_defaults:
13
13
  method selection, grouping tolerances, and algorithm-specific parameters.
14
14
 
15
15
  Attributes:
16
- method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
16
+ method (str): Merge method to use ('kd', 'qt', 'kd_chunked', 'qt_chunked'). Default is "kd".
17
17
  min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
18
18
  rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
19
19
  mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
@@ -25,38 +25,31 @@ class merge_defaults:
25
25
  link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
26
26
  """
27
27
 
28
- method: str = "qt"
28
+ method: str = "kd"
29
29
  min_samples: int = 2
30
30
  rt_tol: float = 5.0
31
- mz_tol: float = 0.01
31
+ mz_tol: float = 0.05
32
32
  chunk_size: int = 500
33
33
  nr_partitions: int = 1000
34
34
  min_rel_cc_size: float = 0.1
35
35
  max_pairwise_log_fc: float = -1.0
36
36
  max_nr_conflicts: int = 0
37
37
  link_ms2: bool = True
38
+ extract_ms1: bool = True
39
+
40
+ # Cross-chunk merging parameters
41
+ dechunking: str = "hierarchical"
38
42
 
39
43
  # Parallel processing parameters
40
44
  threads: Optional[int] = None
41
-
42
- # KD-Strict specific parameters
43
- optimize_rt_tol: bool = False
44
- rt_tol_range: tuple = (0.5, 4.0)
45
- rt_tol_steps: int = 7
46
- secondary_merge_rt_tol: float = 1.0
47
- secondary_merge_mz_tol: float = 0.005
48
- min_sample_overlap: float = 0.8
49
- max_rt_spread: float = 2.0 # Will default to 2x rt_tol
50
- min_coherence: float = 0.0
51
45
 
52
46
  _param_metadata: dict[str, dict[str, Any]] = field(
53
47
  default_factory=lambda: {
54
48
  "method": {
55
49
  "dtype": str,
56
50
  "description": "Merge method (algorithm) to use",
57
- "default": "quality",
58
- "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
59
- "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
51
+ "default": "kd",
52
+ "allowed_values": ["kd", "qt",
60
53
  "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
61
54
  },
62
55
  "min_samples": {
@@ -118,7 +111,17 @@ class merge_defaults:
118
111
  "description": "Whether to link MS2 spectra to consensus features",
119
112
  "default": True,
120
113
  },
121
- # Parallel processing parameters
114
+ "extract_ms1": {
115
+ "dtype": bool,
116
+ "description": "Whether to extract MS1 chromatograms for consensus features",
117
+ "default": True,
118
+ },
119
+ "dechunking": {
120
+ "dtype": str,
121
+ "description": "Cross-chunk merging algorithm for chunked methods",
122
+ "default": "hierarchical",
123
+ "allowed_values": ["hierarchical", "kdtree"],
124
+ },
122
125
  "threads": {
123
126
  "dtype": [int, type(None)],
124
127
  "description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
@@ -126,58 +129,6 @@ class merge_defaults:
126
129
  "min_value": 1,
127
130
  "max_value": 32,
128
131
  },
129
- # KD-Strict specific parameters
130
- "optimize_rt_tol": {
131
- "dtype": bool,
132
- "description": "Enable RT tolerance optimization for kd-strict method",
133
- "default": False,
134
- },
135
- "rt_tol_range": {
136
- "dtype": tuple,
137
- "description": "RT tolerance range for optimization (min, max) in seconds",
138
- "default": (0.8, 2.0),
139
- },
140
- "rt_tol_steps": {
141
- "dtype": int,
142
- "description": "Number of steps for RT tolerance optimization",
143
- "default": 5,
144
- "min_value": 3,
145
- "max_value": 20,
146
- },
147
- "secondary_merge_rt_tol": {
148
- "dtype": float,
149
- "description": "RT tolerance for secondary clustering in kd-strict (seconds)",
150
- "default": 0.5,
151
- "min_value": 0.1,
152
- "max_value": 5.0,
153
- },
154
- "secondary_merge_mz_tol": {
155
- "dtype": float,
156
- "description": "m/z tolerance for secondary clustering in kd-strict (Da)",
157
- "default": 0.005,
158
- "min_value": 0.001,
159
- "max_value": 0.1,
160
- },
161
- "min_sample_overlap": {
162
- "dtype": float,
163
- "description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
164
- "default": 0.8,
165
- "min_value": 0.0,
166
- "max_value": 1.0,
167
- },
168
- "max_rt_spread": {
169
- "dtype": float,
170
- "description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
171
- "default": None,
172
- "min_value": 0.1,
173
- },
174
- "min_coherence": {
175
- "dtype": float,
176
- "description": "Minimum chromatographic coherence score (0.0 = disabled)",
177
- "default": 0.0,
178
- "min_value": 0.0,
179
- "max_value": 1.0,
180
- },
181
132
  },
182
133
  repr=False,
183
134
  )
@@ -551,7 +551,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
551
551
  else:
552
552
  self.logger.info("No identification data available for mzTab export")
553
553
  except Exception as e:
554
- self.logger.warning(f"Could not retrieve identification data: {e}")
554
+ self.logger.debug(f"Could not retrieve identification data: {e}")
555
555
  id_data = None
556
556
  top_id_data = None
557
557
  full_id_data = None
@@ -1190,10 +1190,12 @@ def export_xlsx(self, filename: str | None = None) -> None:
1190
1190
  """
1191
1191
  Export the study data to an Excel workbook with multiple worksheets.
1192
1192
 
1193
- The Excel file contains three worksheets:
1194
- - consensus_df: Consensus features dataframe
1195
- - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
1193
+ The Excel file contains five worksheets:
1194
+ - samples: Samples dataframe
1195
+ - consensus: Consensus features dataframe
1196
1196
  - identification: Identification results with library annotations (get_id)
1197
+ - gaps: Gaps matrix showing filled vs non-filled features (get_gaps_matrix)
1198
+ - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
1197
1199
 
1198
1200
  Args:
1199
1201
  filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
@@ -1263,7 +1265,25 @@ def export_xlsx(self, filename: str | None = None) -> None:
1263
1265
  f"Error getting identification data: {e}. Skipping identification worksheet.",
1264
1266
  )
1265
1267
 
1266
- # 4. Consensus matrix (last worksheet)
1268
+ # 4. Gaps matrix (filled vs non-filled features)
1269
+ try:
1270
+ gaps_df = self.get_gaps_matrix()
1271
+ if gaps_df is not None and not gaps_df.is_empty():
1272
+ gaps_pandas = gaps_df.to_pandas()
1273
+ worksheets["gaps"] = gaps_pandas
1274
+ self.logger.debug(
1275
+ f"Added gaps worksheet with {len(gaps_pandas)} rows",
1276
+ )
1277
+ else:
1278
+ self.logger.warning(
1279
+ "get_gaps_matrix() returned empty data, skipping gaps worksheet",
1280
+ )
1281
+ except Exception as e:
1282
+ self.logger.debug(
1283
+ f"Error getting gaps data: {e}. Skipping gaps worksheet.",
1284
+ )
1285
+
1286
+ # 5. Consensus matrix (last worksheet)
1267
1287
  try:
1268
1288
  matrix_df = self.get_consensus_matrix()
1269
1289
  if matrix_df is not None and not matrix_df.is_empty():