masster 0.4.13__tar.gz → 0.4.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (96) hide show
  1. {masster-0.4.13 → masster-0.4.17}/PKG-INFO +3 -2
  2. {masster-0.4.13 → masster-0.4.17}/README.md +2 -1
  3. {masster-0.4.13 → masster-0.4.17}/pyproject.toml +1 -1
  4. {masster-0.4.13 → masster-0.4.17}/src/masster/__init__.py +2 -0
  5. {masster-0.4.13 → masster-0.4.17}/src/masster/_version.py +1 -1
  6. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/sample.py +41 -0
  7. {masster-0.4.13 → masster-0.4.17}/src/masster/study/__init__.py +1 -0
  8. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/find_consensus_def.py +1 -1
  9. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/merge_def.py +129 -22
  10. {masster-0.4.13 → masster-0.4.17}/src/masster/study/h5.py +65 -106
  11. {masster-0.4.13 → masster-0.4.17}/src/masster/study/id.py +1 -1
  12. {masster-0.4.13 → masster-0.4.17}/src/masster/study/load.py +11 -6
  13. masster-0.4.17/src/masster/study/merge.py +2145 -0
  14. {masster-0.4.13 → masster-0.4.17}/src/masster/study/plot.py +15 -1
  15. {masster-0.4.13 → masster-0.4.17}/src/masster/study/processing.py +0 -874
  16. {masster-0.4.13 → masster-0.4.17}/src/masster/study/save.py +1 -1
  17. {masster-0.4.13 → masster-0.4.17}/src/masster/study/study.py +79 -21
  18. masster-0.4.17/src/masster/wizard/README.md +373 -0
  19. masster-0.4.17/src/masster/wizard/__init__.py +11 -0
  20. masster-0.4.17/src/masster/wizard/example.py +223 -0
  21. masster-0.4.17/src/masster/wizard/test_structure.py +49 -0
  22. masster-0.4.17/src/masster/wizard/test_wizard.py +285 -0
  23. masster-0.4.17/src/masster/wizard/wizard.py +1175 -0
  24. masster-0.4.17/src/masster/wizard.py +1175 -0
  25. {masster-0.4.13 → masster-0.4.17}/uv.lock +1 -1
  26. {masster-0.4.13 → masster-0.4.17}/.github/workflows/publish.yml +0 -0
  27. {masster-0.4.13 → masster-0.4.17}/.github/workflows/security.yml +0 -0
  28. {masster-0.4.13 → masster-0.4.17}/.github/workflows/test.yml +0 -0
  29. {masster-0.4.13 → masster-0.4.17}/.gitignore +0 -0
  30. {masster-0.4.13 → masster-0.4.17}/.pre-commit-config.yaml +0 -0
  31. {masster-0.4.13 → masster-0.4.17}/LICENSE +0 -0
  32. {masster-0.4.13 → masster-0.4.17}/Makefile +0 -0
  33. {masster-0.4.13 → masster-0.4.17}/TESTING.md +0 -0
  34. {masster-0.4.13 → masster-0.4.17}/demo/example_batch_process.py +0 -0
  35. {masster-0.4.13 → masster-0.4.17}/demo/example_sample_process.py +0 -0
  36. {masster-0.4.13 → masster-0.4.17}/src/masster/chromatogram.py +0 -0
  37. {masster-0.4.13 → masster-0.4.17}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  38. {masster-0.4.13 → masster-0.4.17}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  39. {masster-0.4.13 → masster-0.4.17}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  40. {masster-0.4.13 → masster-0.4.17}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  41. {masster-0.4.13 → masster-0.4.17}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  42. {masster-0.4.13 → masster-0.4.17}/src/masster/data/libs/ccm.csv +0 -0
  43. {masster-0.4.13 → masster-0.4.17}/src/masster/data/libs/urine.csv +0 -0
  44. {masster-0.4.13 → masster-0.4.17}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  45. {masster-0.4.13 → masster-0.4.17}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  46. {masster-0.4.13 → masster-0.4.17}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  47. {masster-0.4.13 → masster-0.4.17}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  48. {masster-0.4.13 → masster-0.4.17}/src/masster/lib/__init__.py +0 -0
  49. {masster-0.4.13 → masster-0.4.17}/src/masster/lib/lib.py +0 -0
  50. {masster-0.4.13 → masster-0.4.17}/src/masster/logger.py +0 -0
  51. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/__init__.py +0 -0
  52. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/adducts.py +0 -0
  53. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/__init__.py +0 -0
  54. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  55. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/find_features_def.py +0 -0
  56. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  57. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  58. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/defaults/sample_def.py +0 -0
  59. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/h5.py +0 -0
  60. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/helpers.py +0 -0
  61. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/lib.py +0 -0
  62. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/load.py +0 -0
  63. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/parameters.py +0 -0
  64. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/plot.py +0 -0
  65. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/processing.py +0 -0
  66. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/quant.py +0 -0
  67. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/sample5_schema.json +0 -0
  68. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/save.py +0 -0
  69. {masster-0.4.13 → masster-0.4.17}/src/masster/sample/sciex.py +0 -0
  70. {masster-0.4.13 → masster-0.4.17}/src/masster/spectrum.py +0 -0
  71. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/__init__.py +0 -0
  72. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/align_def.py +0 -0
  73. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/export_def.py +0 -0
  74. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/fill_chrom_def.py +0 -0
  75. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/fill_def.py +0 -0
  76. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/find_ms2_def.py +0 -0
  77. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/identify_def.py +0 -0
  78. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  79. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/integrate_def.py +0 -0
  80. {masster-0.4.13 → masster-0.4.17}/src/masster/study/defaults/study_def.py +0 -0
  81. {masster-0.4.13 → masster-0.4.17}/src/masster/study/export.py +0 -0
  82. {masster-0.4.13 → masster-0.4.17}/src/masster/study/helpers.py +0 -0
  83. {masster-0.4.13 → masster-0.4.17}/src/masster/study/parameters.py +0 -0
  84. {masster-0.4.13 → masster-0.4.17}/src/masster/study/study5_schema.json +0 -0
  85. {masster-0.4.13 → masster-0.4.17}/tests/conftest.py +0 -0
  86. {masster-0.4.13 → masster-0.4.17}/tests/test_chromatogram.py +0 -0
  87. {masster-0.4.13 → masster-0.4.17}/tests/test_defaults.py +0 -0
  88. {masster-0.4.13 → masster-0.4.17}/tests/test_imports.py +0 -0
  89. {masster-0.4.13 → masster-0.4.17}/tests/test_integration.py +0 -0
  90. {masster-0.4.13 → masster-0.4.17}/tests/test_logger.py +0 -0
  91. {masster-0.4.13 → masster-0.4.17}/tests/test_parameters.py +0 -0
  92. {masster-0.4.13 → masster-0.4.17}/tests/test_sample.py +0 -0
  93. {masster-0.4.13 → masster-0.4.17}/tests/test_spectrum.py +0 -0
  94. {masster-0.4.13 → masster-0.4.17}/tests/test_study.py +0 -0
  95. {masster-0.4.13 → masster-0.4.17}/tests/test_version.py +0 -0
  96. {masster-0.4.13 → masster-0.4.17}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.4.13
3
+ Version: 0.4.17
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -767,7 +767,8 @@ study.integrate()
767
767
  # export results
768
768
  study.export_mgf()
769
769
  study.export_mztab()
770
- study.export_consensus()
770
+ study.export_xlsx()
771
+ study.export_parquet()
771
772
 
772
773
  # Save the study to .study5
773
774
  study.save()
@@ -41,7 +41,8 @@ study.integrate()
41
41
  # export results
42
42
  study.export_mgf()
43
43
  study.export_mztab()
44
- study.export_consensus()
44
+ study.export_xlsx()
45
+ study.export_parquet()
45
46
 
46
47
  # Save the study to .study5
47
48
  study.save()
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.4.13"
4
+ version = "0.4.17"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -16,6 +16,7 @@ from masster.lib import Lib
16
16
  from masster.sample.sample import Sample
17
17
  from masster.spectrum import Spectrum
18
18
  from masster.study.study import Study
19
+ from masster.wizard import Wizard, wizard_def
19
20
 
20
21
 
21
22
  __all__ = [
@@ -24,6 +25,7 @@ __all__ = [
24
25
  "Sample",
25
26
  "Spectrum",
26
27
  "Study",
28
+ "Wizard",
27
29
  "__version__",
28
30
  # "get_version",
29
31
  ]
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.4.13"
4
+ __version__ = "0.4.17"
5
5
 
6
6
 
7
7
  def get_version():
@@ -299,6 +299,47 @@ class Sample:
299
299
  find_ms2_defaults = find_ms2_defaults
300
300
  get_spectrum_defaults = get_spectrum_defaults
301
301
 
302
+ def __dir__(self):
303
+ """
304
+ Custom __dir__ implementation to hide internal methods starting with '_'
305
+ and backward compatibility aliases from tab completion and dir() calls,
306
+ while keeping them accessible to class methods.
307
+
308
+ Returns:
309
+ list: List of public attribute and method names (excluding internal and deprecated methods)
310
+ """
311
+ # Define backward compatibility aliases to hide
312
+ backward_compatibility_aliases = {
313
+ 'load_study', # deprecated alias for load_noms1
314
+ 'filter_features', # alias for filter (deprecated naming)
315
+ 'select_features', # alias for select (deprecated naming)
316
+ 'features_filter', # confusing duplicate of filter
317
+ 'features_select', # confusing duplicate of select
318
+ 'merge_defaults', # alias for find_features_defaults (confusing)
319
+ }
320
+
321
+ # Get all attributes from the class
322
+ all_attrs = set()
323
+
324
+ # Add attributes from the class and all its bases
325
+ for cls in self.__class__.__mro__:
326
+ all_attrs.update(cls.__dict__.keys())
327
+
328
+ # Add instance attributes
329
+ all_attrs.update(self.__dict__.keys())
330
+
331
+ # Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
332
+ # Also filter out backward compatibility aliases
333
+ public_attrs = [
334
+ attr for attr in all_attrs
335
+ if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
336
+ ]
337
+
338
+ # Remove backward compatibility aliases from the public attributes
339
+ public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
340
+
341
+ return sorted(public_attrs)
342
+
302
343
  def logger_update(
303
344
  self,
304
345
  level: str | None = None,
@@ -5,5 +5,6 @@ This module provides the Sample class for handling mass spectrometry data.
5
5
  """
6
6
 
7
7
  from .study import Study
8
+ from . import merge as _ # Import unified merge system # noqa: F401
8
9
 
9
10
  __all__ = ["Study"]
@@ -32,7 +32,7 @@ class find_consensus_defaults:
32
32
  "dtype": str,
33
33
  "description": "Feature grouping algorithm",
34
34
  "default": "qt",
35
- "allowed_values": ["qt", "kd", "unlabeled", "sequential"],
35
+ "allowed_values": ["qt", "kd", "unlabeled", "kd-nowarp"],
36
36
  },
37
37
  "min_samples": {
38
38
  "dtype": int,
@@ -9,55 +9,162 @@ class merge_defaults:
9
9
  """
10
10
  Parameter class for Study merge method.
11
11
 
12
- This class encapsulates parameters for consensus feature detection across samples,
13
- including algorithm selection, grouping tolerances, and minimum sample requirements.
12
+ This class encapsulates parameters for all merge algorithms including
13
+ method selection, grouping tolerances, and algorithm-specific parameters.
14
14
 
15
15
  Attributes:
16
- algorithm (str): Feature grouping algorithm. Default is "qt".
17
- min_samples (int): Minimum number of samples for a consensus feature. Default is 1.
16
+ method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
17
+ min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
18
+ rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
19
+ mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
20
+ chunk_size (int): Chunk size for 'chunked' method. Default is 500.
21
+ nr_partitions (int): Number of partitions in m/z dimension for KD algorithms. Default is 500.
22
+ min_rel_cc_size (float): Minimum relative connected component size for conflict resolution. Default is 0.3.
23
+ max_pairwise_log_fc (float): Maximum pairwise log fold change for conflict resolution. Default is 0.5.
24
+ max_nr_conflicts (int): Maximum number of conflicts allowed in consensus feature. Default is 0.
18
25
  link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
19
- mz_tol (float): m/z tolerance for grouping (Da). Default is 0.01.
20
- rt_tol (float): RT tolerance for grouping (seconds). Default is 1.0.
21
26
  """
22
27
 
23
- algorithm: str = "qt"
24
- min_samples: int = 1
25
- link_ms2: bool = True
28
+ method: str = "quality"
29
+ min_samples: int = 10
30
+ rt_tol: float = 5.0
26
31
  mz_tol: float = 0.01
27
- rt_tol: float = 1.0
32
+ chunk_size: int = 300
33
+ nr_partitions: int = 1000
34
+ min_rel_cc_size: float = 0.2
35
+ max_pairwise_log_fc: float = -1.0
36
+ max_nr_conflicts: int = 0
37
+ link_ms2: bool = True
38
+
39
+ # KD-Strict specific parameters
40
+ optimize_rt_tol: bool = False
41
+ rt_tol_range: tuple = (0.8, 2.0)
42
+ rt_tol_steps: int = 5
43
+ secondary_merge_rt_tol: float = 0.5
44
+ secondary_merge_mz_tol: float = 0.005
45
+ min_sample_overlap: float = 0.8
46
+ max_rt_spread: float = None # Will default to 2x rt_tol
47
+ min_coherence: float = 0.0
28
48
 
29
49
  _param_metadata: dict[str, dict[str, Any]] = field(
30
50
  default_factory=lambda: {
31
- "algorithm": {
51
+ "method": {
32
52
  "dtype": str,
33
- "description": "Feature grouping algorithm",
34
- "default": "qt",
35
- "allowed_values": ["qt", "kd", "unlabeled", "sequential"],
53
+ "description": "Merge method (algorithm) to use",
54
+ "default": "quality",
55
+ "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
56
+ "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict"],
36
57
  },
37
58
  "min_samples": {
38
59
  "dtype": int,
39
60
  "description": "Minimum number of samples for a consensus feature",
40
- "default": 1,
61
+ "default": 50,
41
62
  "min_value": 1,
42
63
  },
64
+ "rt_tol": {
65
+ "dtype": float,
66
+ "description": "RT tolerance for grouping (seconds)",
67
+ "default": 2.0,
68
+ "min_value": 0.1,
69
+ "max_value": 60.0,
70
+ },
71
+ "mz_tol": {
72
+ "dtype": float,
73
+ "description": "m/z tolerance for grouping (Da for all methods)",
74
+ "default": 0.01,
75
+ "min_value": 0.001,
76
+ "max_value": 1.0,
77
+ },
78
+ "chunk_size": {
79
+ "dtype": int,
80
+ "description": "Chunk size for 'chunked' method",
81
+ "default": 500,
82
+ "min_value": 10,
83
+ },
84
+ "nr_partitions": {
85
+ "dtype": int,
86
+ "description": "Number of partitions in m/z dimension for KD algorithms",
87
+ "default": 500,
88
+ "min_value": 10,
89
+ "max_value": 10000,
90
+ },
91
+ "min_rel_cc_size": {
92
+ "dtype": float,
93
+ "description": "Minimum relative connected component size for conflict resolution",
94
+ "default": 0.3,
95
+ "min_value": 0.0,
96
+ "max_value": 1.0,
97
+ },
98
+ "max_pairwise_log_fc": {
99
+ "dtype": float,
100
+ "description": "Maximum pairwise log fold change for conflict resolution",
101
+ "default": 0.5,
102
+ "min_value": 0.0,
103
+ "max_value": 10.0,
104
+ },
105
+ "max_nr_conflicts": {
106
+ "dtype": int,
107
+ "description": "Maximum number of conflicts allowed in consensus feature",
108
+ "default": 0,
109
+ "min_value": 0,
110
+ "max_value": 1000,
111
+ },
43
112
  "link_ms2": {
44
113
  "dtype": bool,
45
114
  "description": "Whether to link MS2 spectra to consensus features",
46
115
  "default": True,
47
116
  },
48
- "mz_tol": {
117
+ # KD-Strict specific parameters
118
+ "optimize_rt_tol": {
119
+ "dtype": bool,
120
+ "description": "Enable RT tolerance optimization for kd-strict method",
121
+ "default": False,
122
+ },
123
+ "rt_tol_range": {
124
+ "dtype": tuple,
125
+ "description": "RT tolerance range for optimization (min, max) in seconds",
126
+ "default": (0.8, 2.0),
127
+ },
128
+ "rt_tol_steps": {
129
+ "dtype": int,
130
+ "description": "Number of steps for RT tolerance optimization",
131
+ "default": 5,
132
+ "min_value": 3,
133
+ "max_value": 20,
134
+ },
135
+ "secondary_merge_rt_tol": {
49
136
  "dtype": float,
50
- "description": "m/z tolerance for grouping (Da)",
51
- "default": 0.01,
137
+ "description": "RT tolerance for secondary clustering in kd-strict (seconds)",
138
+ "default": 0.5,
139
+ "min_value": 0.1,
140
+ "max_value": 5.0,
141
+ },
142
+ "secondary_merge_mz_tol": {
143
+ "dtype": float,
144
+ "description": "m/z tolerance for secondary clustering in kd-strict (Da)",
145
+ "default": 0.005,
52
146
  "min_value": 0.001,
147
+ "max_value": 0.1,
148
+ },
149
+ "min_sample_overlap": {
150
+ "dtype": float,
151
+ "description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
152
+ "default": 0.8,
153
+ "min_value": 0.0,
53
154
  "max_value": 1.0,
54
155
  },
55
- "rt_tol": {
156
+ "max_rt_spread": {
56
157
  "dtype": float,
57
- "description": "RT tolerance for grouping (seconds)",
58
- "default": 1.0,
158
+ "description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
159
+ "default": None,
59
160
  "min_value": 0.1,
60
- "max_value": 60.0,
161
+ },
162
+ "min_coherence": {
163
+ "dtype": float,
164
+ "description": "Minimum chromatographic coherence score (0.0 = disabled)",
165
+ "default": 0.0,
166
+ "min_value": 0.0,
167
+ "max_value": 1.0,
61
168
  },
62
169
  },
63
170
  repr=False,
@@ -56,6 +56,45 @@ def _decode_bytes_attr(attr_value):
56
56
  return str(attr_value) if attr_value is not None else ""
57
57
 
58
58
 
59
+ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFrame:
60
+ """Create an empty DataFrame with the correct schema based on study5_schema.json."""
61
+ if df_name not in schema:
62
+ # Fallback to basic empty DataFrame if schema not found
63
+ return pl.DataFrame()
64
+
65
+ df_schema = schema[df_name]["columns"]
66
+ empty_data = {}
67
+ polars_schema = {}
68
+
69
+ for col_name, col_info in df_schema.items():
70
+ dtype_str = col_info["dtype"]
71
+ # Convert string representation to actual Polars dtype
72
+ if dtype_str == "pl.Int64":
73
+ polars_dtype = pl.Int64
74
+ elif dtype_str == "pl.Int32":
75
+ polars_dtype = pl.Int32
76
+ elif dtype_str == "pl.Float64":
77
+ polars_dtype = pl.Float64
78
+ elif dtype_str == "pl.Utf8":
79
+ polars_dtype = pl.Utf8
80
+ elif dtype_str == "pl.String":
81
+ polars_dtype = pl.String
82
+ elif dtype_str == "pl.Boolean":
83
+ polars_dtype = pl.Boolean
84
+ elif dtype_str == "pl.Object":
85
+ polars_dtype = pl.Object
86
+ elif dtype_str == "pl.Null":
87
+ polars_dtype = pl.Null
88
+ else:
89
+ # Fallback to string if unknown type
90
+ polars_dtype = pl.String
91
+
92
+ empty_data[col_name] = []
93
+ polars_schema[col_name] = polars_dtype
94
+
95
+ return pl.DataFrame(empty_data, schema=polars_schema)
96
+
97
+
59
98
  def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
60
99
  """
61
100
  Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
@@ -1080,7 +1119,7 @@ def _save_study5_compressed(self, filename):
1080
1119
  if not filename.endswith(".study5"):
1081
1120
  filename += ".study5"
1082
1121
 
1083
- self.logger.debug(f"Compressed saving study to {filename}")
1122
+ self.logger.debug(f"Save study")
1084
1123
 
1085
1124
  # delete existing file if it exists
1086
1125
  if os.path.exists(filename):
@@ -1132,7 +1171,7 @@ def _save_study5_compressed(self, filename):
1132
1171
 
1133
1172
  with tqdm(
1134
1173
  total=total_steps,
1135
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Fast saving study",
1174
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
1136
1175
  disable=tdqm_disable,
1137
1176
  ) as pbar:
1138
1177
  # Create groups for organization
@@ -1186,8 +1225,11 @@ def _save_study5_compressed(self, filename):
1186
1225
  )
1187
1226
  pbar.update(1)
1188
1227
 
1189
- # Store features_df - use fast method that skips chrom and ms2_specs columns
1228
+ # Store features_df - use fast method that skips chrom and ms2_specs columns
1190
1229
  if self.features_df is not None and not self.features_df.is_empty():
1230
+ pbar.set_description(
1231
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows, compressed)"
1232
+ )
1191
1233
  self.logger.debug(
1192
1234
  f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
1193
1235
  )
@@ -1411,7 +1453,7 @@ def _save_study5(self, filename):
1411
1453
  if not filename.endswith(".study5"):
1412
1454
  filename += ".study5"
1413
1455
 
1414
- self.logger.info(f"Saving study to {filename}")
1456
+ self.logger.info("Save study...")
1415
1457
 
1416
1458
  # delete existing file if it exists
1417
1459
  if os.path.exists(filename):
@@ -1463,7 +1505,7 @@ def _save_study5(self, filename):
1463
1505
 
1464
1506
  with tqdm(
1465
1507
  total=total_steps,
1466
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study",
1508
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
1467
1509
  disable=tdqm_disable,
1468
1510
  ) as pbar:
1469
1511
  # Create groups for organization
@@ -1498,12 +1540,12 @@ def _save_study5(self, filename):
1498
1540
  metadata_group.create_dataset("parameters", data="")
1499
1541
 
1500
1542
  pbar.update(1)
1501
- pbar.set_description(
1502
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
1503
- )
1504
1543
 
1505
1544
  # Store samples_df - use optimized batch processing
1506
1545
  if self.samples_df is not None and not self.samples_df.is_empty():
1546
+ pbar.set_description(
1547
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving samples ({len(self.samples_df)} rows)"
1548
+ )
1507
1549
  samples_group = f.create_group("samples")
1508
1550
  self.logger.debug(
1509
1551
  f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
@@ -1519,6 +1561,9 @@ def _save_study5(self, filename):
1519
1561
 
1520
1562
  # Store features_df - use optimized batch processing
1521
1563
  if self.features_df is not None and not self.features_df.is_empty():
1564
+ pbar.set_description(
1565
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows)"
1566
+ )
1522
1567
  self.logger.debug(
1523
1568
  f"Saving features_df with {len(self.features_df)} rows using optimized method",
1524
1569
  )
@@ -1533,6 +1578,9 @@ def _save_study5(self, filename):
1533
1578
 
1534
1579
  # Store consensus_df - use optimized batch processing
1535
1580
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1581
+ pbar.set_description(
1582
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving consensus ({len(self.consensus_df)} rows)"
1583
+ )
1536
1584
  self.logger.debug(
1537
1585
  f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
1538
1586
  )
@@ -1690,8 +1738,9 @@ def _load_study5(self, filename=None):
1690
1738
  # Use progress bar to show loading progress
1691
1739
  with tqdm(
1692
1740
  total=len(loading_steps),
1693
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
1741
+ desc="Loading study",
1694
1742
  disable=tdqm_disable,
1743
+ unit="step"
1695
1744
  ) as pbar:
1696
1745
  # Load metadata
1697
1746
  pbar.set_description(
@@ -1792,83 +1841,7 @@ def _load_study5(self, filename=None):
1792
1841
  self.logger.debug(
1793
1842
  "No samples data found in study5 file. Initializing empty samples_df.",
1794
1843
  )
1795
- self.samples_df = pl.DataFrame(
1796
- {
1797
- "sample_uid": [],
1798
- "sample_name": [],
1799
- "sample_path": [],
1800
- "sample_type": [],
1801
- "size": [],
1802
- "map_id": [],
1803
- "sample_source": [],
1804
- "num_ms1": [],
1805
- "num_ms2": [],
1806
- "sample_group": [],
1807
- "sample_batch": [],
1808
- "sample_sequence": [],
1809
- },
1810
- schema={
1811
- "sample_uid": pl.Int64,
1812
- "sample_name": pl.Utf8,
1813
- "sample_path": pl.Utf8,
1814
- "sample_type": pl.Utf8,
1815
- "size": pl.Int64,
1816
- "map_id": pl.Int64,
1817
- "sample_source": pl.Utf8,
1818
- "num_ms1": pl.Int64,
1819
- "num_ms2": pl.Int64,
1820
- "sample_group": pl.Utf8,
1821
- "sample_batch": pl.Int64,
1822
- "sample_sequence": pl.Int64,
1823
- },
1824
- )
1825
- pbar.update(1)
1826
- # Load samples_df
1827
- pbar.set_description(
1828
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1829
- )
1830
- if "samples" in f and len(f["samples"].keys()) > 0:
1831
- self.samples_df = _load_dataframe_from_group(
1832
- f["samples"],
1833
- schema,
1834
- "samples_df",
1835
- self.logger,
1836
- )
1837
- else:
1838
- # Initialize empty samples_df with the correct schema if no data exists
1839
- self.logger.debug(
1840
- "No samples data found in study5 file. Initializing empty samples_df.",
1841
- )
1842
- self.samples_df = pl.DataFrame(
1843
- {
1844
- "sample_uid": [],
1845
- "sample_name": [],
1846
- "sample_path": [],
1847
- "sample_type": [],
1848
- "size": [],
1849
- "map_id": [],
1850
- "sample_source": [],
1851
- "num_ms1": [],
1852
- "num_ms2": [],
1853
- "sample_group": [],
1854
- "sample_batch": [],
1855
- "sample_sequence": [],
1856
- },
1857
- schema={
1858
- "sample_uid": pl.Int64,
1859
- "sample_name": pl.Utf8,
1860
- "sample_path": pl.Utf8,
1861
- "sample_type": pl.Utf8,
1862
- "size": pl.Int64,
1863
- "map_id": pl.Int64,
1864
- "sample_source": pl.Utf8,
1865
- "num_ms1": pl.Int64,
1866
- "num_ms2": pl.Int64,
1867
- "sample_group": pl.Utf8,
1868
- "sample_batch": pl.Int64,
1869
- "sample_sequence": pl.Int64,
1870
- },
1871
- )
1844
+ self.samples_df = _create_empty_dataframe_from_schema("samples_df", schema)
1872
1845
  pbar.update(1)
1873
1846
 
1874
1847
  # Load features_df
@@ -1885,7 +1858,7 @@ def _load_study5(self, filename=None):
1885
1858
  object_columns,
1886
1859
  )
1887
1860
  else:
1888
- self.features_df = None
1861
+ self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
1889
1862
  pbar.update(1)
1890
1863
 
1891
1864
  # Load consensus_df
@@ -1942,7 +1915,7 @@ def _load_study5(self, filename=None):
1942
1915
  ],
1943
1916
  )
1944
1917
  else:
1945
- self.consensus_df = None
1918
+ self.consensus_df = _create_empty_dataframe_from_schema("consensus_df", schema)
1946
1919
  pbar.update(1)
1947
1920
 
1948
1921
  # Load consensus_mapping_df
@@ -1957,21 +1930,7 @@ def _load_study5(self, filename=None):
1957
1930
  self.logger,
1958
1931
  )
1959
1932
  else:
1960
- self.consensus_mapping_df = None
1961
- pbar.update(1)
1962
- # Load consensus_mapping_df
1963
- pbar.set_description(
1964
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
1965
- )
1966
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1967
- self.consensus_mapping_df = _load_dataframe_from_group(
1968
- f["consensus_mapping"],
1969
- schema,
1970
- "consensus_mapping_df",
1971
- self.logger,
1972
- )
1973
- else:
1974
- self.consensus_mapping_df = None
1933
+ self.consensus_mapping_df = _create_empty_dataframe_from_schema("consensus_mapping_df", schema)
1975
1934
  pbar.update(1)
1976
1935
 
1977
1936
  # Load consensus_ms2
@@ -1988,7 +1947,7 @@ def _load_study5(self, filename=None):
1988
1947
  object_columns,
1989
1948
  )
1990
1949
  else:
1991
- self.consensus_ms2 = None
1950
+ self.consensus_ms2 = _create_empty_dataframe_from_schema("consensus_ms2", schema)
1992
1951
  pbar.update(1)
1993
1952
 
1994
1953
  # Load lib_df
@@ -2004,7 +1963,7 @@ def _load_study5(self, filename=None):
2004
1963
  [],
2005
1964
  )
2006
1965
  else:
2007
- self.lib_df = None
1966
+ self.lib_df = _create_empty_dataframe_from_schema("lib_df", schema)
2008
1967
  pbar.update(1)
2009
1968
 
2010
1969
  # Load id_df
@@ -2020,7 +1979,7 @@ def _load_study5(self, filename=None):
2020
1979
  [],
2021
1980
  )
2022
1981
  else:
2023
- self.id_df = None
1982
+ self.id_df = _create_empty_dataframe_from_schema("id_df", schema)
2024
1983
  pbar.update(1)
2025
1984
 
2026
1985
  # Check and migrate old string-based map_id to integer indices
@@ -1291,7 +1291,7 @@ def _get_adducts(study, adducts_list: list = None, **kwargs):
1291
1291
 
1292
1292
  logger = getattr(study, "logger", None)
1293
1293
  if logger:
1294
- logger.debug(
1294
+ logger.trace(
1295
1295
  f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
1296
1296
  )
1297
1297
 
@@ -214,13 +214,18 @@ def load(self, filename=None):
214
214
 
215
215
  # self.logger.info(f"Loading study from {filename}")
216
216
  self._load_study5(filename)
217
- # After loading the study, check if consensus XML exists and load it
218
- consensus_xml_path = filename.replace(".study5", ".consensusXML")
219
- if os.path.exists(consensus_xml_path):
220
- self._load_consensusXML(filename=consensus_xml_path)
221
- # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
217
+
218
+ # After loading the study, check if we have consensus features before loading consensus XML
219
+ if (self.consensus_df is not None and not self.consensus_df.is_empty()):
220
+ consensus_xml_path = filename.replace(".study5", ".consensusXML")
221
+ if os.path.exists(consensus_xml_path):
222
+ self._load_consensusXML(filename=consensus_xml_path)
223
+ # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
224
+ else:
225
+ self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
222
226
  else:
223
- self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
227
+ self.logger.debug("No consensus features found, skipping consensusXML loading")
228
+
224
229
  self.filename = filename
225
230
 
226
231