masster 0.3.13__tar.gz → 0.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- {masster-0.3.13 → masster-0.3.15}/PKG-INFO +2 -1
- {masster-0.3.13 → masster-0.3.15}/pyproject.toml +4 -3
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/helpers.py +9 -2
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/load.py +11 -7
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/plot.py +43 -34
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/study_def.py +20 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/h5.py +120 -23
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/helpers.py +974 -13
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/load.py +28 -15
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/plot.py +270 -98
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/processing.py +9 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/study.py +32 -38
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/study5_schema.json +14 -5
- {masster-0.3.13 → masster-0.3.15}/uv.lock +16 -2
- masster-0.3.13/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +0 -199787
- masster-0.3.13/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster-0.3.13/src/masster/docs/SCX_API_Documentation.md +0 -0
- masster-0.3.13/src/masster/docs/SCX_DLL_Analysis.md +0 -0
- {masster-0.3.13 → masster-0.3.15}/.github/workflows/publish.yml +0 -0
- {masster-0.3.13 → masster-0.3.15}/.github/workflows/security.yml +0 -0
- {masster-0.3.13 → masster-0.3.15}/.github/workflows/test.yml +0 -0
- {masster-0.3.13 → masster-0.3.15}/.gitignore +0 -0
- {masster-0.3.13 → masster-0.3.15}/.pre-commit-config.yaml +0 -0
- {masster-0.3.13 → masster-0.3.15}/LICENSE +0 -0
- {masster-0.3.13 → masster-0.3.15}/Makefile +0 -0
- {masster-0.3.13 → masster-0.3.15}/README.md +0 -0
- {masster-0.3.13 → masster-0.3.15}/TESTING.md +0 -0
- {masster-0.3.13 → masster-0.3.15}/demo/example_batch_process.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/demo/example_sample_process.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/__init__.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/_version.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/chromatogram.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/logger.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/__init__.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/__init__.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/find_adducts_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/find_features_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/find_ms2_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/defaults/sample_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/h5.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/lib.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/parameters.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/processing.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/quant.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/sample.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/sample5_schema.json +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/save.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/sample/sciex.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/spectrum.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/__init__.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/__init__.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/align_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/export_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/fill_chrom_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/fill_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/find_consensus_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/find_ms2_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/integrate_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/defaults/merge_def.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/export.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/helpers_optimized.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/parameters.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/src/masster/study/save.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/conftest.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_chromatogram.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_defaults.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_imports.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_integration.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_logger.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_parameters.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_sample.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_spectrum.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_study.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tests/test_version.py +0 -0
- {masster-0.3.13 → masster-0.3.15}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: masster
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: Mass spectrometry data analysis package
|
|
5
5
|
Project-URL: homepage, https://github.com/zamboni-lab/masster
|
|
6
6
|
Project-URL: repository, https://github.com/zamboni-lab/masster
|
|
@@ -684,6 +684,7 @@ Requires-Dist: alphabase>=1.0.0
|
|
|
684
684
|
Requires-Dist: alpharaw>=0.4.8
|
|
685
685
|
Requires-Dist: altair>=5.5.0
|
|
686
686
|
Requires-Dist: bokeh>=3.7.3
|
|
687
|
+
Requires-Dist: cmap>=0.6.2
|
|
687
688
|
Requires-Dist: datashader>=0.18.1
|
|
688
689
|
Requires-Dist: h5py>=3.14.0
|
|
689
690
|
Requires-Dist: holoviews>=1.21.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
[project]
|
|
3
3
|
name = "masster"
|
|
4
|
-
version = "0.3.
|
|
4
|
+
version = "0.3.15"
|
|
5
5
|
description = "Mass spectrometry data analysis package"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Zamboni Lab" }
|
|
@@ -37,7 +37,7 @@ dependencies = [
|
|
|
37
37
|
"hvplot>=0.11.3",
|
|
38
38
|
"loguru>=0.7.3",
|
|
39
39
|
"numpy>=2.0.0",
|
|
40
|
-
# "marimo>=0.14.16",
|
|
40
|
+
# "marimo>=0.14.16",
|
|
41
41
|
"matchms>=0.30.2",
|
|
42
42
|
"matplotlib>=3.8.0",
|
|
43
43
|
"pandas>=2.2.0",
|
|
@@ -50,7 +50,8 @@ dependencies = [
|
|
|
50
50
|
"scipy>=1.12.0",
|
|
51
51
|
"simple-parsing>=0.1.7",
|
|
52
52
|
"tqdm>=4.65.0",
|
|
53
|
-
"openpyxl>=3.1.5"
|
|
53
|
+
"openpyxl>=3.1.5",
|
|
54
|
+
"cmap>=0.6.2",
|
|
54
55
|
]
|
|
55
56
|
|
|
56
57
|
[project.optional-dependencies]
|
|
@@ -281,7 +281,7 @@ def select_closest_scan(
|
|
|
281
281
|
return scan
|
|
282
282
|
|
|
283
283
|
|
|
284
|
-
def get_eic(self, mz, mz_tol=
|
|
284
|
+
def get_eic(self, mz, mz_tol=None):
|
|
285
285
|
"""
|
|
286
286
|
Extract an extracted ion chromatogram (EIC) from `ms1_df` for a target m/z ± mz_tol.
|
|
287
287
|
|
|
@@ -291,11 +291,18 @@ def get_eic(self, mz, mz_tol=0.01):
|
|
|
291
291
|
|
|
292
292
|
Parameters:
|
|
293
293
|
mz (float): target m/z value
|
|
294
|
-
mz_tol (float): tolerance around mz
|
|
294
|
+
mz_tol (float): tolerance around mz. If None, uses self.parameters.eic_mz_tol or defaults to 0.01
|
|
295
295
|
|
|
296
296
|
Returns:
|
|
297
297
|
polars.DataFrame or None: chromatogram with columns ['rt', 'inty'] or None if not available
|
|
298
298
|
"""
|
|
299
|
+
# Use default mz_tol from sample parameters if not provided
|
|
300
|
+
if mz_tol is None:
|
|
301
|
+
if hasattr(self, 'parameters') and hasattr(self.parameters, 'eic_mz_tol'):
|
|
302
|
+
mz_tol = self.parameters.eic_mz_tol
|
|
303
|
+
else:
|
|
304
|
+
mz_tol = 0.01 # fallback default
|
|
305
|
+
|
|
299
306
|
# Validate ms1_df
|
|
300
307
|
if not hasattr(self, "ms1_df") or self.ms1_df is None:
|
|
301
308
|
if hasattr(self, "logger"):
|
|
@@ -379,18 +379,23 @@ def _load_raw(
|
|
|
379
379
|
mz=peaks.mz.values,
|
|
380
380
|
inty=peaks.intensity.values,
|
|
381
381
|
ms_level=s["ms_level"],
|
|
382
|
-
centroided=False,
|
|
383
382
|
)
|
|
384
383
|
# remove peaks with intensity <= 0
|
|
385
384
|
|
|
386
385
|
bl = spect.baseline()
|
|
387
386
|
spect = spect.denoise(threshold=bl)
|
|
387
|
+
|
|
388
388
|
if spect.ms_level == 1:
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
389
|
+
# Use the same logic as mzML loading
|
|
390
|
+
mz = np.array(spect.mz)
|
|
391
|
+
median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
|
|
392
|
+
|
|
393
|
+
if median_diff is not None and median_diff < 0.01:
|
|
394
|
+
spect = spect.centroid(
|
|
395
|
+
tolerance=self.parameters.mz_tol_ms1_da,
|
|
396
|
+
ppm=self.parameters.mz_tol_ms1_ppm,
|
|
397
|
+
min_points=self.parameters.centroid_min_points_ms1,
|
|
398
|
+
)
|
|
394
399
|
newscan = {
|
|
395
400
|
"scan_uid": i,
|
|
396
401
|
"cycle": cycle,
|
|
@@ -544,7 +549,6 @@ def _load_wiff(
|
|
|
544
549
|
mz=peaks.mz.values,
|
|
545
550
|
inty=peaks.intensity.values,
|
|
546
551
|
ms_level=ms_level,
|
|
547
|
-
centroided=False,
|
|
548
552
|
)
|
|
549
553
|
bl = spect.baseline()
|
|
550
554
|
spect = spect.denoise(threshold=bl)
|
|
@@ -56,7 +56,6 @@ from bokeh.models import HoverTool
|
|
|
56
56
|
from holoviews import dim
|
|
57
57
|
from holoviews.plotting.util import process_cmap
|
|
58
58
|
from matplotlib.colors import rgb2hex
|
|
59
|
-
from masster.chromatogram import Chromatogram
|
|
60
59
|
|
|
61
60
|
# Parameters removed - using hardcoded defaults
|
|
62
61
|
|
|
@@ -75,23 +74,36 @@ def _is_notebook_environment():
|
|
|
75
74
|
# Check for Jupyter/JupyterLab
|
|
76
75
|
from IPython import get_ipython
|
|
77
76
|
|
|
78
|
-
|
|
77
|
+
ipython = get_ipython()
|
|
78
|
+
if ipython is not None:
|
|
79
79
|
# Check if we're in a notebook context
|
|
80
|
-
shell =
|
|
80
|
+
shell = ipython.__class__.__name__
|
|
81
81
|
if shell in ["ZMQInteractiveShell", "Shell"]: # Jupyter notebook/lab
|
|
82
82
|
return True
|
|
83
83
|
|
|
84
|
-
# Check for Marimo
|
|
84
|
+
# Check for Marimo - multiple ways to detect it
|
|
85
85
|
import sys
|
|
86
86
|
|
|
87
|
+
# Check if marimo is in modules
|
|
87
88
|
if "marimo" in sys.modules:
|
|
88
89
|
return True
|
|
89
|
-
|
|
90
|
-
#
|
|
90
|
+
|
|
91
|
+
# Check for marimo in the call stack or environment
|
|
92
|
+
import inspect
|
|
93
|
+
frame = inspect.currentframe()
|
|
94
|
+
try:
|
|
95
|
+
while frame:
|
|
96
|
+
if frame.f_globals.get("__name__", "").startswith("marimo"):
|
|
97
|
+
return True
|
|
98
|
+
frame = frame.f_back
|
|
99
|
+
finally:
|
|
100
|
+
del frame
|
|
101
|
+
|
|
102
|
+
# Additional check for notebook environments via builtins
|
|
91
103
|
if hasattr(__builtins__, "__IPYTHON__") or hasattr(__builtins__, "_ih"):
|
|
92
104
|
return True
|
|
93
105
|
|
|
94
|
-
except ImportError:
|
|
106
|
+
except (ImportError, AttributeError):
|
|
95
107
|
pass
|
|
96
108
|
|
|
97
109
|
return False
|
|
@@ -106,22 +118,17 @@ def _display_plot(plot_object, layout=None):
|
|
|
106
118
|
layout: Optional panel layout object
|
|
107
119
|
|
|
108
120
|
Returns:
|
|
109
|
-
The
|
|
121
|
+
The plot object for inline display in notebooks, None for browser display
|
|
110
122
|
"""
|
|
111
123
|
if _is_notebook_environment():
|
|
112
|
-
#
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
#
|
|
124
|
+
# In notebook environments, return the plot object for inline display
|
|
125
|
+
# For Jupyter notebooks, holoviews/panel objects display automatically when returned
|
|
126
|
+
if layout is not None:
|
|
127
|
+
# Return the layout object which will display inline in notebooks
|
|
128
|
+
return layout
|
|
129
|
+
else:
|
|
130
|
+
# Return the plot object directly for holoviews automatic display
|
|
116
131
|
return plot_object
|
|
117
|
-
except Exception:
|
|
118
|
-
# Fallback to panel display for other notebook environments
|
|
119
|
-
if layout is not None:
|
|
120
|
-
return layout
|
|
121
|
-
else:
|
|
122
|
-
# Create a simple layout if none provided
|
|
123
|
-
simple_layout = panel.Column(plot_object)
|
|
124
|
-
return simple_layout
|
|
125
132
|
else:
|
|
126
133
|
# Display in browser (original behavior)
|
|
127
134
|
if layout is not None:
|
|
@@ -512,7 +519,7 @@ def plot_2d(
|
|
|
512
519
|
feats = feats.to_pandas()
|
|
513
520
|
# if ms2_scans is not null, keep only the first element of the list
|
|
514
521
|
feats["ms2_scans"] = feats["ms2_scans"].apply(
|
|
515
|
-
lambda x: x[0] if
|
|
522
|
+
lambda x: x[0] if isinstance(x, list) else x,
|
|
516
523
|
)
|
|
517
524
|
if mz_range is not None:
|
|
518
525
|
feats = feats[(feats["mz"] >= mz_range[0]) & (feats["mz"] <= mz_range[1])]
|
|
@@ -707,8 +714,6 @@ def plot_2d(
|
|
|
707
714
|
class MarkerSizeController(param.Parameterized):
|
|
708
715
|
size_slider = param.Number(default=markersize, bounds=(1, 20), step=0.5)
|
|
709
716
|
|
|
710
|
-
controller = MarkerSizeController()
|
|
711
|
-
|
|
712
717
|
# Create a function that generates just the feature overlays with different sizes
|
|
713
718
|
def create_feature_overlay(size_val):
|
|
714
719
|
feature_overlay = None
|
|
@@ -808,7 +813,17 @@ def plot_2d(
|
|
|
808
813
|
# Create layout
|
|
809
814
|
layout = on.Column(slider_widget, reactive_plot, sizing_mode="stretch_width")
|
|
810
815
|
|
|
811
|
-
|
|
816
|
+
# Handle filename saving for slider mode
|
|
817
|
+
if filename is not None:
|
|
818
|
+
if filename.endswith(".html"):
|
|
819
|
+
layout.save(filename, embed=True)
|
|
820
|
+
else:
|
|
821
|
+
# For slider plots, save the current state
|
|
822
|
+
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
823
|
+
return None
|
|
824
|
+
else:
|
|
825
|
+
# For notebook display, return the interactive layout
|
|
826
|
+
return _display_plot(layout, layout)
|
|
812
827
|
else:
|
|
813
828
|
# Create a panel layout without slider
|
|
814
829
|
layout = panel.Column(overlay)
|
|
@@ -819,17 +834,11 @@ def plot_2d(
|
|
|
819
834
|
layout.save(filename, embed=True)
|
|
820
835
|
else:
|
|
821
836
|
# save the panel layout as a png
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
825
|
-
else:
|
|
826
|
-
hv.save(overlay, filename, fmt="png")
|
|
837
|
+
hv.save(overlay, filename, fmt="png")
|
|
838
|
+
return None
|
|
827
839
|
else:
|
|
828
840
|
# Check if we're in a notebook environment and display appropriately
|
|
829
|
-
|
|
830
|
-
return _display_plot(layout, layout)
|
|
831
|
-
else:
|
|
832
|
-
return _display_plot(overlay, layout)
|
|
841
|
+
return _display_plot(overlay, layout)
|
|
833
842
|
|
|
834
843
|
|
|
835
844
|
def plot_2d_oracle(
|
|
@@ -982,7 +991,7 @@ def plot_2d_oracle(
|
|
|
982
991
|
oracle_data = pd.read_csv(
|
|
983
992
|
os.path.join(oracle_folder, "diag", "summary_by_feature.csv"),
|
|
984
993
|
)
|
|
985
|
-
except:
|
|
994
|
+
except Exception:
|
|
986
995
|
print(f"Could not read {oracle_folder}/diag/summary_by_feature.csv")
|
|
987
996
|
return
|
|
988
997
|
|
|
@@ -18,6 +18,9 @@ class study_defaults:
|
|
|
18
18
|
log_level (str): Logging level to be set for the logger. Default is "INFO".
|
|
19
19
|
log_label (Optional[str]): Optional label for the logger. Default is None.
|
|
20
20
|
log_sink (str): Output sink for logging. Default is "sys.stdout".
|
|
21
|
+
polarity (str): Polarity of the study (positive/negative). Default is "positive".
|
|
22
|
+
eic_mz_tol (float): Default m/z tolerance for EIC extraction and consensus selection. Default is 0.01.
|
|
23
|
+
eic_rt_tol (float): Default RT tolerance for EIC extraction and consensus selection. Default is 10.0.
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
26
|
folder: Optional[str] = None
|
|
@@ -27,6 +30,9 @@ class study_defaults:
|
|
|
27
30
|
log_sink: str = "sys.stdout"
|
|
28
31
|
polarity: str = "positive"
|
|
29
32
|
|
|
33
|
+
eic_mz_tol: float = 0.01
|
|
34
|
+
eic_rt_tol: float = 10.0
|
|
35
|
+
|
|
30
36
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
31
37
|
default_factory=lambda: {
|
|
32
38
|
"folder": {
|
|
@@ -61,6 +67,20 @@ class study_defaults:
|
|
|
61
67
|
"default": "positive",
|
|
62
68
|
"allowed_values": ["positive", "negative", "pos", "neg"],
|
|
63
69
|
},
|
|
70
|
+
"eic_mz_tol": {
|
|
71
|
+
"dtype": float,
|
|
72
|
+
"description": "Default m/z tolerance for EIC extraction and consensus selection (Da)",
|
|
73
|
+
"default": 0.01,
|
|
74
|
+
"min_value": 0.001,
|
|
75
|
+
"max_value": 1.0,
|
|
76
|
+
},
|
|
77
|
+
"eic_rt_tol": {
|
|
78
|
+
"dtype": float,
|
|
79
|
+
"description": "Default RT tolerance for EIC extraction and consensus selection (seconds)",
|
|
80
|
+
"default": 10.0,
|
|
81
|
+
"min_value": 0.2,
|
|
82
|
+
"max_value": 60.0,
|
|
83
|
+
},
|
|
64
84
|
},
|
|
65
85
|
repr=False,
|
|
66
86
|
)
|
|
@@ -695,19 +695,59 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
695
695
|
if schema_columns is None:
|
|
696
696
|
schema_columns = []
|
|
697
697
|
|
|
698
|
-
#
|
|
698
|
+
# Get available columns from HDF5 file
|
|
699
|
+
hdf5_columns = list(group.keys())
|
|
700
|
+
logger.debug(f"HDF5 columns available: {hdf5_columns}")
|
|
701
|
+
|
|
702
|
+
# Handle column name migrations for backward compatibility first
|
|
703
|
+
if df_name == "samples_df":
|
|
704
|
+
# Migrate old column names to new names
|
|
705
|
+
column_migrations = {
|
|
706
|
+
"size": "num_features",
|
|
707
|
+
"file_source": "sample_source",
|
|
708
|
+
"ms1": "num_ms1",
|
|
709
|
+
"ms2": "num_ms2"
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
# Create a mapping of what's actually available after migrations
|
|
713
|
+
effective_columns = hdf5_columns.copy()
|
|
714
|
+
for old_name, new_name in column_migrations.items():
|
|
715
|
+
if old_name in effective_columns:
|
|
716
|
+
logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
|
|
717
|
+
# Add the new name to effective columns and optionally remove old name
|
|
718
|
+
effective_columns.append(new_name)
|
|
719
|
+
|
|
720
|
+
# First pass: load all existing columns (including migrated ones)
|
|
699
721
|
for col in schema_columns or []:
|
|
700
|
-
|
|
722
|
+
source_col = col
|
|
723
|
+
|
|
724
|
+
# Check if we need to load from a migrated column name
|
|
725
|
+
if df_name == "samples_df":
|
|
726
|
+
column_migrations = {
|
|
727
|
+
"size": "num_features",
|
|
728
|
+
"file_source": "sample_source",
|
|
729
|
+
"ms1": "num_ms1",
|
|
730
|
+
"ms2": "num_ms2"
|
|
731
|
+
}
|
|
732
|
+
# Reverse lookup - find old name for new name
|
|
733
|
+
reverse_migrations = {v: k for k, v in column_migrations.items()}
|
|
734
|
+
if col in reverse_migrations:
|
|
735
|
+
old_name = reverse_migrations[col]
|
|
736
|
+
if old_name in group:
|
|
737
|
+
source_col = old_name
|
|
738
|
+
logger.info(f"Loading '{col}' from old column name '{old_name}'")
|
|
739
|
+
|
|
740
|
+
if source_col not in group:
|
|
701
741
|
missing_columns.append(col)
|
|
702
742
|
continue
|
|
703
743
|
|
|
704
744
|
dtype = schema[df_name]["columns"][col].get("dtype", "native")
|
|
705
745
|
if dtype == "pl.Object" or col in object_columns:
|
|
706
746
|
# Handle object columns specially
|
|
707
|
-
data[col] = _reconstruct_object_column(group[
|
|
747
|
+
data[col] = _reconstruct_object_column(group[source_col][:], col)
|
|
708
748
|
else:
|
|
709
749
|
# Regular columns
|
|
710
|
-
column_data = group[
|
|
750
|
+
column_data = group[source_col][:]
|
|
711
751
|
|
|
712
752
|
# Convert -123 sentinel values back to None for numeric columns
|
|
713
753
|
if len(column_data) > 0:
|
|
@@ -759,17 +799,43 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
759
799
|
# Second pass: handle missing columns
|
|
760
800
|
for col in missing_columns:
|
|
761
801
|
logger.warning(f"Column '{col}' not found in {df_name}.")
|
|
762
|
-
# For missing columns, create appropriately sized array
|
|
802
|
+
# For missing columns, create appropriately sized array with appropriate defaults
|
|
763
803
|
if col in object_columns:
|
|
764
804
|
data[col] = [None] * expected_length
|
|
765
805
|
logger.debug(f"Created missing object column '{col}' with length {expected_length}")
|
|
766
806
|
else:
|
|
767
|
-
|
|
768
|
-
|
|
807
|
+
# Provide specific default values for new columns for backward compatibility
|
|
808
|
+
if df_name == "samples_df":
|
|
809
|
+
if col == "sample_group":
|
|
810
|
+
data[col] = [""] * expected_length # Empty string default
|
|
811
|
+
logger.debug(f"Created missing column '{col}' with empty string defaults")
|
|
812
|
+
elif col == "sample_batch":
|
|
813
|
+
data[col] = [1] * expected_length # Batch 1 default
|
|
814
|
+
logger.debug(f"Created missing column '{col}' with batch 1 defaults")
|
|
815
|
+
elif col == "sample_sequence":
|
|
816
|
+
# Create increasing sequence numbers
|
|
817
|
+
data[col] = list(range(1, expected_length + 1))
|
|
818
|
+
logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
|
|
819
|
+
else:
|
|
820
|
+
data[col] = [None] * expected_length
|
|
821
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
822
|
+
else:
|
|
823
|
+
data[col] = [None] * expected_length
|
|
824
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
769
825
|
|
|
770
826
|
# Check for columns in HDF5 file that are not in schema (for backward compatibility)
|
|
771
|
-
|
|
772
|
-
|
|
827
|
+
# But skip the old column names we already migrated
|
|
828
|
+
migrated_old_names = set()
|
|
829
|
+
if df_name == "samples_df":
|
|
830
|
+
column_migrations = {
|
|
831
|
+
"size": "num_features",
|
|
832
|
+
"file_source": "sample_source",
|
|
833
|
+
"ms1": "num_ms1",
|
|
834
|
+
"ms2": "num_ms2"
|
|
835
|
+
}
|
|
836
|
+
migrated_old_names = set(column_migrations.keys())
|
|
837
|
+
|
|
838
|
+
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
773
839
|
|
|
774
840
|
for col in extra_columns:
|
|
775
841
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -1320,9 +1386,12 @@ def _load_study5(self, filename=None):
|
|
|
1320
1386
|
"sample_type": [],
|
|
1321
1387
|
"size": [],
|
|
1322
1388
|
"map_id": [],
|
|
1323
|
-
"
|
|
1324
|
-
"
|
|
1325
|
-
"
|
|
1389
|
+
"sample_source": [],
|
|
1390
|
+
"num_ms1": [],
|
|
1391
|
+
"num_ms2": [],
|
|
1392
|
+
"sample_group": [],
|
|
1393
|
+
"sample_batch": [],
|
|
1394
|
+
"sample_sequence": [],
|
|
1326
1395
|
},
|
|
1327
1396
|
schema={
|
|
1328
1397
|
"sample_uid": pl.Int64,
|
|
@@ -1330,10 +1399,13 @@ def _load_study5(self, filename=None):
|
|
|
1330
1399
|
"sample_path": pl.Utf8,
|
|
1331
1400
|
"sample_type": pl.Utf8,
|
|
1332
1401
|
"size": pl.Int64,
|
|
1333
|
-
"map_id": pl.
|
|
1334
|
-
"
|
|
1335
|
-
"
|
|
1336
|
-
"
|
|
1402
|
+
"map_id": pl.Int64,
|
|
1403
|
+
"sample_source": pl.Utf8,
|
|
1404
|
+
"num_ms1": pl.Int64,
|
|
1405
|
+
"num_ms2": pl.Int64,
|
|
1406
|
+
"sample_group": pl.Utf8,
|
|
1407
|
+
"sample_batch": pl.Int64,
|
|
1408
|
+
"sample_sequence": pl.Int64,
|
|
1337
1409
|
},
|
|
1338
1410
|
)
|
|
1339
1411
|
pbar.update(1)
|
|
@@ -1354,9 +1426,12 @@ def _load_study5(self, filename=None):
|
|
|
1354
1426
|
"sample_type": [],
|
|
1355
1427
|
"size": [],
|
|
1356
1428
|
"map_id": [],
|
|
1357
|
-
"
|
|
1358
|
-
"
|
|
1359
|
-
"
|
|
1429
|
+
"sample_source": [],
|
|
1430
|
+
"num_ms1": [],
|
|
1431
|
+
"num_ms2": [],
|
|
1432
|
+
"sample_group": [],
|
|
1433
|
+
"sample_batch": [],
|
|
1434
|
+
"sample_sequence": [],
|
|
1360
1435
|
},
|
|
1361
1436
|
schema={
|
|
1362
1437
|
"sample_uid": pl.Int64,
|
|
@@ -1364,10 +1439,13 @@ def _load_study5(self, filename=None):
|
|
|
1364
1439
|
"sample_path": pl.Utf8,
|
|
1365
1440
|
"sample_type": pl.Utf8,
|
|
1366
1441
|
"size": pl.Int64,
|
|
1367
|
-
"map_id": pl.
|
|
1368
|
-
"
|
|
1369
|
-
"
|
|
1370
|
-
"
|
|
1442
|
+
"map_id": pl.Int64,
|
|
1443
|
+
"sample_source": pl.Utf8,
|
|
1444
|
+
"num_ms1": pl.Int64,
|
|
1445
|
+
"num_ms2": pl.Int64,
|
|
1446
|
+
"sample_group": pl.Utf8,
|
|
1447
|
+
"sample_batch": pl.Int64,
|
|
1448
|
+
"sample_sequence": pl.Int64,
|
|
1371
1449
|
},
|
|
1372
1450
|
)
|
|
1373
1451
|
pbar.update(1)
|
|
@@ -1463,4 +1541,23 @@ def _load_study5(self, filename=None):
|
|
|
1463
1541
|
self.consensus_ms2 = None
|
|
1464
1542
|
pbar.update(1)
|
|
1465
1543
|
|
|
1544
|
+
# Check and migrate old string-based map_id to integer indices
|
|
1545
|
+
if (self.samples_df is not None and
|
|
1546
|
+
not self.samples_df.is_empty() and
|
|
1547
|
+
self.samples_df['map_id'].dtype == pl.Utf8):
|
|
1548
|
+
self.logger.info("Detected old string-based map_id format, migrating to integer indices")
|
|
1549
|
+
|
|
1550
|
+
# Convert string-based map_id to integer indices
|
|
1551
|
+
sample_count = len(self.samples_df)
|
|
1552
|
+
new_map_ids = list(range(sample_count))
|
|
1553
|
+
|
|
1554
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1555
|
+
pl.lit(new_map_ids).alias("map_id")
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
# Ensure the column is Int64 type
|
|
1559
|
+
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
1560
|
+
|
|
1561
|
+
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
|
|
1562
|
+
|
|
1466
1563
|
self.logger.debug("Study loaded")
|