masster 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +577 -0
- masster/sample/helpers.py +9 -2
- masster/sample/load.py +68 -7
- masster/sample/plot.py +43 -34
- masster/sample/sample.py +4 -0
- masster/spectrum.py +3 -0
- masster/study/defaults/fill_def.py +3 -3
- masster/study/defaults/study_def.py +20 -0
- masster/study/export.py +3 -0
- masster/study/h5.py +120 -23
- masster/study/helpers.py +482 -11
- masster/study/load.py +566 -205
- masster/study/plot.py +9 -2
- masster/study/study.py +32 -13
- masster/study/study5_schema.json +17 -5
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/METADATA +1 -1
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/RECORD +21 -21
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/WHEEL +0 -0
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/entry_points.txt +0 -0
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/licenses/LICENSE +0 -0
masster/sample/load.py
CHANGED
|
@@ -110,6 +110,63 @@ def load(
|
|
|
110
110
|
self.label = label
|
|
111
111
|
|
|
112
112
|
|
|
113
|
+
def load_study(
|
|
114
|
+
self,
|
|
115
|
+
filename=None,
|
|
116
|
+
ondisk=False,
|
|
117
|
+
type=None,
|
|
118
|
+
label=None,
|
|
119
|
+
):
|
|
120
|
+
"""
|
|
121
|
+
Optimized load method for study use that skips loading ms1_df for better performance.
|
|
122
|
+
|
|
123
|
+
This method is identical to load() but uses _load_sample5_study() for .sample5 files,
|
|
124
|
+
which skips reading the potentially large ms1_df dataset to improve throughput when
|
|
125
|
+
adding samples to studies.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
filename (str, optional): The path to the file to load. If None, uses self.file_path.
|
|
129
|
+
ondisk (bool, optional): Whether to load on-disk or in-memory. Defaults to False.
|
|
130
|
+
type (str, optional): Override file type detection. Can be "ztscan". Defaults to None.
|
|
131
|
+
label (str, optional): Override sample label. Defaults to None.
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
FileNotFoundError: If the specified file doesn't exist.
|
|
135
|
+
ValueError: If the file format is not supported.
|
|
136
|
+
|
|
137
|
+
Notes:
|
|
138
|
+
- Only affects .sample5 files (uses _load_sample5_study instead of _load_sample5)
|
|
139
|
+
- Other file formats (.mzML, .wiff, .raw) are loaded normally
|
|
140
|
+
- Sets ms1_df = None for .sample5 files to save memory and loading time
|
|
141
|
+
- Recommended for study workflows where MS1 spectral data is not needed
|
|
142
|
+
"""
|
|
143
|
+
if filename is None:
|
|
144
|
+
filename = self.file_path
|
|
145
|
+
filename = os.path.abspath(filename)
|
|
146
|
+
if not os.path.exists(filename):
|
|
147
|
+
raise FileNotFoundError("Filename not valid. Provide a valid file path.")
|
|
148
|
+
self.ondisk = ondisk
|
|
149
|
+
|
|
150
|
+
# check if file is mzML
|
|
151
|
+
if filename.lower().endswith(".mzml"):
|
|
152
|
+
self._load_mzML(filename)
|
|
153
|
+
elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
|
|
154
|
+
self._load_wiff(filename)
|
|
155
|
+
elif filename.lower().endswith(".raw"):
|
|
156
|
+
self._load_raw(filename)
|
|
157
|
+
elif filename.lower().endswith(".sample5"):
|
|
158
|
+
self._load_sample5_study(filename) # Use optimized version for study loading
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
|
|
161
|
+
|
|
162
|
+
self.file_type = "dda"
|
|
163
|
+
if type is not None and type.lower() in ["ztscan"]:
|
|
164
|
+
self.file_type = "ztscan"
|
|
165
|
+
|
|
166
|
+
if label is not None:
|
|
167
|
+
self.label = label
|
|
168
|
+
|
|
169
|
+
|
|
113
170
|
def _load_mzML(
|
|
114
171
|
self,
|
|
115
172
|
filename=None,
|
|
@@ -379,18 +436,23 @@ def _load_raw(
|
|
|
379
436
|
mz=peaks.mz.values,
|
|
380
437
|
inty=peaks.intensity.values,
|
|
381
438
|
ms_level=s["ms_level"],
|
|
382
|
-
centroided=False,
|
|
383
439
|
)
|
|
384
440
|
# remove peaks with intensity <= 0
|
|
385
441
|
|
|
386
442
|
bl = spect.baseline()
|
|
387
443
|
spect = spect.denoise(threshold=bl)
|
|
444
|
+
|
|
388
445
|
if spect.ms_level == 1:
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
446
|
+
# Use the same logic as mzML loading
|
|
447
|
+
mz = np.array(spect.mz)
|
|
448
|
+
median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
|
|
449
|
+
|
|
450
|
+
if median_diff is not None and median_diff < 0.01:
|
|
451
|
+
spect = spect.centroid(
|
|
452
|
+
tolerance=self.parameters.mz_tol_ms1_da,
|
|
453
|
+
ppm=self.parameters.mz_tol_ms1_ppm,
|
|
454
|
+
min_points=self.parameters.centroid_min_points_ms1,
|
|
455
|
+
)
|
|
394
456
|
newscan = {
|
|
395
457
|
"scan_uid": i,
|
|
396
458
|
"cycle": cycle,
|
|
@@ -544,7 +606,6 @@ def _load_wiff(
|
|
|
544
606
|
mz=peaks.mz.values,
|
|
545
607
|
inty=peaks.intensity.values,
|
|
546
608
|
ms_level=ms_level,
|
|
547
|
-
centroided=False,
|
|
548
609
|
)
|
|
549
610
|
bl = spect.baseline()
|
|
550
611
|
spect = spect.denoise(threshold=bl)
|
masster/sample/plot.py
CHANGED
|
@@ -56,7 +56,6 @@ from bokeh.models import HoverTool
|
|
|
56
56
|
from holoviews import dim
|
|
57
57
|
from holoviews.plotting.util import process_cmap
|
|
58
58
|
from matplotlib.colors import rgb2hex
|
|
59
|
-
from masster.chromatogram import Chromatogram
|
|
60
59
|
|
|
61
60
|
# Parameters removed - using hardcoded defaults
|
|
62
61
|
|
|
@@ -75,23 +74,36 @@ def _is_notebook_environment():
|
|
|
75
74
|
# Check for Jupyter/JupyterLab
|
|
76
75
|
from IPython import get_ipython
|
|
77
76
|
|
|
78
|
-
|
|
77
|
+
ipython = get_ipython()
|
|
78
|
+
if ipython is not None:
|
|
79
79
|
# Check if we're in a notebook context
|
|
80
|
-
shell =
|
|
80
|
+
shell = ipython.__class__.__name__
|
|
81
81
|
if shell in ["ZMQInteractiveShell", "Shell"]: # Jupyter notebook/lab
|
|
82
82
|
return True
|
|
83
83
|
|
|
84
|
-
# Check for Marimo
|
|
84
|
+
# Check for Marimo - multiple ways to detect it
|
|
85
85
|
import sys
|
|
86
86
|
|
|
87
|
+
# Check if marimo is in modules
|
|
87
88
|
if "marimo" in sys.modules:
|
|
88
89
|
return True
|
|
89
|
-
|
|
90
|
-
#
|
|
90
|
+
|
|
91
|
+
# Check for marimo in the call stack or environment
|
|
92
|
+
import inspect
|
|
93
|
+
frame = inspect.currentframe()
|
|
94
|
+
try:
|
|
95
|
+
while frame:
|
|
96
|
+
if frame.f_globals.get("__name__", "").startswith("marimo"):
|
|
97
|
+
return True
|
|
98
|
+
frame = frame.f_back
|
|
99
|
+
finally:
|
|
100
|
+
del frame
|
|
101
|
+
|
|
102
|
+
# Additional check for notebook environments via builtins
|
|
91
103
|
if hasattr(__builtins__, "__IPYTHON__") or hasattr(__builtins__, "_ih"):
|
|
92
104
|
return True
|
|
93
105
|
|
|
94
|
-
except ImportError:
|
|
106
|
+
except (ImportError, AttributeError):
|
|
95
107
|
pass
|
|
96
108
|
|
|
97
109
|
return False
|
|
@@ -106,22 +118,17 @@ def _display_plot(plot_object, layout=None):
|
|
|
106
118
|
layout: Optional panel layout object
|
|
107
119
|
|
|
108
120
|
Returns:
|
|
109
|
-
The
|
|
121
|
+
The plot object for inline display in notebooks, None for browser display
|
|
110
122
|
"""
|
|
111
123
|
if _is_notebook_environment():
|
|
112
|
-
#
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
#
|
|
124
|
+
# In notebook environments, return the plot object for inline display
|
|
125
|
+
# For Jupyter notebooks, holoviews/panel objects display automatically when returned
|
|
126
|
+
if layout is not None:
|
|
127
|
+
# Return the layout object which will display inline in notebooks
|
|
128
|
+
return layout
|
|
129
|
+
else:
|
|
130
|
+
# Return the plot object directly for holoviews automatic display
|
|
116
131
|
return plot_object
|
|
117
|
-
except Exception:
|
|
118
|
-
# Fallback to panel display for other notebook environments
|
|
119
|
-
if layout is not None:
|
|
120
|
-
return layout
|
|
121
|
-
else:
|
|
122
|
-
# Create a simple layout if none provided
|
|
123
|
-
simple_layout = panel.Column(plot_object)
|
|
124
|
-
return simple_layout
|
|
125
132
|
else:
|
|
126
133
|
# Display in browser (original behavior)
|
|
127
134
|
if layout is not None:
|
|
@@ -512,7 +519,7 @@ def plot_2d(
|
|
|
512
519
|
feats = feats.to_pandas()
|
|
513
520
|
# if ms2_scans is not null, keep only the first element of the list
|
|
514
521
|
feats["ms2_scans"] = feats["ms2_scans"].apply(
|
|
515
|
-
lambda x: x[0] if
|
|
522
|
+
lambda x: x[0] if isinstance(x, list) else x,
|
|
516
523
|
)
|
|
517
524
|
if mz_range is not None:
|
|
518
525
|
feats = feats[(feats["mz"] >= mz_range[0]) & (feats["mz"] <= mz_range[1])]
|
|
@@ -707,8 +714,6 @@ def plot_2d(
|
|
|
707
714
|
class MarkerSizeController(param.Parameterized):
|
|
708
715
|
size_slider = param.Number(default=markersize, bounds=(1, 20), step=0.5)
|
|
709
716
|
|
|
710
|
-
controller = MarkerSizeController()
|
|
711
|
-
|
|
712
717
|
# Create a function that generates just the feature overlays with different sizes
|
|
713
718
|
def create_feature_overlay(size_val):
|
|
714
719
|
feature_overlay = None
|
|
@@ -808,7 +813,17 @@ def plot_2d(
|
|
|
808
813
|
# Create layout
|
|
809
814
|
layout = on.Column(slider_widget, reactive_plot, sizing_mode="stretch_width")
|
|
810
815
|
|
|
811
|
-
|
|
816
|
+
# Handle filename saving for slider mode
|
|
817
|
+
if filename is not None:
|
|
818
|
+
if filename.endswith(".html"):
|
|
819
|
+
layout.save(filename, embed=True)
|
|
820
|
+
else:
|
|
821
|
+
# For slider plots, save the current state
|
|
822
|
+
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
823
|
+
return None
|
|
824
|
+
else:
|
|
825
|
+
# For notebook display, return the interactive layout
|
|
826
|
+
return _display_plot(layout, layout)
|
|
812
827
|
else:
|
|
813
828
|
# Create a panel layout without slider
|
|
814
829
|
layout = panel.Column(overlay)
|
|
@@ -819,17 +834,11 @@ def plot_2d(
|
|
|
819
834
|
layout.save(filename, embed=True)
|
|
820
835
|
else:
|
|
821
836
|
# save the panel layout as a png
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
825
|
-
else:
|
|
826
|
-
hv.save(overlay, filename, fmt="png")
|
|
837
|
+
hv.save(overlay, filename, fmt="png")
|
|
838
|
+
return None
|
|
827
839
|
else:
|
|
828
840
|
# Check if we're in a notebook environment and display appropriately
|
|
829
|
-
|
|
830
|
-
return _display_plot(layout, layout)
|
|
831
|
-
else:
|
|
832
|
-
return _display_plot(overlay, layout)
|
|
841
|
+
return _display_plot(overlay, layout)
|
|
833
842
|
|
|
834
843
|
|
|
835
844
|
def plot_2d_oracle(
|
|
@@ -982,7 +991,7 @@ def plot_2d_oracle(
|
|
|
982
991
|
oracle_data = pd.read_csv(
|
|
983
992
|
os.path.join(oracle_folder, "diag", "summary_by_feature.csv"),
|
|
984
993
|
)
|
|
985
|
-
except:
|
|
994
|
+
except Exception:
|
|
986
995
|
print(f"Could not read {oracle_folder}/diag/summary_by_feature.csv")
|
|
987
996
|
return
|
|
988
997
|
|
masster/sample/sample.py
CHANGED
|
@@ -49,6 +49,7 @@ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
|
|
|
49
49
|
|
|
50
50
|
# Sample-specific imports
|
|
51
51
|
from masster.sample.h5 import _load_sample5
|
|
52
|
+
from masster.sample.h5 import _load_sample5_study
|
|
52
53
|
from masster.sample.h5 import _save_sample5
|
|
53
54
|
from masster.sample.helpers import _delete_ms2
|
|
54
55
|
from masster.sample.helpers import _estimate_memory_usage
|
|
@@ -72,6 +73,7 @@ from masster.sample.load import _load_wiff
|
|
|
72
73
|
from masster.sample.load import chrom_extract
|
|
73
74
|
from masster.sample.load import index_file
|
|
74
75
|
from masster.sample.load import load
|
|
76
|
+
from masster.sample.load import load_study
|
|
75
77
|
from masster.sample.load import sanitize
|
|
76
78
|
from masster.sample.plot import plot_2d
|
|
77
79
|
from masster.sample.plot import plot_2d_oracle
|
|
@@ -203,6 +205,7 @@ class Sample:
|
|
|
203
205
|
|
|
204
206
|
# Attach module functions as class methods
|
|
205
207
|
load = load
|
|
208
|
+
load_study = load_study
|
|
206
209
|
save = save
|
|
207
210
|
find_features = find_features
|
|
208
211
|
find_adducts = find_adducts
|
|
@@ -243,6 +246,7 @@ class Sample:
|
|
|
243
246
|
|
|
244
247
|
# Additional method assignments for all imported functions
|
|
245
248
|
_load_sample5 = _load_sample5
|
|
249
|
+
_load_sample5_study = _load_sample5_study
|
|
246
250
|
_save_sample5 = _save_sample5
|
|
247
251
|
_delete_ms2 = _delete_ms2
|
|
248
252
|
_estimate_memory_usage = _estimate_memory_usage
|
masster/spectrum.py
CHANGED
|
@@ -229,6 +229,9 @@ class Spectrum:
|
|
|
229
229
|
elif isinstance(value, (list, dict)):
|
|
230
230
|
# Create copies of mutable objects
|
|
231
231
|
result[key] = copy.deepcopy(value)
|
|
232
|
+
elif isinstance(value, np.number):
|
|
233
|
+
# Handle numpy scalar types (float32, int32, etc.)
|
|
234
|
+
result[key] = value.item()
|
|
232
235
|
else:
|
|
233
236
|
# Immutable objects can be copied directly
|
|
234
237
|
result[key] = value
|
|
@@ -23,7 +23,7 @@ class fill_defaults:
|
|
|
23
23
|
uids: Optional[list] = None
|
|
24
24
|
mz_tol: float = 0.010
|
|
25
25
|
rt_tol: float = 10.0
|
|
26
|
-
min_samples_rel: float = 0.
|
|
26
|
+
min_samples_rel: float = 0.00
|
|
27
27
|
min_samples_abs: int = 5
|
|
28
28
|
|
|
29
29
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
@@ -37,7 +37,7 @@ class fill_defaults:
|
|
|
37
37
|
"dtype": float,
|
|
38
38
|
"description": "m/z tolerance for chromatogram extraction (Da)",
|
|
39
39
|
"default": 0.010,
|
|
40
|
-
"min_value": 0.
|
|
40
|
+
"min_value": 0.0002,
|
|
41
41
|
"max_value": 0.1,
|
|
42
42
|
},
|
|
43
43
|
"rt_tol": {
|
|
@@ -51,7 +51,7 @@ class fill_defaults:
|
|
|
51
51
|
"dtype": float,
|
|
52
52
|
"description": "Minimum relative samples threshold (fraction)",
|
|
53
53
|
"default": 0.05,
|
|
54
|
-
"min_value": 0.
|
|
54
|
+
"min_value": 0.0,
|
|
55
55
|
"max_value": 1.0,
|
|
56
56
|
},
|
|
57
57
|
"min_samples_abs": {
|
|
@@ -18,6 +18,9 @@ class study_defaults:
|
|
|
18
18
|
log_level (str): Logging level to be set for the logger. Default is "INFO".
|
|
19
19
|
log_label (Optional[str]): Optional label for the logger. Default is None.
|
|
20
20
|
log_sink (str): Output sink for logging. Default is "sys.stdout".
|
|
21
|
+
polarity (str): Polarity of the study (positive/negative). Default is "positive".
|
|
22
|
+
eic_mz_tol (float): Default m/z tolerance for EIC extraction and consensus selection. Default is 0.01.
|
|
23
|
+
eic_rt_tol (float): Default RT tolerance for EIC extraction and consensus selection. Default is 10.0.
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
26
|
folder: Optional[str] = None
|
|
@@ -27,6 +30,9 @@ class study_defaults:
|
|
|
27
30
|
log_sink: str = "sys.stdout"
|
|
28
31
|
polarity: str = "positive"
|
|
29
32
|
|
|
33
|
+
eic_mz_tol: float = 0.01
|
|
34
|
+
eic_rt_tol: float = 10.0
|
|
35
|
+
|
|
30
36
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
31
37
|
default_factory=lambda: {
|
|
32
38
|
"folder": {
|
|
@@ -61,6 +67,20 @@ class study_defaults:
|
|
|
61
67
|
"default": "positive",
|
|
62
68
|
"allowed_values": ["positive", "negative", "pos", "neg"],
|
|
63
69
|
},
|
|
70
|
+
"eic_mz_tol": {
|
|
71
|
+
"dtype": float,
|
|
72
|
+
"description": "Default m/z tolerance for EIC extraction and consensus selection (Da)",
|
|
73
|
+
"default": 0.01,
|
|
74
|
+
"min_value": 0.001,
|
|
75
|
+
"max_value": 1.0,
|
|
76
|
+
},
|
|
77
|
+
"eic_rt_tol": {
|
|
78
|
+
"dtype": float,
|
|
79
|
+
"description": "Default RT tolerance for EIC extraction and consensus selection (seconds)",
|
|
80
|
+
"default": 10.0,
|
|
81
|
+
"min_value": 0.2,
|
|
82
|
+
"max_value": 60.0,
|
|
83
|
+
},
|
|
64
84
|
},
|
|
65
85
|
repr=False,
|
|
66
86
|
)
|
masster/study/export.py
CHANGED
|
@@ -180,6 +180,9 @@ def _get_mgf_df(self, **kwargs):
|
|
|
180
180
|
for e in energies:
|
|
181
181
|
cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
|
|
182
182
|
if selection == "best":
|
|
183
|
+
# Check if the filtered DataFrame is empty
|
|
184
|
+
if len(cons_ms2_e) == 0:
|
|
185
|
+
continue
|
|
183
186
|
idx = cons_ms2_e["prec_inty"].idxmax()
|
|
184
187
|
cons_ms2_e_row = cons_ms2_e.loc[idx]
|
|
185
188
|
spect = cons_ms2_e_row["spec"]
|
masster/study/h5.py
CHANGED
|
@@ -695,19 +695,59 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
695
695
|
if schema_columns is None:
|
|
696
696
|
schema_columns = []
|
|
697
697
|
|
|
698
|
-
#
|
|
698
|
+
# Get available columns from HDF5 file
|
|
699
|
+
hdf5_columns = list(group.keys())
|
|
700
|
+
logger.debug(f"HDF5 columns available: {hdf5_columns}")
|
|
701
|
+
|
|
702
|
+
# Handle column name migrations for backward compatibility first
|
|
703
|
+
if df_name == "samples_df":
|
|
704
|
+
# Migrate old column names to new names
|
|
705
|
+
column_migrations = {
|
|
706
|
+
"size": "num_features",
|
|
707
|
+
"file_source": "sample_source",
|
|
708
|
+
"ms1": "num_ms1",
|
|
709
|
+
"ms2": "num_ms2"
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
# Create a mapping of what's actually available after migrations
|
|
713
|
+
effective_columns = hdf5_columns.copy()
|
|
714
|
+
for old_name, new_name in column_migrations.items():
|
|
715
|
+
if old_name in effective_columns:
|
|
716
|
+
logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
|
|
717
|
+
# Add the new name to effective columns and optionally remove old name
|
|
718
|
+
effective_columns.append(new_name)
|
|
719
|
+
|
|
720
|
+
# First pass: load all existing columns (including migrated ones)
|
|
699
721
|
for col in schema_columns or []:
|
|
700
|
-
|
|
722
|
+
source_col = col
|
|
723
|
+
|
|
724
|
+
# Check if we need to load from a migrated column name
|
|
725
|
+
if df_name == "samples_df":
|
|
726
|
+
column_migrations = {
|
|
727
|
+
"size": "num_features",
|
|
728
|
+
"file_source": "sample_source",
|
|
729
|
+
"ms1": "num_ms1",
|
|
730
|
+
"ms2": "num_ms2"
|
|
731
|
+
}
|
|
732
|
+
# Reverse lookup - find old name for new name
|
|
733
|
+
reverse_migrations = {v: k for k, v in column_migrations.items()}
|
|
734
|
+
if col in reverse_migrations:
|
|
735
|
+
old_name = reverse_migrations[col]
|
|
736
|
+
if old_name in group:
|
|
737
|
+
source_col = old_name
|
|
738
|
+
logger.info(f"Loading '{col}' from old column name '{old_name}'")
|
|
739
|
+
|
|
740
|
+
if source_col not in group:
|
|
701
741
|
missing_columns.append(col)
|
|
702
742
|
continue
|
|
703
743
|
|
|
704
744
|
dtype = schema[df_name]["columns"][col].get("dtype", "native")
|
|
705
745
|
if dtype == "pl.Object" or col in object_columns:
|
|
706
746
|
# Handle object columns specially
|
|
707
|
-
data[col] = _reconstruct_object_column(group[
|
|
747
|
+
data[col] = _reconstruct_object_column(group[source_col][:], col)
|
|
708
748
|
else:
|
|
709
749
|
# Regular columns
|
|
710
|
-
column_data = group[
|
|
750
|
+
column_data = group[source_col][:]
|
|
711
751
|
|
|
712
752
|
# Convert -123 sentinel values back to None for numeric columns
|
|
713
753
|
if len(column_data) > 0:
|
|
@@ -759,17 +799,43 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
759
799
|
# Second pass: handle missing columns
|
|
760
800
|
for col in missing_columns:
|
|
761
801
|
logger.warning(f"Column '{col}' not found in {df_name}.")
|
|
762
|
-
# For missing columns, create appropriately sized array
|
|
802
|
+
# For missing columns, create appropriately sized array with appropriate defaults
|
|
763
803
|
if col in object_columns:
|
|
764
804
|
data[col] = [None] * expected_length
|
|
765
805
|
logger.debug(f"Created missing object column '{col}' with length {expected_length}")
|
|
766
806
|
else:
|
|
767
|
-
|
|
768
|
-
|
|
807
|
+
# Provide specific default values for new columns for backward compatibility
|
|
808
|
+
if df_name == "samples_df":
|
|
809
|
+
if col == "sample_group":
|
|
810
|
+
data[col] = [""] * expected_length # Empty string default
|
|
811
|
+
logger.debug(f"Created missing column '{col}' with empty string defaults")
|
|
812
|
+
elif col == "sample_batch":
|
|
813
|
+
data[col] = [1] * expected_length # Batch 1 default
|
|
814
|
+
logger.debug(f"Created missing column '{col}' with batch 1 defaults")
|
|
815
|
+
elif col == "sample_sequence":
|
|
816
|
+
# Create increasing sequence numbers
|
|
817
|
+
data[col] = list(range(1, expected_length + 1))
|
|
818
|
+
logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
|
|
819
|
+
else:
|
|
820
|
+
data[col] = [None] * expected_length
|
|
821
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
822
|
+
else:
|
|
823
|
+
data[col] = [None] * expected_length
|
|
824
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
769
825
|
|
|
770
826
|
# Check for columns in HDF5 file that are not in schema (for backward compatibility)
|
|
771
|
-
|
|
772
|
-
|
|
827
|
+
# But skip the old column names we already migrated
|
|
828
|
+
migrated_old_names = set()
|
|
829
|
+
if df_name == "samples_df":
|
|
830
|
+
column_migrations = {
|
|
831
|
+
"size": "num_features",
|
|
832
|
+
"file_source": "sample_source",
|
|
833
|
+
"ms1": "num_ms1",
|
|
834
|
+
"ms2": "num_ms2"
|
|
835
|
+
}
|
|
836
|
+
migrated_old_names = set(column_migrations.keys())
|
|
837
|
+
|
|
838
|
+
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
773
839
|
|
|
774
840
|
for col in extra_columns:
|
|
775
841
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -1320,9 +1386,12 @@ def _load_study5(self, filename=None):
|
|
|
1320
1386
|
"sample_type": [],
|
|
1321
1387
|
"size": [],
|
|
1322
1388
|
"map_id": [],
|
|
1323
|
-
"
|
|
1324
|
-
"
|
|
1325
|
-
"
|
|
1389
|
+
"sample_source": [],
|
|
1390
|
+
"num_ms1": [],
|
|
1391
|
+
"num_ms2": [],
|
|
1392
|
+
"sample_group": [],
|
|
1393
|
+
"sample_batch": [],
|
|
1394
|
+
"sample_sequence": [],
|
|
1326
1395
|
},
|
|
1327
1396
|
schema={
|
|
1328
1397
|
"sample_uid": pl.Int64,
|
|
@@ -1330,10 +1399,13 @@ def _load_study5(self, filename=None):
|
|
|
1330
1399
|
"sample_path": pl.Utf8,
|
|
1331
1400
|
"sample_type": pl.Utf8,
|
|
1332
1401
|
"size": pl.Int64,
|
|
1333
|
-
"map_id": pl.
|
|
1334
|
-
"
|
|
1335
|
-
"
|
|
1336
|
-
"
|
|
1402
|
+
"map_id": pl.Int64,
|
|
1403
|
+
"sample_source": pl.Utf8,
|
|
1404
|
+
"num_ms1": pl.Int64,
|
|
1405
|
+
"num_ms2": pl.Int64,
|
|
1406
|
+
"sample_group": pl.Utf8,
|
|
1407
|
+
"sample_batch": pl.Int64,
|
|
1408
|
+
"sample_sequence": pl.Int64,
|
|
1337
1409
|
},
|
|
1338
1410
|
)
|
|
1339
1411
|
pbar.update(1)
|
|
@@ -1354,9 +1426,12 @@ def _load_study5(self, filename=None):
|
|
|
1354
1426
|
"sample_type": [],
|
|
1355
1427
|
"size": [],
|
|
1356
1428
|
"map_id": [],
|
|
1357
|
-
"
|
|
1358
|
-
"
|
|
1359
|
-
"
|
|
1429
|
+
"sample_source": [],
|
|
1430
|
+
"num_ms1": [],
|
|
1431
|
+
"num_ms2": [],
|
|
1432
|
+
"sample_group": [],
|
|
1433
|
+
"sample_batch": [],
|
|
1434
|
+
"sample_sequence": [],
|
|
1360
1435
|
},
|
|
1361
1436
|
schema={
|
|
1362
1437
|
"sample_uid": pl.Int64,
|
|
@@ -1364,10 +1439,13 @@ def _load_study5(self, filename=None):
|
|
|
1364
1439
|
"sample_path": pl.Utf8,
|
|
1365
1440
|
"sample_type": pl.Utf8,
|
|
1366
1441
|
"size": pl.Int64,
|
|
1367
|
-
"map_id": pl.
|
|
1368
|
-
"
|
|
1369
|
-
"
|
|
1370
|
-
"
|
|
1442
|
+
"map_id": pl.Int64,
|
|
1443
|
+
"sample_source": pl.Utf8,
|
|
1444
|
+
"num_ms1": pl.Int64,
|
|
1445
|
+
"num_ms2": pl.Int64,
|
|
1446
|
+
"sample_group": pl.Utf8,
|
|
1447
|
+
"sample_batch": pl.Int64,
|
|
1448
|
+
"sample_sequence": pl.Int64,
|
|
1371
1449
|
},
|
|
1372
1450
|
)
|
|
1373
1451
|
pbar.update(1)
|
|
@@ -1463,4 +1541,23 @@ def _load_study5(self, filename=None):
|
|
|
1463
1541
|
self.consensus_ms2 = None
|
|
1464
1542
|
pbar.update(1)
|
|
1465
1543
|
|
|
1544
|
+
# Check and migrate old string-based map_id to integer indices
|
|
1545
|
+
if (self.samples_df is not None and
|
|
1546
|
+
not self.samples_df.is_empty() and
|
|
1547
|
+
self.samples_df['map_id'].dtype == pl.Utf8):
|
|
1548
|
+
self.logger.info("Detected old string-based map_id format, migrating to integer indices")
|
|
1549
|
+
|
|
1550
|
+
# Convert string-based map_id to integer indices
|
|
1551
|
+
sample_count = len(self.samples_df)
|
|
1552
|
+
new_map_ids = list(range(sample_count))
|
|
1553
|
+
|
|
1554
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1555
|
+
pl.lit(new_map_ids).alias("map_id")
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
# Ensure the column is Int64 type
|
|
1559
|
+
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
1560
|
+
|
|
1561
|
+
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
|
|
1562
|
+
|
|
1466
1563
|
self.logger.debug("Study loaded")
|