masster 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/data/libs/README.md +17 -0
- masster/data/libs/ccm.py +533 -0
- masster/data/libs/central_carbon_README.md +17 -0
- masster/data/libs/central_carbon_metabolites.csv +120 -0
- masster/data/libs/urine.py +333 -0
- masster/data/libs/urine_metabolites.csv +51 -0
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/lib.py +32 -25
- masster/sample/load.py +9 -3
- masster/sample/plot.py +113 -27
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +450 -196
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +144 -118
- masster/study/plot.py +691 -277
- masster/study/processing.py +9 -5
- masster/study/study.py +6 -6
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/RECORD +31 -25
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0
masster/sample/plot.py
CHANGED
|
@@ -87,9 +87,10 @@ def _is_notebook_environment():
|
|
|
87
87
|
# Check if marimo is in modules
|
|
88
88
|
if "marimo" in sys.modules:
|
|
89
89
|
return True
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
# Check for marimo in the call stack or environment
|
|
92
92
|
import inspect
|
|
93
|
+
|
|
93
94
|
frame = inspect.currentframe()
|
|
94
95
|
try:
|
|
95
96
|
while frame:
|
|
@@ -140,6 +141,110 @@ def _display_plot(plot_object, layout=None):
|
|
|
140
141
|
return None
|
|
141
142
|
|
|
142
143
|
|
|
144
|
+
def _handle_sample_plot_output(self, plot_obj, filename=None, plot_type="bokeh"):
|
|
145
|
+
"""
|
|
146
|
+
Helper function to handle consistent save/display behavior for sample plots.
|
|
147
|
+
|
|
148
|
+
Parameters:
|
|
149
|
+
plot_obj: The plot object (bokeh figure, holoviews layout, or panel object)
|
|
150
|
+
filename: Optional filename to save the plot
|
|
151
|
+
plot_type: Type of plot object ("bokeh", "panel", "holoviews")
|
|
152
|
+
"""
|
|
153
|
+
if filename is not None:
|
|
154
|
+
# Convert relative paths to absolute paths using sample folder as base
|
|
155
|
+
import os
|
|
156
|
+
if hasattr(self, 'folder') and self.folder and not os.path.isabs(filename):
|
|
157
|
+
filename = os.path.join(self.folder, filename)
|
|
158
|
+
|
|
159
|
+
# Convert to absolute path for logging
|
|
160
|
+
abs_filename = os.path.abspath(filename)
|
|
161
|
+
|
|
162
|
+
if filename.endswith(".html"):
|
|
163
|
+
if plot_type == "panel":
|
|
164
|
+
plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
|
|
165
|
+
elif plot_type == "holoviews":
|
|
166
|
+
import panel
|
|
167
|
+
panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
|
|
168
|
+
elif plot_type == "bokeh":
|
|
169
|
+
from bokeh.plotting import output_file
|
|
170
|
+
from bokeh.io import save
|
|
171
|
+
output_file(filename)
|
|
172
|
+
save(plot_obj)
|
|
173
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
174
|
+
elif filename.endswith(".png"):
|
|
175
|
+
try:
|
|
176
|
+
if plot_type == "bokeh":
|
|
177
|
+
from bokeh.io.export import export_png
|
|
178
|
+
export_png(plot_obj, filename=filename)
|
|
179
|
+
elif plot_type in ["panel", "holoviews"]:
|
|
180
|
+
import holoviews as hv
|
|
181
|
+
hv.save(plot_obj, filename, fmt="png")
|
|
182
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
183
|
+
except Exception:
|
|
184
|
+
# Fall back to HTML if PNG export not available
|
|
185
|
+
html_filename = filename.replace('.png', '.html')
|
|
186
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
187
|
+
if plot_type == "panel":
|
|
188
|
+
plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
189
|
+
elif plot_type == "holoviews":
|
|
190
|
+
import panel
|
|
191
|
+
panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
192
|
+
elif plot_type == "bokeh":
|
|
193
|
+
from bokeh.plotting import output_file
|
|
194
|
+
from bokeh.io import save
|
|
195
|
+
output_file(html_filename)
|
|
196
|
+
save(plot_obj)
|
|
197
|
+
self.logger.warning(f"PNG export not available, saved as HTML instead: {abs_html_filename}")
|
|
198
|
+
elif filename.endswith(".pdf"):
|
|
199
|
+
# Try to save as PDF, fall back to HTML if not available
|
|
200
|
+
try:
|
|
201
|
+
if plot_type == "bokeh":
|
|
202
|
+
from bokeh.io.export import export_pdf
|
|
203
|
+
export_pdf(plot_obj, filename=filename)
|
|
204
|
+
elif plot_type in ["panel", "holoviews"]:
|
|
205
|
+
import holoviews as hv
|
|
206
|
+
hv.save(plot_obj, filename, fmt="pdf")
|
|
207
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
208
|
+
except ImportError:
|
|
209
|
+
# Fall back to HTML if PDF export not available
|
|
210
|
+
html_filename = filename.replace('.pdf', '.html')
|
|
211
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
212
|
+
if plot_type == "panel":
|
|
213
|
+
plot_obj.save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
214
|
+
elif plot_type == "holoviews":
|
|
215
|
+
import panel
|
|
216
|
+
panel.panel(plot_obj).save(html_filename, embed=True) # type: ignore[attr-defined]
|
|
217
|
+
elif plot_type == "bokeh":
|
|
218
|
+
from bokeh.plotting import output_file
|
|
219
|
+
from bokeh.io import save
|
|
220
|
+
output_file(html_filename)
|
|
221
|
+
save(plot_obj)
|
|
222
|
+
self.logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
|
|
223
|
+
else:
|
|
224
|
+
# Default to HTML for unknown extensions
|
|
225
|
+
if plot_type == "panel":
|
|
226
|
+
plot_obj.save(filename, embed=True) # type: ignore[attr-defined]
|
|
227
|
+
elif plot_type == "holoviews":
|
|
228
|
+
import panel
|
|
229
|
+
panel.panel(plot_obj).save(filename, embed=True) # type: ignore[attr-defined]
|
|
230
|
+
elif plot_type == "bokeh":
|
|
231
|
+
from bokeh.plotting import output_file
|
|
232
|
+
from bokeh.io import save
|
|
233
|
+
output_file(filename)
|
|
234
|
+
save(plot_obj)
|
|
235
|
+
self.logger.info(f"Plot saved to: {abs_filename}")
|
|
236
|
+
else:
|
|
237
|
+
# Show in notebook when no filename provided
|
|
238
|
+
if plot_type == "panel":
|
|
239
|
+
plot_obj.show() # type: ignore[attr-defined]
|
|
240
|
+
elif plot_type == "holoviews":
|
|
241
|
+
import panel
|
|
242
|
+
return panel.panel(plot_obj)
|
|
243
|
+
elif plot_type == "bokeh":
|
|
244
|
+
from bokeh.plotting import show
|
|
245
|
+
show(plot_obj)
|
|
246
|
+
|
|
247
|
+
|
|
143
248
|
def plot_chrom(
|
|
144
249
|
self,
|
|
145
250
|
feature_uid=None,
|
|
@@ -270,16 +375,9 @@ def plot_chrom(
|
|
|
270
375
|
|
|
271
376
|
layout = layout.cols(1)
|
|
272
377
|
layout = panel.Column(layout)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
layout.save(filename, embed=True)
|
|
277
|
-
else:
|
|
278
|
-
# save the panel layout as a png
|
|
279
|
-
hv.save(layout, filename, fmt="png")
|
|
280
|
-
else:
|
|
281
|
-
# Check if we're in a notebook environment and display appropriately
|
|
282
|
-
return _display_plot(layout.object, layout)
|
|
378
|
+
|
|
379
|
+
# Use consistent save/display behavior
|
|
380
|
+
self._handle_sample_plot_output(layout, filename, "panel")
|
|
283
381
|
|
|
284
382
|
|
|
285
383
|
def plot_2d(
|
|
@@ -829,12 +927,8 @@ def plot_2d(
|
|
|
829
927
|
layout = panel.Column(overlay)
|
|
830
928
|
|
|
831
929
|
if filename is not None:
|
|
832
|
-
#
|
|
833
|
-
|
|
834
|
-
layout.save(filename, embed=True)
|
|
835
|
-
else:
|
|
836
|
-
# save the panel layout as a png
|
|
837
|
-
hv.save(overlay, filename, fmt="png")
|
|
930
|
+
# Use consistent save/display behavior
|
|
931
|
+
self._handle_sample_plot_output(layout, filename, "panel")
|
|
838
932
|
return None
|
|
839
933
|
else:
|
|
840
934
|
# Check if we're in a notebook environment and display appropriately
|
|
@@ -1959,16 +2053,8 @@ def plot_feature_stats(
|
|
|
1959
2053
|
# Arrange the plots in a layout with three columns
|
|
1960
2054
|
layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
|
|
1961
2055
|
|
|
1962
|
-
#
|
|
1963
|
-
|
|
1964
|
-
if filename.endswith(".html"):
|
|
1965
|
-
panel.panel(layout).save(filename, embed=True) # type: ignore[attr-defined]
|
|
1966
|
-
else:
|
|
1967
|
-
hv.save(layout, filename, fmt="png")
|
|
1968
|
-
else:
|
|
1969
|
-
# Check if we're in a notebook environment and display appropriately
|
|
1970
|
-
layout_obj = panel.panel(layout)
|
|
1971
|
-
return _display_plot(layout, layout_obj)
|
|
2056
|
+
# Use consistent save/display behavior
|
|
2057
|
+
self._handle_sample_plot_output(layout, filename, "holoviews")
|
|
1972
2058
|
|
|
1973
2059
|
|
|
1974
2060
|
def plot_tic(
|
masster/study/export.py
CHANGED
|
@@ -445,7 +445,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
445
445
|
mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
|
|
446
446
|
mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
|
|
447
447
|
mtd_lines.append(
|
|
448
|
-
"MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]"
|
|
448
|
+
"MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
|
|
449
449
|
)
|
|
450
450
|
mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
|
|
451
451
|
mtd_lines.append("")
|
|
@@ -499,8 +499,16 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
499
499
|
# Use the matrix as-is since it already has the correct sample columns
|
|
500
500
|
# The matrix columns are sample names, which is what we want for the assay columns
|
|
501
501
|
|
|
502
|
-
# round to int
|
|
503
|
-
abundance_matrix
|
|
502
|
+
# round to int - handle both Polars and Pandas DataFrames
|
|
503
|
+
if hasattr(abundance_matrix, 'with_columns'):
|
|
504
|
+
# Polars DataFrame
|
|
505
|
+
numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
|
|
506
|
+
abundance_matrix = abundance_matrix.with_columns([
|
|
507
|
+
abundance_matrix[col].round(0) for col in numeric_cols
|
|
508
|
+
])
|
|
509
|
+
else:
|
|
510
|
+
# Pandas DataFrame
|
|
511
|
+
abundance_matrix = abundance_matrix.round(0)
|
|
504
512
|
|
|
505
513
|
# Use actual number of samples from the abundance matrix
|
|
506
514
|
n_assays = len(abundance_matrix.columns)
|
|
@@ -570,9 +578,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
570
578
|
]
|
|
571
579
|
# Add abundance values for each assay
|
|
572
580
|
consensus_uid = row["consensus_uid"]
|
|
573
|
-
if consensus_uid in abundance_matrix
|
|
574
|
-
|
|
575
|
-
|
|
581
|
+
# Check if consensus_uid exists in the abundance_matrix (Polars)
|
|
582
|
+
filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
|
|
583
|
+
if filtered_matrix.height > 0:
|
|
584
|
+
# Get the first (and should be only) matching row
|
|
585
|
+
abundance_row = filtered_matrix.row(0, named=True)
|
|
586
|
+
# Extract values excluding the consensus_uid column
|
|
587
|
+
abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
588
|
+
sml_row += [str(val) if val is not None else "null" for val in abundance_values]
|
|
576
589
|
else:
|
|
577
590
|
sml_row += ["null"] * n_assays
|
|
578
591
|
sml_row += ["null", "null"]
|
|
@@ -615,11 +628,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
615
628
|
str(row.get("retention_time_in_seconds_start", "null")),
|
|
616
629
|
str(row.get("retention_time_in_seconds_end", "null")),
|
|
617
630
|
]
|
|
618
|
-
# Add abundance values for each assay - same as SML
|
|
631
|
+
# Add abundance values for each assay - same as SML (Polars)
|
|
619
632
|
consensus_uid = row["consensus_uid"]
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
633
|
+
filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
|
|
634
|
+
if filtered_matrix.height > 0:
|
|
635
|
+
# Get the first (and should be only) matching row
|
|
636
|
+
abundance_row = filtered_matrix.row(0, named=True)
|
|
637
|
+
# Extract values excluding the consensus_uid column
|
|
638
|
+
abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
639
|
+
smf_row += [str(val) if val is not None else "null" for val in abundance_values]
|
|
623
640
|
else:
|
|
624
641
|
smf_row += ["null"] * n_assays
|
|
625
642
|
smf_lines.append("\t".join(smf_row))
|
masster/study/h5.py
CHANGED
|
@@ -94,7 +94,7 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
|
|
|
94
94
|
numeric_cols.append(col)
|
|
95
95
|
|
|
96
96
|
logger.debug(
|
|
97
|
-
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
|
|
97
|
+
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
|
|
98
98
|
)
|
|
99
99
|
|
|
100
100
|
# Process numeric columns in batch (most efficient)
|
|
@@ -277,7 +277,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
277
277
|
results[chunk_start] = chunk_result
|
|
278
278
|
except Exception as e:
|
|
279
279
|
logger.warning(
|
|
280
|
-
f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
|
|
280
|
+
f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}",
|
|
281
281
|
)
|
|
282
282
|
# Fallback to simple string conversion for this chunk
|
|
283
283
|
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
@@ -435,7 +435,7 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
|
|
|
435
435
|
group.create_dataset(col, data=data_as_str, compression=compression)
|
|
436
436
|
else:
|
|
437
437
|
logger.warning(
|
|
438
|
-
f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
|
|
438
|
+
f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
|
|
439
439
|
)
|
|
440
440
|
elif dtype == "string":
|
|
441
441
|
# Handle string columns
|
|
@@ -698,17 +698,17 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
698
698
|
# Get available columns from HDF5 file
|
|
699
699
|
hdf5_columns = list(group.keys())
|
|
700
700
|
logger.debug(f"HDF5 columns available: {hdf5_columns}")
|
|
701
|
-
|
|
701
|
+
|
|
702
702
|
# Handle column name migrations for backward compatibility first
|
|
703
703
|
if df_name == "samples_df":
|
|
704
704
|
# Migrate old column names to new names
|
|
705
705
|
column_migrations = {
|
|
706
706
|
"size": "num_features",
|
|
707
|
-
"file_source": "sample_source",
|
|
707
|
+
"file_source": "sample_source",
|
|
708
708
|
"ms1": "num_ms1",
|
|
709
|
-
"ms2": "num_ms2"
|
|
709
|
+
"ms2": "num_ms2",
|
|
710
710
|
}
|
|
711
|
-
|
|
711
|
+
|
|
712
712
|
# Create a mapping of what's actually available after migrations
|
|
713
713
|
effective_columns = hdf5_columns.copy()
|
|
714
714
|
for old_name, new_name in column_migrations.items():
|
|
@@ -720,14 +720,14 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
720
720
|
# First pass: load all existing columns (including migrated ones)
|
|
721
721
|
for col in schema_columns or []:
|
|
722
722
|
source_col = col
|
|
723
|
-
|
|
723
|
+
|
|
724
724
|
# Check if we need to load from a migrated column name
|
|
725
725
|
if df_name == "samples_df":
|
|
726
726
|
column_migrations = {
|
|
727
727
|
"size": "num_features",
|
|
728
|
-
"file_source": "sample_source",
|
|
728
|
+
"file_source": "sample_source",
|
|
729
729
|
"ms1": "num_ms1",
|
|
730
|
-
"ms2": "num_ms2"
|
|
730
|
+
"ms2": "num_ms2",
|
|
731
731
|
}
|
|
732
732
|
# Reverse lookup - find old name for new name
|
|
733
733
|
reverse_migrations = {v: k for k, v in column_migrations.items()}
|
|
@@ -736,7 +736,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
736
736
|
if old_name in group:
|
|
737
737
|
source_col = old_name
|
|
738
738
|
logger.info(f"Loading '{col}' from old column name '{old_name}'")
|
|
739
|
-
|
|
739
|
+
|
|
740
740
|
if source_col not in group:
|
|
741
741
|
missing_columns.append(col)
|
|
742
742
|
continue
|
|
@@ -829,12 +829,12 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
829
829
|
if df_name == "samples_df":
|
|
830
830
|
column_migrations = {
|
|
831
831
|
"size": "num_features",
|
|
832
|
-
"file_source": "sample_source",
|
|
832
|
+
"file_source": "sample_source",
|
|
833
833
|
"ms1": "num_ms1",
|
|
834
|
-
"ms2": "num_ms2"
|
|
834
|
+
"ms2": "num_ms2",
|
|
835
835
|
}
|
|
836
836
|
migrated_old_names = set(column_migrations.keys())
|
|
837
|
-
|
|
837
|
+
|
|
838
838
|
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
839
839
|
|
|
840
840
|
for col in extra_columns:
|
|
@@ -974,7 +974,7 @@ def _save_study5_compressed(self, filename=None):
|
|
|
974
974
|
|
|
975
975
|
pbar.update(1)
|
|
976
976
|
pbar.set_description(
|
|
977
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
|
|
977
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
|
|
978
978
|
)
|
|
979
979
|
|
|
980
980
|
# Store samples_df - use optimized batch processing
|
|
@@ -987,7 +987,7 @@ def _save_study5_compressed(self, filename=None):
|
|
|
987
987
|
# Store features_df - use fast method that skips chrom and ms2_specs columns
|
|
988
988
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
989
989
|
self.logger.debug(
|
|
990
|
-
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
|
|
990
|
+
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
|
|
991
991
|
)
|
|
992
992
|
_save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
|
|
993
993
|
pbar.update(1)
|
|
@@ -1066,7 +1066,7 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
|
|
|
1066
1066
|
numeric_cols.append(col)
|
|
1067
1067
|
|
|
1068
1068
|
logger.debug(
|
|
1069
|
-
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
|
|
1069
|
+
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
|
|
1070
1070
|
)
|
|
1071
1071
|
|
|
1072
1072
|
# Process numeric columns in batch (most efficient)
|
|
@@ -1184,7 +1184,7 @@ def _save_study5(self, filename=None):
|
|
|
1184
1184
|
|
|
1185
1185
|
pbar.update(1)
|
|
1186
1186
|
pbar.set_description(
|
|
1187
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
|
|
1187
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
|
|
1188
1188
|
)
|
|
1189
1189
|
|
|
1190
1190
|
# Store samples_df - use optimized batch processing
|
|
@@ -1309,7 +1309,7 @@ def _load_study5(self, filename=None):
|
|
|
1309
1309
|
) as pbar:
|
|
1310
1310
|
# Load metadata
|
|
1311
1311
|
pbar.set_description(
|
|
1312
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata"
|
|
1312
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
|
|
1313
1313
|
)
|
|
1314
1314
|
if "metadata" in f:
|
|
1315
1315
|
metadata = f["metadata"]
|
|
@@ -1371,7 +1371,7 @@ def _load_study5(self, filename=None):
|
|
|
1371
1371
|
|
|
1372
1372
|
# Load samples_df
|
|
1373
1373
|
pbar.set_description(
|
|
1374
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
|
|
1374
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1375
1375
|
)
|
|
1376
1376
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1377
1377
|
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
@@ -1411,7 +1411,7 @@ def _load_study5(self, filename=None):
|
|
|
1411
1411
|
pbar.update(1)
|
|
1412
1412
|
# Load samples_df
|
|
1413
1413
|
pbar.set_description(
|
|
1414
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
|
|
1414
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1415
1415
|
)
|
|
1416
1416
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1417
1417
|
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
@@ -1452,12 +1452,16 @@ def _load_study5(self, filename=None):
|
|
|
1452
1452
|
|
|
1453
1453
|
# Load features_df
|
|
1454
1454
|
pbar.set_description(
|
|
1455
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features"
|
|
1455
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features",
|
|
1456
1456
|
)
|
|
1457
1457
|
if "features" in f and len(f["features"].keys()) > 0:
|
|
1458
1458
|
object_columns = ["chrom", "ms2_scans", "ms2_specs"]
|
|
1459
1459
|
self.features_df = _load_dataframe_from_group(
|
|
1460
|
-
f["features"],
|
|
1460
|
+
f["features"],
|
|
1461
|
+
schema,
|
|
1462
|
+
"features_df",
|
|
1463
|
+
self.logger,
|
|
1464
|
+
object_columns,
|
|
1461
1465
|
)
|
|
1462
1466
|
else:
|
|
1463
1467
|
self.features_df = None
|
|
@@ -1465,7 +1469,7 @@ def _load_study5(self, filename=None):
|
|
|
1465
1469
|
|
|
1466
1470
|
# Load consensus_df
|
|
1467
1471
|
pbar.set_description(
|
|
1468
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus"
|
|
1472
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus",
|
|
1469
1473
|
)
|
|
1470
1474
|
if "consensus" in f and len(f["consensus"].keys()) > 0:
|
|
1471
1475
|
# Only include adducts in object_columns if it actually exists in the file
|
|
@@ -1474,7 +1478,11 @@ def _load_study5(self, filename=None):
|
|
|
1474
1478
|
object_columns.append("adducts")
|
|
1475
1479
|
|
|
1476
1480
|
self.consensus_df = _load_dataframe_from_group(
|
|
1477
|
-
f["consensus"],
|
|
1481
|
+
f["consensus"],
|
|
1482
|
+
schema,
|
|
1483
|
+
"consensus_df",
|
|
1484
|
+
self.logger,
|
|
1485
|
+
object_columns,
|
|
1478
1486
|
)
|
|
1479
1487
|
|
|
1480
1488
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
@@ -1507,22 +1515,28 @@ def _load_study5(self, filename=None):
|
|
|
1507
1515
|
|
|
1508
1516
|
# Load consensus_mapping_df
|
|
1509
1517
|
pbar.set_description(
|
|
1510
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
|
|
1518
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
|
|
1511
1519
|
)
|
|
1512
1520
|
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
1513
1521
|
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1514
|
-
f["consensus_mapping"],
|
|
1522
|
+
f["consensus_mapping"],
|
|
1523
|
+
schema,
|
|
1524
|
+
"consensus_mapping_df",
|
|
1525
|
+
self.logger,
|
|
1515
1526
|
)
|
|
1516
1527
|
else:
|
|
1517
1528
|
self.consensus_mapping_df = None
|
|
1518
1529
|
pbar.update(1)
|
|
1519
1530
|
# Load consensus_mapping_df
|
|
1520
1531
|
pbar.set_description(
|
|
1521
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
|
|
1532
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
|
|
1522
1533
|
)
|
|
1523
1534
|
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
1524
1535
|
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1525
|
-
f["consensus_mapping"],
|
|
1536
|
+
f["consensus_mapping"],
|
|
1537
|
+
schema,
|
|
1538
|
+
"consensus_mapping_df",
|
|
1539
|
+
self.logger,
|
|
1526
1540
|
)
|
|
1527
1541
|
else:
|
|
1528
1542
|
self.consensus_mapping_df = None
|
|
@@ -1530,34 +1544,38 @@ def _load_study5(self, filename=None):
|
|
|
1530
1544
|
|
|
1531
1545
|
# Load consensus_ms2
|
|
1532
1546
|
pbar.set_description(
|
|
1533
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2"
|
|
1547
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2",
|
|
1534
1548
|
)
|
|
1535
1549
|
if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
|
|
1536
1550
|
object_columns = ["spec"]
|
|
1537
1551
|
self.consensus_ms2 = _load_dataframe_from_group(
|
|
1538
|
-
f["consensus_ms2"],
|
|
1552
|
+
f["consensus_ms2"],
|
|
1553
|
+
schema,
|
|
1554
|
+
"consensus_ms2",
|
|
1555
|
+
self.logger,
|
|
1556
|
+
object_columns,
|
|
1539
1557
|
)
|
|
1540
1558
|
else:
|
|
1541
1559
|
self.consensus_ms2 = None
|
|
1542
1560
|
pbar.update(1)
|
|
1543
1561
|
|
|
1544
1562
|
# Check and migrate old string-based map_id to integer indices
|
|
1545
|
-
if
|
|
1546
|
-
not self.samples_df.is_empty() and
|
|
1547
|
-
self.samples_df['map_id'].dtype == pl.Utf8):
|
|
1563
|
+
if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
|
|
1548
1564
|
self.logger.info("Detected old string-based map_id format, migrating to integer indices")
|
|
1549
|
-
|
|
1565
|
+
|
|
1550
1566
|
# Convert string-based map_id to integer indices
|
|
1551
1567
|
sample_count = len(self.samples_df)
|
|
1552
1568
|
new_map_ids = list(range(sample_count))
|
|
1553
|
-
|
|
1569
|
+
|
|
1554
1570
|
self.samples_df = self.samples_df.with_columns(
|
|
1555
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
1571
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
1556
1572
|
)
|
|
1557
|
-
|
|
1573
|
+
|
|
1558
1574
|
# Ensure the column is Int64 type
|
|
1559
1575
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
1560
|
-
|
|
1561
|
-
self.logger.info(
|
|
1576
|
+
|
|
1577
|
+
self.logger.info(
|
|
1578
|
+
f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
|
|
1579
|
+
)
|
|
1562
1580
|
|
|
1563
1581
|
self.logger.debug("Study loaded")
|