masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -35,8 +35,8 @@ import h5py
|
|
|
35
35
|
import polars as pl
|
|
36
36
|
from tqdm import tqdm
|
|
37
37
|
|
|
38
|
-
from
|
|
39
|
-
from
|
|
38
|
+
from masster.chromatogram import Chromatogram
|
|
39
|
+
from masster.spectrum import Spectrum
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
# Helper functions for HDF5 operations
|
|
@@ -109,13 +109,7 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
|
|
|
109
109
|
|
|
110
110
|
# Process object columns with optimized serialization
|
|
111
111
|
if object_cols:
|
|
112
|
-
_save_object_columns_optimized(
|
|
113
|
-
group,
|
|
114
|
-
df_ordered,
|
|
115
|
-
object_cols,
|
|
116
|
-
logger,
|
|
117
|
-
chunk_size,
|
|
118
|
-
)
|
|
112
|
+
_save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
|
|
119
113
|
|
|
120
114
|
except Exception as e:
|
|
121
115
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
@@ -152,33 +146,17 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
152
146
|
|
|
153
147
|
# If sample value is a list/array, treat as object column
|
|
154
148
|
if isinstance(sample_value, (list, tuple, np.ndarray)):
|
|
155
|
-
logger.debug(
|
|
156
|
-
|
|
157
|
-
)
|
|
158
|
-
_save_dataframe_column_legacy_single(
|
|
159
|
-
group,
|
|
160
|
-
col,
|
|
161
|
-
data_series.to_list(),
|
|
162
|
-
"object",
|
|
163
|
-
logger,
|
|
164
|
-
)
|
|
149
|
+
logger.debug(f"Column '{col}' contains array-like data, treating as object")
|
|
150
|
+
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
|
|
165
151
|
return
|
|
166
152
|
|
|
167
153
|
# Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
|
|
168
154
|
try:
|
|
169
|
-
data_array = np.array(
|
|
170
|
-
[(-123 if x is None else float(x)) for x in data_array],
|
|
171
|
-
)
|
|
155
|
+
data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
|
|
172
156
|
except (ValueError, TypeError):
|
|
173
157
|
# If conversion fails, this is not a numeric column
|
|
174
158
|
logger.debug(f"Column '{col}' is not numeric, treating as object")
|
|
175
|
-
_save_dataframe_column_legacy_single(
|
|
176
|
-
group,
|
|
177
|
-
col,
|
|
178
|
-
data_series.to_list(),
|
|
179
|
-
"object",
|
|
180
|
-
logger,
|
|
181
|
-
)
|
|
159
|
+
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
|
|
182
160
|
return
|
|
183
161
|
|
|
184
162
|
group.create_dataset(col, data=data_array, **compression_kwargs)
|
|
@@ -186,13 +164,7 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
186
164
|
except Exception as e:
|
|
187
165
|
logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
|
|
188
166
|
# Fallback to old method
|
|
189
|
-
_save_dataframe_column_legacy_single(
|
|
190
|
-
group,
|
|
191
|
-
col,
|
|
192
|
-
data_series.to_list(),
|
|
193
|
-
str(data_series.dtype),
|
|
194
|
-
logger,
|
|
195
|
-
)
|
|
167
|
+
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), str(data_series.dtype), logger)
|
|
196
168
|
|
|
197
169
|
|
|
198
170
|
def _save_string_column_fast(group, col, data_series, logger):
|
|
@@ -207,13 +179,7 @@ def _save_string_column_fast(group, col, data_series, logger):
|
|
|
207
179
|
except Exception as e:
|
|
208
180
|
logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
|
|
209
181
|
# Fallback to old method
|
|
210
|
-
_save_dataframe_column_legacy_single(
|
|
211
|
-
group,
|
|
212
|
-
col,
|
|
213
|
-
data_series.to_list(),
|
|
214
|
-
"string",
|
|
215
|
-
logger,
|
|
216
|
-
)
|
|
182
|
+
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), "string", logger)
|
|
217
183
|
|
|
218
184
|
|
|
219
185
|
def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
@@ -266,9 +232,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
266
232
|
else:
|
|
267
233
|
serialized_chunk.append("None")
|
|
268
234
|
else:
|
|
269
|
-
logger.warning(
|
|
270
|
-
f"Unknown object column '{col_name}', using default serialization",
|
|
271
|
-
)
|
|
235
|
+
logger.warning(f"Unknown object column '{col_name}', using default serialization")
|
|
272
236
|
for item in chunk_data:
|
|
273
237
|
serialized_chunk.append(str(item) if item is not None else "None")
|
|
274
238
|
|
|
@@ -281,28 +245,16 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
281
245
|
total_items = len(data_list)
|
|
282
246
|
|
|
283
247
|
if total_items == 0:
|
|
284
|
-
group.create_dataset(
|
|
285
|
-
col,
|
|
286
|
-
data=[],
|
|
287
|
-
compression="gzip",
|
|
288
|
-
compression_opts=6,
|
|
289
|
-
)
|
|
248
|
+
group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
|
|
290
249
|
continue
|
|
291
250
|
|
|
292
251
|
# For small datasets, process directly
|
|
293
252
|
if total_items <= chunk_size:
|
|
294
253
|
serialized_data = serialize_chunk(col, data_list)
|
|
295
|
-
group.create_dataset(
|
|
296
|
-
col,
|
|
297
|
-
data=serialized_data,
|
|
298
|
-
compression="gzip",
|
|
299
|
-
compression_opts=6,
|
|
300
|
-
)
|
|
254
|
+
group.create_dataset(col, data=serialized_data, compression="gzip", compression_opts=6)
|
|
301
255
|
else:
|
|
302
256
|
# For large datasets, use chunked processing with parallel serialization
|
|
303
|
-
logger.debug(
|
|
304
|
-
f"Processing large object column '{col}' with {total_items} items in chunks",
|
|
305
|
-
)
|
|
257
|
+
logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
|
|
306
258
|
|
|
307
259
|
all_serialized = []
|
|
308
260
|
num_chunks = (total_items + chunk_size - 1) // chunk_size
|
|
@@ -329,58 +281,28 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
329
281
|
)
|
|
330
282
|
# Fallback to simple string conversion for this chunk
|
|
331
283
|
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
332
|
-
results[chunk_start] = [
|
|
333
|
-
str(item) if item is not None else "None"
|
|
334
|
-
for item in chunk
|
|
335
|
-
]
|
|
284
|
+
results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
|
|
336
285
|
|
|
337
286
|
# Reassemble in correct order
|
|
338
287
|
for i in range(0, total_items, chunk_size):
|
|
339
288
|
if i in results:
|
|
340
289
|
all_serialized.extend(results[i])
|
|
341
290
|
|
|
342
|
-
group.create_dataset(
|
|
343
|
-
col,
|
|
344
|
-
data=all_serialized,
|
|
345
|
-
compression="gzip",
|
|
346
|
-
compression_opts=6,
|
|
347
|
-
)
|
|
291
|
+
group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
|
|
348
292
|
|
|
349
293
|
except Exception as e:
|
|
350
|
-
logger.warning(
|
|
351
|
-
f"Failed to save object column '{col}' with optimization: {e}",
|
|
352
|
-
)
|
|
294
|
+
logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
|
|
353
295
|
# Fallback to old method
|
|
354
|
-
_save_dataframe_column_legacy_single(
|
|
355
|
-
group,
|
|
356
|
-
col,
|
|
357
|
-
df[col].to_list(),
|
|
358
|
-
"object",
|
|
359
|
-
logger,
|
|
360
|
-
)
|
|
296
|
+
_save_dataframe_column_legacy_single(group, col, df[col].to_list(), "object", logger)
|
|
361
297
|
|
|
362
298
|
|
|
363
|
-
def _save_dataframe_column_legacy_single(
|
|
364
|
-
group,
|
|
365
|
-
col: str,
|
|
366
|
-
data,
|
|
367
|
-
dtype: str,
|
|
368
|
-
logger,
|
|
369
|
-
compression="gzip",
|
|
370
|
-
):
|
|
299
|
+
def _save_dataframe_column_legacy_single(group, col: str, data, dtype: str, logger, compression="gzip"):
|
|
371
300
|
"""Legacy single column save method for fallback."""
|
|
372
301
|
# This is the original _save_dataframe_column method for compatibility
|
|
373
302
|
return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
|
|
374
303
|
|
|
375
304
|
|
|
376
|
-
def _save_dataframe_column_legacy(
|
|
377
|
-
group,
|
|
378
|
-
col: str,
|
|
379
|
-
data,
|
|
380
|
-
dtype: str,
|
|
381
|
-
logger,
|
|
382
|
-
compression="gzip",
|
|
383
|
-
):
|
|
305
|
+
def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, compression="gzip"):
|
|
384
306
|
"""
|
|
385
307
|
Save a single DataFrame column to an HDF5 group with optimized compression.
|
|
386
308
|
|
|
@@ -405,10 +327,7 @@ def _save_dataframe_column_legacy(
|
|
|
405
327
|
|
|
406
328
|
# Optimized compression configuration
|
|
407
329
|
COMPRESSION_CONFIG = {
|
|
408
|
-
"fast_access": {
|
|
409
|
-
"compression": "lzf",
|
|
410
|
-
"shuffle": True,
|
|
411
|
-
}, # Fast I/O for IDs, rt, mz
|
|
330
|
+
"fast_access": {"compression": "lzf", "shuffle": True}, # Fast I/O for IDs, rt, mz
|
|
412
331
|
"numeric": {"compression": "lzf"}, # Standard numeric data
|
|
413
332
|
"string": {"compression": "gzip", "compression_opts": 6}, # String data
|
|
414
333
|
"json": {"compression": "gzip", "compression_opts": 6}, # JSON objects
|
|
@@ -431,22 +350,11 @@ def _save_dataframe_column_legacy(
|
|
|
431
350
|
return COMPRESSION_CONFIG["fast_access"]
|
|
432
351
|
|
|
433
352
|
# JSON object columns (complex serialized data)
|
|
434
|
-
elif column_name in [
|
|
435
|
-
"spectrum",
|
|
436
|
-
"chromatogram",
|
|
437
|
-
"chromatograms",
|
|
438
|
-
"ms2_specs",
|
|
439
|
-
"chrom",
|
|
440
|
-
]:
|
|
353
|
+
elif column_name in ["spectrum", "chromatogram", "chromatograms", "ms2_specs", "chrom"]:
|
|
441
354
|
return COMPRESSION_CONFIG["json"]
|
|
442
355
|
|
|
443
356
|
# String/text columns
|
|
444
|
-
elif data_type in ["string", "object"] and column_name in [
|
|
445
|
-
"sample_name",
|
|
446
|
-
"file_path",
|
|
447
|
-
"label",
|
|
448
|
-
"file_type",
|
|
449
|
-
]:
|
|
357
|
+
elif data_type in ["string", "object"] and column_name in ["sample_name", "file_path", "label", "file_type"]:
|
|
450
358
|
return COMPRESSION_CONFIG["string"]
|
|
451
359
|
|
|
452
360
|
# Large bulk numeric data
|
|
@@ -626,14 +534,9 @@ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
626
534
|
"""Convert string null representations to proper nulls."""
|
|
627
535
|
for col in df.columns:
|
|
628
536
|
if df[col].dtype == pl.Utf8:
|
|
629
|
-
df = df.with_columns(
|
|
630
|
-
[
|
|
631
|
-
|
|
632
|
-
.then(None)
|
|
633
|
-
.otherwise(pl.col(col))
|
|
634
|
-
.alias(col),
|
|
635
|
-
],
|
|
636
|
-
)
|
|
537
|
+
df = df.with_columns([
|
|
538
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
|
|
539
|
+
])
|
|
637
540
|
return df
|
|
638
541
|
|
|
639
542
|
|
|
@@ -674,11 +577,7 @@ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.Da
|
|
|
674
577
|
return df
|
|
675
578
|
|
|
676
579
|
|
|
677
|
-
def _reorder_columns_by_schema(
|
|
678
|
-
df: pl.DataFrame,
|
|
679
|
-
schema: dict,
|
|
680
|
-
df_name: str,
|
|
681
|
-
) -> pl.DataFrame:
|
|
580
|
+
def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
|
|
682
581
|
"""Reorder DataFrame columns to match schema order."""
|
|
683
582
|
if df_name not in schema or "columns" not in schema[df_name]:
|
|
684
583
|
return df
|
|
@@ -732,24 +631,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
732
631
|
# print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
733
632
|
if col == "adducts":
|
|
734
633
|
# Handle adducts as List(Struct) - now contains dicts
|
|
735
|
-
df = df.with_columns(
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
pl.
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
pl.Field("mass", pl.Float64),
|
|
747
|
-
],
|
|
748
|
-
),
|
|
749
|
-
),
|
|
634
|
+
df = df.with_columns([
|
|
635
|
+
pl.Series(
|
|
636
|
+
col,
|
|
637
|
+
values,
|
|
638
|
+
dtype=pl.List(
|
|
639
|
+
pl.Struct([
|
|
640
|
+
pl.Field("adduct", pl.Utf8),
|
|
641
|
+
pl.Field("count", pl.Int64),
|
|
642
|
+
pl.Field("percentage", pl.Float64),
|
|
643
|
+
pl.Field("mass", pl.Float64),
|
|
644
|
+
]),
|
|
750
645
|
),
|
|
751
|
-
|
|
752
|
-
)
|
|
646
|
+
),
|
|
647
|
+
])
|
|
753
648
|
else:
|
|
754
649
|
# Other object columns stay as Object
|
|
755
650
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -760,24 +655,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
760
655
|
# print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
761
656
|
if col == "adducts":
|
|
762
657
|
# Handle adducts as List(Struct) - now contains dicts
|
|
763
|
-
df = df.with_columns(
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
pl.
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
pl.Field("mass", pl.Float64),
|
|
775
|
-
],
|
|
776
|
-
),
|
|
777
|
-
),
|
|
658
|
+
df = df.with_columns([
|
|
659
|
+
pl.Series(
|
|
660
|
+
col,
|
|
661
|
+
values,
|
|
662
|
+
dtype=pl.List(
|
|
663
|
+
pl.Struct([
|
|
664
|
+
pl.Field("adduct", pl.Utf8),
|
|
665
|
+
pl.Field("count", pl.Int64),
|
|
666
|
+
pl.Field("percentage", pl.Float64),
|
|
667
|
+
pl.Field("mass", pl.Float64),
|
|
668
|
+
]),
|
|
778
669
|
),
|
|
779
|
-
|
|
780
|
-
)
|
|
670
|
+
),
|
|
671
|
+
])
|
|
781
672
|
else:
|
|
782
673
|
# Other object columns stay as Object
|
|
783
674
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -785,13 +676,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
785
676
|
return df
|
|
786
677
|
|
|
787
678
|
|
|
788
|
-
def _load_dataframe_from_group(
|
|
789
|
-
group,
|
|
790
|
-
schema: dict,
|
|
791
|
-
df_name: str,
|
|
792
|
-
logger,
|
|
793
|
-
object_columns: list | None = None,
|
|
794
|
-
) -> pl.DataFrame:
|
|
679
|
+
def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list | None = None) -> pl.DataFrame:
|
|
795
680
|
"""Load a DataFrame from HDF5 group using schema."""
|
|
796
681
|
if object_columns is None:
|
|
797
682
|
object_columns = []
|
|
@@ -805,9 +690,7 @@ def _load_dataframe_from_group(
|
|
|
805
690
|
)
|
|
806
691
|
schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
|
|
807
692
|
logger.debug(f"Schema section for {df_name}: {schema_section}")
|
|
808
|
-
schema_columns = (
|
|
809
|
-
schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
810
|
-
)
|
|
693
|
+
schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
811
694
|
logger.debug(f"Schema columns for {df_name}: {schema_columns}")
|
|
812
695
|
if schema_columns is None:
|
|
813
696
|
schema_columns = []
|
|
@@ -830,9 +713,7 @@ def _load_dataframe_from_group(
|
|
|
830
713
|
effective_columns = hdf5_columns.copy()
|
|
831
714
|
for old_name, new_name in column_migrations.items():
|
|
832
715
|
if old_name in effective_columns:
|
|
833
|
-
logger.info(
|
|
834
|
-
f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility",
|
|
835
|
-
)
|
|
716
|
+
logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
|
|
836
717
|
# Add the new name to effective columns and optionally remove old name
|
|
837
718
|
effective_columns.append(new_name)
|
|
838
719
|
|
|
@@ -897,9 +778,7 @@ def _load_dataframe_from_group(
|
|
|
897
778
|
for col, values in data.items():
|
|
898
779
|
if values is not None and hasattr(values, "__len__"):
|
|
899
780
|
expected_length = len(values)
|
|
900
|
-
logger.debug(
|
|
901
|
-
f"Determined expected_length={expected_length} from loaded column '{col}'",
|
|
902
|
-
)
|
|
781
|
+
logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
|
|
903
782
|
break
|
|
904
783
|
|
|
905
784
|
# If no data loaded yet, try HDF5 columns directly
|
|
@@ -909,9 +788,7 @@ def _load_dataframe_from_group(
|
|
|
909
788
|
col_data = group[col][:]
|
|
910
789
|
if expected_length is None:
|
|
911
790
|
expected_length = len(col_data)
|
|
912
|
-
logger.debug(
|
|
913
|
-
f"Determined expected_length={expected_length} from HDF5 column '{col}'",
|
|
914
|
-
)
|
|
791
|
+
logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
|
|
915
792
|
break
|
|
916
793
|
|
|
917
794
|
# Default to 0 if no data found
|
|
@@ -925,38 +802,26 @@ def _load_dataframe_from_group(
|
|
|
925
802
|
# For missing columns, create appropriately sized array with appropriate defaults
|
|
926
803
|
if col in object_columns:
|
|
927
804
|
data[col] = [None] * expected_length
|
|
928
|
-
logger.debug(
|
|
929
|
-
f"Created missing object column '{col}' with length {expected_length}",
|
|
930
|
-
)
|
|
805
|
+
logger.debug(f"Created missing object column '{col}' with length {expected_length}")
|
|
931
806
|
else:
|
|
932
807
|
# Provide specific default values for new columns for backward compatibility
|
|
933
808
|
if df_name == "samples_df":
|
|
934
809
|
if col == "sample_group":
|
|
935
810
|
data[col] = [""] * expected_length # Empty string default
|
|
936
|
-
logger.debug(
|
|
937
|
-
f"Created missing column '{col}' with empty string defaults",
|
|
938
|
-
)
|
|
811
|
+
logger.debug(f"Created missing column '{col}' with empty string defaults")
|
|
939
812
|
elif col == "sample_batch":
|
|
940
813
|
data[col] = [1] * expected_length # Batch 1 default
|
|
941
|
-
logger.debug(
|
|
942
|
-
f"Created missing column '{col}' with batch 1 defaults",
|
|
943
|
-
)
|
|
814
|
+
logger.debug(f"Created missing column '{col}' with batch 1 defaults")
|
|
944
815
|
elif col == "sample_sequence":
|
|
945
816
|
# Create increasing sequence numbers
|
|
946
817
|
data[col] = list(range(1, expected_length + 1))
|
|
947
|
-
logger.debug(
|
|
948
|
-
f"Created missing column '{col}' with sequence 1-{expected_length}",
|
|
949
|
-
)
|
|
818
|
+
logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
|
|
950
819
|
else:
|
|
951
820
|
data[col] = [None] * expected_length
|
|
952
|
-
logger.debug(
|
|
953
|
-
f"Created missing regular column '{col}' with length {expected_length}",
|
|
954
|
-
)
|
|
821
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
955
822
|
else:
|
|
956
823
|
data[col] = [None] * expected_length
|
|
957
|
-
logger.debug(
|
|
958
|
-
f"Created missing regular column '{col}' with length {expected_length}",
|
|
959
|
-
)
|
|
824
|
+
logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
|
|
960
825
|
|
|
961
826
|
# Check for columns in HDF5 file that are not in schema (for backward compatibility)
|
|
962
827
|
# But skip the old column names we already migrated
|
|
@@ -970,11 +835,7 @@ def _load_dataframe_from_group(
|
|
|
970
835
|
}
|
|
971
836
|
migrated_old_names = set(column_migrations.keys())
|
|
972
837
|
|
|
973
|
-
extra_columns = [
|
|
974
|
-
col
|
|
975
|
-
for col in hdf5_columns
|
|
976
|
-
if col not in (schema_columns or []) and col not in migrated_old_names
|
|
977
|
-
]
|
|
838
|
+
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
978
839
|
|
|
979
840
|
for col in extra_columns:
|
|
980
841
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -993,10 +854,7 @@ def _load_dataframe_from_group(
|
|
|
993
854
|
object_columns.append(col)
|
|
994
855
|
else:
|
|
995
856
|
# Regular string data
|
|
996
|
-
data[col] = [
|
|
997
|
-
item.decode("utf-8") if isinstance(item, bytes) else item
|
|
998
|
-
for item in column_data
|
|
999
|
-
]
|
|
857
|
+
data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
1000
858
|
except Exception:
|
|
1001
859
|
# If decoding fails, treat as regular data
|
|
1002
860
|
data[col] = column_data
|
|
@@ -1009,19 +867,10 @@ def _load_dataframe_from_group(
|
|
|
1009
867
|
# Handle byte string conversion for non-object columns
|
|
1010
868
|
# Only convert to strings for columns that should actually be strings
|
|
1011
869
|
for col, values in data.items():
|
|
1012
|
-
if (
|
|
1013
|
-
col not in object_columns
|
|
1014
|
-
and values is not None
|
|
1015
|
-
and len(values) > 0
|
|
1016
|
-
and isinstance(values[0], bytes)
|
|
1017
|
-
):
|
|
870
|
+
if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
|
|
1018
871
|
# Check schema to see if this should be a string column
|
|
1019
872
|
should_be_string = False
|
|
1020
|
-
if
|
|
1021
|
-
df_name in schema
|
|
1022
|
-
and "columns" in schema[df_name]
|
|
1023
|
-
and col in schema[df_name]["columns"]
|
|
1024
|
-
):
|
|
873
|
+
if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
|
|
1025
874
|
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
1026
875
|
should_be_string = dtype_str == "pl.Utf8"
|
|
1027
876
|
|
|
@@ -1039,9 +888,7 @@ def _load_dataframe_from_group(
|
|
|
1039
888
|
logger.debug(f"Creating DataFrame with object columns: {object_columns}")
|
|
1040
889
|
for col in object_columns:
|
|
1041
890
|
if col in data:
|
|
1042
|
-
logger.debug(
|
|
1043
|
-
f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
|
|
1044
|
-
)
|
|
891
|
+
logger.debug(f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}")
|
|
1045
892
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
1046
893
|
else:
|
|
1047
894
|
df = pl.DataFrame(data)
|
|
@@ -1087,22 +934,15 @@ def _save_study5_compressed(self, filename):
|
|
|
1087
934
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1088
935
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1089
936
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1090
|
-
if (
|
|
1091
|
-
self.consensus_mapping_df
|
|
1092
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1093
|
-
):
|
|
1094
|
-
dataframes_to_save.append(
|
|
1095
|
-
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1096
|
-
)
|
|
937
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
938
|
+
dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
|
|
1097
939
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1098
940
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1099
941
|
|
|
1100
942
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1101
943
|
|
|
1102
944
|
# Show progress for large saves
|
|
1103
|
-
tdqm_disable =
|
|
1104
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1105
|
-
)
|
|
945
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1106
946
|
|
|
1107
947
|
with tqdm(
|
|
1108
948
|
total=total_steps,
|
|
@@ -1118,14 +958,8 @@ def _save_study5_compressed(self, filename):
|
|
|
1118
958
|
|
|
1119
959
|
# Store metadata
|
|
1120
960
|
metadata_group.attrs["format"] = "master-study-1"
|
|
1121
|
-
metadata_group.attrs["folder"] = (
|
|
1122
|
-
|
|
1123
|
-
)
|
|
1124
|
-
metadata_group.attrs["label"] = (
|
|
1125
|
-
str(self.label)
|
|
1126
|
-
if hasattr(self, "label") and self.label is not None
|
|
1127
|
-
else ""
|
|
1128
|
-
)
|
|
961
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
962
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1129
963
|
|
|
1130
964
|
# Store parameters as JSON
|
|
1131
965
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1146,16 +980,8 @@ def _save_study5_compressed(self, filename):
|
|
|
1146
980
|
# Store samples_df - use optimized batch processing
|
|
1147
981
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1148
982
|
samples_group = f.create_group("samples")
|
|
1149
|
-
self.logger.debug(
|
|
1150
|
-
|
|
1151
|
-
)
|
|
1152
|
-
_save_dataframe_optimized(
|
|
1153
|
-
self.samples_df,
|
|
1154
|
-
samples_group,
|
|
1155
|
-
schema,
|
|
1156
|
-
"samples_df",
|
|
1157
|
-
self.logger,
|
|
1158
|
-
)
|
|
983
|
+
self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
|
|
984
|
+
_save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
|
|
1159
985
|
pbar.update(1)
|
|
1160
986
|
|
|
1161
987
|
# Store features_df - use fast method that skips chrom and ms2_specs columns
|
|
@@ -1163,79 +989,38 @@ def _save_study5_compressed(self, filename):
|
|
|
1163
989
|
self.logger.debug(
|
|
1164
990
|
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
|
|
1165
991
|
)
|
|
1166
|
-
_save_dataframe_optimized_fast(
|
|
1167
|
-
self.features_df,
|
|
1168
|
-
features_group,
|
|
1169
|
-
schema,
|
|
1170
|
-
"features_df",
|
|
1171
|
-
self.logger,
|
|
1172
|
-
)
|
|
992
|
+
_save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
|
|
1173
993
|
pbar.update(1)
|
|
1174
994
|
|
|
1175
995
|
# Store consensus_df - use optimized batch processing
|
|
1176
996
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1177
|
-
self.logger.debug(
|
|
1178
|
-
|
|
1179
|
-
)
|
|
1180
|
-
_save_dataframe_optimized(
|
|
1181
|
-
self.consensus_df,
|
|
1182
|
-
consensus_group,
|
|
1183
|
-
schema,
|
|
1184
|
-
"consensus_df",
|
|
1185
|
-
self.logger,
|
|
1186
|
-
)
|
|
997
|
+
self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
|
|
998
|
+
_save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
|
|
1187
999
|
pbar.update(1)
|
|
1188
1000
|
|
|
1189
1001
|
# Store consensus_mapping_df - keep existing fast method
|
|
1190
|
-
if (
|
|
1191
|
-
self.consensus_mapping_df is not None
|
|
1192
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1193
|
-
):
|
|
1002
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1194
1003
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1195
|
-
self.logger.debug(
|
|
1196
|
-
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
1197
|
-
)
|
|
1004
|
+
self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
|
|
1198
1005
|
for col in consensus_mapping.columns:
|
|
1199
1006
|
try:
|
|
1200
1007
|
data = consensus_mapping[col].to_numpy()
|
|
1201
1008
|
# Use LZF compression for consensus mapping data
|
|
1202
|
-
consensus_mapping_group.create_dataset(
|
|
1203
|
-
col,
|
|
1204
|
-
data=data,
|
|
1205
|
-
compression="lzf",
|
|
1206
|
-
shuffle=True,
|
|
1207
|
-
)
|
|
1009
|
+
consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
1208
1010
|
except Exception as e:
|
|
1209
|
-
self.logger.warning(
|
|
1210
|
-
f"Failed to save column '{col}' in consensus_mapping_df: {e}",
|
|
1211
|
-
)
|
|
1011
|
+
self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
|
|
1212
1012
|
pbar.update(1)
|
|
1213
1013
|
|
|
1214
1014
|
# Store consensus_ms2 - use optimized batch processing
|
|
1215
1015
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1216
|
-
self.logger.debug(
|
|
1217
|
-
|
|
1218
|
-
)
|
|
1219
|
-
_save_dataframe_optimized(
|
|
1220
|
-
self.consensus_ms2,
|
|
1221
|
-
consensus_ms2_group,
|
|
1222
|
-
schema,
|
|
1223
|
-
"consensus_ms2",
|
|
1224
|
-
self.logger,
|
|
1225
|
-
)
|
|
1016
|
+
self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
|
|
1017
|
+
_save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
|
|
1226
1018
|
pbar.update(1)
|
|
1227
1019
|
|
|
1228
1020
|
self.logger.debug(f"Fast save completed for {filename}")
|
|
1229
1021
|
|
|
1230
1022
|
|
|
1231
|
-
def _save_dataframe_optimized_fast(
|
|
1232
|
-
df,
|
|
1233
|
-
group,
|
|
1234
|
-
schema,
|
|
1235
|
-
df_name,
|
|
1236
|
-
logger,
|
|
1237
|
-
chunk_size=10000,
|
|
1238
|
-
):
|
|
1023
|
+
def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
|
|
1239
1024
|
"""
|
|
1240
1025
|
Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
|
|
1241
1026
|
|
|
@@ -1260,9 +1045,7 @@ def _save_dataframe_optimized_fast(
|
|
|
1260
1045
|
# Skip chrom and ms2_specs columns for features_df
|
|
1261
1046
|
if df_name == "features_df":
|
|
1262
1047
|
skip_columns = ["chrom", "ms2_specs"]
|
|
1263
|
-
df_ordered = df_ordered.select(
|
|
1264
|
-
[col for col in df_ordered.columns if col not in skip_columns],
|
|
1265
|
-
)
|
|
1048
|
+
df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
|
|
1266
1049
|
logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
|
|
1267
1050
|
|
|
1268
1051
|
total_rows = len(df_ordered)
|
|
@@ -1297,13 +1080,7 @@ def _save_dataframe_optimized_fast(
|
|
|
1297
1080
|
|
|
1298
1081
|
# Process object columns with optimized serialization
|
|
1299
1082
|
if object_cols:
|
|
1300
|
-
_save_object_columns_optimized(
|
|
1301
|
-
group,
|
|
1302
|
-
df_ordered,
|
|
1303
|
-
object_cols,
|
|
1304
|
-
logger,
|
|
1305
|
-
chunk_size,
|
|
1306
|
-
)
|
|
1083
|
+
_save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
|
|
1307
1084
|
|
|
1308
1085
|
except Exception as e:
|
|
1309
1086
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
@@ -1366,22 +1143,15 @@ def _save_study5(self, filename):
|
|
|
1366
1143
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1367
1144
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1368
1145
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1369
|
-
if (
|
|
1370
|
-
self.consensus_mapping_df
|
|
1371
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1372
|
-
):
|
|
1373
|
-
dataframes_to_save.append(
|
|
1374
|
-
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1375
|
-
)
|
|
1146
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1147
|
+
dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
|
|
1376
1148
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1377
1149
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1378
1150
|
|
|
1379
1151
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1380
1152
|
|
|
1381
1153
|
# Show progress for large saves
|
|
1382
|
-
tdqm_disable =
|
|
1383
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1384
|
-
)
|
|
1154
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1385
1155
|
|
|
1386
1156
|
with tqdm(
|
|
1387
1157
|
total=total_steps,
|
|
@@ -1397,14 +1167,8 @@ def _save_study5(self, filename):
|
|
|
1397
1167
|
|
|
1398
1168
|
# Store metadata
|
|
1399
1169
|
metadata_group.attrs["format"] = "master-study-1"
|
|
1400
|
-
metadata_group.attrs["folder"] = (
|
|
1401
|
-
|
|
1402
|
-
)
|
|
1403
|
-
metadata_group.attrs["label"] = (
|
|
1404
|
-
str(self.label)
|
|
1405
|
-
if hasattr(self, "label") and self.label is not None
|
|
1406
|
-
else ""
|
|
1407
|
-
)
|
|
1170
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
1171
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1408
1172
|
|
|
1409
1173
|
# Store parameters as JSON
|
|
1410
1174
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1425,83 +1189,39 @@ def _save_study5(self, filename):
|
|
|
1425
1189
|
# Store samples_df - use optimized batch processing
|
|
1426
1190
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1427
1191
|
samples_group = f.create_group("samples")
|
|
1428
|
-
self.logger.debug(
|
|
1429
|
-
|
|
1430
|
-
)
|
|
1431
|
-
_save_dataframe_optimized(
|
|
1432
|
-
self.samples_df,
|
|
1433
|
-
samples_group,
|
|
1434
|
-
schema,
|
|
1435
|
-
"samples_df",
|
|
1436
|
-
self.logger,
|
|
1437
|
-
)
|
|
1192
|
+
self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
|
|
1193
|
+
_save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
|
|
1438
1194
|
pbar.update(1)
|
|
1439
1195
|
|
|
1440
1196
|
# Store features_df - use optimized batch processing
|
|
1441
1197
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
1442
|
-
self.logger.debug(
|
|
1443
|
-
|
|
1444
|
-
)
|
|
1445
|
-
_save_dataframe_optimized(
|
|
1446
|
-
self.features_df,
|
|
1447
|
-
features_group,
|
|
1448
|
-
schema,
|
|
1449
|
-
"features_df",
|
|
1450
|
-
self.logger,
|
|
1451
|
-
)
|
|
1198
|
+
self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
|
|
1199
|
+
_save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
|
|
1452
1200
|
pbar.update(1)
|
|
1453
1201
|
|
|
1454
1202
|
# Store consensus_df - use optimized batch processing
|
|
1455
1203
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1456
|
-
self.logger.debug(
|
|
1457
|
-
|
|
1458
|
-
)
|
|
1459
|
-
_save_dataframe_optimized(
|
|
1460
|
-
self.consensus_df,
|
|
1461
|
-
consensus_group,
|
|
1462
|
-
schema,
|
|
1463
|
-
"consensus_df",
|
|
1464
|
-
self.logger,
|
|
1465
|
-
)
|
|
1204
|
+
self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
|
|
1205
|
+
_save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
|
|
1466
1206
|
pbar.update(1)
|
|
1467
1207
|
|
|
1468
1208
|
# Store consensus_mapping_df - keep existing fast method
|
|
1469
|
-
if (
|
|
1470
|
-
self.consensus_mapping_df is not None
|
|
1471
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1472
|
-
):
|
|
1209
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1473
1210
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1474
|
-
self.logger.debug(
|
|
1475
|
-
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
1476
|
-
)
|
|
1211
|
+
self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
|
|
1477
1212
|
for col in consensus_mapping.columns:
|
|
1478
1213
|
try:
|
|
1479
1214
|
data = consensus_mapping[col].to_numpy()
|
|
1480
1215
|
# Use LZF compression for consensus mapping data
|
|
1481
|
-
consensus_mapping_group.create_dataset(
|
|
1482
|
-
col,
|
|
1483
|
-
data=data,
|
|
1484
|
-
compression="lzf",
|
|
1485
|
-
shuffle=True,
|
|
1486
|
-
)
|
|
1216
|
+
consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
1487
1217
|
except Exception as e:
|
|
1488
|
-
self.logger.warning(
|
|
1489
|
-
f"Failed to save column '{col}' in consensus_mapping_df: {e}",
|
|
1490
|
-
)
|
|
1218
|
+
self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
|
|
1491
1219
|
pbar.update(1)
|
|
1492
1220
|
|
|
1493
1221
|
# Store consensus_ms2 - use optimized batch processing
|
|
1494
1222
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1495
|
-
self.logger.debug(
|
|
1496
|
-
|
|
1497
|
-
)
|
|
1498
|
-
_save_dataframe_optimized(
|
|
1499
|
-
self.consensus_ms2,
|
|
1500
|
-
consensus_ms2_group,
|
|
1501
|
-
schema,
|
|
1502
|
-
"consensus_ms2",
|
|
1503
|
-
self.logger,
|
|
1504
|
-
)
|
|
1223
|
+
self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
|
|
1224
|
+
_save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
|
|
1505
1225
|
pbar.update(1)
|
|
1506
1226
|
|
|
1507
1227
|
self.logger.info(f"Study saved successfully to {filename}")
|
|
@@ -1551,9 +1271,7 @@ def _load_study5(self, filename=None):
|
|
|
1551
1271
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
1552
1272
|
schema = _load_schema(schema_path)
|
|
1553
1273
|
if not schema:
|
|
1554
|
-
self.logger.warning(
|
|
1555
|
-
f"Schema file {schema_path} not found. Using default types.",
|
|
1556
|
-
)
|
|
1274
|
+
self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
|
|
1557
1275
|
|
|
1558
1276
|
# Define loading steps for progress tracking
|
|
1559
1277
|
loading_steps = [
|
|
@@ -1616,7 +1334,7 @@ def _load_study5(self, filename=None):
|
|
|
1616
1334
|
self.history = {}
|
|
1617
1335
|
|
|
1618
1336
|
# Reconstruct self.parameters from loaded history
|
|
1619
|
-
from
|
|
1337
|
+
from masster.study.defaults.study_def import study_defaults
|
|
1620
1338
|
|
|
1621
1339
|
# Always create a fresh study_defaults object to ensure we have all defaults
|
|
1622
1340
|
self.parameters = study_defaults()
|
|
@@ -1625,48 +1343,27 @@ def _load_study5(self, filename=None):
|
|
|
1625
1343
|
if self.history and "study" in self.history:
|
|
1626
1344
|
study_params = self.history["study"]
|
|
1627
1345
|
if isinstance(study_params, dict):
|
|
1628
|
-
failed_params = self.parameters.set_from_dict(
|
|
1629
|
-
study_params,
|
|
1630
|
-
validate=False,
|
|
1631
|
-
)
|
|
1346
|
+
failed_params = self.parameters.set_from_dict(study_params, validate=False)
|
|
1632
1347
|
if failed_params:
|
|
1633
|
-
self.logger.debug(
|
|
1634
|
-
f"Could not set study parameters: {failed_params}",
|
|
1635
|
-
)
|
|
1348
|
+
self.logger.debug(f"Could not set study parameters: {failed_params}")
|
|
1636
1349
|
else:
|
|
1637
|
-
self.logger.debug(
|
|
1638
|
-
"Successfully updated parameters from loaded history",
|
|
1639
|
-
)
|
|
1350
|
+
self.logger.debug("Successfully updated parameters from loaded history")
|
|
1640
1351
|
else:
|
|
1641
|
-
self.logger.debug(
|
|
1642
|
-
"Study parameters in history are not a valid dictionary",
|
|
1643
|
-
)
|
|
1352
|
+
self.logger.debug("Study parameters in history are not a valid dictionary")
|
|
1644
1353
|
else:
|
|
1645
|
-
self.logger.debug(
|
|
1646
|
-
"No study parameters found in history, using defaults",
|
|
1647
|
-
)
|
|
1354
|
+
self.logger.debug("No study parameters found in history, using defaults")
|
|
1648
1355
|
|
|
1649
1356
|
# Synchronize instance attributes with parameters (similar to __init__)
|
|
1650
1357
|
# Note: folder and label are already loaded from metadata attributes above
|
|
1651
1358
|
# but we ensure they match the parameters for consistency
|
|
1652
|
-
if (
|
|
1653
|
-
hasattr(self.parameters, "folder")
|
|
1654
|
-
and self.parameters.folder is not None
|
|
1655
|
-
):
|
|
1359
|
+
if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
|
|
1656
1360
|
self.folder = self.parameters.folder
|
|
1657
|
-
if (
|
|
1658
|
-
hasattr(self.parameters, "label")
|
|
1659
|
-
and self.parameters.label is not None
|
|
1660
|
-
):
|
|
1361
|
+
if hasattr(self.parameters, "label") and self.parameters.label is not None:
|
|
1661
1362
|
self.label = self.parameters.label
|
|
1662
1363
|
if hasattr(self.parameters, "log_level"):
|
|
1663
1364
|
self.log_level = self.parameters.log_level
|
|
1664
1365
|
if hasattr(self.parameters, "log_label"):
|
|
1665
|
-
self.log_label =
|
|
1666
|
-
self.parameters.log_label
|
|
1667
|
-
if self.parameters.log_label is not None
|
|
1668
|
-
else ""
|
|
1669
|
-
)
|
|
1366
|
+
self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
|
|
1670
1367
|
if hasattr(self.parameters, "log_sink"):
|
|
1671
1368
|
self.log_sink = self.parameters.log_sink
|
|
1672
1369
|
pbar.update(1)
|
|
@@ -1676,17 +1373,10 @@ def _load_study5(self, filename=None):
|
|
|
1676
1373
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1677
1374
|
)
|
|
1678
1375
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1679
|
-
self.samples_df = _load_dataframe_from_group(
|
|
1680
|
-
f["samples"],
|
|
1681
|
-
schema,
|
|
1682
|
-
"samples_df",
|
|
1683
|
-
self.logger,
|
|
1684
|
-
)
|
|
1376
|
+
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
1685
1377
|
else:
|
|
1686
1378
|
# Initialize empty samples_df with the correct schema if no data exists
|
|
1687
|
-
self.logger.debug(
|
|
1688
|
-
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1689
|
-
)
|
|
1379
|
+
self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
|
|
1690
1380
|
self.samples_df = pl.DataFrame(
|
|
1691
1381
|
{
|
|
1692
1382
|
"sample_uid": [],
|
|
@@ -1723,17 +1413,10 @@ def _load_study5(self, filename=None):
|
|
|
1723
1413
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1724
1414
|
)
|
|
1725
1415
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1726
|
-
self.samples_df = _load_dataframe_from_group(
|
|
1727
|
-
f["samples"],
|
|
1728
|
-
schema,
|
|
1729
|
-
"samples_df",
|
|
1730
|
-
self.logger,
|
|
1731
|
-
)
|
|
1416
|
+
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
1732
1417
|
else:
|
|
1733
1418
|
# Initialize empty samples_df with the correct schema if no data exists
|
|
1734
|
-
self.logger.debug(
|
|
1735
|
-
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1736
|
-
)
|
|
1419
|
+
self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
|
|
1737
1420
|
self.samples_df = pl.DataFrame(
|
|
1738
1421
|
{
|
|
1739
1422
|
"sample_uid": [],
|
|
@@ -1803,39 +1486,28 @@ def _load_study5(self, filename=None):
|
|
|
1803
1486
|
|
|
1804
1487
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
1805
1488
|
if self.consensus_df is not None:
|
|
1806
|
-
if
|
|
1807
|
-
"adducts
|
|
1808
|
-
|
|
1809
|
-
):
|
|
1810
|
-
self.logger.info(
|
|
1811
|
-
"Adding missing 'adducts' column for backward compatibility",
|
|
1812
|
-
)
|
|
1813
|
-
empty_adducts: list[list] = [
|
|
1814
|
-
[] for _ in range(len(self.consensus_df))
|
|
1815
|
-
]
|
|
1489
|
+
if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
|
|
1490
|
+
self.logger.info("Adding missing 'adducts' column for backward compatibility")
|
|
1491
|
+
empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
|
|
1816
1492
|
|
|
1817
1493
|
# If column exists but is Null, drop it first
|
|
1818
1494
|
if "adducts" in self.consensus_df.columns:
|
|
1819
1495
|
self.consensus_df = self.consensus_df.drop("adducts")
|
|
1820
1496
|
|
|
1821
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
pl.
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
pl.Field("mass", pl.Float64),
|
|
1833
|
-
],
|
|
1834
|
-
),
|
|
1835
|
-
),
|
|
1497
|
+
self.consensus_df = self.consensus_df.with_columns([
|
|
1498
|
+
pl.Series(
|
|
1499
|
+
"adducts",
|
|
1500
|
+
empty_adducts,
|
|
1501
|
+
dtype=pl.List(
|
|
1502
|
+
pl.Struct([
|
|
1503
|
+
pl.Field("adduct", pl.Utf8),
|
|
1504
|
+
pl.Field("count", pl.Int64),
|
|
1505
|
+
pl.Field("percentage", pl.Float64),
|
|
1506
|
+
pl.Field("mass", pl.Float64),
|
|
1507
|
+
]),
|
|
1836
1508
|
),
|
|
1837
|
-
|
|
1838
|
-
)
|
|
1509
|
+
),
|
|
1510
|
+
])
|
|
1839
1511
|
else:
|
|
1840
1512
|
self.consensus_df = None
|
|
1841
1513
|
pbar.update(1)
|
|
@@ -1887,14 +1559,8 @@ def _load_study5(self, filename=None):
|
|
|
1887
1559
|
pbar.update(1)
|
|
1888
1560
|
|
|
1889
1561
|
# Check and migrate old string-based map_id to integer indices
|
|
1890
|
-
if (
|
|
1891
|
-
self.
|
|
1892
|
-
and not self.samples_df.is_empty()
|
|
1893
|
-
and self.samples_df["map_id"].dtype == pl.Utf8
|
|
1894
|
-
):
|
|
1895
|
-
self.logger.info(
|
|
1896
|
-
"Detected old string-based map_id format, migrating to integer indices",
|
|
1897
|
-
)
|
|
1562
|
+
if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
|
|
1563
|
+
self.logger.info("Detected old string-based map_id format, migrating to integer indices")
|
|
1898
1564
|
|
|
1899
1565
|
# Convert string-based map_id to integer indices
|
|
1900
1566
|
sample_count = len(self.samples_df)
|