masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/study/h5.py
CHANGED
|
@@ -109,7 +109,13 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
|
|
|
109
109
|
|
|
110
110
|
# Process object columns with optimized serialization
|
|
111
111
|
if object_cols:
|
|
112
|
-
_save_object_columns_optimized(
|
|
112
|
+
_save_object_columns_optimized(
|
|
113
|
+
group,
|
|
114
|
+
df_ordered,
|
|
115
|
+
object_cols,
|
|
116
|
+
logger,
|
|
117
|
+
chunk_size,
|
|
118
|
+
)
|
|
113
119
|
|
|
114
120
|
except Exception as e:
|
|
115
121
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
@@ -146,17 +152,33 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
146
152
|
|
|
147
153
|
# If sample value is a list/array, treat as object column
|
|
148
154
|
if isinstance(sample_value, (list, tuple, np.ndarray)):
|
|
149
|
-
logger.debug(
|
|
150
|
-
|
|
155
|
+
logger.debug(
|
|
156
|
+
f"Column '{col}' contains array-like data, treating as object",
|
|
157
|
+
)
|
|
158
|
+
_save_dataframe_column_legacy_single(
|
|
159
|
+
group,
|
|
160
|
+
col,
|
|
161
|
+
data_series.to_list(),
|
|
162
|
+
"object",
|
|
163
|
+
logger,
|
|
164
|
+
)
|
|
151
165
|
return
|
|
152
166
|
|
|
153
167
|
# Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
|
|
154
168
|
try:
|
|
155
|
-
data_array = np.array(
|
|
169
|
+
data_array = np.array(
|
|
170
|
+
[(-123 if x is None else float(x)) for x in data_array],
|
|
171
|
+
)
|
|
156
172
|
except (ValueError, TypeError):
|
|
157
173
|
# If conversion fails, this is not a numeric column
|
|
158
174
|
logger.debug(f"Column '{col}' is not numeric, treating as object")
|
|
159
|
-
_save_dataframe_column_legacy_single(
|
|
175
|
+
_save_dataframe_column_legacy_single(
|
|
176
|
+
group,
|
|
177
|
+
col,
|
|
178
|
+
data_series.to_list(),
|
|
179
|
+
"object",
|
|
180
|
+
logger,
|
|
181
|
+
)
|
|
160
182
|
return
|
|
161
183
|
|
|
162
184
|
group.create_dataset(col, data=data_array, **compression_kwargs)
|
|
@@ -164,7 +186,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
164
186
|
except Exception as e:
|
|
165
187
|
logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
|
|
166
188
|
# Fallback to old method
|
|
167
|
-
_save_dataframe_column_legacy_single(
|
|
189
|
+
_save_dataframe_column_legacy_single(
|
|
190
|
+
group,
|
|
191
|
+
col,
|
|
192
|
+
data_series.to_list(),
|
|
193
|
+
str(data_series.dtype),
|
|
194
|
+
logger,
|
|
195
|
+
)
|
|
168
196
|
|
|
169
197
|
|
|
170
198
|
def _save_string_column_fast(group, col, data_series, logger):
|
|
@@ -179,7 +207,13 @@ def _save_string_column_fast(group, col, data_series, logger):
|
|
|
179
207
|
except Exception as e:
|
|
180
208
|
logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
|
|
181
209
|
# Fallback to old method
|
|
182
|
-
_save_dataframe_column_legacy_single(
|
|
210
|
+
_save_dataframe_column_legacy_single(
|
|
211
|
+
group,
|
|
212
|
+
col,
|
|
213
|
+
data_series.to_list(),
|
|
214
|
+
"string",
|
|
215
|
+
logger,
|
|
216
|
+
)
|
|
183
217
|
|
|
184
218
|
|
|
185
219
|
def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
@@ -232,7 +266,9 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
232
266
|
else:
|
|
233
267
|
serialized_chunk.append("None")
|
|
234
268
|
else:
|
|
235
|
-
logger.warning(
|
|
269
|
+
logger.warning(
|
|
270
|
+
f"Unknown object column '{col_name}', using default serialization",
|
|
271
|
+
)
|
|
236
272
|
for item in chunk_data:
|
|
237
273
|
serialized_chunk.append(str(item) if item is not None else "None")
|
|
238
274
|
|
|
@@ -245,16 +281,28 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
245
281
|
total_items = len(data_list)
|
|
246
282
|
|
|
247
283
|
if total_items == 0:
|
|
248
|
-
group.create_dataset(
|
|
284
|
+
group.create_dataset(
|
|
285
|
+
col,
|
|
286
|
+
data=[],
|
|
287
|
+
compression="gzip",
|
|
288
|
+
compression_opts=6,
|
|
289
|
+
)
|
|
249
290
|
continue
|
|
250
291
|
|
|
251
292
|
# For small datasets, process directly
|
|
252
293
|
if total_items <= chunk_size:
|
|
253
294
|
serialized_data = serialize_chunk(col, data_list)
|
|
254
|
-
group.create_dataset(
|
|
295
|
+
group.create_dataset(
|
|
296
|
+
col,
|
|
297
|
+
data=serialized_data,
|
|
298
|
+
compression="gzip",
|
|
299
|
+
compression_opts=6,
|
|
300
|
+
)
|
|
255
301
|
else:
|
|
256
302
|
# For large datasets, use chunked processing with parallel serialization
|
|
257
|
-
logger.debug(
|
|
303
|
+
logger.debug(
|
|
304
|
+
f"Processing large object column '{col}' with {total_items} items in chunks",
|
|
305
|
+
)
|
|
258
306
|
|
|
259
307
|
all_serialized = []
|
|
260
308
|
num_chunks = (total_items + chunk_size - 1) // chunk_size
|
|
@@ -281,28 +329,58 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
281
329
|
)
|
|
282
330
|
# Fallback to simple string conversion for this chunk
|
|
283
331
|
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
284
|
-
results[chunk_start] = [
|
|
332
|
+
results[chunk_start] = [
|
|
333
|
+
str(item) if item is not None else "None"
|
|
334
|
+
for item in chunk
|
|
335
|
+
]
|
|
285
336
|
|
|
286
337
|
# Reassemble in correct order
|
|
287
338
|
for i in range(0, total_items, chunk_size):
|
|
288
339
|
if i in results:
|
|
289
340
|
all_serialized.extend(results[i])
|
|
290
341
|
|
|
291
|
-
group.create_dataset(
|
|
342
|
+
group.create_dataset(
|
|
343
|
+
col,
|
|
344
|
+
data=all_serialized,
|
|
345
|
+
compression="gzip",
|
|
346
|
+
compression_opts=6,
|
|
347
|
+
)
|
|
292
348
|
|
|
293
349
|
except Exception as e:
|
|
294
|
-
logger.warning(
|
|
350
|
+
logger.warning(
|
|
351
|
+
f"Failed to save object column '{col}' with optimization: {e}",
|
|
352
|
+
)
|
|
295
353
|
# Fallback to old method
|
|
296
|
-
_save_dataframe_column_legacy_single(
|
|
354
|
+
_save_dataframe_column_legacy_single(
|
|
355
|
+
group,
|
|
356
|
+
col,
|
|
357
|
+
df[col].to_list(),
|
|
358
|
+
"object",
|
|
359
|
+
logger,
|
|
360
|
+
)
|
|
297
361
|
|
|
298
362
|
|
|
299
|
-
def _save_dataframe_column_legacy_single(
|
|
363
|
+
def _save_dataframe_column_legacy_single(
|
|
364
|
+
group,
|
|
365
|
+
col: str,
|
|
366
|
+
data,
|
|
367
|
+
dtype: str,
|
|
368
|
+
logger,
|
|
369
|
+
compression="gzip",
|
|
370
|
+
):
|
|
300
371
|
"""Legacy single column save method for fallback."""
|
|
301
372
|
# This is the original _save_dataframe_column method for compatibility
|
|
302
373
|
return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
|
|
303
374
|
|
|
304
375
|
|
|
305
|
-
def _save_dataframe_column_legacy(
|
|
376
|
+
def _save_dataframe_column_legacy(
|
|
377
|
+
group,
|
|
378
|
+
col: str,
|
|
379
|
+
data,
|
|
380
|
+
dtype: str,
|
|
381
|
+
logger,
|
|
382
|
+
compression="gzip",
|
|
383
|
+
):
|
|
306
384
|
"""
|
|
307
385
|
Save a single DataFrame column to an HDF5 group with optimized compression.
|
|
308
386
|
|
|
@@ -327,7 +405,10 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
|
|
|
327
405
|
|
|
328
406
|
# Optimized compression configuration
|
|
329
407
|
COMPRESSION_CONFIG = {
|
|
330
|
-
"fast_access": {
|
|
408
|
+
"fast_access": {
|
|
409
|
+
"compression": "lzf",
|
|
410
|
+
"shuffle": True,
|
|
411
|
+
}, # Fast I/O for IDs, rt, mz
|
|
331
412
|
"numeric": {"compression": "lzf"}, # Standard numeric data
|
|
332
413
|
"string": {"compression": "gzip", "compression_opts": 6}, # String data
|
|
333
414
|
"json": {"compression": "gzip", "compression_opts": 6}, # JSON objects
|
|
@@ -350,11 +431,22 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
|
|
|
350
431
|
return COMPRESSION_CONFIG["fast_access"]
|
|
351
432
|
|
|
352
433
|
# JSON object columns (complex serialized data)
|
|
353
|
-
elif column_name in [
|
|
434
|
+
elif column_name in [
|
|
435
|
+
"spectrum",
|
|
436
|
+
"chromatogram",
|
|
437
|
+
"chromatograms",
|
|
438
|
+
"ms2_specs",
|
|
439
|
+
"chrom",
|
|
440
|
+
]:
|
|
354
441
|
return COMPRESSION_CONFIG["json"]
|
|
355
442
|
|
|
356
443
|
# String/text columns
|
|
357
|
-
elif data_type in ["string", "object"] and column_name in [
|
|
444
|
+
elif data_type in ["string", "object"] and column_name in [
|
|
445
|
+
"sample_name",
|
|
446
|
+
"file_path",
|
|
447
|
+
"label",
|
|
448
|
+
"file_type",
|
|
449
|
+
]:
|
|
358
450
|
return COMPRESSION_CONFIG["string"]
|
|
359
451
|
|
|
360
452
|
# Large bulk numeric data
|
|
@@ -524,12 +616,16 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
524
616
|
for adduct_row in adducts_list:
|
|
525
617
|
if len(adduct_row) >= 3:
|
|
526
618
|
# Convert from [adduct, count, percentage] to dict structure
|
|
527
|
-
converted_adducts.append(
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
619
|
+
converted_adducts.append(
|
|
620
|
+
{
|
|
621
|
+
"adduct": str(adduct_row[0]),
|
|
622
|
+
"count": int(float(adduct_row[1])),
|
|
623
|
+
"percentage": float(adduct_row[2]),
|
|
624
|
+
"mass": float(adduct_row[3])
|
|
625
|
+
if len(adduct_row) > 3
|
|
626
|
+
else 0.0,
|
|
627
|
+
},
|
|
628
|
+
)
|
|
533
629
|
reconstructed_data.append(converted_adducts)
|
|
534
630
|
else:
|
|
535
631
|
# Unknown object column
|
|
@@ -544,9 +640,14 @@ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
544
640
|
"""Convert string null representations to proper nulls."""
|
|
545
641
|
for col in df.columns:
|
|
546
642
|
if df[col].dtype == pl.Utf8:
|
|
547
|
-
df = df.with_columns(
|
|
548
|
-
|
|
549
|
-
|
|
643
|
+
df = df.with_columns(
|
|
644
|
+
[
|
|
645
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
646
|
+
.then(None)
|
|
647
|
+
.otherwise(pl.col(col))
|
|
648
|
+
.alias(col),
|
|
649
|
+
],
|
|
650
|
+
)
|
|
550
651
|
return df
|
|
551
652
|
|
|
552
653
|
|
|
@@ -587,7 +688,11 @@ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.Da
|
|
|
587
688
|
return df
|
|
588
689
|
|
|
589
690
|
|
|
590
|
-
def _reorder_columns_by_schema(
|
|
691
|
+
def _reorder_columns_by_schema(
|
|
692
|
+
df: pl.DataFrame,
|
|
693
|
+
schema: dict,
|
|
694
|
+
df_name: str,
|
|
695
|
+
) -> pl.DataFrame:
|
|
591
696
|
"""Reorder DataFrame columns to match schema order."""
|
|
592
697
|
if df_name not in schema or "columns" not in schema[df_name]:
|
|
593
698
|
return df
|
|
@@ -641,20 +746,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
641
746
|
# print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
642
747
|
if col == "adducts":
|
|
643
748
|
# Handle adducts as List(Struct) - now contains dicts
|
|
644
|
-
df = df.with_columns(
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
pl.
|
|
650
|
-
pl.
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
749
|
+
df = df.with_columns(
|
|
750
|
+
[
|
|
751
|
+
pl.Series(
|
|
752
|
+
col,
|
|
753
|
+
values,
|
|
754
|
+
dtype=pl.List(
|
|
755
|
+
pl.Struct(
|
|
756
|
+
[
|
|
757
|
+
pl.Field("adduct", pl.Utf8),
|
|
758
|
+
pl.Field("count", pl.Int64),
|
|
759
|
+
pl.Field("percentage", pl.Float64),
|
|
760
|
+
pl.Field("mass", pl.Float64),
|
|
761
|
+
],
|
|
762
|
+
),
|
|
763
|
+
),
|
|
655
764
|
),
|
|
656
|
-
|
|
657
|
-
|
|
765
|
+
],
|
|
766
|
+
)
|
|
658
767
|
else:
|
|
659
768
|
# Other object columns stay as Object
|
|
660
769
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -665,20 +774,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
665
774
|
# print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
666
775
|
if col == "adducts":
|
|
667
776
|
# Handle adducts as List(Struct) - now contains dicts
|
|
668
|
-
df = df.with_columns(
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
pl.
|
|
674
|
-
pl.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
777
|
+
df = df.with_columns(
|
|
778
|
+
[
|
|
779
|
+
pl.Series(
|
|
780
|
+
col,
|
|
781
|
+
values,
|
|
782
|
+
dtype=pl.List(
|
|
783
|
+
pl.Struct(
|
|
784
|
+
[
|
|
785
|
+
pl.Field("adduct", pl.Utf8),
|
|
786
|
+
pl.Field("count", pl.Int64),
|
|
787
|
+
pl.Field("percentage", pl.Float64),
|
|
788
|
+
pl.Field("mass", pl.Float64),
|
|
789
|
+
],
|
|
790
|
+
),
|
|
791
|
+
),
|
|
679
792
|
),
|
|
680
|
-
|
|
681
|
-
|
|
793
|
+
],
|
|
794
|
+
)
|
|
682
795
|
else:
|
|
683
796
|
# Other object columns stay as Object
|
|
684
797
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -686,7 +799,13 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
686
799
|
return df
|
|
687
800
|
|
|
688
801
|
|
|
689
|
-
def _load_dataframe_from_group(
|
|
802
|
+
def _load_dataframe_from_group(
|
|
803
|
+
group,
|
|
804
|
+
schema: dict,
|
|
805
|
+
df_name: str,
|
|
806
|
+
logger,
|
|
807
|
+
object_columns: list | None = None,
|
|
808
|
+
) -> pl.DataFrame:
|
|
690
809
|
"""Load a DataFrame from HDF5 group using schema."""
|
|
691
810
|
if object_columns is None:
|
|
692
811
|
object_columns = []
|
|
@@ -700,7 +819,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
700
819
|
)
|
|
701
820
|
schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
|
|
702
821
|
logger.debug(f"Schema section for {df_name}: {schema_section}")
|
|
703
|
-
schema_columns =
|
|
822
|
+
schema_columns = (
|
|
823
|
+
schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
824
|
+
)
|
|
704
825
|
logger.debug(f"Schema columns for {df_name}: {schema_columns}")
|
|
705
826
|
if schema_columns is None:
|
|
706
827
|
schema_columns = []
|
|
@@ -723,7 +844,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
723
844
|
effective_columns = hdf5_columns.copy()
|
|
724
845
|
for old_name, new_name in column_migrations.items():
|
|
725
846
|
if old_name in effective_columns:
|
|
726
|
-
logger.info(
|
|
847
|
+
logger.info(
|
|
848
|
+
f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility",
|
|
849
|
+
)
|
|
727
850
|
# Add the new name to effective columns and optionally remove old name
|
|
728
851
|
effective_columns.append(new_name)
|
|
729
852
|
|
|
@@ -788,7 +911,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
788
911
|
for col, values in data.items():
|
|
789
912
|
if values is not None and hasattr(values, "__len__"):
|
|
790
913
|
expected_length = len(values)
|
|
791
|
-
logger.debug(
|
|
914
|
+
logger.debug(
|
|
915
|
+
f"Determined expected_length={expected_length} from loaded column '{col}'",
|
|
916
|
+
)
|
|
792
917
|
break
|
|
793
918
|
|
|
794
919
|
# If no data loaded yet, try HDF5 columns directly
|
|
@@ -798,7 +923,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
798
923
|
col_data = group[col][:]
|
|
799
924
|
if expected_length is None:
|
|
800
925
|
expected_length = len(col_data)
|
|
801
|
-
logger.debug(
|
|
926
|
+
logger.debug(
|
|
927
|
+
f"Determined expected_length={expected_length} from HDF5 column '{col}'",
|
|
928
|
+
)
|
|
802
929
|
break
|
|
803
930
|
|
|
804
931
|
# Default to 0 if no data found
|
|
@@ -812,26 +939,38 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
812
939
|
# For missing columns, create appropriately sized array with appropriate defaults
|
|
813
940
|
if col in object_columns:
|
|
814
941
|
data[col] = [None] * expected_length
|
|
815
|
-
logger.debug(
|
|
942
|
+
logger.debug(
|
|
943
|
+
f"Created missing object column '{col}' with length {expected_length}",
|
|
944
|
+
)
|
|
816
945
|
else:
|
|
817
946
|
# Provide specific default values for new columns for backward compatibility
|
|
818
947
|
if df_name == "samples_df":
|
|
819
948
|
if col == "sample_group":
|
|
820
949
|
data[col] = [""] * expected_length # Empty string default
|
|
821
|
-
logger.debug(
|
|
950
|
+
logger.debug(
|
|
951
|
+
f"Created missing column '{col}' with empty string defaults",
|
|
952
|
+
)
|
|
822
953
|
elif col == "sample_batch":
|
|
823
954
|
data[col] = [1] * expected_length # Batch 1 default
|
|
824
|
-
logger.debug(
|
|
955
|
+
logger.debug(
|
|
956
|
+
f"Created missing column '{col}' with batch 1 defaults",
|
|
957
|
+
)
|
|
825
958
|
elif col == "sample_sequence":
|
|
826
959
|
# Create increasing sequence numbers
|
|
827
960
|
data[col] = list(range(1, expected_length + 1))
|
|
828
|
-
logger.debug(
|
|
961
|
+
logger.debug(
|
|
962
|
+
f"Created missing column '{col}' with sequence 1-{expected_length}",
|
|
963
|
+
)
|
|
829
964
|
else:
|
|
830
965
|
data[col] = [None] * expected_length
|
|
831
|
-
logger.debug(
|
|
966
|
+
logger.debug(
|
|
967
|
+
f"Created missing regular column '{col}' with length {expected_length}",
|
|
968
|
+
)
|
|
832
969
|
else:
|
|
833
970
|
data[col] = [None] * expected_length
|
|
834
|
-
logger.debug(
|
|
971
|
+
logger.debug(
|
|
972
|
+
f"Created missing regular column '{col}' with length {expected_length}",
|
|
973
|
+
)
|
|
835
974
|
|
|
836
975
|
# Check for columns in HDF5 file that are not in schema (for backward compatibility)
|
|
837
976
|
# But skip the old column names we already migrated
|
|
@@ -845,7 +984,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
845
984
|
}
|
|
846
985
|
migrated_old_names = set(column_migrations.keys())
|
|
847
986
|
|
|
848
|
-
extra_columns = [
|
|
987
|
+
extra_columns = [
|
|
988
|
+
col
|
|
989
|
+
for col in hdf5_columns
|
|
990
|
+
if col not in (schema_columns or []) and col not in migrated_old_names
|
|
991
|
+
]
|
|
849
992
|
|
|
850
993
|
for col in extra_columns:
|
|
851
994
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -864,7 +1007,10 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
864
1007
|
object_columns.append(col)
|
|
865
1008
|
else:
|
|
866
1009
|
# Regular string data
|
|
867
|
-
data[col] = [
|
|
1010
|
+
data[col] = [
|
|
1011
|
+
item.decode("utf-8") if isinstance(item, bytes) else item
|
|
1012
|
+
for item in column_data
|
|
1013
|
+
]
|
|
868
1014
|
except Exception:
|
|
869
1015
|
# If decoding fails, treat as regular data
|
|
870
1016
|
data[col] = column_data
|
|
@@ -877,10 +1023,19 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
877
1023
|
# Handle byte string conversion for non-object columns
|
|
878
1024
|
# Only convert to strings for columns that should actually be strings
|
|
879
1025
|
for col, values in data.items():
|
|
880
|
-
if
|
|
1026
|
+
if (
|
|
1027
|
+
col not in object_columns
|
|
1028
|
+
and values is not None
|
|
1029
|
+
and len(values) > 0
|
|
1030
|
+
and isinstance(values[0], bytes)
|
|
1031
|
+
):
|
|
881
1032
|
# Check schema to see if this should be a string column
|
|
882
1033
|
should_be_string = False
|
|
883
|
-
if
|
|
1034
|
+
if (
|
|
1035
|
+
df_name in schema
|
|
1036
|
+
and "columns" in schema[df_name]
|
|
1037
|
+
and col in schema[df_name]["columns"]
|
|
1038
|
+
):
|
|
884
1039
|
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
885
1040
|
should_be_string = dtype_str == "pl.Utf8"
|
|
886
1041
|
|
|
@@ -898,7 +1053,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
898
1053
|
logger.debug(f"Creating DataFrame with object columns: {object_columns}")
|
|
899
1054
|
for col in object_columns:
|
|
900
1055
|
if col in data:
|
|
901
|
-
logger.debug(
|
|
1056
|
+
logger.debug(
|
|
1057
|
+
f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
|
|
1058
|
+
)
|
|
902
1059
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
903
1060
|
else:
|
|
904
1061
|
df = pl.DataFrame(data)
|
|
@@ -944,19 +1101,34 @@ def _save_study5_compressed(self, filename):
|
|
|
944
1101
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
945
1102
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
946
1103
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
947
|
-
if
|
|
948
|
-
|
|
1104
|
+
if (
|
|
1105
|
+
self.consensus_mapping_df is not None
|
|
1106
|
+
and not self.consensus_mapping_df.is_empty()
|
|
1107
|
+
):
|
|
1108
|
+
dataframes_to_save.append(
|
|
1109
|
+
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1110
|
+
)
|
|
949
1111
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
950
1112
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
951
|
-
if
|
|
1113
|
+
if (
|
|
1114
|
+
hasattr(self, "lib_df")
|
|
1115
|
+
and self.lib_df is not None
|
|
1116
|
+
and not self.lib_df.is_empty()
|
|
1117
|
+
):
|
|
952
1118
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
953
|
-
if
|
|
1119
|
+
if (
|
|
1120
|
+
hasattr(self, "id_df")
|
|
1121
|
+
and self.id_df is not None
|
|
1122
|
+
and not self.id_df.is_empty()
|
|
1123
|
+
):
|
|
954
1124
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
955
1125
|
|
|
956
1126
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
957
1127
|
|
|
958
1128
|
# Show progress for large saves
|
|
959
|
-
tdqm_disable =
|
|
1129
|
+
tdqm_disable = (
|
|
1130
|
+
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1131
|
+
)
|
|
960
1132
|
|
|
961
1133
|
with tqdm(
|
|
962
1134
|
total=total_steps,
|
|
@@ -974,8 +1146,14 @@ def _save_study5_compressed(self, filename):
|
|
|
974
1146
|
|
|
975
1147
|
# Store metadata
|
|
976
1148
|
metadata_group.attrs["format"] = "master-study-1"
|
|
977
|
-
metadata_group.attrs["folder"] =
|
|
978
|
-
|
|
1149
|
+
metadata_group.attrs["folder"] = (
|
|
1150
|
+
str(self.folder) if self.folder is not None else ""
|
|
1151
|
+
)
|
|
1152
|
+
metadata_group.attrs["label"] = (
|
|
1153
|
+
str(self.label)
|
|
1154
|
+
if hasattr(self, "label") and self.label is not None
|
|
1155
|
+
else ""
|
|
1156
|
+
)
|
|
979
1157
|
|
|
980
1158
|
# Store parameters as JSON
|
|
981
1159
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -996,8 +1174,16 @@ def _save_study5_compressed(self, filename):
|
|
|
996
1174
|
# Store samples_df - use optimized batch processing
|
|
997
1175
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
998
1176
|
samples_group = f.create_group("samples")
|
|
999
|
-
self.logger.debug(
|
|
1000
|
-
|
|
1177
|
+
self.logger.debug(
|
|
1178
|
+
f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
|
|
1179
|
+
)
|
|
1180
|
+
_save_dataframe_optimized(
|
|
1181
|
+
self.samples_df,
|
|
1182
|
+
samples_group,
|
|
1183
|
+
schema,
|
|
1184
|
+
"samples_df",
|
|
1185
|
+
self.logger,
|
|
1186
|
+
)
|
|
1001
1187
|
pbar.update(1)
|
|
1002
1188
|
|
|
1003
1189
|
# Store features_df - use fast method that skips chrom and ms2_specs columns
|
|
@@ -1005,50 +1191,115 @@ def _save_study5_compressed(self, filename):
|
|
|
1005
1191
|
self.logger.debug(
|
|
1006
1192
|
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
|
|
1007
1193
|
)
|
|
1008
|
-
_save_dataframe_optimized_fast(
|
|
1194
|
+
_save_dataframe_optimized_fast(
|
|
1195
|
+
self.features_df,
|
|
1196
|
+
features_group,
|
|
1197
|
+
schema,
|
|
1198
|
+
"features_df",
|
|
1199
|
+
self.logger,
|
|
1200
|
+
)
|
|
1009
1201
|
pbar.update(1)
|
|
1010
1202
|
|
|
1011
1203
|
# Store consensus_df - use optimized batch processing
|
|
1012
1204
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1013
|
-
self.logger.debug(
|
|
1014
|
-
|
|
1205
|
+
self.logger.debug(
|
|
1206
|
+
f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
|
|
1207
|
+
)
|
|
1208
|
+
_save_dataframe_optimized(
|
|
1209
|
+
self.consensus_df,
|
|
1210
|
+
consensus_group,
|
|
1211
|
+
schema,
|
|
1212
|
+
"consensus_df",
|
|
1213
|
+
self.logger,
|
|
1214
|
+
)
|
|
1015
1215
|
pbar.update(1)
|
|
1016
1216
|
|
|
1017
1217
|
# Store consensus_mapping_df - keep existing fast method
|
|
1018
|
-
if
|
|
1218
|
+
if (
|
|
1219
|
+
self.consensus_mapping_df is not None
|
|
1220
|
+
and not self.consensus_mapping_df.is_empty()
|
|
1221
|
+
):
|
|
1019
1222
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1020
|
-
self.logger.debug(
|
|
1223
|
+
self.logger.debug(
|
|
1224
|
+
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
1225
|
+
)
|
|
1021
1226
|
for col in consensus_mapping.columns:
|
|
1022
1227
|
try:
|
|
1023
1228
|
data = consensus_mapping[col].to_numpy()
|
|
1024
1229
|
# Use LZF compression for consensus mapping data
|
|
1025
|
-
consensus_mapping_group.create_dataset(
|
|
1230
|
+
consensus_mapping_group.create_dataset(
|
|
1231
|
+
col,
|
|
1232
|
+
data=data,
|
|
1233
|
+
compression="lzf",
|
|
1234
|
+
shuffle=True,
|
|
1235
|
+
)
|
|
1026
1236
|
except Exception as e:
|
|
1027
|
-
self.logger.warning(
|
|
1237
|
+
self.logger.warning(
|
|
1238
|
+
f"Failed to save column '{col}' in consensus_mapping_df: {e}",
|
|
1239
|
+
)
|
|
1028
1240
|
pbar.update(1)
|
|
1029
1241
|
|
|
1030
1242
|
# Store consensus_ms2 - use optimized batch processing
|
|
1031
1243
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1032
|
-
self.logger.debug(
|
|
1033
|
-
|
|
1244
|
+
self.logger.debug(
|
|
1245
|
+
f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
|
|
1246
|
+
)
|
|
1247
|
+
_save_dataframe_optimized(
|
|
1248
|
+
self.consensus_ms2,
|
|
1249
|
+
consensus_ms2_group,
|
|
1250
|
+
schema,
|
|
1251
|
+
"consensus_ms2",
|
|
1252
|
+
self.logger,
|
|
1253
|
+
)
|
|
1034
1254
|
pbar.update(1)
|
|
1035
1255
|
|
|
1036
1256
|
# Store lib_df - library data
|
|
1037
|
-
if
|
|
1038
|
-
|
|
1039
|
-
|
|
1257
|
+
if (
|
|
1258
|
+
hasattr(self, "lib_df")
|
|
1259
|
+
and self.lib_df is not None
|
|
1260
|
+
and not self.lib_df.is_empty()
|
|
1261
|
+
):
|
|
1262
|
+
self.logger.debug(
|
|
1263
|
+
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1264
|
+
)
|
|
1265
|
+
_save_dataframe_optimized(
|
|
1266
|
+
self.lib_df,
|
|
1267
|
+
lib_group,
|
|
1268
|
+
schema,
|
|
1269
|
+
"lib_df",
|
|
1270
|
+
self.logger,
|
|
1271
|
+
)
|
|
1040
1272
|
pbar.update(1)
|
|
1041
1273
|
|
|
1042
1274
|
# Store id_df - identification results
|
|
1043
|
-
if
|
|
1044
|
-
|
|
1045
|
-
|
|
1275
|
+
if (
|
|
1276
|
+
hasattr(self, "id_df")
|
|
1277
|
+
and self.id_df is not None
|
|
1278
|
+
and not self.id_df.is_empty()
|
|
1279
|
+
):
|
|
1280
|
+
self.logger.debug(
|
|
1281
|
+
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1282
|
+
)
|
|
1283
|
+
_save_dataframe_optimized(
|
|
1284
|
+
self.id_df,
|
|
1285
|
+
id_group,
|
|
1286
|
+
schema,
|
|
1287
|
+
"id_df",
|
|
1288
|
+
self.logger,
|
|
1289
|
+
)
|
|
1046
1290
|
pbar.update(1)
|
|
1047
1291
|
|
|
1048
1292
|
self.logger.debug(f"Fast save completed for {filename}")
|
|
1049
1293
|
|
|
1050
1294
|
|
|
1051
|
-
def _save_dataframe_optimized_fast(
|
|
1295
|
+
def _save_dataframe_optimized_fast(
|
|
1296
|
+
df,
|
|
1297
|
+
group,
|
|
1298
|
+
schema,
|
|
1299
|
+
df_name,
|
|
1300
|
+
logger,
|
|
1301
|
+
chunk_size=10000,
|
|
1302
|
+
):
|
|
1052
1303
|
"""
|
|
1053
1304
|
Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
|
|
1054
1305
|
|
|
@@ -1073,7 +1324,9 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
|
|
|
1073
1324
|
# Skip chrom and ms2_specs columns for features_df
|
|
1074
1325
|
if df_name == "features_df":
|
|
1075
1326
|
skip_columns = ["chrom", "ms2_specs"]
|
|
1076
|
-
df_ordered = df_ordered.select(
|
|
1327
|
+
df_ordered = df_ordered.select(
|
|
1328
|
+
[col for col in df_ordered.columns if col not in skip_columns],
|
|
1329
|
+
)
|
|
1077
1330
|
logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
|
|
1078
1331
|
|
|
1079
1332
|
total_rows = len(df_ordered)
|
|
@@ -1108,7 +1361,13 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
|
|
|
1108
1361
|
|
|
1109
1362
|
# Process object columns with optimized serialization
|
|
1110
1363
|
if object_cols:
|
|
1111
|
-
_save_object_columns_optimized(
|
|
1364
|
+
_save_object_columns_optimized(
|
|
1365
|
+
group,
|
|
1366
|
+
df_ordered,
|
|
1367
|
+
object_cols,
|
|
1368
|
+
logger,
|
|
1369
|
+
chunk_size,
|
|
1370
|
+
)
|
|
1112
1371
|
|
|
1113
1372
|
except Exception as e:
|
|
1114
1373
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
@@ -1173,19 +1432,34 @@ def _save_study5(self, filename):
|
|
|
1173
1432
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1174
1433
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1175
1434
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1176
|
-
if
|
|
1177
|
-
|
|
1435
|
+
if (
|
|
1436
|
+
self.consensus_mapping_df is not None
|
|
1437
|
+
and not self.consensus_mapping_df.is_empty()
|
|
1438
|
+
):
|
|
1439
|
+
dataframes_to_save.append(
|
|
1440
|
+
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1441
|
+
)
|
|
1178
1442
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1179
1443
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1180
|
-
if
|
|
1444
|
+
if (
|
|
1445
|
+
hasattr(self, "lib_df")
|
|
1446
|
+
and self.lib_df is not None
|
|
1447
|
+
and not self.lib_df.is_empty()
|
|
1448
|
+
):
|
|
1181
1449
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
1182
|
-
if
|
|
1450
|
+
if (
|
|
1451
|
+
hasattr(self, "id_df")
|
|
1452
|
+
and self.id_df is not None
|
|
1453
|
+
and not self.id_df.is_empty()
|
|
1454
|
+
):
|
|
1183
1455
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
1184
1456
|
|
|
1185
1457
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1186
1458
|
|
|
1187
1459
|
# Show progress for large saves
|
|
1188
|
-
tdqm_disable =
|
|
1460
|
+
tdqm_disable = (
|
|
1461
|
+
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1462
|
+
)
|
|
1189
1463
|
|
|
1190
1464
|
with tqdm(
|
|
1191
1465
|
total=total_steps,
|
|
@@ -1203,8 +1477,14 @@ def _save_study5(self, filename):
|
|
|
1203
1477
|
|
|
1204
1478
|
# Store metadata
|
|
1205
1479
|
metadata_group.attrs["format"] = "master-study-1"
|
|
1206
|
-
metadata_group.attrs["folder"] =
|
|
1207
|
-
|
|
1480
|
+
metadata_group.attrs["folder"] = (
|
|
1481
|
+
str(self.folder) if self.folder is not None else ""
|
|
1482
|
+
)
|
|
1483
|
+
metadata_group.attrs["label"] = (
|
|
1484
|
+
str(self.label)
|
|
1485
|
+
if hasattr(self, "label") and self.label is not None
|
|
1486
|
+
else ""
|
|
1487
|
+
)
|
|
1208
1488
|
|
|
1209
1489
|
# Store parameters as JSON
|
|
1210
1490
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1225,51 +1505,119 @@ def _save_study5(self, filename):
|
|
|
1225
1505
|
# Store samples_df - use optimized batch processing
|
|
1226
1506
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1227
1507
|
samples_group = f.create_group("samples")
|
|
1228
|
-
self.logger.debug(
|
|
1229
|
-
|
|
1508
|
+
self.logger.debug(
|
|
1509
|
+
f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
|
|
1510
|
+
)
|
|
1511
|
+
_save_dataframe_optimized(
|
|
1512
|
+
self.samples_df,
|
|
1513
|
+
samples_group,
|
|
1514
|
+
schema,
|
|
1515
|
+
"samples_df",
|
|
1516
|
+
self.logger,
|
|
1517
|
+
)
|
|
1230
1518
|
pbar.update(1)
|
|
1231
1519
|
|
|
1232
1520
|
# Store features_df - use optimized batch processing
|
|
1233
1521
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
1234
|
-
self.logger.debug(
|
|
1235
|
-
|
|
1522
|
+
self.logger.debug(
|
|
1523
|
+
f"Saving features_df with {len(self.features_df)} rows using optimized method",
|
|
1524
|
+
)
|
|
1525
|
+
_save_dataframe_optimized(
|
|
1526
|
+
self.features_df,
|
|
1527
|
+
features_group,
|
|
1528
|
+
schema,
|
|
1529
|
+
"features_df",
|
|
1530
|
+
self.logger,
|
|
1531
|
+
)
|
|
1236
1532
|
pbar.update(1)
|
|
1237
1533
|
|
|
1238
1534
|
# Store consensus_df - use optimized batch processing
|
|
1239
1535
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1240
|
-
self.logger.debug(
|
|
1241
|
-
|
|
1536
|
+
self.logger.debug(
|
|
1537
|
+
f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
|
|
1538
|
+
)
|
|
1539
|
+
_save_dataframe_optimized(
|
|
1540
|
+
self.consensus_df,
|
|
1541
|
+
consensus_group,
|
|
1542
|
+
schema,
|
|
1543
|
+
"consensus_df",
|
|
1544
|
+
self.logger,
|
|
1545
|
+
)
|
|
1242
1546
|
pbar.update(1)
|
|
1243
1547
|
|
|
1244
1548
|
# Store consensus_mapping_df - keep existing fast method
|
|
1245
|
-
if
|
|
1549
|
+
if (
|
|
1550
|
+
self.consensus_mapping_df is not None
|
|
1551
|
+
and not self.consensus_mapping_df.is_empty()
|
|
1552
|
+
):
|
|
1246
1553
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1247
|
-
self.logger.debug(
|
|
1554
|
+
self.logger.debug(
|
|
1555
|
+
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
1556
|
+
)
|
|
1248
1557
|
for col in consensus_mapping.columns:
|
|
1249
1558
|
try:
|
|
1250
1559
|
data = consensus_mapping[col].to_numpy()
|
|
1251
1560
|
# Use LZF compression for consensus mapping data
|
|
1252
|
-
consensus_mapping_group.create_dataset(
|
|
1561
|
+
consensus_mapping_group.create_dataset(
|
|
1562
|
+
col,
|
|
1563
|
+
data=data,
|
|
1564
|
+
compression="lzf",
|
|
1565
|
+
shuffle=True,
|
|
1566
|
+
)
|
|
1253
1567
|
except Exception as e:
|
|
1254
|
-
self.logger.warning(
|
|
1568
|
+
self.logger.warning(
|
|
1569
|
+
f"Failed to save column '{col}' in consensus_mapping_df: {e}",
|
|
1570
|
+
)
|
|
1255
1571
|
pbar.update(1)
|
|
1256
1572
|
|
|
1257
1573
|
# Store consensus_ms2 - use optimized batch processing
|
|
1258
1574
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1259
|
-
self.logger.debug(
|
|
1260
|
-
|
|
1575
|
+
self.logger.debug(
|
|
1576
|
+
f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
|
|
1577
|
+
)
|
|
1578
|
+
_save_dataframe_optimized(
|
|
1579
|
+
self.consensus_ms2,
|
|
1580
|
+
consensus_ms2_group,
|
|
1581
|
+
schema,
|
|
1582
|
+
"consensus_ms2",
|
|
1583
|
+
self.logger,
|
|
1584
|
+
)
|
|
1261
1585
|
pbar.update(1)
|
|
1262
1586
|
|
|
1263
1587
|
# Store lib_df - library data
|
|
1264
|
-
if
|
|
1265
|
-
|
|
1266
|
-
|
|
1588
|
+
if (
|
|
1589
|
+
hasattr(self, "lib_df")
|
|
1590
|
+
and self.lib_df is not None
|
|
1591
|
+
and not self.lib_df.is_empty()
|
|
1592
|
+
):
|
|
1593
|
+
self.logger.debug(
|
|
1594
|
+
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1595
|
+
)
|
|
1596
|
+
_save_dataframe_optimized(
|
|
1597
|
+
self.lib_df,
|
|
1598
|
+
lib_group,
|
|
1599
|
+
schema,
|
|
1600
|
+
"lib_df",
|
|
1601
|
+
self.logger,
|
|
1602
|
+
)
|
|
1267
1603
|
pbar.update(1)
|
|
1268
1604
|
|
|
1269
1605
|
# Store id_df - identification results
|
|
1270
|
-
if
|
|
1271
|
-
|
|
1272
|
-
|
|
1606
|
+
if (
|
|
1607
|
+
hasattr(self, "id_df")
|
|
1608
|
+
and self.id_df is not None
|
|
1609
|
+
and not self.id_df.is_empty()
|
|
1610
|
+
):
|
|
1611
|
+
self.logger.debug(
|
|
1612
|
+
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1613
|
+
)
|
|
1614
|
+
_save_dataframe_optimized(
|
|
1615
|
+
self.id_df,
|
|
1616
|
+
id_group,
|
|
1617
|
+
schema,
|
|
1618
|
+
"id_df",
|
|
1619
|
+
self.logger,
|
|
1620
|
+
)
|
|
1273
1621
|
pbar.update(1)
|
|
1274
1622
|
|
|
1275
1623
|
self.logger.info(f"Study saved successfully to {filename}")
|
|
@@ -1319,7 +1667,9 @@ def _load_study5(self, filename=None):
|
|
|
1319
1667
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
1320
1668
|
schema = _load_schema(schema_path)
|
|
1321
1669
|
if not schema:
|
|
1322
|
-
self.logger.warning(
|
|
1670
|
+
self.logger.warning(
|
|
1671
|
+
f"Schema file {schema_path} not found. Using default types.",
|
|
1672
|
+
)
|
|
1323
1673
|
|
|
1324
1674
|
# Define loading steps for progress tracking
|
|
1325
1675
|
loading_steps = [
|
|
@@ -1393,27 +1743,48 @@ def _load_study5(self, filename=None):
|
|
|
1393
1743
|
if self.history and "study" in self.history:
|
|
1394
1744
|
study_params = self.history["study"]
|
|
1395
1745
|
if isinstance(study_params, dict):
|
|
1396
|
-
failed_params = self.parameters.set_from_dict(
|
|
1746
|
+
failed_params = self.parameters.set_from_dict(
|
|
1747
|
+
study_params,
|
|
1748
|
+
validate=False,
|
|
1749
|
+
)
|
|
1397
1750
|
if failed_params:
|
|
1398
|
-
self.logger.debug(
|
|
1751
|
+
self.logger.debug(
|
|
1752
|
+
f"Could not set study parameters: {failed_params}",
|
|
1753
|
+
)
|
|
1399
1754
|
else:
|
|
1400
|
-
self.logger.debug(
|
|
1755
|
+
self.logger.debug(
|
|
1756
|
+
"Successfully updated parameters from loaded history",
|
|
1757
|
+
)
|
|
1401
1758
|
else:
|
|
1402
|
-
self.logger.debug(
|
|
1759
|
+
self.logger.debug(
|
|
1760
|
+
"Study parameters in history are not a valid dictionary",
|
|
1761
|
+
)
|
|
1403
1762
|
else:
|
|
1404
|
-
self.logger.debug(
|
|
1763
|
+
self.logger.debug(
|
|
1764
|
+
"No study parameters found in history, using defaults",
|
|
1765
|
+
)
|
|
1405
1766
|
|
|
1406
1767
|
# Synchronize instance attributes with parameters (similar to __init__)
|
|
1407
1768
|
# Note: folder and label are already loaded from metadata attributes above
|
|
1408
1769
|
# but we ensure they match the parameters for consistency
|
|
1409
|
-
if
|
|
1770
|
+
if (
|
|
1771
|
+
hasattr(self.parameters, "folder")
|
|
1772
|
+
and self.parameters.folder is not None
|
|
1773
|
+
):
|
|
1410
1774
|
self.folder = self.parameters.folder
|
|
1411
|
-
if
|
|
1775
|
+
if (
|
|
1776
|
+
hasattr(self.parameters, "label")
|
|
1777
|
+
and self.parameters.label is not None
|
|
1778
|
+
):
|
|
1412
1779
|
self.label = self.parameters.label
|
|
1413
1780
|
if hasattr(self.parameters, "log_level"):
|
|
1414
1781
|
self.log_level = self.parameters.log_level
|
|
1415
1782
|
if hasattr(self.parameters, "log_label"):
|
|
1416
|
-
self.log_label =
|
|
1783
|
+
self.log_label = (
|
|
1784
|
+
self.parameters.log_label
|
|
1785
|
+
if self.parameters.log_label is not None
|
|
1786
|
+
else ""
|
|
1787
|
+
)
|
|
1417
1788
|
if hasattr(self.parameters, "log_sink"):
|
|
1418
1789
|
self.log_sink = self.parameters.log_sink
|
|
1419
1790
|
pbar.update(1)
|
|
@@ -1423,10 +1794,17 @@ def _load_study5(self, filename=None):
|
|
|
1423
1794
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1424
1795
|
)
|
|
1425
1796
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1426
|
-
self.samples_df = _load_dataframe_from_group(
|
|
1797
|
+
self.samples_df = _load_dataframe_from_group(
|
|
1798
|
+
f["samples"],
|
|
1799
|
+
schema,
|
|
1800
|
+
"samples_df",
|
|
1801
|
+
self.logger,
|
|
1802
|
+
)
|
|
1427
1803
|
else:
|
|
1428
1804
|
# Initialize empty samples_df with the correct schema if no data exists
|
|
1429
|
-
self.logger.debug(
|
|
1805
|
+
self.logger.debug(
|
|
1806
|
+
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1807
|
+
)
|
|
1430
1808
|
self.samples_df = pl.DataFrame(
|
|
1431
1809
|
{
|
|
1432
1810
|
"sample_uid": [],
|
|
@@ -1463,10 +1841,17 @@ def _load_study5(self, filename=None):
|
|
|
1463
1841
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1464
1842
|
)
|
|
1465
1843
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1466
|
-
self.samples_df = _load_dataframe_from_group(
|
|
1844
|
+
self.samples_df = _load_dataframe_from_group(
|
|
1845
|
+
f["samples"],
|
|
1846
|
+
schema,
|
|
1847
|
+
"samples_df",
|
|
1848
|
+
self.logger,
|
|
1849
|
+
)
|
|
1467
1850
|
else:
|
|
1468
1851
|
# Initialize empty samples_df with the correct schema if no data exists
|
|
1469
|
-
self.logger.debug(
|
|
1852
|
+
self.logger.debug(
|
|
1853
|
+
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1854
|
+
)
|
|
1470
1855
|
self.samples_df = pl.DataFrame(
|
|
1471
1856
|
{
|
|
1472
1857
|
"sample_uid": [],
|
|
@@ -1536,28 +1921,39 @@ def _load_study5(self, filename=None):
|
|
|
1536
1921
|
|
|
1537
1922
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
1538
1923
|
if self.consensus_df is not None:
|
|
1539
|
-
if
|
|
1540
|
-
|
|
1541
|
-
|
|
1924
|
+
if (
|
|
1925
|
+
"adducts" not in self.consensus_df.columns
|
|
1926
|
+
or self.consensus_df["adducts"].dtype == pl.Null
|
|
1927
|
+
):
|
|
1928
|
+
self.logger.info(
|
|
1929
|
+
"Adding missing 'adducts' column for backward compatibility",
|
|
1930
|
+
)
|
|
1931
|
+
empty_adducts: list[list] = [
|
|
1932
|
+
[] for _ in range(len(self.consensus_df))
|
|
1933
|
+
]
|
|
1542
1934
|
|
|
1543
1935
|
# If column exists but is Null, drop it first
|
|
1544
1936
|
if "adducts" in self.consensus_df.columns:
|
|
1545
1937
|
self.consensus_df = self.consensus_df.drop("adducts")
|
|
1546
1938
|
|
|
1547
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
pl.
|
|
1553
|
-
pl.
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1939
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1940
|
+
[
|
|
1941
|
+
pl.Series(
|
|
1942
|
+
"adducts",
|
|
1943
|
+
empty_adducts,
|
|
1944
|
+
dtype=pl.List(
|
|
1945
|
+
pl.Struct(
|
|
1946
|
+
[
|
|
1947
|
+
pl.Field("adduct", pl.Utf8),
|
|
1948
|
+
pl.Field("count", pl.Int64),
|
|
1949
|
+
pl.Field("percentage", pl.Float64),
|
|
1950
|
+
pl.Field("mass", pl.Float64),
|
|
1951
|
+
],
|
|
1952
|
+
),
|
|
1953
|
+
),
|
|
1558
1954
|
),
|
|
1559
|
-
|
|
1560
|
-
|
|
1955
|
+
],
|
|
1956
|
+
)
|
|
1561
1957
|
else:
|
|
1562
1958
|
self.consensus_df = None
|
|
1563
1959
|
pbar.update(1)
|
|
@@ -1641,8 +2037,14 @@ def _load_study5(self, filename=None):
|
|
|
1641
2037
|
pbar.update(1)
|
|
1642
2038
|
|
|
1643
2039
|
# Check and migrate old string-based map_id to integer indices
|
|
1644
|
-
if
|
|
1645
|
-
self.
|
|
2040
|
+
if (
|
|
2041
|
+
self.samples_df is not None
|
|
2042
|
+
and not self.samples_df.is_empty()
|
|
2043
|
+
and self.samples_df["map_id"].dtype == pl.Utf8
|
|
2044
|
+
):
|
|
2045
|
+
self.logger.info(
|
|
2046
|
+
"Detected old string-based map_id format, migrating to integer indices",
|
|
2047
|
+
)
|
|
1646
2048
|
|
|
1647
2049
|
# Convert string-based map_id to integer indices
|
|
1648
2050
|
sample_count = len(self.samples_df)
|