masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/h5.py CHANGED
@@ -1,886 +1,1398 @@
1
- """
2
- _study_h5.py
3
-
4
- This module provides HDF5-based save/load functionality for the Study class.
5
- It handles serialization and deserialization of Polars DataFrames with complex objects
6
- like Chromatogram and Spectrum instances.
7
-
8
- Key Features:
9
- - **HDF5 Storage**: Efficient compressed storage using HDF5 format
10
- - **Complex Object Serialization**: JSON-based serialization for Chromatogram and Spectrum objects
11
- - **Schema-based loading**: Uses study5_schema.json for proper type handling
12
- - **Error Handling**: Robust error handling and logging
13
-
14
- Dependencies:
15
- - `h5py`: For HDF5 file operations
16
- - `polars`: For DataFrame handling
17
- - `json`: For complex object serialization
18
- - `numpy`: For numerical array operations
19
-
20
- Functions:
21
- - `_save_study5()`: Save study to .study5 HDF5 file (new format)
22
- - `_load_study5()`: Load study from .study5 HDF5 file (new format)
23
- - `_save_h5()`: Save study to .h5 file (legacy format)
24
- - `_load_h5()`: Load study from .h5 file (legacy format)
25
- """
26
-
27
- import json
28
- import os
29
-
30
- import h5py
31
- import polars as pl
32
-
33
- from masster.chromatogram import Chromatogram
34
- from masster.spectrum import Spectrum
35
-
36
-
37
- # Helper functions for HDF5 operations
38
- def _load_schema(schema_path: str) -> dict:
39
- """Load schema from JSON file with error handling."""
40
- try:
41
- with open(schema_path) as f:
42
- return json.load(f) # type: ignore
43
- except FileNotFoundError:
44
- return {}
45
-
46
-
47
- def _decode_bytes_attr(attr_value):
48
- """Decode metadata attribute, handling both bytes and string types."""
49
- if isinstance(attr_value, bytes):
50
- return attr_value.decode("utf-8")
51
- return str(attr_value) if attr_value is not None else ""
52
-
53
-
54
- def _save_dataframe_column(group, col: str, data, dtype: str, logger, compression="gzip"):
55
- """
56
- Save a single DataFrame column to an HDF5 group with optimized compression.
57
-
58
- This optimized version uses context-aware compression strategies for better
59
- performance and smaller file sizes. Different compression algorithms are
60
- selected based on data type and column name patterns.
61
-
62
- Args:
63
- group: HDF5 group to save to
64
- col: Column name
65
- data: Column data
66
- dtype: Data type string
67
- logger: Logger instance
68
- compression: Default compression (used for compatibility, but overridden by optimization)
69
-
70
- Compression Strategy:
71
- - LZF + shuffle: Fast access data (consensus_uid, rt, mz, intensity, scan_id)
72
- - GZIP level 6: JSON objects (chromatograms, spectra) and string data
73
- - GZIP level 9: Bulk storage data (large collections)
74
- - LZF: Standard numeric arrays
75
- """
76
-
77
- # Optimized compression configuration
78
- COMPRESSION_CONFIG = {
79
- 'fast_access': {'compression': 'lzf', 'shuffle': True}, # Fast I/O for IDs, rt, mz
80
- 'numeric': {'compression': 'lzf'}, # Standard numeric data
81
- 'string': {'compression': 'gzip', 'compression_opts': 6}, # String data
82
- 'json': {'compression': 'gzip', 'compression_opts': 6}, # JSON objects
83
- 'bulk': {'compression': 'gzip', 'compression_opts': 9} # Large bulk data
84
- }
85
-
86
- def get_optimal_compression(column_name, data_type, data_size=None):
87
- """Get optimal compression settings based on column type and usage pattern."""
88
- # Fast access columns (frequently read IDs and coordinates)
89
- if column_name in ['consensus_uid', 'feature_uid', 'scan_id', 'rt', 'mz', 'intensity', 'rt_original', 'mz_original']:
90
- return COMPRESSION_CONFIG['fast_access']
91
-
92
- # JSON object columns (complex serialized data)
93
- elif column_name in ['spectrum', 'chromatogram', 'chromatograms', 'ms2_specs', 'chrom']:
94
- return COMPRESSION_CONFIG['json']
95
-
96
- # String/text columns
97
- elif data_type in ['string', 'object'] and column_name in ['sample_name', 'file_path', 'label', 'file_type']:
98
- return COMPRESSION_CONFIG['string']
99
-
100
- # Large bulk numeric data
101
- elif data_size and data_size > 100000:
102
- return COMPRESSION_CONFIG['bulk']
103
-
104
- # Standard numeric data
105
- else:
106
- return COMPRESSION_CONFIG['numeric']
107
-
108
- # Get data size for optimization decisions
109
- data_size = len(data) if hasattr(data, '__len__') else None
110
-
111
- # Get optimal compression settings
112
- optimal_compression = get_optimal_compression(col, dtype, data_size)
113
- if dtype == "object":
114
- if col == "chrom":
115
- # Handle Chromatogram objects
116
- data_as_str = []
117
- for item in data:
118
- if item is not None:
119
- data_as_str.append(item.to_json())
120
- else:
121
- data_as_str.append("None")
122
- group.create_dataset(col, data=data_as_str, compression=compression)
123
- elif col == "ms2_scans":
124
- # Handle MS2 scan lists
125
- data_as_json_strings = []
126
- for item in data:
127
- if item is not None:
128
- data_as_json_strings.append(json.dumps(list(item)))
129
- else:
130
- data_as_json_strings.append("None")
131
- group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
132
- elif col == "ms2_specs":
133
- # Handle MS2 spectrum lists
134
- data_as_lists_of_strings = []
135
- for item in data:
136
- if item is not None:
137
- json_strings = []
138
- for spectrum in item:
139
- if spectrum is not None:
140
- json_strings.append(spectrum.to_json())
141
- else:
142
- json_strings.append("None")
143
- data_as_lists_of_strings.append(json_strings)
144
- else:
145
- data_as_lists_of_strings.append(["None"])
146
- # Convert to serialized data
147
- serialized_data = [json.dumps(item) for item in data_as_lists_of_strings]
148
- group.create_dataset(col, data=serialized_data, **optimal_compression)
149
- elif col == "spec":
150
- # Handle single Spectrum objects
151
- data_as_str = []
152
- for item in data:
153
- if item is not None:
154
- data_as_str.append(item.to_json())
155
- else:
156
- data_as_str.append("None")
157
- group.create_dataset(col, data=data_as_str, compression=compression)
158
- else:
159
- logger.warning(f"Unexpectedly, column '{col}' has dtype 'object'. Implement serialization for this column.")
160
- elif dtype == "string":
161
- # Handle string columns
162
- string_data = ["None" if x is None else str(x) for x in data]
163
- group.create_dataset(col, data=string_data, **optimal_compression)
164
- else:
165
- # Handle numeric columns
166
- try:
167
- # Convert None values to -123 sentinel value for numeric columns
168
- import numpy as np
169
- data_array = np.array(data)
170
-
171
- # Check if it's a numeric dtype that might have None/null values
172
- if data_array.dtype == object:
173
- # Convert None values to -123 for numeric columns with mixed types
174
- processed_data = []
175
- for item in data:
176
- if item is None:
177
- processed_data.append(-123)
178
- else:
179
- try:
180
- # Try to convert to float to check if it's numeric
181
- processed_data.append(int(float(item)))
182
- except (ValueError, TypeError):
183
- # If conversion fails, keep original value (might be string)
184
- processed_data.append(item)
185
- data_array = np.array(processed_data)
186
-
187
- group.create_dataset(col, data=data_array, **optimal_compression)
188
- except Exception as e:
189
- logger.warning(f"Failed to save column '{col}': {e}")
190
-
191
-
192
- def _reconstruct_object_column(data_col, col_name: str):
193
- """Reconstruct object columns from serialized HDF5 data."""
194
- reconstructed_data: list = []
195
-
196
- for item in data_col:
197
- if isinstance(item, bytes):
198
- item = item.decode("utf-8")
199
-
200
- if item == "None" or item == "":
201
- reconstructed_data.append(None)
202
- continue
203
-
204
- try:
205
- if col_name == "chrom":
206
- reconstructed_data.append(Chromatogram.from_json(item))
207
- elif col_name == "ms2_scans":
208
- scan_list = json.loads(item)
209
- reconstructed_data.append(scan_list)
210
- elif col_name == "ms2_specs":
211
- json_list = json.loads(item)
212
- if json_list == ["None"]:
213
- reconstructed_data.append(None)
214
- else:
215
- spectrum_list: list = []
216
- for json_str in json_list:
217
- if json_str == "None":
218
- spectrum_list.append(None)
219
- else:
220
- spectrum_list.append(Spectrum.from_json(json_str))
221
- reconstructed_data.append(spectrum_list)
222
- elif col_name == "spec":
223
- reconstructed_data.append(Spectrum.from_json(item))
224
- else:
225
- # Unknown object column
226
- reconstructed_data.append(None)
227
- except (json.JSONDecodeError, ValueError):
228
- reconstructed_data.append(None)
229
-
230
- return reconstructed_data
231
-
232
-
233
- def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
234
- """Convert string null representations to proper nulls."""
235
- for col in df.columns:
236
- if df[col].dtype == pl.Utf8:
237
- df = df.with_columns([
238
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
239
- .then(None)
240
- .otherwise(pl.col(col))
241
- .alias(col),
242
- ])
243
- return df
244
-
245
-
246
- def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
247
- """Apply schema-based type casting to DataFrame columns."""
248
- if df_name not in schema or "columns" not in schema[df_name]:
249
- return df
250
-
251
- schema_columns = schema[df_name]["columns"]
252
- cast_exprs = []
253
-
254
- for col in df.columns:
255
- if col in schema_columns:
256
- dtype_str = schema_columns[col]["dtype"]
257
- # Convert string representation to actual Polars type
258
- if dtype_str == "pl.Object":
259
- cast_exprs.append(pl.col(col)) # Keep Object type as is
260
- elif dtype_str == "pl.Int64":
261
- cast_exprs.append(pl.col(col).cast(pl.Int64, strict=False))
262
- elif dtype_str == "pl.Float64":
263
- cast_exprs.append(pl.col(col).cast(pl.Float64, strict=False))
264
- elif dtype_str == "pl.Utf8":
265
- cast_exprs.append(pl.col(col).cast(pl.Utf8, strict=False))
266
- elif dtype_str == "pl.Int32":
267
- cast_exprs.append(pl.col(col).cast(pl.Int32, strict=False))
268
- elif dtype_str == "pl.Boolean":
269
- cast_exprs.append(pl.col(col).cast(pl.Boolean, strict=False))
270
- elif dtype_str == "pl.Null":
271
- cast_exprs.append(pl.col(col).cast(pl.Null, strict=False))
272
- else:
273
- cast_exprs.append(pl.col(col)) # Keep original type
274
- else:
275
- cast_exprs.append(pl.col(col)) # Keep original type
276
-
277
- if cast_exprs:
278
- df = df.with_columns(cast_exprs)
279
-
280
- return df
281
-
282
-
283
- def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
284
- """Reorder DataFrame columns to match schema order."""
285
- if df_name not in schema or "columns" not in schema[df_name]:
286
- return df
287
-
288
- schema_columns = list(schema[df_name]["columns"].keys())
289
- # Only reorder columns that exist in both schema and DataFrame
290
- existing_columns = [col for col in schema_columns if col in df.columns]
291
- # Add any extra columns not in schema at the end
292
- extra_columns = [col for col in df.columns if col not in schema_columns]
293
- final_column_order = existing_columns + extra_columns
294
-
295
- return df.select(final_column_order)
296
-
297
-
298
- def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataFrame:
299
- """Create DataFrame handling Object columns properly."""
300
- object_data = {k: v for k, v in data.items() if k in object_columns}
301
- regular_data = {k: v for k, v in data.items() if k not in object_columns}
302
-
303
- # Create DataFrame with regular columns first
304
- if regular_data:
305
- df = pl.DataFrame(regular_data)
306
- # Add Object columns one by one
307
- for col, values in object_data.items():
308
- df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
309
- else:
310
- # Only Object columns
311
- df = pl.DataFrame()
312
- for col, values in object_data.items():
313
- df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
314
-
315
- return df
316
-
317
-
318
- def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list = None) -> pl.DataFrame:
319
- """Load a DataFrame from HDF5 group using schema."""
320
- if object_columns is None:
321
- object_columns = []
322
-
323
- data: dict = {}
324
- missing_columns = []
325
-
326
- # Iterate through schema columns in order to maintain column ordering
327
- logger.debug(f"Loading {df_name} - schema type: {type(schema)}, content: {schema.keys() if isinstance(schema, dict) else 'Not a dict'}")
328
- schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
329
- logger.debug(f"Schema section for {df_name}: {schema_section}")
330
- schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
331
- logger.debug(f"Schema columns for {df_name}: {schema_columns}")
332
- if schema_columns is None:
333
- schema_columns = []
334
-
335
- for col in (schema_columns or []):
336
- if col not in group:
337
- logger.warning(f"Column '{col}' not found in {df_name}.")
338
- data[col] = None
339
- missing_columns.append(col)
340
- continue
341
-
342
- dtype = schema[df_name]["columns"][col].get("dtype", "native")
343
- if dtype == "pl.Object" and col in object_columns:
344
- # Handle object columns specially
345
- data[col] = _reconstruct_object_column(group[col][:], col)
346
- else:
347
- # Regular columns
348
- column_data = group[col][:]
349
-
350
- # Convert -123 sentinel values back to None for numeric columns
351
- if len(column_data) > 0:
352
- # Check if it's a numeric column that might contain sentinel values
353
- try:
354
- import numpy as np
355
- data_array = np.array(column_data)
356
- if data_array.dtype in [np.float32, np.float64, np.int32, np.int64]:
357
- # Replace -123 sentinel values with None
358
- processed_data: list = []
359
- for item in column_data:
360
- if item == -123:
361
- processed_data.append(None)
362
- else:
363
- processed_data.append(item)
364
- data[col] = processed_data
365
- else:
366
- data[col] = column_data
367
- except Exception:
368
- # If any error occurs, use original data
369
- data[col] = column_data
370
- else:
371
- data[col] = column_data
372
-
373
- if not data:
374
- return None
375
-
376
- # Handle byte string conversion for non-object columns
377
- for col, values in data.items():
378
- if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
379
- processed_values = []
380
- for val in values:
381
- if isinstance(val, bytes):
382
- val = val.decode("utf-8")
383
- processed_values.append(val)
384
- data[col] = processed_values
385
-
386
- # Create DataFrame with Object columns handled properly
387
- if object_columns:
388
- df = _create_dataframe_with_objects(data, object_columns)
389
- else:
390
- df = pl.DataFrame(data)
391
-
392
- # Clean null values and apply schema
393
- df = _clean_string_nulls(df)
394
- df = _apply_schema_casting(df, schema, df_name)
395
- df = _reorder_columns_by_schema(df, schema, df_name)
396
-
397
- return df
398
-
399
-
400
- def _save_study5(self, filename=None):
401
- """
402
- Save the Study instance data to a .study5 HDF5 file with optimized schema-based format.
403
-
404
- This method saves all Study DataFrames (samples_df, features_df, consensus_df,
405
- consensus_mapping_df, consensus_ms2) using the schema defined in study5_schema.json
406
- for proper Polars DataFrame type handling.
407
-
408
- Args:
409
- filename (str, optional): Target file name. If None, uses default based on default_folder.
410
-
411
- Stores:
412
- - metadata/format (str): Data format identifier ("master-study-1")
413
- - metadata/default_folder (str): Study default folder path
414
- - metadata/label (str): Study label
415
- - metadata/parameters (str): JSON-serialized parameters dictionary
416
- - samples/: samples_df DataFrame data
417
- - features/: features_df DataFrame data with Chromatogram and Spectrum objects
418
- - consensus/: consensus_df DataFrame data
419
- - consensus_mapping/: consensus_mapping_df DataFrame data
420
- - consensus_ms2/: consensus_ms2 DataFrame data with Spectrum objects
421
-
422
- Notes:
423
- - Uses HDF5 format with compression for efficient storage.
424
- - Chromatogram objects are serialized as JSON for reconstruction.
425
- - MS2 scan lists and Spectrum objects are properly serialized.
426
- - Parameters dictionary (nested dicts) are JSON-serialized for storage.
427
- - Optimized for use with _load_study5() method.
428
- """
429
-
430
- # if no extension is given, add .study5
431
- if not filename.endswith(".study5"):
432
- filename += ".study5"
433
-
434
- self.logger.info(f"Saving study to {filename}")
435
-
436
- # delete existing file if it exists
437
- if os.path.exists(filename):
438
- os.remove(filename)
439
-
440
- # Load schema for column ordering
441
- schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
442
- schema = _load_schema(schema_path)
443
- if not schema:
444
- self.logger.warning(f"Could not load schema from {schema_path}")
445
-
446
- with h5py.File(filename, "w") as f:
447
- # Create groups for organization
448
- metadata_group = f.create_group("metadata")
449
- features_group = f.create_group("features")
450
- consensus_group = f.create_group("consensus")
451
- consensus_mapping_group = f.create_group("consensus_mapping")
452
- consensus_ms2_group = f.create_group("consensus_ms2")
453
-
454
- # Store metadata
455
- metadata_group.attrs["format"] = "master-study-1"
456
- metadata_group.attrs["default_folder"] = str(self.default_folder) if self.default_folder is not None else ""
457
- metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
458
-
459
- # Store parameters as JSON
460
- if hasattr(self, "parameters") and self.history is not None:
461
- try:
462
- parameters_json = json.dumps(self.history, indent=2)
463
- metadata_group.create_dataset("parameters", data=parameters_json)
464
- except (TypeError, ValueError) as e:
465
- self.logger.warning(f"Failed to serialize history: {e}")
466
- metadata_group.create_dataset("parameters", data="")
467
- else:
468
- metadata_group.create_dataset("parameters", data="")
469
-
470
- # Store samples_df - only create group if there's data to store
471
- if self.samples_df is not None and not self.samples_df.is_empty():
472
- samples_group = f.create_group("samples")
473
- samples = _reorder_columns_by_schema(self.samples_df.clone(), schema, "samples_df")
474
- for col in samples.columns:
475
- dtype = str(samples[col].dtype).lower()
476
- data = samples[col].to_list()
477
- _save_dataframe_column(samples_group, col, data, dtype, self.logger)
478
-
479
- # Store features_df
480
- if self.features_df is not None:
481
- features = _reorder_columns_by_schema(self.features_df.clone(), schema, "features_df")
482
- for col in features.columns:
483
- dtype = str(features[col].dtype).lower()
484
- column_data: object = features[col] if dtype == "object" else features[col].to_list()
485
- _save_dataframe_column(features_group, col, column_data, dtype, self.logger)
486
-
487
- # Store consensus_df
488
- if self.consensus_df is not None:
489
- consensus = _reorder_columns_by_schema(self.consensus_df.clone(), schema, "consensus_df")
490
- for col in consensus.columns:
491
- dtype = str(consensus[col].dtype).lower()
492
- data = consensus[col].to_list()
493
- _save_dataframe_column(consensus_group, col, data, dtype, self.logger)
494
-
495
- # Store consensus_mapping_df
496
- if self.consensus_mapping_df is not None:
497
- consensus_mapping = self.consensus_mapping_df.clone()
498
- for col in consensus_mapping.columns:
499
- try:
500
- data = consensus_mapping[col].to_numpy()
501
- # Use LZF compression for consensus mapping data
502
- consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
503
- except Exception as e:
504
- self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
505
-
506
- # Store consensus_ms2
507
- if self.consensus_ms2 is not None:
508
- consensus_ms2 = self.consensus_ms2.clone()
509
- for col in consensus_ms2.columns:
510
- dtype = str(consensus_ms2[col].dtype).lower()
511
- data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
512
- _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
513
-
514
- self.logger.debug(f"Save completed for {filename}")
515
-
516
-
517
- def _load_study5(self, filename=None):
518
- """
519
- Load Study instance data from a .study5 HDF5 file.
520
-
521
- Restores all Study DataFrames that were saved with _save_study5() method using the
522
- schema defined in study5_schema.json for proper Polars DataFrame reconstruction.
523
-
524
- Args:
525
- filename (str, optional): Path to the .study5 HDF5 file to load. If None, uses default.
526
-
527
- Returns:
528
- None (modifies self in place)
529
-
530
- Notes:
531
- - Restores DataFrames with proper schema typing from study5_schema.json
532
- - Handles Chromatogram and Spectrum object reconstruction
533
- - Properly handles MS2 scan lists and spectrum lists
534
- - Restores parameters dictionary from JSON serialization
535
- """
536
- from datetime import datetime
537
- from tqdm import tqdm
538
-
539
- self.logger.info(f"Loading study from {filename}")
540
-
541
- # Handle default filename
542
- if filename is None:
543
- if self.default_folder is not None:
544
- filename = os.path.join(self.default_folder, "study.study5")
545
- else:
546
- self.logger.error("Either filename or default_folder must be provided")
547
- return
548
-
549
- # Add .study5 extension if not provided
550
- if not filename.endswith(".study5"):
551
- filename += ".study5"
552
-
553
- if not os.path.exists(filename):
554
- self.logger.error(f"File {filename} does not exist")
555
- return
556
-
557
- # Load schema for proper DataFrame reconstruction
558
- schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
559
- schema = _load_schema(schema_path)
560
- if not schema:
561
- self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
562
-
563
- # Define loading steps for progress tracking
564
- loading_steps = [
565
- "metadata",
566
- "samples_df",
567
- "features_df",
568
- "consensus_df",
569
- "consensus_mapping_df",
570
- "consensus_ms2"
571
- ]
572
-
573
- # Check if progress bar should be disabled based on log level
574
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
575
-
576
- with h5py.File(filename, "r") as f:
577
- # Use progress bar to show loading progress
578
- with tqdm(
579
- total=len(loading_steps),
580
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
581
- disable=tdqm_disable,
582
- ) as pbar:
583
-
584
- # Load metadata
585
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
586
- if "metadata" in f:
587
- metadata = f["metadata"]
588
- self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
589
- if hasattr(self, "label"):
590
- self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
591
-
592
- # Load parameters from JSON
593
- if "parameters" in metadata:
594
- try:
595
- parameters_data = metadata["parameters"][()]
596
- if isinstance(parameters_data, bytes):
597
- parameters_data = parameters_data.decode("utf-8")
598
-
599
- if parameters_data and parameters_data != "":
600
- self.history = json.loads(parameters_data)
601
- else:
602
- self.history = {}
603
- except (json.JSONDecodeError, ValueError, TypeError) as e:
604
- self.logger.warning(f"Failed to deserialize parameters: {e}")
605
- self.history = {}
606
- else:
607
- self.history = {}
608
-
609
- # Reconstruct self.parameters from loaded history
610
- from masster.study.defaults.study_def import study_defaults
611
-
612
- # Always create a fresh study_defaults object to ensure we have all defaults
613
- self.parameters = study_defaults()
614
-
615
- # Update parameters from loaded history if available
616
- if self.history and "study" in self.history:
617
- study_params = self.history["study"]
618
- if isinstance(study_params, dict):
619
- failed_params = self.parameters.set_from_dict(study_params, validate=False)
620
- if failed_params:
621
- self.logger.debug(f"Could not set study parameters: {failed_params}")
622
- else:
623
- self.logger.debug("Successfully updated parameters from loaded history")
624
- else:
625
- self.logger.debug("Study parameters in history are not a valid dictionary")
626
- else:
627
- self.logger.debug("No study parameters found in history, using defaults")
628
-
629
- # Synchronize instance attributes with parameters (similar to __init__)
630
- # Note: default_folder and label are already loaded from metadata attributes above
631
- # but we ensure they match the parameters for consistency
632
- if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
633
- self.default_folder = self.parameters.default_folder
634
- if hasattr(self.parameters, 'label') and self.parameters.label is not None:
635
- self.label = self.parameters.label
636
- if hasattr(self.parameters, 'log_level'):
637
- self.log_level = self.parameters.log_level
638
- if hasattr(self.parameters, 'log_label'):
639
- self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
640
- if hasattr(self.parameters, 'log_sink'):
641
- self.log_sink = self.parameters.log_sink
642
- pbar.update(1)
643
-
644
- # Load samples_df
645
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
646
- if "samples" in f and len(f["samples"].keys()) > 0:
647
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
648
- else:
649
- # Initialize empty samples_df with the correct schema if no data exists
650
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
651
- self.samples_df = pl.DataFrame(
652
- {
653
- "sample_uid": [],
654
- "sample_name": [],
655
- "sample_path": [],
656
- "sample_type": [],
657
- "size": [],
658
- "map_id": [],
659
- },
660
- schema={
661
- "sample_uid": pl.Int64,
662
- "sample_name": pl.Utf8,
663
- "sample_path": pl.Utf8,
664
- "sample_type": pl.Utf8,
665
- "size": pl.Int64,
666
- "map_id": pl.Utf8,
667
- },
668
- )
669
- pbar.update(1)
670
-
671
- # Load features_df
672
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
673
- if "features" in f and len(f["features"].keys()) > 0:
674
- object_columns = ["chrom", "ms2_scans", "ms2_specs"]
675
- self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
676
- else:
677
- self.features_df = None
678
- pbar.update(1)
679
-
680
- # Load consensus_df
681
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
682
- if "consensus" in f and len(f["consensus"].keys()) > 0:
683
- self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
684
- else:
685
- self.consensus_df = None
686
- pbar.update(1)
687
-
688
- # Load consensus_mapping_df
689
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
690
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
691
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
692
- else:
693
- self.consensus_mapping_df = None
694
- pbar.update(1)
695
-
696
- # Load consensus_ms2
697
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
698
- if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
699
- object_columns = ["spec"]
700
- self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
701
- else:
702
- self.consensus_ms2 = None
703
- pbar.update(1)
704
-
705
- self.logger.info(f"Study loaded from {filename}")
706
-
707
-
708
- def _load_h5(self, filename=None):
709
- """
710
- Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
711
-
712
- This is a legacy method for loading older HDF5 format files. For new files,
713
- use _load_study5() which has improved schema handling and performance.
714
-
715
- Args:
716
- filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
717
-
718
- Returns:
719
- None (modifies self in place)
720
-
721
- Notes:
722
- - Legacy format loader with basic DataFrame reconstruction
723
- - Includes progress bar for loading steps
724
- - For new projects, prefer _load_study5() method
725
- """
726
- from datetime import datetime
727
- from tqdm import tqdm
728
-
729
- # Handle default filename
730
- if filename is None:
731
- if self.default_folder is not None:
732
- filename = os.path.join(self.default_folder, "study.h5")
733
- else:
734
- self.logger.error("Either filename or default_folder must be provided")
735
- return
736
-
737
- # Add .h5 extension if not provided
738
- if not filename.endswith(".h5"):
739
- filename += ".h5"
740
-
741
- if not os.path.exists(filename):
742
- self.logger.error(f"File {filename} does not exist")
743
- return
744
-
745
- # Define loading steps for progress tracking
746
- loading_steps = [
747
- "metadata",
748
- "samples_df",
749
- "features_df",
750
- "consensus_df",
751
- "consensus_mapping_df"
752
- ]
753
-
754
- # Check if progress bar should be disabled based on log level
755
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
756
-
757
- with h5py.File(filename, "r") as f:
758
- # Use progress bar to show loading progress
759
- with tqdm(
760
- total=len(loading_steps),
761
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading legacy study",
762
- disable=tdqm_disable,
763
- ) as pbar:
764
-
765
- # Load metadata
766
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
767
- if "metadata" in f:
768
- metadata = f["metadata"]
769
- self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
770
- if hasattr(self, "label"):
771
- self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
772
-
773
- # Load parameters from JSON if available
774
- if "parameters" in metadata:
775
- try:
776
- parameters_data = metadata["parameters"][()]
777
- if isinstance(parameters_data, bytes):
778
- parameters_data = parameters_data.decode("utf-8")
779
-
780
- if parameters_data and parameters_data != "":
781
- self.history = json.loads(parameters_data)
782
- else:
783
- self.history = {}
784
- except (json.JSONDecodeError, ValueError, TypeError) as e:
785
- self.logger.warning(f"Failed to deserialize parameters: {e}")
786
- self.history = {}
787
- else:
788
- self.history = {}
789
- pbar.update(1)
790
-
791
- # Load samples_df (legacy format)
792
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
793
- if "samples" in f and len(f["samples"].keys()) > 0:
794
- samples_data = {}
795
- for col in f["samples"].keys():
796
- column_data = f["samples"][col][:]
797
- # Handle byte strings
798
- if len(column_data) > 0 and isinstance(column_data[0], bytes):
799
- column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
800
- samples_data[col] = column_data
801
-
802
- if samples_data:
803
- self.samples_df = pl.DataFrame(samples_data)
804
- else:
805
- # Initialize empty samples_df
806
- self.samples_df = pl.DataFrame({
807
- "sample_uid": [],
808
- "sample_name": [],
809
- "sample_path": [],
810
- "sample_type": [],
811
- "size": [],
812
- "map_id": [],
813
- })
814
- else:
815
- self.samples_df = pl.DataFrame({
816
- "sample_uid": [],
817
- "sample_name": [],
818
- "sample_path": [],
819
- "sample_type": [],
820
- "size": [],
821
- "map_id": [],
822
- })
823
- pbar.update(1)
824
-
825
- # Load features_df (legacy format)
826
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
827
- if "features" in f and len(f["features"].keys()) > 0:
828
- features_data = {}
829
- for col in f["features"].keys():
830
- column_data = f["features"][col][:]
831
- # Handle special object columns
832
- if col in ["chrom", "ms2_specs"]:
833
- reconstructed_data = _reconstruct_object_column(column_data, col)
834
- features_data[col] = reconstructed_data
835
- else:
836
- # Handle byte strings
837
- if len(column_data) > 0 and isinstance(column_data[0], bytes):
838
- column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
839
- features_data[col] = column_data
840
-
841
- if features_data:
842
- # Create DataFrame with Object columns handled properly
843
- object_columns = ["chrom", "ms2_specs"]
844
- self.features_df = _create_dataframe_with_objects(features_data, object_columns)
845
- else:
846
- self.features_df = None
847
- else:
848
- self.features_df = None
849
- pbar.update(1)
850
-
851
- # Load consensus_df (legacy format)
852
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
853
- if "consensus" in f and len(f["consensus"].keys()) > 0:
854
- consensus_data = {}
855
- for col in f["consensus"].keys():
856
- column_data = f["consensus"][col][:]
857
- # Handle byte strings
858
- if len(column_data) > 0 and isinstance(column_data[0], bytes):
859
- column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
860
- consensus_data[col] = column_data
861
-
862
- if consensus_data:
863
- self.consensus_df = pl.DataFrame(consensus_data)
864
- else:
865
- self.consensus_df = None
866
- else:
867
- self.consensus_df = None
868
- pbar.update(1)
869
-
870
- # Load consensus_mapping_df (legacy format)
871
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
872
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
873
- mapping_data = {}
874
- for col in f["consensus_mapping"].keys():
875
- column_data = f["consensus_mapping"][col][:]
876
- mapping_data[col] = column_data
877
-
878
- if mapping_data:
879
- self.consensus_mapping_df = pl.DataFrame(mapping_data)
880
- else:
881
- self.consensus_mapping_df = None
882
- else:
883
- self.consensus_mapping_df = None
884
- pbar.update(1)
885
-
886
- self.logger.info(f"Legacy study loaded from {filename}")
1
+ """
2
+ _study_h5.py
3
+
4
+ This module provides HDF5-based save/load functionality for the Study class.
5
+ It handles serialization and deserialization of Polars DataFrames with complex objects
6
+ It handles serialization and deserialization of Polars DataFrames with complex objects
7
+ It handles serialization and deserialization of Polars DataFrames with complex objects
8
+ like Chromatogram and Spectrum instances.
9
+
10
+ Key Features:
11
+ - **HDF5 Storage**: Efficient compressed storage using HDF5 format
12
+ - **Complex Object Serialization**: JSON-based serialization for Chromatogram and Spectrum objects
13
+ - **Schema-based loading**: Uses study5_schema.json for proper type handling
14
+ - **Error Handling**: Robust error handling and logging
15
+
16
+ Dependencies:
17
+ - `h5py`: For HDF5 file operations
18
+ - `polars`: For DataFrame handling
19
+ - `json`: For complex object serialization
20
+ - `numpy`: For numerical array operations
21
+
22
+ Functions:
23
+ - `_save_study5()`: Save study to .study5 HDF5 file (new format)
24
+ - `_load_study5()`: Load study from .study5 HDF5 file (new format)
25
+ - `_save_h5()`: Save study to .h5 file (legacy format)
26
+ - `_load_h5()`: Load study from .h5 file (legacy format)
27
+ """
28
+
29
+ import json
30
+ import os
31
+ from concurrent.futures import ThreadPoolExecutor, as_completed
32
+ from datetime import datetime
33
+
34
+ import h5py
35
+ import polars as pl
36
+ from tqdm import tqdm
37
+
38
+ from masster.chromatogram import Chromatogram
39
+ from masster.spectrum import Spectrum
40
+
41
+
42
+ # Helper functions for HDF5 operations
43
+ def _load_schema(schema_path: str) -> dict:
44
+ """Load schema from JSON file with error handling."""
45
+ try:
46
+ with open(schema_path) as f:
47
+ return json.load(f) # type: ignore
48
+ except FileNotFoundError:
49
+ return {}
50
+
51
+
52
+ def _decode_bytes_attr(attr_value):
53
+ """Decode metadata attribute, handling both bytes and string types."""
54
+ if isinstance(attr_value, bytes):
55
+ return attr_value.decode("utf-8")
56
+ return str(attr_value) if attr_value is not None else ""
57
+
58
+
59
+ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
60
+ """
61
+ Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
62
+
63
+ This function replaces individual column processing with batch operations for much
64
+ better performance on large datasets (300+ samples).
65
+
66
+ Args:
67
+ df: Polars DataFrame to save
68
+ group: HDF5 group to save to
69
+ schema: Schema for column ordering
70
+ df_name: Name of the DataFrame for schema lookup
71
+ logger: Logger instance
72
+ chunk_size: Number of rows to process at once for memory efficiency
73
+ """
74
+ if df is None or df.is_empty():
75
+ return
76
+
77
+ try:
78
+ # Reorder columns according to schema
79
+ df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
80
+ total_rows = len(df_ordered)
81
+
82
+ # Group columns by processing type for batch optimization
83
+ numeric_cols = []
84
+ string_cols = []
85
+ object_cols = []
86
+
87
+ for col in df_ordered.columns:
88
+ dtype = str(df_ordered[col].dtype).lower()
89
+ if dtype == "object":
90
+ object_cols.append(col)
91
+ elif dtype in ["string", "utf8"]:
92
+ string_cols.append(col)
93
+ else:
94
+ numeric_cols.append(col)
95
+
96
+ logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
97
+
98
+ # Process numeric columns in batch (most efficient)
99
+ if numeric_cols:
100
+ for col in numeric_cols:
101
+ _save_numeric_column_fast(group, col, df_ordered[col], logger)
102
+
103
+ # Process string columns in batch
104
+ if string_cols:
105
+ for col in string_cols:
106
+ _save_string_column_fast(group, col, df_ordered[col], logger)
107
+
108
+ # Process object columns with optimized serialization
109
+ if object_cols:
110
+ _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
111
+
112
+ except Exception as e:
113
+ logger.error(f"Failed to save DataFrame {df_name}: {e}")
114
+ # Fallback to old method for safety
115
+ _save_dataframe_column_legacy(df, group, schema, df_name, logger)
116
+
117
+
118
+ def _save_numeric_column_fast(group, col, data_series, logger):
119
+ """Fast numeric column saving with optimal compression."""
120
+ try:
121
+ import numpy as np
122
+
123
+ # Get compression settings based on column name
124
+ if col in ["consensus_uid", "feature_uid", "scan_id", "rt", "mz", "intensity"]:
125
+ compression_kwargs = {"compression": "lzf", "shuffle": True}
126
+ else:
127
+ compression_kwargs = {"compression": "lzf"}
128
+
129
+ # Convert to numpy array efficiently
130
+ try:
131
+ data_array = data_series.to_numpy()
132
+ except Exception:
133
+ # Fallback for complex data types
134
+ data_array = np.array(data_series.to_list())
135
+
136
+ # Handle None/null values efficiently
137
+ if data_array.dtype == object:
138
+ # Check if this is actually a list/array column that should be treated as object
139
+ sample_value = None
140
+ for val in data_array:
141
+ if val is not None:
142
+ sample_value = val
143
+ break
144
+
145
+ # If sample value is a list/array, treat as object column
146
+ if isinstance(sample_value, (list, tuple, np.ndarray)):
147
+ logger.debug(f"Column '{col}' contains array-like data, treating as object")
148
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
149
+ return
150
+
151
+ # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
152
+ try:
153
+ data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
154
+ except (ValueError, TypeError):
155
+ # If conversion fails, this is not a numeric column
156
+ logger.debug(f"Column '{col}' is not numeric, treating as object")
157
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
158
+ return
159
+
160
+ group.create_dataset(col, data=data_array, **compression_kwargs)
161
+
162
+ except Exception as e:
163
+ logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
164
+ # Fallback to old method
165
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), str(data_series.dtype), logger)
166
+
167
+
168
+ def _save_string_column_fast(group, col, data_series, logger):
169
+ """Fast string column saving with optimal compression."""
170
+ try:
171
+ # Convert to string array efficiently
172
+ string_data = ["None" if x is None else str(x) for x in data_series.to_list()]
173
+
174
+ compression_kwargs = {"compression": "gzip", "compression_opts": 6}
175
+ group.create_dataset(col, data=string_data, **compression_kwargs)
176
+
177
+ except Exception as e:
178
+ logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
179
+ # Fallback to old method
180
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "string", logger)
181
+
182
+
183
+ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
184
+ """Optimized object column processing with chunking and parallel serialization."""
185
+ import json
186
+
187
+ def serialize_chunk(col_name, chunk_data):
188
+ """Serialize a chunk of object data."""
189
+ serialized_chunk = []
190
+
191
+ if col_name == "chrom":
192
+ # Handle Chromatogram objects
193
+ for item in chunk_data:
194
+ if item is not None:
195
+ serialized_chunk.append(item.to_json())
196
+ else:
197
+ serialized_chunk.append("None")
198
+ elif col_name == "ms2_scans":
199
+ # Handle MS2 scan lists
200
+ for item in chunk_data:
201
+ if item is not None:
202
+ serialized_chunk.append(json.dumps(list(item)))
203
+ else:
204
+ serialized_chunk.append("None")
205
+ elif col_name == "ms2_specs":
206
+ # Handle MS2 spectrum lists
207
+ for item in chunk_data:
208
+ if item is not None:
209
+ json_strings = []
210
+ for spectrum in item:
211
+ if spectrum is not None:
212
+ json_strings.append(spectrum.to_json())
213
+ else:
214
+ json_strings.append("None")
215
+ serialized_chunk.append(json.dumps(json_strings))
216
+ else:
217
+ serialized_chunk.append(json.dumps(["None"]))
218
+ elif col_name in ["adducts", "adduct_values"]:
219
+ # Handle lists
220
+ for item in chunk_data:
221
+ if item is not None:
222
+ serialized_chunk.append(json.dumps(item))
223
+ else:
224
+ serialized_chunk.append("[]")
225
+ elif col_name == "spec":
226
+ # Handle single Spectrum objects
227
+ for item in chunk_data:
228
+ if item is not None:
229
+ serialized_chunk.append(item.to_json())
230
+ else:
231
+ serialized_chunk.append("None")
232
+ else:
233
+ logger.warning(f"Unknown object column '{col_name}', using default serialization")
234
+ for item in chunk_data:
235
+ serialized_chunk.append(str(item) if item is not None else "None")
236
+
237
+ return serialized_chunk
238
+
239
+ # Process each object column
240
+ for col in object_cols:
241
+ try:
242
+ data_list = df[col].to_list()
243
+ total_items = len(data_list)
244
+
245
+ if total_items == 0:
246
+ group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
247
+ continue
248
+
249
+ # For small datasets, process directly
250
+ if total_items <= chunk_size:
251
+ serialized_data = serialize_chunk(col, data_list)
252
+ group.create_dataset(col, data=serialized_data, compression="gzip", compression_opts=6)
253
+ else:
254
+ # For large datasets, use chunked processing with parallel serialization
255
+ logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
256
+
257
+ all_serialized = []
258
+ num_chunks = (total_items + chunk_size - 1) // chunk_size
259
+
260
+ # Use thread pool for parallel serialization of chunks
261
+ with ThreadPoolExecutor(max_workers=min(4, num_chunks)) as executor:
262
+ futures = {}
263
+
264
+ for i in range(0, total_items, chunk_size):
265
+ chunk = data_list[i:i + chunk_size]
266
+ future = executor.submit(serialize_chunk, col, chunk)
267
+ futures[future] = i
268
+
269
+ # Collect results in order
270
+ results = {}
271
+ for future in as_completed(futures):
272
+ chunk_start = futures[future]
273
+ try:
274
+ chunk_result = future.result()
275
+ results[chunk_start] = chunk_result
276
+ except Exception as e:
277
+ logger.warning(f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}")
278
+ # Fallback to simple string conversion for this chunk
279
+ chunk = data_list[chunk_start:chunk_start + chunk_size]
280
+ results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
281
+
282
+ # Reassemble in correct order
283
+ for i in range(0, total_items, chunk_size):
284
+ if i in results:
285
+ all_serialized.extend(results[i])
286
+
287
+ group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
288
+
289
+ except Exception as e:
290
+ logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
291
+ # Fallback to old method
292
+ _save_dataframe_column_legacy_single(group, col, df[col].to_list(), "object", logger)
293
+
294
+
295
+ def _save_dataframe_column_legacy_single(group, col: str, data, dtype: str, logger, compression="gzip"):
296
+ """Legacy single column save method for fallback."""
297
+ # This is the original _save_dataframe_column method for compatibility
298
+ return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
299
+
300
+
301
+ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, compression="gzip"):
302
+ """
303
+ Save a single DataFrame column to an HDF5 group with optimized compression.
304
+
305
+ This optimized version uses context-aware compression strategies for better
306
+ performance and smaller file sizes. Different compression algorithms are
307
+ selected based on data type and column name patterns.
308
+
309
+ Args:
310
+ group: HDF5 group to save to
311
+ col: Column name
312
+ data: Column data
313
+ dtype: Data type string
314
+ logger: Logger instance
315
+ compression: Default compression (used for compatibility, but overridden by optimization)
316
+
317
+ Compression Strategy:
318
+ - LZF + shuffle: Fast access data (consensus_uid, rt, mz, intensity, scan_id)
319
+ - GZIP level 6: JSON objects (chromatograms, spectra) and string data
320
+ - GZIP level 9: Bulk storage data (large collections)
321
+ - LZF: Standard numeric arrays
322
+ """
323
+
324
+ # Optimized compression configuration
325
+ COMPRESSION_CONFIG = {
326
+ "fast_access": {"compression": "lzf", "shuffle": True}, # Fast I/O for IDs, rt, mz
327
+ "numeric": {"compression": "lzf"}, # Standard numeric data
328
+ "string": {"compression": "gzip", "compression_opts": 6}, # String data
329
+ "json": {"compression": "gzip", "compression_opts": 6}, # JSON objects
330
+ "bulk": {"compression": "gzip", "compression_opts": 9}, # Large bulk data
331
+ }
332
+
333
+ def get_optimal_compression(column_name, data_type, data_size=None):
334
+ """Get optimal compression settings based on column type and usage pattern."""
335
+ # Fast access columns (frequently read IDs and coordinates)
336
+ if column_name in [
337
+ "consensus_uid",
338
+ "feature_uid",
339
+ "scan_id",
340
+ "rt",
341
+ "mz",
342
+ "intensity",
343
+ "rt_original",
344
+ "mz_original",
345
+ ]:
346
+ return COMPRESSION_CONFIG["fast_access"]
347
+
348
+ # JSON object columns (complex serialized data)
349
+ elif column_name in ["spectrum", "chromatogram", "chromatograms", "ms2_specs", "chrom"]:
350
+ return COMPRESSION_CONFIG["json"]
351
+
352
+ # String/text columns
353
+ elif data_type in ["string", "object"] and column_name in ["sample_name", "file_path", "label", "file_type"]:
354
+ return COMPRESSION_CONFIG["string"]
355
+
356
+ # Large bulk numeric data
357
+ elif data_size and data_size > 100000:
358
+ return COMPRESSION_CONFIG["bulk"]
359
+
360
+ # Standard numeric data
361
+ else:
362
+ return COMPRESSION_CONFIG["numeric"]
363
+
364
+ # Get data size for optimization decisions
365
+ data_size = len(data) if hasattr(data, "__len__") else None
366
+
367
+ # Get optimal compression settings
368
+ optimal_compression = get_optimal_compression(col, dtype, data_size)
369
+ if dtype == "object" or dtype.startswith("list"):
370
+ if col == "chrom":
371
+ # Handle Chromatogram objects
372
+ data_as_str = []
373
+ for item in data:
374
+ if item is not None:
375
+ data_as_str.append(item.to_json())
376
+ else:
377
+ data_as_str.append("None")
378
+ group.create_dataset(col, data=data_as_str, compression=compression)
379
+ elif col == "ms2_scans":
380
+ # Handle MS2 scan lists
381
+ data_as_json_strings = []
382
+ for item in data:
383
+ if item is not None:
384
+ data_as_json_strings.append(json.dumps(list(item)))
385
+ else:
386
+ data_as_json_strings.append("None")
387
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
388
+ elif col == "ms2_specs":
389
+ # Handle MS2 spectrum lists
390
+ data_as_lists_of_strings = []
391
+ for item in data:
392
+ if item is not None:
393
+ json_strings = []
394
+ for spectrum in item:
395
+ if spectrum is not None:
396
+ json_strings.append(spectrum.to_json())
397
+ else:
398
+ json_strings.append("None")
399
+ data_as_lists_of_strings.append(json_strings)
400
+ else:
401
+ data_as_lists_of_strings.append(["None"])
402
+ # Convert to serialized data
403
+ serialized_data = [json.dumps(item) for item in data_as_lists_of_strings]
404
+ group.create_dataset(col, data=serialized_data, **optimal_compression)
405
+ elif col == "adducts":
406
+ # Handle adducts lists (List(String))
407
+ data_as_json_strings = []
408
+ for item in data:
409
+ if item is not None:
410
+ data_as_json_strings.append(json.dumps(item))
411
+ else:
412
+ data_as_json_strings.append("[]")
413
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
414
+ elif col == "adduct_values":
415
+ # Handle adduct_values lists (List(Struct))
416
+ data_as_json_strings = []
417
+ for item in data:
418
+ if item is not None:
419
+ data_as_json_strings.append(json.dumps(item))
420
+ else:
421
+ data_as_json_strings.append("[]")
422
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
423
+ elif col == "spec":
424
+ # Handle single Spectrum objects
425
+ data_as_str = []
426
+ for item in data:
427
+ if item is not None:
428
+ data_as_str.append(item.to_json())
429
+ else:
430
+ data_as_str.append("None")
431
+ group.create_dataset(col, data=data_as_str, compression=compression)
432
+ else:
433
+ logger.warning(f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.")
434
+ elif dtype == "string":
435
+ # Handle string columns
436
+ string_data = ["None" if x is None else str(x) for x in data]
437
+ group.create_dataset(col, data=string_data, **optimal_compression)
438
+ else:
439
+ # Handle numeric columns
440
+ try:
441
+ # Convert None values to -123 sentinel value for numeric columns
442
+ import numpy as np
443
+
444
+ data_array = np.array(data)
445
+
446
+ # Check if it's a numeric dtype that might have None/null values
447
+ if data_array.dtype == object:
448
+ # Convert None values to -123 for numeric columns with mixed types
449
+ processed_data = []
450
+ for item in data:
451
+ if item is None:
452
+ processed_data.append(-123)
453
+ else:
454
+ try:
455
+ # Try to convert to float to check if it's numeric
456
+ processed_data.append(int(float(item)))
457
+ except (ValueError, TypeError):
458
+ # If conversion fails, keep original value (might be string)
459
+ processed_data.append(item)
460
+ data_array = np.array(processed_data)
461
+
462
+ group.create_dataset(col, data=data_array, **optimal_compression)
463
+ except Exception as e:
464
+ logger.warning(f"Failed to save column '{col}': {e}")
465
+
466
+
467
+ # Keep the original function as _save_dataframe_column for backward compatibility
468
+ _save_dataframe_column = _save_dataframe_column_legacy
469
+
470
+
471
+ def _reconstruct_object_column(data_col, col_name: str):
472
+ """Reconstruct object columns from serialized HDF5 data."""
473
+ reconstructed_data: list = []
474
+
475
+ for item in data_col:
476
+ if isinstance(item, bytes):
477
+ item = item.decode("utf-8")
478
+
479
+ # Handle non-string data (e.g., float32 NaN from corrupted compression)
480
+ if not isinstance(item, str):
481
+ import numpy as np
482
+ if isinstance(item, (float, np.floating)) and np.isnan(item):
483
+ reconstructed_data.append(None)
484
+ continue
485
+ else:
486
+ reconstructed_data.append(None)
487
+ continue
488
+
489
+ if item == "None" or item == "":
490
+ reconstructed_data.append(None)
491
+ continue
492
+
493
+ try:
494
+ if col_name == "chrom":
495
+ reconstructed_data.append(Chromatogram.from_json(item))
496
+ elif col_name == "ms2_scans":
497
+ scan_list = json.loads(item)
498
+ reconstructed_data.append(scan_list)
499
+ elif col_name == "ms2_specs":
500
+ json_list = json.loads(item)
501
+ if json_list == ["None"]:
502
+ reconstructed_data.append(None)
503
+ else:
504
+ spectrum_list: list = []
505
+ for json_str in json_list:
506
+ if json_str == "None":
507
+ spectrum_list.append(None)
508
+ else:
509
+ spectrum_list.append(Spectrum.from_json(json_str))
510
+ reconstructed_data.append(spectrum_list)
511
+ elif col_name == "spec":
512
+ reconstructed_data.append(Spectrum.from_json(item))
513
+ elif col_name == "adducts":
514
+ # Handle adducts lists (List(Struct)) - now contains dicts instead of strings
515
+ adducts_list = json.loads(item)
516
+ reconstructed_data.append(adducts_list)
517
+ else:
518
+ # Unknown object column
519
+ reconstructed_data.append(None)
520
+ except (json.JSONDecodeError, ValueError):
521
+ reconstructed_data.append(None)
522
+
523
+ return reconstructed_data
524
+
525
+
526
+ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
527
+ """Convert string null representations to proper nulls."""
528
+ for col in df.columns:
529
+ if df[col].dtype == pl.Utf8:
530
+ df = df.with_columns([
531
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
532
+ ])
533
+ return df
534
+
535
+
536
+ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
537
+ """Apply schema-based type casting to DataFrame columns."""
538
+ if df_name not in schema or "columns" not in schema[df_name]:
539
+ return df
540
+
541
+ schema_columns = schema[df_name]["columns"]
542
+ cast_exprs = []
543
+
544
+ for col in df.columns:
545
+ if col in schema_columns:
546
+ dtype_str = schema_columns[col]["dtype"]
547
+ # Convert string representation to actual Polars type
548
+ if dtype_str == "pl.Object":
549
+ cast_exprs.append(pl.col(col)) # Keep Object type as is
550
+ elif dtype_str == "pl.Int64":
551
+ cast_exprs.append(pl.col(col).cast(pl.Int64, strict=False))
552
+ elif dtype_str == "pl.Float64":
553
+ cast_exprs.append(pl.col(col).cast(pl.Float64, strict=False))
554
+ elif dtype_str == "pl.Utf8":
555
+ cast_exprs.append(pl.col(col).cast(pl.Utf8, strict=False))
556
+ elif dtype_str == "pl.Int32":
557
+ cast_exprs.append(pl.col(col).cast(pl.Int32, strict=False))
558
+ elif dtype_str == "pl.Boolean":
559
+ cast_exprs.append(pl.col(col).cast(pl.Boolean, strict=False))
560
+ elif dtype_str == "pl.Null":
561
+ cast_exprs.append(pl.col(col).cast(pl.Null, strict=False))
562
+ else:
563
+ cast_exprs.append(pl.col(col)) # Keep original type
564
+ else:
565
+ cast_exprs.append(pl.col(col)) # Keep original type
566
+
567
+ if cast_exprs:
568
+ df = df.with_columns(cast_exprs)
569
+
570
+ return df
571
+
572
+
573
+ def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
574
+ """Reorder DataFrame columns to match schema order."""
575
+ if df_name not in schema or "columns" not in schema[df_name]:
576
+ return df
577
+
578
+ schema_columns = list(schema[df_name]["columns"].keys())
579
+ # Only reorder columns that exist in both schema and DataFrame
580
+ existing_columns = [col for col in schema_columns if col in df.columns]
581
+ # Add any extra columns not in schema at the end
582
+ extra_columns = [col for col in df.columns if col not in schema_columns]
583
+ final_column_order = existing_columns + extra_columns
584
+
585
+ return df.select(final_column_order)
586
+
587
+
588
+ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataFrame:
589
+ """Create DataFrame handling Object columns properly."""
590
+ object_data = {k: v for k, v in data.items() if k in object_columns}
591
+ regular_data = {k: v for k, v in data.items() if k not in object_columns}
592
+
593
+ # Determine expected length from regular data or first object column
594
+ expected_length = None
595
+ if regular_data:
596
+ for values in regular_data.values():
597
+ if values is not None and hasattr(values, '__len__'):
598
+ expected_length = len(values)
599
+ break
600
+
601
+ if expected_length is None and object_data:
602
+ for values in object_data.values():
603
+ if values is not None and hasattr(values, '__len__'):
604
+ expected_length = len(values)
605
+ break
606
+
607
+ if expected_length is None:
608
+ expected_length = 0
609
+
610
+ # Fix any object columns that have None or empty values
611
+ for col in object_columns:
612
+ if col in object_data:
613
+ values = object_data[col]
614
+ if values is None or (hasattr(values, '__len__') and len(values) == 0):
615
+ object_data[col] = [None] * expected_length
616
+ # print(f"DEBUG: Fixed object column '{col}' to have length {expected_length}")
617
+
618
+ # Create DataFrame with regular columns first
619
+ if regular_data:
620
+ df = pl.DataFrame(regular_data)
621
+ # print(f"DEBUG: Created DataFrame with regular columns, shape: {df.shape}")
622
+ # Add Object columns one by one
623
+ for col, values in object_data.items():
624
+ # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
625
+ if col == "adducts":
626
+ # Handle adducts as List(Struct) - now contains dicts
627
+ df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
628
+ pl.Field("adduct", pl.Utf8),
629
+ pl.Field("count", pl.Int64),
630
+ pl.Field("percentage", pl.Float64),
631
+ pl.Field("mass", pl.Float64)
632
+ ])))])
633
+ else:
634
+ # Other object columns stay as Object
635
+ df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
636
+ else:
637
+ # Only Object columns
638
+ df = pl.DataFrame()
639
+ for col, values in object_data.items():
640
+ # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
641
+ if col == "adducts":
642
+ # Handle adducts as List(Struct) - now contains dicts
643
+ df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
644
+ pl.Field("adduct", pl.Utf8),
645
+ pl.Field("count", pl.Int64),
646
+ pl.Field("percentage", pl.Float64),
647
+ pl.Field("mass", pl.Float64)
648
+ ])))])
649
+ else:
650
+ # Other object columns stay as Object
651
+ df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
652
+
653
+ return df
654
+
655
+
656
+ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list = None) -> pl.DataFrame:
657
+ """Load a DataFrame from HDF5 group using schema."""
658
+ if object_columns is None:
659
+ object_columns = []
660
+
661
+ data: dict = {}
662
+ missing_columns = []
663
+
664
+ # Iterate through schema columns in order to maintain column ordering
665
+ logger.debug(
666
+ f"Loading {df_name} - schema type: {type(schema)}, content: {schema.keys() if isinstance(schema, dict) else 'Not a dict'}",
667
+ )
668
+ schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
669
+ logger.debug(f"Schema section for {df_name}: {schema_section}")
670
+ schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
671
+ logger.debug(f"Schema columns for {df_name}: {schema_columns}")
672
+ if schema_columns is None:
673
+ schema_columns = []
674
+
675
+ # First pass: load all existing columns
676
+ for col in schema_columns or []:
677
+ if col not in group:
678
+ missing_columns.append(col)
679
+ continue
680
+
681
+ dtype = schema[df_name]["columns"][col].get("dtype", "native")
682
+ if dtype == "pl.Object" or col in object_columns:
683
+ # Handle object columns specially
684
+ data[col] = _reconstruct_object_column(group[col][:], col)
685
+ else:
686
+ # Regular columns
687
+ column_data = group[col][:]
688
+
689
+ # Convert -123 sentinel values back to None for numeric columns
690
+ if len(column_data) > 0:
691
+ # Check if it's a numeric column that might contain sentinel values
692
+ try:
693
+ import numpy as np
694
+
695
+ data_array = np.array(column_data)
696
+ if data_array.dtype in [np.float32, np.float64, np.int32, np.int64]:
697
+ # Replace -123 sentinel values with None
698
+ processed_data: list = []
699
+ for item in column_data:
700
+ if item == -123:
701
+ processed_data.append(None)
702
+ else:
703
+ processed_data.append(item)
704
+ data[col] = processed_data
705
+ else:
706
+ data[col] = column_data
707
+ except Exception:
708
+ # If any error occurs, use original data
709
+ data[col] = column_data
710
+ else:
711
+ data[col] = column_data
712
+
713
+ # Determine expected DataFrame length from loaded columns
714
+ expected_length = None
715
+ for col, values in data.items():
716
+ if values is not None and hasattr(values, '__len__'):
717
+ expected_length = len(values)
718
+ logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
719
+ break
720
+
721
+ # If no data loaded yet, try HDF5 columns directly
722
+ if expected_length is None:
723
+ hdf5_columns = list(group.keys())
724
+ for col in hdf5_columns:
725
+ col_data = group[col][:]
726
+ if expected_length is None:
727
+ expected_length = len(col_data)
728
+ logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
729
+ break
730
+
731
+ # Default to 0 if no data found
732
+ if expected_length is None:
733
+ expected_length = 0
734
+ logger.debug("No columns found, setting expected_length=0")
735
+
736
+ # Second pass: handle missing columns
737
+ for col in missing_columns:
738
+ logger.warning(f"Column '{col}' not found in {df_name}.")
739
+ # For missing columns, create appropriately sized array of None values
740
+ if col in object_columns:
741
+ data[col] = [None] * expected_length
742
+ logger.debug(f"Created missing object column '{col}' with length {expected_length}")
743
+ else:
744
+ data[col] = [None] * expected_length
745
+ logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
746
+
747
+ # Check for columns in HDF5 file that are not in schema (for backward compatibility)
748
+ hdf5_columns = list(group.keys())
749
+ extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
750
+
751
+ for col in extra_columns:
752
+ logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
753
+ column_data = group[col][:]
754
+
755
+ # Try to determine if this should be treated as an object column
756
+ # by checking if the data looks like JSON strings
757
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
758
+ try:
759
+ # Check if it looks like JSON
760
+ test_decode = column_data[0].decode('utf-8')
761
+ if test_decode.startswith('[') or test_decode.startswith('{'):
762
+ # Looks like JSON, treat as object column
763
+ data[col] = _reconstruct_object_column(column_data, col)
764
+ if col not in object_columns:
765
+ object_columns.append(col)
766
+ else:
767
+ # Regular string data
768
+ data[col] = [item.decode('utf-8') if isinstance(item, bytes) else item for item in column_data]
769
+ except Exception:
770
+ # If decoding fails, treat as regular data
771
+ data[col] = column_data
772
+ else:
773
+ data[col] = column_data
774
+
775
+ if not data:
776
+ return None
777
+
778
+ # Handle byte string conversion for non-object columns
779
+ # Only convert to strings for columns that should actually be strings
780
+ for col, values in data.items():
781
+ if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
782
+ # Check schema to see if this should be a string column
783
+ should_be_string = False
784
+ if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
785
+ dtype_str = schema[df_name]["columns"][col]["dtype"]
786
+ should_be_string = dtype_str == "pl.Utf8"
787
+
788
+ if should_be_string:
789
+ processed_values = []
790
+ for val in values:
791
+ if isinstance(val, bytes):
792
+ val = val.decode("utf-8")
793
+ processed_values.append(val)
794
+ data[col] = processed_values
795
+ # If not a string column, leave as original data type (will be cast by schema)
796
+
797
+ # Create DataFrame with Object columns handled properly
798
+ if object_columns:
799
+ logger.debug(f"Creating DataFrame with object columns: {object_columns}")
800
+ for col in object_columns:
801
+ if col in data:
802
+ logger.debug(f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}")
803
+ df = _create_dataframe_with_objects(data, object_columns)
804
+ else:
805
+ df = pl.DataFrame(data)
806
+
807
+ # Clean null values and apply schema
808
+ df = _clean_string_nulls(df)
809
+ df = _apply_schema_casting(df, schema, df_name)
810
+ df = _reorder_columns_by_schema(df, schema, df_name)
811
+
812
+ return df
813
+
814
+
815
+ def _save_study5_compressed(self, filename=None):
816
+ """
817
+ Compressed save identical to _save_study5 but skips serialization of chrom and ms2_specs columns in features_df.
818
+
819
+ This version maintains full compatibility with _load_study5() while providing performance benefits
820
+ by skipping the serialization of heavy object columns (chrom and ms2_specs) in features_df.
821
+ """
822
+
823
+ # if no extension is given, add .study5
824
+ if not filename.endswith(".study5"):
825
+ filename += ".study5"
826
+
827
+ self.logger.info(f"Compressed saving study to {filename}")
828
+
829
+ # delete existing file if it exists
830
+ if os.path.exists(filename):
831
+ os.remove(filename)
832
+
833
+ # Load schema for column ordering
834
+ schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
835
+ schema = _load_schema(schema_path)
836
+ if not schema:
837
+ self.logger.warning(f"Could not load schema from {schema_path}")
838
+
839
+ with h5py.File(filename, "w") as f:
840
+ # Count total DataFrames to save for progress tracking
841
+ dataframes_to_save = []
842
+ if self.samples_df is not None and not self.samples_df.is_empty():
843
+ dataframes_to_save.append(("samples", len(self.samples_df)))
844
+ if self.features_df is not None and not self.features_df.is_empty():
845
+ dataframes_to_save.append(("features", len(self.features_df)))
846
+ if self.consensus_df is not None and not self.consensus_df.is_empty():
847
+ dataframes_to_save.append(("consensus", len(self.consensus_df)))
848
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
849
+ dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
850
+ if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
851
+ dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
852
+
853
+ total_steps = len(dataframes_to_save) + 1 # +1 for metadata
854
+
855
+ # Show progress for large saves
856
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
857
+
858
+ with tqdm(
859
+ total=total_steps,
860
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Fast saving study",
861
+ disable=tdqm_disable,
862
+ ) as pbar:
863
+
864
+ # Create groups for organization
865
+ metadata_group = f.create_group("metadata")
866
+ features_group = f.create_group("features")
867
+ consensus_group = f.create_group("consensus")
868
+ consensus_mapping_group = f.create_group("consensus_mapping")
869
+ consensus_ms2_group = f.create_group("consensus_ms2")
870
+
871
+ # Store metadata
872
+ metadata_group.attrs["format"] = "master-study-1"
873
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
874
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
875
+
876
+ # Store parameters as JSON
877
+ if hasattr(self, "parameters") and self.history is not None:
878
+ try:
879
+ parameters_json = json.dumps(self.history, indent=2)
880
+ metadata_group.create_dataset("parameters", data=parameters_json)
881
+ except (TypeError, ValueError) as e:
882
+ self.logger.warning(f"Failed to serialize history: {e}")
883
+ metadata_group.create_dataset("parameters", data="")
884
+ else:
885
+ metadata_group.create_dataset("parameters", data="")
886
+
887
+ pbar.update(1)
888
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes")
889
+
890
+ # Store samples_df - use optimized batch processing
891
+ if self.samples_df is not None and not self.samples_df.is_empty():
892
+ samples_group = f.create_group("samples")
893
+ self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
894
+ _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
895
+ pbar.update(1)
896
+
897
+ # Store features_df - use fast method that skips chrom and ms2_specs columns
898
+ if self.features_df is not None and not self.features_df.is_empty():
899
+ self.logger.debug(f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)")
900
+ _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
901
+ pbar.update(1)
902
+
903
+ # Store consensus_df - use optimized batch processing
904
+ if self.consensus_df is not None and not self.consensus_df.is_empty():
905
+ self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
906
+ _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
907
+ pbar.update(1)
908
+
909
+ # Store consensus_mapping_df - keep existing fast method
910
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
911
+ consensus_mapping = self.consensus_mapping_df.clone()
912
+ self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
913
+ for col in consensus_mapping.columns:
914
+ try:
915
+ data = consensus_mapping[col].to_numpy()
916
+ # Use LZF compression for consensus mapping data
917
+ consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
918
+ except Exception as e:
919
+ self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
920
+ pbar.update(1)
921
+
922
+ # Store consensus_ms2 - use optimized batch processing
923
+ if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
924
+ self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
925
+ _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
926
+ pbar.update(1)
927
+
928
+ self.logger.info(f"Fast study saved successfully to {filename}")
929
+ self.logger.debug(f"Fast save completed for {filename}")
930
+
931
+
932
+ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
933
+ """
934
+ Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
935
+
936
+ This function is identical to _save_dataframe_optimized but excludes heavy object columns
937
+ (chrom and ms2_specs) when saving features_df to improve performance.
938
+
939
+ Args:
940
+ df: Polars DataFrame to save
941
+ group: HDF5 group to save to
942
+ schema: Schema for column ordering
943
+ df_name: Name of the DataFrame for schema lookup
944
+ logger: Logger instance
945
+ chunk_size: Number of rows to process at once for memory efficiency
946
+ """
947
+ if df is None or df.is_empty():
948
+ return
949
+
950
+ try:
951
+ # Reorder columns according to schema
952
+ df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
953
+
954
+ # Skip chrom and ms2_specs columns for features_df
955
+ if df_name == "features_df":
956
+ skip_columns = ["chrom", "ms2_specs"]
957
+ df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
958
+ logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
959
+
960
+ total_rows = len(df_ordered)
961
+
962
+ # Group columns by processing type for batch optimization
963
+ numeric_cols = []
964
+ string_cols = []
965
+ object_cols = []
966
+
967
+ for col in df_ordered.columns:
968
+ dtype = str(df_ordered[col].dtype).lower()
969
+ if dtype == "object":
970
+ object_cols.append(col)
971
+ elif dtype in ["string", "utf8"]:
972
+ string_cols.append(col)
973
+ else:
974
+ numeric_cols.append(col)
975
+
976
+ logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
977
+
978
+ # Process numeric columns in batch (most efficient)
979
+ if numeric_cols:
980
+ for col in numeric_cols:
981
+ _save_numeric_column_fast(group, col, df_ordered[col], logger)
982
+
983
+ # Process string columns in batch
984
+ if string_cols:
985
+ for col in string_cols:
986
+ _save_string_column_fast(group, col, df_ordered[col], logger)
987
+
988
+ # Process object columns with optimized serialization
989
+ if object_cols:
990
+ _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
991
+
992
+ except Exception as e:
993
+ logger.error(f"Failed to save DataFrame {df_name}: {e}")
994
+ # Fallback to old method for safety
995
+ _save_dataframe_column_legacy(df, group, schema, df_name, logger)
996
+
997
+
998
+ def _save_study5(self, filename=None):
999
+ """
1000
+ Save the Study instance data to a .study5 HDF5 file with optimized schema-based format.
1001
+
1002
+ This method saves all Study DataFrames (samples_df, features_df, consensus_df,
1003
+ consensus_mapping_df, consensus_ms2) using the schema defined in study5_schema.json
1004
+ for proper Polars DataFrame type handling.
1005
+
1006
+ Args:
1007
+ filename (str, optional): Target file name. If None, uses default based on folder.
1008
+
1009
+ Stores:
1010
+ - metadata/format (str): Data format identifier ("master-study-1")
1011
+ - metadata/folder (str): Study default folder path
1012
+ - metadata/label (str): Study label
1013
+ - metadata/parameters (str): JSON-serialized parameters dictionary
1014
+ - samples/: samples_df DataFrame data
1015
+ - features/: features_df DataFrame data with Chromatogram and Spectrum objects
1016
+ - consensus/: consensus_df DataFrame data
1017
+ - consensus_mapping/: consensus_mapping_df DataFrame data
1018
+ - consensus_ms2/: consensus_ms2 DataFrame data with Spectrum objects
1019
+
1020
+ Notes:
1021
+ - Uses HDF5 format with compression for efficient storage.
1022
+ - Chromatogram objects are serialized as JSON for reconstruction.
1023
+ - MS2 scan lists and Spectrum objects are properly serialized.
1024
+ - Parameters dictionary (nested dicts) are JSON-serialized for storage.
1025
+ - Optimized for use with _load_study5() method.
1026
+ """
1027
+
1028
+ # if no extension is given, add .study5
1029
+ if not filename.endswith(".study5"):
1030
+ filename += ".study5"
1031
+
1032
+ self.logger.info(f"Saving study to {filename}")
1033
+
1034
+ # delete existing file if it exists
1035
+ if os.path.exists(filename):
1036
+ os.remove(filename)
1037
+
1038
+ # Load schema for column ordering
1039
+ schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
1040
+ schema = _load_schema(schema_path)
1041
+ if not schema:
1042
+ self.logger.warning(f"Could not load schema from {schema_path}")
1043
+
1044
+ with h5py.File(filename, "w") as f:
1045
+ # Count total DataFrames to save for progress tracking
1046
+ dataframes_to_save = []
1047
+ if self.samples_df is not None and not self.samples_df.is_empty():
1048
+ dataframes_to_save.append(("samples", len(self.samples_df)))
1049
+ if self.features_df is not None and not self.features_df.is_empty():
1050
+ dataframes_to_save.append(("features", len(self.features_df)))
1051
+ if self.consensus_df is not None and not self.consensus_df.is_empty():
1052
+ dataframes_to_save.append(("consensus", len(self.consensus_df)))
1053
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1054
+ dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1055
+ if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1056
+ dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1057
+
1058
+ total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1059
+
1060
+ # Show progress for large saves
1061
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1062
+
1063
+ with tqdm(
1064
+ total=total_steps,
1065
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study",
1066
+ disable=tdqm_disable,
1067
+ ) as pbar:
1068
+
1069
+ # Create groups for organization
1070
+ metadata_group = f.create_group("metadata")
1071
+ features_group = f.create_group("features")
1072
+ consensus_group = f.create_group("consensus")
1073
+ consensus_mapping_group = f.create_group("consensus_mapping")
1074
+ consensus_ms2_group = f.create_group("consensus_ms2")
1075
+
1076
+ # Store metadata
1077
+ metadata_group.attrs["format"] = "master-study-1"
1078
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1079
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1080
+
1081
+ # Store parameters as JSON
1082
+ if hasattr(self, "parameters") and self.history is not None:
1083
+ try:
1084
+ parameters_json = json.dumps(self.history, indent=2)
1085
+ metadata_group.create_dataset("parameters", data=parameters_json)
1086
+ except (TypeError, ValueError) as e:
1087
+ self.logger.warning(f"Failed to serialize history: {e}")
1088
+ metadata_group.create_dataset("parameters", data="")
1089
+ else:
1090
+ metadata_group.create_dataset("parameters", data="")
1091
+
1092
+ pbar.update(1)
1093
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes")
1094
+
1095
+ # Store samples_df - use optimized batch processing
1096
+ if self.samples_df is not None and not self.samples_df.is_empty():
1097
+ samples_group = f.create_group("samples")
1098
+ self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
1099
+ _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1100
+ pbar.update(1)
1101
+
1102
+ # Store features_df - use optimized batch processing
1103
+ if self.features_df is not None and not self.features_df.is_empty():
1104
+ self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
1105
+ _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
1106
+ pbar.update(1)
1107
+
1108
+ # Store consensus_df - use optimized batch processing
1109
+ if self.consensus_df is not None and not self.consensus_df.is_empty():
1110
+ self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
1111
+ _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
1112
+ pbar.update(1)
1113
+
1114
+ # Store consensus_mapping_df - keep existing fast method
1115
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1116
+ consensus_mapping = self.consensus_mapping_df.clone()
1117
+ self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
1118
+ for col in consensus_mapping.columns:
1119
+ try:
1120
+ data = consensus_mapping[col].to_numpy()
1121
+ # Use LZF compression for consensus mapping data
1122
+ consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
1123
+ except Exception as e:
1124
+ self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
1125
+ pbar.update(1)
1126
+
1127
+ # Store consensus_ms2 - use optimized batch processing
1128
+ if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1129
+ self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
1130
+ _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
1131
+ pbar.update(1)
1132
+
1133
+ self.logger.info(f"Study saved successfully to {filename}")
1134
+ self.logger.debug(f"Save completed for {filename}")
1135
+ self.logger.debug(f"Save completed for {filename}")
1136
+
1137
+
1138
+ def _load_study5(self, filename=None):
1139
+ """
1140
+ Load Study instance data from a .study5 HDF5 file.
1141
+
1142
+ Restores all Study DataFrames that were saved with _save_study5() method using the
1143
+ schema defined in study5_schema.json for proper Polars DataFrame reconstruction.
1144
+
1145
+ Args:
1146
+ filename (str, optional): Path to the .study5 HDF5 file to load. If None, uses default.
1147
+
1148
+ Returns:
1149
+ None (modifies self in place)
1150
+
1151
+ Notes:
1152
+ - Restores DataFrames with proper schema typing from study5_schema.json
1153
+ - Handles Chromatogram and Spectrum object reconstruction
1154
+ - Properly handles MS2 scan lists and spectrum lists
1155
+ - Restores parameters dictionary from JSON serialization
1156
+ """
1157
+
1158
+ self.logger.info(f"Loading study from {filename}")
1159
+
1160
+ # Handle default filename
1161
+ if filename is None:
1162
+ if self.folder is not None:
1163
+ filename = os.path.join(self.folder, "study.study5")
1164
+ else:
1165
+ self.logger.error("Either filename or folder must be provided")
1166
+ return
1167
+
1168
+ # Add .study5 extension if not provided
1169
+ if not filename.endswith(".study5"):
1170
+ filename += ".study5"
1171
+
1172
+ if not os.path.exists(filename):
1173
+ self.logger.error(f"File {filename} does not exist")
1174
+ return
1175
+
1176
+ # Load schema for proper DataFrame reconstruction
1177
+ schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
1178
+ schema = _load_schema(schema_path)
1179
+ if not schema:
1180
+ self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
1181
+
1182
+ # Define loading steps for progress tracking
1183
+ loading_steps = [
1184
+ "metadata",
1185
+ "samples_df",
1186
+ "features_df",
1187
+ "consensus_df",
1188
+ "consensus_mapping_df",
1189
+ "consensus_ms2"
1190
+ ]
1191
+
1192
+ # Check if progress bar should be disabled based on log level
1193
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1194
+
1195
+ # Define loading steps for progress tracking
1196
+ loading_steps = [
1197
+ "metadata",
1198
+ "samples_df",
1199
+ "features_df",
1200
+ "consensus_df",
1201
+ "consensus_mapping_df",
1202
+ "consensus_ms2"
1203
+ ]
1204
+
1205
+ # Check if progress bar should be disabled based on log level
1206
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1207
+
1208
+ with h5py.File(filename, "r") as f:
1209
+ # Use progress bar to show loading progress
1210
+ with tqdm(
1211
+ total=len(loading_steps),
1212
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
1213
+ disable=tdqm_disable,
1214
+ ) as pbar:
1215
+
1216
+ # Load metadata
1217
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
1218
+ if "metadata" in f:
1219
+ metadata = f["metadata"]
1220
+ self.folder = _decode_bytes_attr(metadata.attrs.get("folder", ""))
1221
+ if hasattr(self, "label"):
1222
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
1223
+
1224
+ # Load parameters from JSON
1225
+ if "parameters" in metadata:
1226
+ try:
1227
+ parameters_data = metadata["parameters"][()]
1228
+ if isinstance(parameters_data, bytes):
1229
+ parameters_data = parameters_data.decode("utf-8")
1230
+
1231
+ if parameters_data and parameters_data != "":
1232
+ self.history = json.loads(parameters_data)
1233
+ else:
1234
+ self.history = {}
1235
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
1236
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
1237
+ self.history = {}
1238
+ else:
1239
+ self.history = {}
1240
+
1241
+ # Reconstruct self.parameters from loaded history
1242
+ from masster.study.defaults.study_def import study_defaults
1243
+
1244
+ # Always create a fresh study_defaults object to ensure we have all defaults
1245
+ self.parameters = study_defaults()
1246
+
1247
+ # Update parameters from loaded history if available
1248
+ if self.history and "study" in self.history:
1249
+ study_params = self.history["study"]
1250
+ if isinstance(study_params, dict):
1251
+ failed_params = self.parameters.set_from_dict(study_params, validate=False)
1252
+ if failed_params:
1253
+ self.logger.debug(f"Could not set study parameters: {failed_params}")
1254
+ else:
1255
+ self.logger.debug("Successfully updated parameters from loaded history")
1256
+ else:
1257
+ self.logger.debug("Study parameters in history are not a valid dictionary")
1258
+ else:
1259
+ self.logger.debug("No study parameters found in history, using defaults")
1260
+
1261
+ # Synchronize instance attributes with parameters (similar to __init__)
1262
+ # Note: folder and label are already loaded from metadata attributes above
1263
+ # but we ensure they match the parameters for consistency
1264
+ if hasattr(self.parameters, 'folder') and self.parameters.folder is not None:
1265
+ self.folder = self.parameters.folder
1266
+ if hasattr(self.parameters, 'label') and self.parameters.label is not None:
1267
+ self.label = self.parameters.label
1268
+ if hasattr(self.parameters, 'log_level'):
1269
+ self.log_level = self.parameters.log_level
1270
+ if hasattr(self.parameters, 'log_label'):
1271
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1272
+ if hasattr(self.parameters, 'log_sink'):
1273
+ self.log_sink = self.parameters.log_sink
1274
+ pbar.update(1)
1275
+
1276
+ # Load samples_df
1277
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
1278
+ if "samples" in f and len(f["samples"].keys()) > 0:
1279
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1280
+ else:
1281
+ # Initialize empty samples_df with the correct schema if no data exists
1282
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1283
+ self.samples_df = pl.DataFrame(
1284
+ {
1285
+ "sample_uid": [],
1286
+ "sample_name": [],
1287
+ "sample_path": [],
1288
+ "sample_type": [],
1289
+ "size": [],
1290
+ "map_id": [],
1291
+ "file_source": [],
1292
+ },
1293
+ schema={
1294
+ "sample_uid": pl.Int64,
1295
+ "sample_name": pl.Utf8,
1296
+ "sample_path": pl.Utf8,
1297
+ "sample_type": pl.Utf8,
1298
+ "size": pl.Int64,
1299
+ "map_id": pl.Utf8,
1300
+ "file_source": pl.Utf8,
1301
+ },
1302
+ )
1303
+ pbar.update(1)
1304
+ # Load samples_df
1305
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
1306
+ if "samples" in f and len(f["samples"].keys()) > 0:
1307
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1308
+ else:
1309
+ # Initialize empty samples_df with the correct schema if no data exists
1310
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1311
+ self.samples_df = pl.DataFrame(
1312
+ {
1313
+ "sample_uid": [],
1314
+ "sample_name": [],
1315
+ "sample_path": [],
1316
+ "sample_type": [],
1317
+ "size": [],
1318
+ "map_id": [],
1319
+ "file_source": [],
1320
+ },
1321
+ schema={
1322
+ "sample_uid": pl.Int64,
1323
+ "sample_name": pl.Utf8,
1324
+ "sample_path": pl.Utf8,
1325
+ "sample_type": pl.Utf8,
1326
+ "size": pl.Int64,
1327
+ "map_id": pl.Utf8,
1328
+ "file_source": pl.Utf8,
1329
+ },
1330
+ )
1331
+ pbar.update(1)
1332
+
1333
+ # Load features_df
1334
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
1335
+ if "features" in f and len(f["features"].keys()) > 0:
1336
+ object_columns = ["chrom", "ms2_scans", "ms2_specs"]
1337
+ self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
1338
+ else:
1339
+ self.features_df = None
1340
+ pbar.update(1)
1341
+
1342
+ # Load consensus_df
1343
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
1344
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
1345
+ # Only include adducts in object_columns if it actually exists in the file
1346
+ object_columns = []
1347
+ if "adducts" in f["consensus"]:
1348
+ object_columns.append("adducts")
1349
+
1350
+ self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger, object_columns)
1351
+
1352
+ # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
1353
+ if self.consensus_df is not None:
1354
+ if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
1355
+ self.logger.info("Adding missing 'adducts' column for backward compatibility")
1356
+ empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
1357
+
1358
+ # If column exists but is Null, drop it first
1359
+ if "adducts" in self.consensus_df.columns:
1360
+ self.consensus_df = self.consensus_df.drop("adducts")
1361
+
1362
+ self.consensus_df = self.consensus_df.with_columns([
1363
+ pl.Series("adducts", empty_adducts, dtype=pl.List(pl.Struct([
1364
+ pl.Field("adduct", pl.Utf8),
1365
+ pl.Field("count", pl.Int64),
1366
+ pl.Field("percentage", pl.Float64),
1367
+ pl.Field("mass", pl.Float64)
1368
+ ])))
1369
+ ])
1370
+ else:
1371
+ self.consensus_df = None
1372
+ pbar.update(1)
1373
+
1374
+ # Load consensus_mapping_df
1375
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
1376
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1377
+ self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
1378
+ else:
1379
+ self.consensus_mapping_df = None
1380
+ pbar.update(1)
1381
+ # Load consensus_mapping_df
1382
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
1383
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1384
+ self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
1385
+ else:
1386
+ self.consensus_mapping_df = None
1387
+ pbar.update(1)
1388
+
1389
+ # Load consensus_ms2
1390
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
1391
+ if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
1392
+ object_columns = ["spec"]
1393
+ self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
1394
+ else:
1395
+ self.consensus_ms2 = None
1396
+ pbar.update(1)
1397
+
1398
+ self.logger.debug("Study loaded")