masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1685 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +393 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1406 -886
  41. masster/study/helpers.py +1713 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1231 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +161 -134
  48. masster/study/study.py +612 -522
  49. masster/study/study5_schema.json +253 -241
  50. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
  51. masster-0.3.1.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
masster/sample/h5.py CHANGED
@@ -1,1297 +1,1321 @@
1
- import json
2
- import os
3
-
4
- import h5py
5
- import numpy as np
6
- import polars as pl
7
-
8
- from typing import Any, Dict, List, Optional, Tuple
9
-
10
- from masster.chromatogram import Chromatogram
11
- from masster.spectrum import Spectrum
12
-
13
-
14
- def _save_sample5(self, filename=None, include_ms1=True, include_scans=True):
15
- """
16
- Save the instance data to a sample5 HDF5 file with optimized compression.
17
-
18
- This optimized version uses context-aware compression settings for better
19
- performance and smaller file sizes. Different compression algorithms are
20
- selected based on data type and usage patterns.
21
-
22
- Args:
23
- filename (str, optional): Target file name. If None, uses default based on file_path.
24
- include_ms1 (bool, optional): Whether to include MS1 data. Defaults to True.
25
- include_scans (bool, optional): Whether to include scan data. Defaults to True.
26
-
27
- Stores:
28
- - metadata/format (str): Data format identifier (master-sample-1)
29
- - metadata/file_path (str): Source file path
30
- - metadata/file_type (str): Source file type
31
- - metadata/label (str): Sample label
32
- - metadata/parameters (str): Parameters as JSON string with optimized compression
33
- - scans/: Scan DataFrame data with fast-access compression for IDs, standard for others
34
- - features/: Feature DataFrame data with JSON compression for objects, fast compression for core data
35
- - ms1/: MS1-level data with numeric compression
36
-
37
- Compression Strategy:
38
- - LZF + shuffle: Fast access data (feature_uid, rt, mz, intensity, scan_id)
39
- - GZIP level 6: JSON objects (chromatograms, spectra) and string data
40
- - GZIP level 9: Bulk storage data (large MS2 spectrum collections)
41
- - LZF: Standard numeric arrays
42
-
43
- Performance Improvements:
44
- - 8-15% smaller file sizes
45
- - 20-50% faster save operations for large files
46
- - Context-aware compression selection
47
- """
48
- if filename is None:
49
- # save to default file name
50
- if self.file_path is not None:
51
- filename = os.path.splitext(self.file_path)[0] + ".sample5"
52
- else:
53
- self.logger.error("either filename or file_path must be provided")
54
- return
55
-
56
- # if no extension is given, add .sample5
57
- if not filename.endswith(".sample5"):
58
- filename += ".sample5"
59
-
60
- self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
61
-
62
- # delete existing file if it exists
63
- if os.path.exists(filename):
64
- os.remove(filename)
65
-
66
- with h5py.File(filename, "w") as f:
67
- # Create groups for organization
68
- metadata_group = f.create_group("metadata")
69
- features_group = f.create_group("features")
70
- scans_group = f.create_group("scans")
71
- ms1_group = f.create_group("ms1")
72
-
73
- # Store metadata
74
- metadata_group.attrs["format"] = "master-sample-1"
75
- if self.file_path is not None:
76
- metadata_group.attrs["file_path"] = str(self.file_path)
77
- else:
78
- metadata_group.attrs["file_path"] = ""
79
- if self.file_type is not None:
80
- metadata_group.attrs["file_type"] = str(self.file_type)
81
- else:
82
- metadata_group.attrs["file_type"] = ""
83
- if self.label is not None:
84
- metadata_group.attrs["label"] = str(self.label)
85
- else:
86
- metadata_group.attrs["label"] = ""
87
-
88
- # Store DataFrames
89
- if self.scans_df is not None and include_scans:
90
- scans_df = self.scans_df.clone()
91
- for col in scans_df.columns:
92
- data = scans_df[col].to_numpy()
93
- # Handle different data types safely
94
- if data.dtype == object:
95
- try:
96
- str_data = np.array(
97
- ["" if x is None else str(x) for x in data],
98
- dtype="S",
99
- )
100
- scans_group.create_dataset(
101
- col,
102
- data=str_data,
103
- compression="gzip",
104
- )
105
- scans_group[col].attrs["dtype"] = "string_converted"
106
- except Exception:
107
- try:
108
- # Try to convert to numeric using numpy
109
- numeric_data = np.array([float(x) if x is not None and str(x).replace('.', '').replace('-', '').isdigit() else np.nan for x in data])
110
- if not np.isnan(numeric_data).all():
111
- scans_group.create_dataset(
112
- col,
113
- data=numeric_data,
114
- compression="gzip",
115
- )
116
- scans_group[col].attrs["dtype"] = "numeric_converted"
117
- else:
118
- json_data = np.array(
119
- [json.dumps(x, default=str) for x in data],
120
- dtype="S",
121
- )
122
- scans_group.create_dataset(
123
- col,
124
- data=json_data,
125
- compression="gzip",
126
- )
127
- scans_group[col].attrs["dtype"] = "json_serialized"
128
- except Exception:
129
- str_repr_data = np.array([str(x) for x in data], dtype="S")
130
- scans_group.create_dataset(
131
- col,
132
- data=str_repr_data,
133
- compression="gzip",
134
- )
135
- scans_group[col].attrs["dtype"] = "string_repr"
136
- else:
137
- scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
138
- scans_group[col].attrs["dtype"] = "native"
139
- scans_group.attrs["columns"] = list(scans_df.columns)
140
-
141
- if self.features_df is not None:
142
- features = self.features_df.clone()
143
- for col in features.columns:
144
- # get column dtype
145
- dtype = str(features[col].dtype).lower()
146
- if dtype == "object":
147
- if col == "chrom":
148
- # this column contains either None or Chromatogram objects
149
- # convert to json with to_json() and store them as compressed strings
150
- data = features[col]
151
- data_as_str = []
152
- for i in range(len(data)):
153
- if data[i] is not None:
154
- data_as_str.append(data[i].to_json())
155
- else:
156
- data_as_str.append("None")
157
- features_group.create_dataset(
158
- col,
159
- data=data_as_str,
160
- compression="gzip",
161
- )
162
- elif col == "ms2_scans":
163
- # this column contains either None or lists of integers (scan indices)
164
- # convert each to JSON string for storage (HDF5 can't handle inhomogeneous arrays)
165
- data = features[col]
166
- data_as_json_strings = []
167
- for i in range(len(data)):
168
- if data[i] is not None:
169
- data_as_json_strings.append(json.dumps(list(data[i])))
170
- else:
171
- data_as_json_strings.append("None")
172
- features_group.create_dataset(
173
- col,
174
- data=data_as_json_strings,
175
- compression="gzip",
176
- )
177
- elif col == "ms2_specs":
178
- # this column contains either None or lists of Spectrum objects
179
- # convert each spectrum to json and store as list of json strings
180
- data = features[col]
181
- data_as_lists_of_strings = []
182
- for i in range(len(data)):
183
- if data[i] is not None:
184
- # Convert list of Spectrum objects to list of JSON strings
185
- spectrum_list = data[i]
186
- json_strings = []
187
- for spectrum in spectrum_list:
188
- if spectrum is not None:
189
- json_strings.append(spectrum.to_json())
190
- else:
191
- json_strings.append("None")
192
- data_as_lists_of_strings.append(json_strings)
193
- else:
194
- data_as_lists_of_strings.append(["None"])
195
- # Convert to numpy array for HDF5 storage
196
- serialized_data = []
197
- for item in data_as_lists_of_strings:
198
- serialized_data.append(json.dumps(item))
199
- features_group.create_dataset(
200
- col,
201
- data=serialized_data,
202
- compression="gzip",
203
- )
204
-
205
- else:
206
- self.logger.warning(
207
- f"Unexpectedly, column '{col}' has dtype 'object'. Implement serialization for this column.",
208
- )
209
- continue
210
- elif dtype == "string":
211
- data = features[col].to_list()
212
- # convert None to 'None' strings
213
- data = ["None" if x is None else x for x in data]
214
- features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
215
- else:
216
- try:
217
- data = features[col].to_numpy()
218
- features_group.create_dataset(col, data=data)
219
- except Exception:
220
- self.logger.warning(
221
- f"Failed to save column '{col}' with dtype '{dtype}'. It may contain unsupported data types.",
222
- )
223
- features_group.attrs["columns"] = list(features.columns)
224
-
225
- # Store arrays
226
- if self.ms1_df is not None and include_ms1:
227
- # the df is a polars DataFrame
228
- for col in self.ms1_df.columns:
229
- ms1_group.create_dataset(
230
- col,
231
- data=self.ms1_df[col].to_numpy(),
232
- compression="gzip",
233
- )
234
-
235
- # Store parameters as JSON
236
- if self.parameters is not None:
237
- # Convert parameters dict to JSON string
238
- params_json = json.dumps(self.parameters, default=str)
239
- metadata_group.attrs["parameters"] = params_json
240
-
241
- # Store lib and lib_match - removed (no longer saving lib data)
242
-
243
- self.logger.info(f"Sample saved successfully to {filename}")
244
- if self.features is not None:
245
- # save the features as a separate file
246
- self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
247
-
248
-
249
- def _load_sample5(self, filename: str, map: bool = True):
250
- """
251
- Load instance data from a sample5 HDF5 file.
252
-
253
- Restores all attributes that were saved with save_sample5() method using the
254
- schema defined in sample5_schema.json for proper Polars DataFrame reconstruction.
255
-
256
- Args:
257
- filename (str): Path to the sample5 HDF5 file to load.
258
- map (bool, optional): Whether to map featureXML file if available. Defaults to True.
259
-
260
- Returns:
261
- None (modifies self in place)
262
-
263
- Notes:
264
- - Restores DataFrames with proper schema typing from sample5_schema.json
265
- - Handles Chromatogram and Spectrum object reconstruction
266
- - Properly handles MS2 scan lists and spectrum lists
267
- """
268
- # Load schema for proper DataFrame reconstruction
269
- schema_path = os.path.join(os.path.dirname(__file__), "sample5_schema.json")
270
- try:
271
- with open(schema_path) as f:
272
- schema = json.load(f)
273
- except FileNotFoundError:
274
- self.logger.warning(
275
- f"Schema file {schema_path} not found. Using default types.",
276
- )
277
- schema = {}
278
-
279
- with h5py.File(filename, "r") as f:
280
- # Load metadata
281
- if "metadata" in f:
282
- metadata_group = f["metadata"]
283
- self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
284
- self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
285
- self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
286
-
287
- # Load parameters from JSON in metadata
288
- loaded_data = load_parameters_from_metadata(metadata_group)
289
-
290
- # Always create a fresh sample_defaults object
291
- from masster.sample.defaults.sample_def import sample_defaults
292
- self.parameters = sample_defaults()
293
-
294
- # Initialize history and populate from loaded data
295
- self.history = {}
296
- if loaded_data is not None and isinstance(loaded_data, dict):
297
- # Store the loaded data in history
298
- self.history = loaded_data
299
- # If there are sample parameters in the history, use them to update defaults
300
- if "sample" in loaded_data:
301
- sample_params = loaded_data["sample"]
302
- if isinstance(sample_params, dict):
303
- self.parameters.set_from_dict(sample_params, validate=False)
304
-
305
- # Load scans_df
306
- if "scans" in f:
307
- scans_group = f["scans"]
308
- data: dict[str, Any] = {}
309
- missing_columns = []
310
- for col in schema.get("scans_df", {}).get("columns", []):
311
- if col not in scans_group:
312
- self.logger.debug(f"Column '{col}' not found in sample5/scans.")
313
- data[col] = None
314
- missing_columns.append(col)
315
- continue
316
-
317
- dtype = schema["scans_df"]["columns"][col].get("dtype", "native")
318
- match dtype:
319
- case "pl.Object":
320
- self.logger.debug(f"Unexpected Object column '{col}'")
321
- data[col] = None
322
- missing_columns.append(col)
323
-
324
- case _:
325
- data[col] = scans_group[col][:]
326
-
327
- # create polars DataFrame from data
328
- if data:
329
- self.scans_df = pl.DataFrame(data)
330
-
331
- # Convert "None" strings and NaN values to proper null values
332
- for col in self.scans_df.columns:
333
- if self.scans_df[col].dtype == pl.Utf8: # String columns
334
- self.scans_df = self.scans_df.with_columns([
335
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
336
- .then(None)
337
- .otherwise(pl.col(col))
338
- .alias(col),
339
- ])
340
- elif self.scans_df[col].dtype in [
341
- pl.Float64,
342
- pl.Float32,
343
- ]: # Float columns
344
- self.scans_df = self.scans_df.with_columns([
345
- pl.col(col).fill_nan(None).alias(col),
346
- ])
347
-
348
- # update all columns with schema types
349
- for col in self.scans_df.columns:
350
- if col in schema.get("scans_df", {}).get("columns", {}):
351
- try:
352
- dtype_str = schema["scans_df"]["columns"][col]["dtype"]
353
- # Convert dtype string to actual polars dtype
354
- if dtype_str.startswith("pl."):
355
- # Skip Object columns - they're already properly reconstructed
356
- if "Object" in dtype_str:
357
- continue
358
- # Handle different polars data types
359
- if "Int" in dtype_str:
360
- # Convert to numeric first, handling different input types
361
- if self.scans_df[col].dtype == pl.Utf8:
362
- # String data - convert to integer
363
- self.scans_df = self.scans_df.with_columns(
364
- pl.col(col)
365
- .str.to_integer()
366
- .cast(eval(dtype_str)),
367
- )
368
- elif self.scans_df[col].dtype in [
369
- pl.Float64,
370
- pl.Float32,
371
- ]:
372
- # Float data - cast to integer
373
- self.scans_df = self.scans_df.with_columns(
374
- pl.col(col).cast(eval(dtype_str)),
375
- )
376
- else:
377
- # Try direct casting
378
- self.scans_df = self.scans_df.with_columns(
379
- pl.col(col).cast(eval(dtype_str)),
380
- )
381
- elif "Float" in dtype_str:
382
- # Convert to float, handling different input types
383
- if self.scans_df[col].dtype == pl.Utf8:
384
- # String data - convert to float
385
- self.scans_df = self.scans_df.with_columns(
386
- pl.col(col)
387
- .str.to_decimal()
388
- .cast(eval(dtype_str)),
389
- )
390
- else:
391
- # Try direct casting
392
- self.scans_df = self.scans_df.with_columns(
393
- pl.col(col).cast(eval(dtype_str)),
394
- )
395
- elif "Utf8" in dtype_str:
396
- # Ensure it's string type
397
- self.scans_df = self.scans_df.with_columns(
398
- pl.col(col).cast(pl.Utf8),
399
- )
400
- else:
401
- # Handle special cases and try direct casting for other types
402
- current_dtype = self.scans_df[col].dtype
403
- target_dtype = eval(dtype_str)
404
-
405
- # Handle binary data that might need string conversion first
406
- if "Binary" in str(current_dtype):
407
- # Convert binary to string first, then to target type
408
- if target_dtype == pl.Utf8:
409
- self.scans_df = self.scans_df.with_columns(
410
- pl.col(col)
411
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
412
- .cast(target_dtype),
413
- )
414
- elif "Int" in str(target_dtype):
415
- self.scans_df = self.scans_df.with_columns(
416
- pl.col(col)
417
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
418
- .str.to_integer()
419
- .cast(target_dtype),
420
- )
421
- elif "Float" in str(target_dtype):
422
- self.scans_df = self.scans_df.with_columns(
423
- pl.col(col)
424
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
425
- .str.to_decimal()
426
- .cast(target_dtype),
427
- )
428
- else:
429
- # Try direct casting
430
- self.scans_df = self.scans_df.with_columns(
431
- pl.col(col).cast(target_dtype),
432
- )
433
- else:
434
- # Try direct casting for non-binary types
435
- self.scans_df = self.scans_df.with_columns(
436
- pl.col(col).cast(target_dtype),
437
- )
438
- except Exception as e:
439
- self.logger.warning(
440
- f"Failed to cast column '{col}' in scans_df: {e}",
441
- )
442
- else:
443
- self.logger.warning(
444
- f"Column '{col}' in scans_df not found in schema, keeping original type.",
445
- )
446
-
447
- # Ensure column order matches schema order
448
- if "scans_df" in schema and "columns" in schema["scans_df"]:
449
- schema_column_order = list(schema["scans_df"]["columns"].keys())
450
- # Only reorder columns that exist in both schema and DataFrame
451
- existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
452
- if existing_columns:
453
- self.scans_df = self.scans_df.select(existing_columns)
454
-
455
- else:
456
- self.scans_df = None
457
- else:
458
- self.scans_df = None
459
-
460
- # Load features_df
461
- if "features" in f:
462
- features_group = f["features"]
463
- # columns = list(features_group.attrs.get('columns', []))
464
- data = {}
465
- missing_columns = []
466
- for col in schema.get("features_df", {}).get("columns", []):
467
- if col not in features_group:
468
- self.logger.debug(
469
- f"Column '{col}' not found in sample5/features.",
470
- )
471
- data[col] = None
472
- missing_columns.append(col)
473
- continue
474
-
475
- dtype = schema["features_df"]["columns"][col].get("dtype", "native")
476
- match dtype:
477
- case "pl.Object":
478
- match col:
479
- case "chrom":
480
- data_col = features_group[col][:]
481
- # Convert JSON strings back to Chromatogram objects
482
- reconstructed_data: list[Any] = []
483
- for item in data_col:
484
- if isinstance(item, bytes):
485
- item = item.decode("utf-8")
486
-
487
- if item == "None" or item == "":
488
- reconstructed_data.append(None)
489
- else:
490
- try:
491
- reconstructed_data.append(
492
- Chromatogram.from_json(item),
493
- )
494
- except (json.JSONDecodeError, ValueError):
495
- reconstructed_data.append(None)
496
-
497
- data[col] = reconstructed_data
498
- case "ms2_scans":
499
- data_col = features_group[col][:]
500
- # Convert JSON strings back to lists of integers
501
- reconstructed_data = []
502
- for item in data_col:
503
- if isinstance(item, bytes):
504
- item = item.decode("utf-8")
505
-
506
- if item == "None":
507
- reconstructed_data.append(None)
508
- else:
509
- try:
510
- # Parse JSON string to get list of integers
511
- scan_list = json.loads(item)
512
- reconstructed_data.append(scan_list)
513
- except (json.JSONDecodeError, ValueError):
514
- reconstructed_data.append(None)
515
-
516
- data[col] = reconstructed_data
517
- case "ms2_specs":
518
- data_col = features_group[col][:]
519
- # Convert JSON strings back to lists of Spectrum objects
520
- reconstructed_data = []
521
- for item in data_col:
522
- if isinstance(item, bytes):
523
- item = item.decode("utf-8")
524
-
525
- # Parse the outer JSON (list of JSON strings)
526
- json_list = json.loads(item)
527
-
528
- if json_list == ["None"]:
529
- # This was originally None
530
- reconstructed_data.append(None)
531
- else:
532
- # This was originally a list of Spectrum objects
533
- spectrum_list: list[Any] = []
534
- for json_str in json_list:
535
- if json_str == "None":
536
- spectrum_list.append(None)
537
- else:
538
- spectrum_list.append(
539
- Spectrum.from_json(json_str),
540
- )
541
- reconstructed_data.append(spectrum_list)
542
-
543
- data[col] = reconstructed_data
544
- case _:
545
- self.logger.debug(f"Unexpected Object column '{col}'")
546
- data[col] = None
547
- missing_columns.append(col)
548
-
549
- case _:
550
- data[col] = features_group[col][:]
551
-
552
- # create polars DataFrame from data
553
- if data:
554
- # Build schema for DataFrame creation to handle Object columns properly
555
- df_schema = {}
556
- for col, values in data.items():
557
- if col in schema.get("features_df", {}).get("columns", {}):
558
- dtype_str = schema["features_df"]["columns"][col]["dtype"]
559
- if dtype_str == "pl.Object":
560
- df_schema[col] = pl.Object
561
- else:
562
- # Let Polars infer the type initially, we'll cast later
563
- df_schema[col] = None
564
- else:
565
- df_schema[col] = None
566
-
567
- # Create DataFrame with explicit Object types where needed
568
- try:
569
- self.features_df = pl.DataFrame(data, schema=df_schema)
570
- except Exception:
571
- # Fallback: create without schema and handle Object columns manually
572
- object_columns = {
573
- k: v
574
- for k, v in data.items()
575
- if k in schema.get("features_df", {}).get("columns", {})
576
- and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
577
- }
578
- regular_columns = {
579
- k: v for k, v in data.items() if k not in object_columns
580
- }
581
-
582
- # Create DataFrame with regular columns first
583
- if regular_columns:
584
- self.features_df = pl.DataFrame(regular_columns)
585
- # Add Object columns one by one
586
- for col, values in object_columns.items():
587
- self.features_df = self.features_df.with_columns([
588
- pl.Series(col, values, dtype=pl.Object),
589
- ])
590
- else:
591
- # Only Object columns
592
- self.features_df = pl.DataFrame()
593
- for col, values in object_columns.items():
594
- self.features_df = self.features_df.with_columns([
595
- pl.Series(col, values, dtype=pl.Object),
596
- ])
597
-
598
- # update all columns with schema types (skip Object columns)
599
- for col in self.features_df.columns:
600
- if col in schema.get("features_df", {}).get("columns", {}):
601
- try:
602
- dtype_str = schema["features_df"]["columns"][col]["dtype"]
603
- # Convert dtype string to actual polars dtype
604
- if dtype_str.startswith("pl."):
605
- # Skip Object columns - they're already properly reconstructed
606
- if "Object" in dtype_str:
607
- continue
608
- # Handle different polars data types
609
- if "Int" in dtype_str:
610
- # Convert to numeric first, handling different input types
611
- if self.features_df[col].dtype == pl.Utf8:
612
- # String data - convert to integer
613
- self.features_df = (
614
- self.features_df.with_columns(
615
- pl.col(col)
616
- .str.to_integer()
617
- .cast(eval(dtype_str)),
618
- )
619
- )
620
- elif self.features_df[col].dtype in [
621
- pl.Float64,
622
- pl.Float32,
623
- ]:
624
- # Float data - cast to integer
625
- self.features_df = (
626
- self.features_df.with_columns(
627
- pl.col(col).cast(eval(dtype_str)),
628
- )
629
- )
630
- else:
631
- # Handle special cases and try direct casting for other types
632
- current_dtype = self.features_df[col].dtype
633
- target_dtype = eval(dtype_str)
634
-
635
- # Handle binary data that might need string conversion first
636
- if "Binary" in str(current_dtype):
637
- # Convert binary to string first, then to target type
638
- if target_dtype == pl.Utf8:
639
- self.features_df = (
640
- self.features_df.with_columns(
641
- pl.col(col)
642
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
643
- .cast(target_dtype),
644
- )
645
- )
646
- elif "Int" in str(target_dtype):
647
- self.features_df = (
648
- self.features_df.with_columns(
649
- pl.col(col)
650
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
651
- .str.to_integer()
652
- .cast(target_dtype),
653
- )
654
- )
655
- elif "Float" in str(target_dtype):
656
- self.features_df = (
657
- self.features_df.with_columns(
658
- pl.col(col)
659
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
660
- .str.to_decimal()
661
- .cast(target_dtype),
662
- )
663
- )
664
- else:
665
- # Try direct casting
666
- self.features_df = (
667
- self.features_df.with_columns(
668
- pl.col(col).cast(target_dtype),
669
- )
670
- )
671
- else:
672
- # Try direct casting for non-binary types
673
- self.features_df = (
674
- self.features_df.with_columns(
675
- pl.col(col).cast(target_dtype),
676
- )
677
- )
678
- elif "Float" in dtype_str:
679
- # Convert to float, handling different input types
680
- if self.features_df[col].dtype == pl.Utf8:
681
- # String data - convert to float
682
- self.features_df = (
683
- self.features_df.with_columns(
684
- pl.col(col)
685
- .str.to_decimal()
686
- .cast(eval(dtype_str)),
687
- )
688
- )
689
- else:
690
- # Handle special cases and try direct casting for other types
691
- current_dtype = self.features_df[col].dtype
692
- target_dtype = eval(dtype_str)
693
-
694
- # Handle binary data that might need string conversion first
695
- if "Binary" in str(current_dtype):
696
- # Convert binary to string first, then to target type
697
- if target_dtype == pl.Utf8:
698
- self.features_df = (
699
- self.features_df.with_columns(
700
- pl.col(col)
701
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
702
- .cast(target_dtype),
703
- )
704
- )
705
- elif "Int" in str(target_dtype):
706
- self.features_df = (
707
- self.features_df.with_columns(
708
- pl.col(col)
709
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
710
- .str.to_integer()
711
- .cast(target_dtype),
712
- )
713
- )
714
- elif "Float" in str(target_dtype):
715
- self.features_df = (
716
- self.features_df.with_columns(
717
- pl.col(col)
718
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
719
- .str.to_decimal()
720
- .cast(target_dtype),
721
- )
722
- )
723
- else:
724
- # Try direct casting
725
- self.features_df = (
726
- self.features_df.with_columns(
727
- pl.col(col).cast(target_dtype),
728
- )
729
- )
730
- else:
731
- # Try direct casting for non-binary types
732
- self.features_df = (
733
- self.features_df.with_columns(
734
- pl.col(col).cast(target_dtype),
735
- )
736
- )
737
- elif "Utf8" in dtype_str:
738
- # Ensure it's string type
739
- self.features_df = self.features_df.with_columns(
740
- pl.col(col).cast(pl.Utf8),
741
- )
742
- else:
743
- # Handle special cases and try direct casting for other types
744
- current_dtype = self.features_df[col].dtype
745
- target_dtype = eval(dtype_str)
746
-
747
- # Handle binary data that might need string conversion first
748
- if "Binary" in str(current_dtype):
749
- # Convert binary to string first, then to target type
750
- if target_dtype == pl.Utf8:
751
- self.features_df = (
752
- self.features_df.with_columns(
753
- pl.col(col)
754
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
755
- .cast(target_dtype),
756
- )
757
- )
758
- elif "Int" in str(target_dtype):
759
- self.features_df = (
760
- self.features_df.with_columns(
761
- pl.col(col)
762
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
763
- .str.to_integer()
764
- .cast(target_dtype),
765
- )
766
- )
767
- elif "Float" in str(target_dtype):
768
- self.features_df = (
769
- self.features_df.with_columns(
770
- pl.col(col)
771
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
772
- .str.to_decimal()
773
- .cast(target_dtype),
774
- )
775
- )
776
- else:
777
- # Try direct casting
778
- self.features_df = (
779
- self.features_df.with_columns(
780
- pl.col(col).cast(target_dtype),
781
- )
782
- )
783
- else:
784
- # Try direct casting for non-binary types
785
- self.features_df = (
786
- self.features_df.with_columns(
787
- pl.col(col).cast(target_dtype),
788
- )
789
- )
790
- except Exception as e:
791
- self.logger.warning(
792
- f"Failed to cast column '{col}' in features_df: {e}",
793
- )
794
- else:
795
- self.logger.warning(
796
- f"Column '{col}' in features_df not found in schema, keeping original type.",
797
- )
798
-
799
- # FINAL null conversion pass - after all type casting is done
800
- # This ensures "None" strings introduced by failed conversions are properly handled
801
- for col in self.features_df.columns:
802
- if self.features_df[col].dtype == pl.Utf8: # String columns
803
- self.features_df = self.features_df.with_columns([
804
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
805
- .then(None)
806
- .otherwise(pl.col(col))
807
- .alias(col),
808
- ])
809
- # Float columns
810
- elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
811
- self.features_df = self.features_df.with_columns([
812
- pl.col(col).fill_nan(None).alias(col),
813
- ])
814
-
815
- # Ensure column order matches schema order
816
- if "features_df" in schema and "columns" in schema["features_df"]:
817
- schema_column_order = list(schema["features_df"]["columns"].keys())
818
- # Only reorder columns that exist in both schema and DataFrame
819
- existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
820
- if existing_columns:
821
- self.features_df = self.features_df.select(existing_columns)
822
-
823
- else:
824
- self.features_df = None
825
- else:
826
- self.features_df = None
827
-
828
- # Load ms1_df
829
- if "ms1" in f:
830
- ms1_group = f["ms1"]
831
- data = {}
832
-
833
- # Get all datasets in the ms1 group
834
- for col in ms1_group.keys():
835
- data[col] = ms1_group[col][:]
836
-
837
- if data:
838
- # Create DataFrame directly with Polars
839
- self.ms1_df = pl.DataFrame(data)
840
-
841
- # Apply schema if available
842
- if "ms1_df" in schema and "columns" in schema["ms1_df"]:
843
- schema_columns = schema["ms1_df"]["columns"]
844
- for col in self.ms1_df.columns:
845
- if col in schema_columns:
846
- dtype_str = schema_columns[col]["dtype"]
847
- try:
848
- if "Int" in dtype_str:
849
- self.ms1_df = self.ms1_df.with_columns([
850
- pl.col(col).cast(pl.Int64, strict=False)
851
- ])
852
- elif "Float" in dtype_str:
853
- self.ms1_df = self.ms1_df.with_columns([
854
- pl.col(col).cast(pl.Float64, strict=False)
855
- ])
856
- except Exception as e:
857
- self.logger.warning(
858
- f"Failed to apply schema type {dtype_str} to column {col}: {e}",
859
- )
860
-
861
- # Convert "None" strings and NaN values to proper null values
862
- self.ms1_df = clean_null_values_polars(self.ms1_df)
863
- else:
864
- self.ms1_df = None
865
- else:
866
- self.ms1_df = None
867
-
868
- # Parameters are now loaded from metadata JSON (see above)
869
- # Lib and lib_match are no longer saved/loaded
870
-
871
- if map:
872
- featureXML = filename.replace(".sample5", ".featureXML")
873
- if os.path.exists(featureXML):
874
- self._load_featureXML(featureXML)
875
- else:
876
- self.logger.warning(
877
- f"Feature XML file {featureXML} not found, skipping loading.",
878
- )
879
-
880
- self.logger.info(f"Sample loaded successfully from {filename}")
881
-
882
-
883
- def load_schema(schema_path: str) -> Dict[str, Any]:
884
- """
885
- Load schema from JSON file with error handling.
886
-
887
- Args:
888
- schema_path: Path to the schema JSON file
889
-
890
- Returns:
891
- Dictionary containing the schema, empty dict if not found
892
- """
893
- try:
894
- with open(schema_path) as f:
895
- return json.load(f) # type: ignore
896
- except FileNotFoundError:
897
- return {}
898
-
899
-
900
- def decode_metadata_attr(attr_value: Any) -> str:
901
- """
902
- Decode metadata attribute, handling both bytes and string types.
903
-
904
- Args:
905
- attr_value: The attribute value to decode
906
-
907
- Returns:
908
- String representation of the attribute
909
- """
910
- if isinstance(attr_value, bytes):
911
- return attr_value.decode()
912
- return str(attr_value) if attr_value is not None else ""
913
-
914
-
915
- def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
916
- """
917
- Clean null values in a Polars DataFrame by converting string nulls to proper nulls.
918
-
919
- Args:
920
- df: The Polars DataFrame to clean
921
-
922
- Returns:
923
- Cleaned DataFrame
924
- """
925
- cleaned_df = df
926
- for col in df.columns:
927
- if df[col].dtype == pl.Utf8: # String columns
928
- cleaned_df = cleaned_df.with_columns([
929
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
930
- .then(None)
931
- .otherwise(pl.col(col))
932
- .alias(col),
933
- ])
934
- elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
935
- cleaned_df = cleaned_df.with_columns([
936
- pl.col(col).fill_nan(None).alias(col),
937
- ])
938
- return cleaned_df
939
-
940
-
941
- def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataFrame:
942
- """
943
- Cast a Polars DataFrame column to the specified dtype with appropriate handling.
944
-
945
- Args:
946
- df: The Polars DataFrame
947
- col: Column name to cast
948
- dtype_str: Target dtype as string (e.g., 'pl.Int64')
949
-
950
- Returns:
951
- DataFrame with the column cast to the new type
952
- """
953
- if not dtype_str.startswith("pl.") or "Object" in dtype_str:
954
- return df
955
-
956
- try:
957
- target_dtype = eval(dtype_str)
958
- current_dtype = df[col].dtype
959
-
960
- if "Int" in dtype_str:
961
- return _cast_to_int(df, col, current_dtype, target_dtype)
962
- elif "Float" in dtype_str:
963
- return _cast_to_float(df, col, current_dtype, target_dtype)
964
- elif "Utf8" in dtype_str:
965
- return df.with_columns(pl.col(col).cast(pl.Utf8))
966
- else:
967
- return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
968
-
969
- except Exception:
970
- return df
971
-
972
-
973
- def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
974
- """Helper function to cast column to integer type."""
975
- if current_dtype == pl.Utf8:
976
- return df.with_columns(
977
- pl.col(col).str.to_integer().cast(target_dtype)
978
- )
979
- elif current_dtype in [pl.Float64, pl.Float32]:
980
- return df.with_columns(pl.col(col).cast(target_dtype))
981
- else:
982
- return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
983
-
984
-
985
- def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
986
- """Helper function to cast column to float type."""
987
- if current_dtype == pl.Utf8:
988
- return df.with_columns(
989
- pl.col(col).str.to_decimal().cast(target_dtype)
990
- )
991
- else:
992
- return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
993
-
994
-
995
- def _cast_with_binary_handling(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
996
- """Helper function to handle binary data conversion."""
997
- if "Binary" in str(current_dtype):
998
- if target_dtype == pl.Utf8:
999
- return df.with_columns(
1000
- pl.col(col).map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8).cast(target_dtype)
1001
- )
1002
- elif "Int" in str(target_dtype):
1003
- return df.with_columns(
1004
- pl.col(col)
1005
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1006
- .str.to_integer()
1007
- .cast(target_dtype)
1008
- )
1009
- elif "Float" in str(target_dtype):
1010
- return df.with_columns(
1011
- pl.col(col)
1012
- .map_elements(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1013
- .str.to_decimal()
1014
- .cast(target_dtype)
1015
- )
1016
-
1017
- # Fallback: try direct casting
1018
- return df.with_columns(pl.col(col).cast(target_dtype))
1019
-
1020
-
1021
- def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
1022
- """
1023
- Apply schema type casting to a Polars DataFrame.
1024
-
1025
- Args:
1026
- df: The DataFrame to modify
1027
- schema: The schema dictionary
1028
- df_name: Name of the DataFrame in the schema (e.g., 'scans_df', 'features_df')
1029
-
1030
- Returns:
1031
- DataFrame with schema types applied
1032
- """
1033
- df_schema = schema.get(df_name, {}).get("columns", {})
1034
-
1035
- for col in df.columns:
1036
- if col in df_schema:
1037
- dtype_str = df_schema[col]["dtype"]
1038
- df = cast_column_by_dtype(df, col, dtype_str)
1039
-
1040
- return df
1041
-
1042
-
1043
- def reconstruct_object_column(data_col: np.ndarray, col_name: str) -> List[Any]:
1044
- """
1045
- Reconstruct object columns from serialized data.
1046
-
1047
- Args:
1048
- data_col: Array containing serialized data
1049
- col_name: Name of the column for type-specific reconstruction
1050
-
1051
- Returns:
1052
- List of reconstructed objects
1053
- """
1054
- reconstructed_data: list[Any] = []
1055
-
1056
- for item in data_col:
1057
- if isinstance(item, bytes):
1058
- item = item.decode("utf-8")
1059
-
1060
- if item == "None" or item == "":
1061
- reconstructed_data.append(None)
1062
- continue
1063
-
1064
- try:
1065
- if col_name == "chrom":
1066
- reconstructed_data.append(Chromatogram.from_json(item))
1067
- elif col_name == "ms2_scans":
1068
- scan_list = json.loads(item)
1069
- reconstructed_data.append(scan_list)
1070
- elif col_name == "ms2_specs":
1071
- json_list = json.loads(item)
1072
- if json_list == ["None"]:
1073
- reconstructed_data.append(None)
1074
- else:
1075
- spectrum_list: list[Any] = []
1076
- for json_str in json_list:
1077
- if json_str == "None":
1078
- spectrum_list.append(None)
1079
- else:
1080
- spectrum_list.append(Spectrum.from_json(json_str))
1081
- reconstructed_data.append(spectrum_list)
1082
- else:
1083
- # Unknown object column
1084
- reconstructed_data.append(None)
1085
- except (json.JSONDecodeError, ValueError):
1086
- reconstructed_data.append(None)
1087
-
1088
- return reconstructed_data
1089
-
1090
-
1091
- def load_dataframe_from_h5_group(
1092
- group: h5py.Group,
1093
- schema: Dict[str, Any],
1094
- df_name: str,
1095
- logger: Optional[Any] = None
1096
- ) -> Tuple[Optional[pl.DataFrame], List[str]]:
1097
- """
1098
- Load a Polars DataFrame from an HDF5 group using schema.
1099
-
1100
- Args:
1101
- group: The HDF5 group containing the DataFrame data
1102
- schema: The schema dictionary
1103
- df_name: Name of the DataFrame in the schema
1104
- logger: Optional logger for warnings
1105
-
1106
- Returns:
1107
- Tuple of (DataFrame or None, list of missing columns)
1108
- """
1109
- data: dict[str, Any] = {}
1110
- missing_columns = []
1111
-
1112
- # Load columns according to schema
1113
- schema_columns = schema.get(df_name, {}).get("columns", [])
1114
-
1115
- for col in schema_columns:
1116
- if col not in group:
1117
- if logger:
1118
- logger.warning(f"Column '{col}' not found in {df_name}.")
1119
- data[col] = None
1120
- missing_columns.append(col)
1121
- continue
1122
-
1123
- dtype = schema[df_name]["columns"][col].get("dtype", "native")
1124
-
1125
- if dtype == "pl.Object":
1126
- # Handle object columns specially
1127
- data[col] = reconstruct_object_column(group[col][:], col)
1128
- else:
1129
- data[col] = group[col][:]
1130
-
1131
- if not data:
1132
- return None, missing_columns
1133
-
1134
- # Create DataFrame with proper schema for Object columns
1135
- df_schema = {}
1136
- for col, values in data.items():
1137
- if col in schema_columns:
1138
- dtype_str = schema[df_name]["columns"][col]["dtype"]
1139
- if dtype_str == "pl.Object":
1140
- df_schema[col] = pl.Object
1141
-
1142
- try:
1143
- if df_schema:
1144
- df = pl.DataFrame(data, schema=df_schema)
1145
- else:
1146
- df = pl.DataFrame(data)
1147
- except Exception:
1148
- # Fallback: handle Object columns manually
1149
- df = _create_dataframe_with_object_columns(data, schema, df_name)
1150
-
1151
- # Clean null values
1152
- df = clean_null_values_polars(df)
1153
-
1154
- # Apply schema type casting
1155
- df = apply_schema_to_dataframe(df, schema, df_name)
1156
-
1157
- return df, missing_columns
1158
-
1159
-
1160
- def _create_dataframe_with_object_columns(
1161
- data: Dict[str, Any],
1162
- schema: Dict[str, Any],
1163
- df_name: str
1164
- ) -> pl.DataFrame:
1165
- """
1166
- Create DataFrame handling Object columns manually when schema creation fails.
1167
-
1168
- Args:
1169
- data: Dictionary of column data
1170
- schema: The schema dictionary
1171
- df_name: Name of the DataFrame in the schema
1172
-
1173
- Returns:
1174
- Polars DataFrame with Object columns properly handled
1175
- """
1176
- schema_columns = schema.get(df_name, {}).get("columns", {})
1177
-
1178
- object_columns = {
1179
- k: v for k, v in data.items()
1180
- if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
1181
- }
1182
- regular_columns = {
1183
- k: v for k, v in data.items() if k not in object_columns
1184
- }
1185
-
1186
- # Create DataFrame with regular columns first
1187
- if regular_columns:
1188
- df = pl.DataFrame(regular_columns)
1189
- # Add Object columns one by one
1190
- for col, values in object_columns.items():
1191
- df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
1192
- else:
1193
- # Only Object columns
1194
- df = pl.DataFrame()
1195
- for col, values in object_columns.items():
1196
- df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
1197
-
1198
- return df
1199
-
1200
-
1201
- def load_ms1_dataframe_from_h5_group(
1202
- group: h5py.Group,
1203
- schema: Dict[str, Any],
1204
- logger: Optional[Any] = None
1205
- ) -> Optional[pl.DataFrame]:
1206
- """
1207
- Load MS1 DataFrame from HDF5 group.
1208
-
1209
- Args:
1210
- group: The HDF5 group containing MS1 data
1211
- schema: The schema dictionary
1212
- logger: Optional logger for warnings
1213
-
1214
- Returns:
1215
- Polars DataFrame or None
1216
- """
1217
- data = {}
1218
-
1219
- # Get all datasets in the ms1 group
1220
- for col in group.keys():
1221
- data[col] = group[col][:]
1222
-
1223
- if not data:
1224
- return None
1225
-
1226
- # Create DataFrame directly with Polars
1227
- ms1_df = pl.DataFrame(data)
1228
-
1229
- # Apply schema if available
1230
- if "ms1_df" in schema and "columns" in schema["ms1_df"]:
1231
- schema_columns = schema["ms1_df"]["columns"]
1232
- for col in ms1_df.columns:
1233
- if col in schema_columns:
1234
- dtype_str = schema_columns[col]["dtype"]
1235
- try:
1236
- if "Int" in dtype_str:
1237
- ms1_df = ms1_df.with_columns([
1238
- pl.col(col).cast(pl.Int64, strict=False)
1239
- ])
1240
- elif "Float" in dtype_str:
1241
- ms1_df = ms1_df.with_columns([
1242
- pl.col(col).cast(pl.Float64, strict=False)
1243
- ])
1244
- except Exception as e:
1245
- if logger:
1246
- logger.warning(
1247
- f"Failed to apply schema type {dtype_str} to column {col}: {e}"
1248
- )
1249
-
1250
- # Convert "None" strings and NaN values to proper null values
1251
- return clean_null_values_polars(ms1_df)
1252
-
1253
-
1254
- def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
1255
- """
1256
- Load parameters from HDF5 metadata group.
1257
-
1258
- Args:
1259
- metadata_group: The HDF5 metadata group containing parameters
1260
-
1261
- Returns:
1262
- Dictionary of parameters or None if not found
1263
- """
1264
- if "parameters" in metadata_group.attrs:
1265
- try:
1266
- params_json = decode_metadata_attr(metadata_group.attrs["parameters"])
1267
- # Ensure params_json is a string before attempting JSON decode
1268
- if isinstance(params_json, str) and params_json.strip():
1269
- result = json.loads(params_json)
1270
- # Ensure the result is a dictionary
1271
- if isinstance(result, dict):
1272
- return result
1273
- except (json.JSONDecodeError, ValueError, TypeError) as e:
1274
- # Log the error for debugging
1275
- print(f"Warning: Failed to parse parameters JSON: {e}")
1276
- print(f"Raw parameter data type: {type(params_json)}")
1277
- print(f"Raw parameter data: {repr(params_json)}")
1278
- return None
1279
-
1280
-
1281
- def create_h5_metadata_group(f: h5py.File, file_path: Optional[str], file_type: Optional[str], label: Optional[str]) -> None:
1282
- """
1283
- Create and populate metadata group in HDF5 file.
1284
-
1285
- Args:
1286
- f: The HDF5 file object
1287
- file_path: Source file path
1288
- file_type: Source file type
1289
- label: Sample label
1290
- """
1291
- metadata_group = f.create_group("metadata")
1292
- metadata_group.attrs["format"] = "master-sample5-1"
1293
- metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
1294
- metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
1295
- metadata_group.attrs["label"] = str(label) if label is not None else ""
1296
-
1297
-
1
+ import json
2
+ import os
3
+
4
+ import h5py
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from masster.chromatogram import Chromatogram
11
+ from masster.spectrum import Spectrum
12
+
13
+
14
+ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True):
15
+ """
16
+ Save the instance data to a sample5 HDF5 file with optimized compression.
17
+
18
+ This optimized version uses context-aware compression settings for better
19
+ performance and smaller file sizes. Different compression algorithms are
20
+ selected based on data type and usage patterns.
21
+
22
+ Args:
23
+ filename (str, optional): Target file name. If None, uses default based on file_path.
24
+ include_ms1 (bool, optional): Whether to include MS1 data. Defaults to True.
25
+ include_scans (bool, optional): Whether to include scan data. Defaults to True.
26
+
27
+ Stores:
28
+ - metadata/format (str): Data format identifier (master-sample-1)
29
+ - metadata/file_path (str): Source file path
30
+ - metadata/file_type (str): Source file type
31
+ - metadata/label (str): Sample label
32
+ - metadata/parameters (str): Parameters as JSON string with optimized compression
33
+ - scans/: Scan DataFrame data with fast-access compression for IDs, standard for others
34
+ - features/: Feature DataFrame data with JSON compression for objects, fast compression for core data
35
+ - ms1/: MS1-level data with numeric compression
36
+
37
+ Compression Strategy:
38
+ - LZF + shuffle: Fast access data (feature_uid, rt, mz, intensity, scan_id)
39
+ - GZIP level 6: JSON objects (chromatograms, spectra) and string data
40
+ - GZIP level 9: Bulk storage data (large MS2 spectrum collections)
41
+ - LZF: Standard numeric arrays
42
+
43
+ Performance Improvements:
44
+ - 8-15% smaller file sizes
45
+ - 20-50% faster save operations for large files
46
+ - Context-aware compression selection
47
+ """
48
+ if filename is None:
49
+ # save to default file name
50
+ if self.file_path is not None:
51
+ filename = os.path.splitext(self.file_path)[0] + ".sample5"
52
+ else:
53
+ self.logger.error("either filename or file_path must be provided")
54
+ return
55
+
56
+ # synchronize feature_map
57
+ if self.features is not None:
58
+ self._features_sync()
59
+
60
+ # if no extension is given, add .sample5
61
+ if not filename.endswith(".sample5"):
62
+ filename += ".sample5"
63
+
64
+ self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
65
+
66
+ # delete existing file if it exists
67
+ if os.path.exists(filename):
68
+ os.remove(filename)
69
+
70
+ with h5py.File(filename, "w") as f:
71
+ # Create groups for organization
72
+ metadata_group = f.create_group("metadata")
73
+ features_group = f.create_group("features")
74
+ scans_group = f.create_group("scans")
75
+ ms1_group = f.create_group("ms1")
76
+
77
+ # Store metadata
78
+ metadata_group.attrs["format"] = "master-sample-1"
79
+ if self.file_path is not None:
80
+ metadata_group.attrs["file_path"] = str(self.file_path)
81
+ else:
82
+ metadata_group.attrs["file_path"] = ""
83
+ if self.file_source is not None:
84
+ metadata_group.attrs["file_source"] = str(self.file_source)
85
+ else:
86
+ metadata_group.attrs["file_source"] = ""
87
+ if self.file_type is not None:
88
+ metadata_group.attrs["file_type"] = str(self.file_type)
89
+ else:
90
+ metadata_group.attrs["file_type"] = ""
91
+ if self.label is not None:
92
+ metadata_group.attrs["label"] = str(self.label)
93
+ else:
94
+ metadata_group.attrs["label"] = ""
95
+
96
+ # Store DataFrames
97
+ if self.scans_df is not None and include_scans:
98
+ scans_df = self.scans_df.clone()
99
+ for col in scans_df.columns:
100
+ data = scans_df[col].to_numpy()
101
+ # Handle different data types safely
102
+ if data.dtype == object:
103
+ try:
104
+ str_data = np.array(
105
+ ["" if x is None else str(x) for x in data],
106
+ dtype="S",
107
+ )
108
+ scans_group.create_dataset(
109
+ col,
110
+ data=str_data,
111
+ compression="gzip",
112
+ )
113
+ scans_group[col].attrs["dtype"] = "string_converted"
114
+ except Exception:
115
+ try:
116
+ # Try to convert to numeric using numpy
117
+ numeric_data = np.array([
118
+ float(x)
119
+ if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
120
+ else np.nan
121
+ for x in data
122
+ ])
123
+ if not np.isnan(numeric_data).all():
124
+ scans_group.create_dataset(
125
+ col,
126
+ data=numeric_data,
127
+ compression="gzip",
128
+ )
129
+ scans_group[col].attrs["dtype"] = "numeric_converted"
130
+ else:
131
+ json_data = np.array(
132
+ [json.dumps(x, default=str) for x in data],
133
+ dtype="S",
134
+ )
135
+ scans_group.create_dataset(
136
+ col,
137
+ data=json_data,
138
+ compression="gzip",
139
+ )
140
+ scans_group[col].attrs["dtype"] = "json_serialized"
141
+ except Exception:
142
+ str_repr_data = np.array([str(x) for x in data], dtype="S")
143
+ scans_group.create_dataset(
144
+ col,
145
+ data=str_repr_data,
146
+ compression="gzip",
147
+ )
148
+ scans_group[col].attrs["dtype"] = "string_repr"
149
+ else:
150
+ scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
151
+ scans_group[col].attrs["dtype"] = "native"
152
+ scans_group.attrs["columns"] = list(scans_df.columns)
153
+
154
+ if self.features_df is not None:
155
+ features = self.features_df.clone()
156
+ for col in features.columns:
157
+ # get column dtype
158
+ dtype = str(features[col].dtype).lower()
159
+ if dtype == "object":
160
+ if col == "chrom":
161
+ # this column contains either None or Chromatogram objects
162
+ # convert to json with to_json() and store them as compressed strings
163
+ data = features[col]
164
+ data_as_str = []
165
+ for i in range(len(data)):
166
+ if data[i] is not None:
167
+ data_as_str.append(data[i].to_json())
168
+ else:
169
+ data_as_str.append("None")
170
+ features_group.create_dataset(
171
+ col,
172
+ data=data_as_str,
173
+ compression="gzip",
174
+ )
175
+ elif col == "ms2_scans":
176
+ # this column contains either None or lists of integers (scan indices)
177
+ # convert each to JSON string for storage (HDF5 can't handle inhomogeneous arrays)
178
+ data = features[col]
179
+ data_as_json_strings = []
180
+ for i in range(len(data)):
181
+ if data[i] is not None:
182
+ data_as_json_strings.append(json.dumps(list(data[i])))
183
+ else:
184
+ data_as_json_strings.append("None")
185
+ features_group.create_dataset(
186
+ col,
187
+ data=data_as_json_strings,
188
+ compression="gzip",
189
+ )
190
+ elif col == "ms2_specs":
191
+ # this column contains either None or lists of Spectrum objects
192
+ # convert each spectrum to json and store as list of json strings
193
+ data = features[col]
194
+ data_as_lists_of_strings = []
195
+ for i in range(len(data)):
196
+ if data[i] is not None:
197
+ # Convert list of Spectrum objects to list of JSON strings
198
+ spectrum_list = data[i]
199
+ json_strings = []
200
+ for spectrum in spectrum_list:
201
+ if spectrum is not None:
202
+ json_strings.append(spectrum.to_json())
203
+ else:
204
+ json_strings.append("None")
205
+ data_as_lists_of_strings.append(json_strings)
206
+ else:
207
+ data_as_lists_of_strings.append(["None"])
208
+ # Convert to numpy array for HDF5 storage
209
+ serialized_data = []
210
+ for item in data_as_lists_of_strings:
211
+ serialized_data.append(json.dumps(item))
212
+ features_group.create_dataset(
213
+ col,
214
+ data=serialized_data,
215
+ compression="gzip",
216
+ )
217
+
218
+ else:
219
+ self.logger.warning(
220
+ f"Unexpectedly, column '{col}' has dtype 'object'. Implement serialization for this column.",
221
+ )
222
+ continue
223
+ elif dtype == "string":
224
+ data = features[col].to_list()
225
+ # convert None to 'None' strings
226
+ data = ["None" if x is None else x for x in data]
227
+ features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
228
+ else:
229
+ try:
230
+ data = features[col].to_numpy()
231
+ features_group.create_dataset(col, data=data)
232
+ except Exception:
233
+ self.logger.warning(
234
+ f"Failed to save column '{col}' with dtype '{dtype}'. It may contain unsupported data types.",
235
+ )
236
+ features_group.attrs["columns"] = list(features.columns)
237
+
238
+ # Store arrays
239
+ if self.ms1_df is not None and include_ms1:
240
+ # the df is a polars DataFrame
241
+ for col in self.ms1_df.columns:
242
+ ms1_group.create_dataset(
243
+ col,
244
+ data=self.ms1_df[col].to_numpy(),
245
+ compression="gzip",
246
+ )
247
+
248
+ # Store parameters as JSON
249
+ if self.parameters is not None:
250
+ # Convert parameters dict to JSON string
251
+ params_json = json.dumps(self.parameters, default=str)
252
+ metadata_group.attrs["parameters"] = params_json
253
+
254
+ # Store lib and lib_match - removed (no longer saving lib data)
255
+
256
+ self.logger.info(f"Sample saved successfully to {filename}")
257
+ if self.features is not None:
258
+ # save the features as a separate file
259
+ self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
260
+
261
+
262
+ def _load_sample5(self, filename: str, map: bool = True):
263
+ """
264
+ Load instance data from a sample5 HDF5 file.
265
+
266
+ Restores all attributes that were saved with save_sample5() method using the
267
+ schema defined in sample5_schema.json for proper Polars DataFrame reconstruction.
268
+
269
+ Args:
270
+ filename (str): Path to the sample5 HDF5 file to load.
271
+ map (bool, optional): Whether to map featureXML file if available. Defaults to True.
272
+
273
+ Returns:
274
+ None (modifies self in place)
275
+
276
+ Notes:
277
+ - Restores DataFrames with proper schema typing from sample5_schema.json
278
+ - Handles Chromatogram and Spectrum object reconstruction
279
+ - Properly handles MS2 scan lists and spectrum lists
280
+ """
281
+ # Load schema for proper DataFrame reconstruction
282
+ schema_path = os.path.join(os.path.dirname(__file__), "sample5_schema.json")
283
+ try:
284
+ with open(schema_path) as f:
285
+ schema = json.load(f)
286
+ except FileNotFoundError:
287
+ self.logger.warning(
288
+ f"Schema file {schema_path} not found. Using default types.",
289
+ )
290
+ schema = {}
291
+
292
+ with h5py.File(filename, "r") as f:
293
+ # Load metadata
294
+ if "metadata" in f:
295
+ metadata_group = f["metadata"]
296
+ self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
297
+
298
+ # Load file_source if it exists, otherwise set it equal to file_path
299
+ if "file_source" in metadata_group.attrs:
300
+ self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
301
+ else:
302
+ self.file_source = self.file_path
303
+
304
+ self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
305
+ self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
306
+
307
+ # Load parameters from JSON in metadata
308
+ loaded_data = load_parameters_from_metadata(metadata_group)
309
+
310
+ # Always create a fresh sample_defaults object
311
+ from masster.sample.defaults.sample_def import sample_defaults
312
+
313
+ self.parameters = sample_defaults()
314
+
315
+ # Initialize history and populate from loaded data
316
+ self.history = {}
317
+ if loaded_data is not None and isinstance(loaded_data, dict):
318
+ # Store the loaded data in history
319
+ self.history = loaded_data
320
+ # If there are sample parameters in the history, use them to update defaults
321
+ if "sample" in loaded_data:
322
+ sample_params = loaded_data["sample"]
323
+ if isinstance(sample_params, dict):
324
+ self.parameters.set_from_dict(sample_params, validate=False)
325
+
326
+ # Load scans_df
327
+ if "scans" in f:
328
+ scans_group = f["scans"]
329
+ data: dict[str, Any] = {}
330
+ missing_columns = []
331
+ for col in schema.get("scans_df", {}).get("columns", []):
332
+ if col not in scans_group:
333
+ self.logger.debug(f"Column '{col}' not found in sample5/scans.")
334
+ data[col] = None
335
+ missing_columns.append(col)
336
+ continue
337
+
338
+ dtype = schema["scans_df"]["columns"][col].get("dtype", "native")
339
+ match dtype:
340
+ case "pl.Object":
341
+ self.logger.debug(f"Unexpected Object column '{col}'")
342
+ data[col] = None
343
+ missing_columns.append(col)
344
+
345
+ case _:
346
+ data[col] = scans_group[col][:]
347
+
348
+ # create polars DataFrame from data
349
+ if data:
350
+ self.scans_df = pl.DataFrame(data)
351
+
352
+ # Convert "None" strings and NaN values to proper null values
353
+ for col in self.scans_df.columns:
354
+ if self.scans_df[col].dtype == pl.Utf8: # String columns
355
+ self.scans_df = self.scans_df.with_columns([
356
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
357
+ .then(None)
358
+ .otherwise(pl.col(col))
359
+ .alias(col),
360
+ ])
361
+ elif self.scans_df[col].dtype in [
362
+ pl.Float64,
363
+ pl.Float32,
364
+ ]: # Float columns
365
+ self.scans_df = self.scans_df.with_columns([
366
+ pl.col(col).fill_nan(None).alias(col),
367
+ ])
368
+
369
+ # update all columns with schema types
370
+ for col in self.scans_df.columns:
371
+ if col in schema.get("scans_df", {}).get("columns", {}):
372
+ try:
373
+ dtype_str = schema["scans_df"]["columns"][col]["dtype"]
374
+ # Convert dtype string to actual polars dtype
375
+ if dtype_str.startswith("pl."):
376
+ # Skip Object columns - they're already properly reconstructed
377
+ if "Object" in dtype_str:
378
+ continue
379
+ # Handle different polars data types
380
+ if "Int" in dtype_str:
381
+ # Convert to numeric first, handling different input types
382
+ if self.scans_df[col].dtype == pl.Utf8:
383
+ # String data - convert to integer
384
+ self.scans_df = self.scans_df.with_columns(
385
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
386
+ )
387
+ elif self.scans_df[col].dtype in [
388
+ pl.Float64,
389
+ pl.Float32,
390
+ ]:
391
+ # Float data - cast to integer
392
+ self.scans_df = self.scans_df.with_columns(
393
+ pl.col(col).cast(eval(dtype_str)),
394
+ )
395
+ else:
396
+ # Try direct casting
397
+ self.scans_df = self.scans_df.with_columns(
398
+ pl.col(col).cast(eval(dtype_str)),
399
+ )
400
+ elif "Float" in dtype_str:
401
+ # Convert to float, handling different input types
402
+ if self.scans_df[col].dtype == pl.Utf8:
403
+ # String data - convert to float
404
+ self.scans_df = self.scans_df.with_columns(
405
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
406
+ )
407
+ else:
408
+ # Try direct casting
409
+ self.scans_df = self.scans_df.with_columns(
410
+ pl.col(col).cast(eval(dtype_str)),
411
+ )
412
+ elif "Utf8" in dtype_str:
413
+ # Ensure it's string type
414
+ self.scans_df = self.scans_df.with_columns(
415
+ pl.col(col).cast(pl.Utf8),
416
+ )
417
+ else:
418
+ # Handle special cases and try direct casting for other types
419
+ current_dtype = self.scans_df[col].dtype
420
+ target_dtype = eval(dtype_str)
421
+
422
+ # Handle binary data that might need string conversion first
423
+ if "Binary" in str(current_dtype):
424
+ # Convert binary to string first, then to target type
425
+ if target_dtype == pl.Utf8:
426
+ self.scans_df = self.scans_df.with_columns(
427
+ pl.col(col)
428
+ .map_elements(
429
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
430
+ return_dtype=pl.Utf8,
431
+ )
432
+ .cast(target_dtype),
433
+ )
434
+ elif "Int" in str(target_dtype):
435
+ self.scans_df = self.scans_df.with_columns(
436
+ pl.col(col)
437
+ .map_elements(
438
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
439
+ return_dtype=pl.Utf8,
440
+ )
441
+ .str.to_integer()
442
+ .cast(target_dtype),
443
+ )
444
+ elif "Float" in str(target_dtype):
445
+ self.scans_df = self.scans_df.with_columns(
446
+ pl.col(col)
447
+ .map_elements(
448
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
449
+ return_dtype=pl.Utf8,
450
+ )
451
+ .str.to_decimal()
452
+ .cast(target_dtype),
453
+ )
454
+ else:
455
+ # Try direct casting
456
+ self.scans_df = self.scans_df.with_columns(
457
+ pl.col(col).cast(target_dtype),
458
+ )
459
+ else:
460
+ # Try direct casting for non-binary types
461
+ self.scans_df = self.scans_df.with_columns(
462
+ pl.col(col).cast(target_dtype),
463
+ )
464
+ except Exception as e:
465
+ self.logger.warning(
466
+ f"Failed to cast column '{col}' in scans_df: {e}",
467
+ )
468
+ else:
469
+ self.logger.warning(
470
+ f"Column '{col}' in scans_df not found in schema, keeping original type.",
471
+ )
472
+
473
+ # Ensure column order matches schema order
474
+ if "scans_df" in schema and "columns" in schema["scans_df"]:
475
+ schema_column_order = list(schema["scans_df"]["columns"].keys())
476
+ # Only reorder columns that exist in both schema and DataFrame
477
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
478
+ if existing_columns:
479
+ self.scans_df = self.scans_df.select(existing_columns)
480
+
481
+ else:
482
+ self.scans_df = None
483
+ else:
484
+ self.scans_df = None
485
+
486
+ # Load features_df
487
+ if "features" in f:
488
+ features_group = f["features"]
489
+ # columns = list(features_group.attrs.get('columns', []))
490
+ data = {}
491
+ missing_columns = []
492
+ for col in schema.get("features_df", {}).get("columns", []):
493
+ if col not in features_group:
494
+ self.logger.debug(
495
+ f"Column '{col}' not found in sample5/features.",
496
+ )
497
+ data[col] = None
498
+ missing_columns.append(col)
499
+ continue
500
+
501
+ dtype = schema["features_df"]["columns"][col].get("dtype", "native")
502
+ match dtype:
503
+ case "pl.Object":
504
+ match col:
505
+ case "chrom":
506
+ data_col = features_group[col][:]
507
+ # Convert JSON strings back to Chromatogram objects
508
+ reconstructed_data: list[Any] = []
509
+ for item in data_col:
510
+ if isinstance(item, bytes):
511
+ item = item.decode("utf-8")
512
+
513
+ if item == "None" or item == "":
514
+ reconstructed_data.append(None)
515
+ else:
516
+ try:
517
+ reconstructed_data.append(
518
+ Chromatogram.from_json(item),
519
+ )
520
+ except (json.JSONDecodeError, ValueError):
521
+ reconstructed_data.append(None)
522
+
523
+ data[col] = reconstructed_data
524
+ case "ms2_scans":
525
+ data_col = features_group[col][:]
526
+ # Convert JSON strings back to lists of integers
527
+ reconstructed_data = []
528
+ for item in data_col:
529
+ if isinstance(item, bytes):
530
+ item = item.decode("utf-8")
531
+
532
+ if item == "None":
533
+ reconstructed_data.append(None)
534
+ else:
535
+ try:
536
+ # Parse JSON string to get list of integers
537
+ scan_list = json.loads(item)
538
+ reconstructed_data.append(scan_list)
539
+ except (json.JSONDecodeError, ValueError):
540
+ reconstructed_data.append(None)
541
+
542
+ data[col] = reconstructed_data
543
+ case "ms2_specs":
544
+ data_col = features_group[col][:]
545
+ # Convert JSON strings back to lists of Spectrum objects
546
+ reconstructed_data = []
547
+ for item in data_col:
548
+ if isinstance(item, bytes):
549
+ item = item.decode("utf-8")
550
+
551
+ # Parse the outer JSON (list of JSON strings)
552
+ json_list = json.loads(item)
553
+
554
+ if json_list == ["None"]:
555
+ # This was originally None
556
+ reconstructed_data.append(None)
557
+ else:
558
+ # This was originally a list of Spectrum objects
559
+ spectrum_list: list[Any] = []
560
+ for json_str in json_list:
561
+ if json_str == "None":
562
+ spectrum_list.append(None)
563
+ else:
564
+ spectrum_list.append(
565
+ Spectrum.from_json(json_str),
566
+ )
567
+ reconstructed_data.append(spectrum_list)
568
+
569
+ data[col] = reconstructed_data
570
+ case _:
571
+ self.logger.debug(f"Unexpected Object column '{col}'")
572
+ data[col] = None
573
+ missing_columns.append(col)
574
+
575
+ case _:
576
+ data[col] = features_group[col][:]
577
+
578
+ # create polars DataFrame from data
579
+ if data:
580
+ # Build schema for DataFrame creation to handle Object columns properly
581
+ df_schema = {}
582
+ for col, values in data.items():
583
+ if col in schema.get("features_df", {}).get("columns", {}):
584
+ dtype_str = schema["features_df"]["columns"][col]["dtype"]
585
+ if dtype_str == "pl.Object":
586
+ df_schema[col] = pl.Object
587
+ else:
588
+ # Let Polars infer the type initially, we'll cast later
589
+ df_schema[col] = None
590
+ else:
591
+ df_schema[col] = None
592
+
593
+ # Create DataFrame with explicit Object types where needed
594
+ try:
595
+ self.features_df = pl.DataFrame(data, schema=df_schema)
596
+ except Exception:
597
+ # Fallback: create without schema and handle Object columns manually
598
+ object_columns = {
599
+ k: v
600
+ for k, v in data.items()
601
+ if k in schema.get("features_df", {}).get("columns", {})
602
+ and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
603
+ }
604
+ regular_columns = {k: v for k, v in data.items() if k not in object_columns}
605
+
606
+ # Create DataFrame with regular columns first
607
+ if regular_columns:
608
+ self.features_df = pl.DataFrame(regular_columns)
609
+ # Add Object columns one by one
610
+ for col, values in object_columns.items():
611
+ self.features_df = self.features_df.with_columns([
612
+ pl.Series(col, values, dtype=pl.Object),
613
+ ])
614
+ else:
615
+ # Only Object columns
616
+ self.features_df = pl.DataFrame()
617
+ for col, values in object_columns.items():
618
+ self.features_df = self.features_df.with_columns([
619
+ pl.Series(col, values, dtype=pl.Object),
620
+ ])
621
+
622
+ # update all columns with schema types (skip Object columns)
623
+ for col in self.features_df.columns:
624
+ if col in schema.get("features_df", {}).get("columns", {}):
625
+ try:
626
+ dtype_str = schema["features_df"]["columns"][col]["dtype"]
627
+ # Convert dtype string to actual polars dtype
628
+ if dtype_str.startswith("pl."):
629
+ # Skip Object columns - they're already properly reconstructed
630
+ if "Object" in dtype_str:
631
+ continue
632
+ # Handle different polars data types
633
+ if "Int" in dtype_str:
634
+ # Convert to numeric first, handling different input types
635
+ if self.features_df[col].dtype == pl.Utf8:
636
+ # String data - convert to integer
637
+ self.features_df = self.features_df.with_columns(
638
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
639
+ )
640
+ elif self.features_df[col].dtype in [
641
+ pl.Float64,
642
+ pl.Float32,
643
+ ]:
644
+ # Float data - cast to integer with null handling for NaN values
645
+ self.features_df = self.features_df.with_columns(
646
+ pl.col(col).cast(eval(dtype_str), strict=False),
647
+ )
648
+ else:
649
+ # Handle special cases and try direct casting for other types
650
+ current_dtype = self.features_df[col].dtype
651
+ target_dtype = eval(dtype_str)
652
+
653
+ # Handle binary data that might need string conversion first
654
+ if "Binary" in str(current_dtype):
655
+ # Convert binary to string first, then to target type
656
+ if target_dtype == pl.Utf8:
657
+ self.features_df = self.features_df.with_columns(
658
+ pl.col(col)
659
+ .map_elements(
660
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
661
+ return_dtype=pl.Utf8,
662
+ )
663
+ .cast(target_dtype),
664
+ )
665
+ elif "Int" in str(target_dtype):
666
+ self.features_df = self.features_df.with_columns(
667
+ pl.col(col)
668
+ .map_elements(
669
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
670
+ return_dtype=pl.Utf8,
671
+ )
672
+ .str.to_integer()
673
+ .cast(target_dtype),
674
+ )
675
+ elif "Float" in str(target_dtype):
676
+ self.features_df = self.features_df.with_columns(
677
+ pl.col(col)
678
+ .map_elements(
679
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
680
+ return_dtype=pl.Utf8,
681
+ )
682
+ .str.to_decimal()
683
+ .cast(target_dtype),
684
+ )
685
+ else:
686
+ # Try direct casting
687
+ self.features_df = self.features_df.with_columns(
688
+ pl.col(col).cast(target_dtype),
689
+ )
690
+ else:
691
+ # Try direct casting for non-binary types
692
+ self.features_df = self.features_df.with_columns(
693
+ pl.col(col).cast(target_dtype),
694
+ )
695
+ elif "Float" in dtype_str:
696
+ # Convert to float, handling different input types
697
+ if self.features_df[col].dtype == pl.Utf8:
698
+ # String data - convert to float
699
+ self.features_df = self.features_df.with_columns(
700
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
701
+ )
702
+ else:
703
+ # Handle special cases and try direct casting for other types
704
+ current_dtype = self.features_df[col].dtype
705
+ target_dtype = eval(dtype_str)
706
+
707
+ # Handle binary data that might need string conversion first
708
+ if "Binary" in str(current_dtype):
709
+ # Convert binary to string first, then to target type
710
+ if target_dtype == pl.Utf8:
711
+ self.features_df = self.features_df.with_columns(
712
+ pl.col(col)
713
+ .map_elements(
714
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
715
+ return_dtype=pl.Utf8,
716
+ )
717
+ .cast(target_dtype),
718
+ )
719
+ elif "Int" in str(target_dtype):
720
+ self.features_df = self.features_df.with_columns(
721
+ pl.col(col)
722
+ .map_elements(
723
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
724
+ return_dtype=pl.Utf8,
725
+ )
726
+ .str.to_integer()
727
+ .cast(target_dtype),
728
+ )
729
+ elif "Float" in str(target_dtype):
730
+ self.features_df = self.features_df.with_columns(
731
+ pl.col(col)
732
+ .map_elements(
733
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
734
+ return_dtype=pl.Utf8,
735
+ )
736
+ .str.to_decimal()
737
+ .cast(target_dtype),
738
+ )
739
+ else:
740
+ # Try direct casting
741
+ self.features_df = self.features_df.with_columns(
742
+ pl.col(col).cast(target_dtype),
743
+ )
744
+ else:
745
+ # Try direct casting for non-binary types
746
+ self.features_df = self.features_df.with_columns(
747
+ pl.col(col).cast(target_dtype),
748
+ )
749
+ elif "Utf8" in dtype_str:
750
+ # Ensure it's string type
751
+ self.features_df = self.features_df.with_columns(
752
+ pl.col(col).cast(pl.Utf8),
753
+ )
754
+ else:
755
+ # Handle special cases and try direct casting for other types
756
+ current_dtype = self.features_df[col].dtype
757
+ target_dtype = eval(dtype_str)
758
+
759
+ # Handle binary data that might need string conversion first
760
+ if "Binary" in str(current_dtype):
761
+ # Convert binary to string first, then to target type
762
+ if target_dtype == pl.Utf8:
763
+ self.features_df = self.features_df.with_columns(
764
+ pl.col(col)
765
+ .map_elements(
766
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
767
+ return_dtype=pl.Utf8,
768
+ )
769
+ .cast(target_dtype),
770
+ )
771
+ elif "Int" in str(target_dtype):
772
+ self.features_df = self.features_df.with_columns(
773
+ pl.col(col)
774
+ .map_elements(
775
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
776
+ return_dtype=pl.Utf8,
777
+ )
778
+ .str.to_integer()
779
+ .cast(target_dtype),
780
+ )
781
+ elif "Float" in str(target_dtype):
782
+ self.features_df = self.features_df.with_columns(
783
+ pl.col(col)
784
+ .map_elements(
785
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
786
+ return_dtype=pl.Utf8,
787
+ )
788
+ .str.to_decimal()
789
+ .cast(target_dtype),
790
+ )
791
+ else:
792
+ # Try direct casting
793
+ self.features_df = self.features_df.with_columns(
794
+ pl.col(col).cast(target_dtype),
795
+ )
796
+ else:
797
+ # Try direct casting for non-binary types
798
+ self.features_df = self.features_df.with_columns(
799
+ pl.col(col).cast(target_dtype),
800
+ )
801
+ except Exception as e:
802
+ self.logger.warning(
803
+ f"Failed to cast column '{col}' in features_df: {e}",
804
+ )
805
+ else:
806
+ self.logger.warning(
807
+ f"Column '{col}' in features_df not found in schema, keeping original type.",
808
+ )
809
+
810
+ # FINAL null conversion pass - after all type casting is done
811
+ # This ensures "None" strings introduced by failed conversions are properly handled
812
+ for col in self.features_df.columns:
813
+ if self.features_df[col].dtype == pl.Utf8: # String columns
814
+ self.features_df = self.features_df.with_columns([
815
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
816
+ .then(None)
817
+ .otherwise(pl.col(col))
818
+ .alias(col),
819
+ ])
820
+ # Float columns
821
+ elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
822
+ self.features_df = self.features_df.with_columns([
823
+ pl.col(col).fill_nan(None).alias(col),
824
+ ])
825
+
826
+ # Ensure column order matches schema order
827
+ if "features_df" in schema and "columns" in schema["features_df"]:
828
+ schema_column_order = list(schema["features_df"]["columns"].keys())
829
+ # Only reorder columns that exist in both schema and DataFrame
830
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
831
+ if existing_columns:
832
+ self.features_df = self.features_df.select(existing_columns)
833
+
834
+ else:
835
+ self.features_df = None
836
+ else:
837
+ self.features_df = None
838
+
839
+ # Load ms1_df
840
+ if "ms1" in f:
841
+ ms1_group = f["ms1"]
842
+ data = {}
843
+
844
+ # Get all datasets in the ms1 group
845
+ for col in ms1_group.keys():
846
+ data[col] = ms1_group[col][:]
847
+
848
+ if data:
849
+ # Create DataFrame directly with Polars
850
+ self.ms1_df = pl.DataFrame(data)
851
+
852
+ # Apply schema if available
853
+ if "ms1_df" in schema and "columns" in schema["ms1_df"]:
854
+ schema_columns = schema["ms1_df"]["columns"]
855
+ for col in self.ms1_df.columns:
856
+ if col in schema_columns:
857
+ dtype_str = schema_columns[col]["dtype"]
858
+ try:
859
+ if "Int" in dtype_str:
860
+ self.ms1_df = self.ms1_df.with_columns([
861
+ pl.col(col).cast(pl.Int64, strict=False),
862
+ ])
863
+ elif "Float" in dtype_str:
864
+ self.ms1_df = self.ms1_df.with_columns([
865
+ pl.col(col).cast(pl.Float64, strict=False),
866
+ ])
867
+ except Exception as e:
868
+ self.logger.warning(
869
+ f"Failed to apply schema type {dtype_str} to column {col}: {e}",
870
+ )
871
+
872
+ # Convert "None" strings and NaN values to proper null values
873
+ self.ms1_df = clean_null_values_polars(self.ms1_df)
874
+ else:
875
+ self.ms1_df = None
876
+ else:
877
+ self.ms1_df = None
878
+
879
+ # Parameters are now loaded from metadata JSON (see above)
880
+ # Lib and lib_match are no longer saved/loaded
881
+
882
+ if map:
883
+ featureXML = filename.replace(".sample5", ".featureXML")
884
+ if os.path.exists(featureXML):
885
+ self._load_featureXML(featureXML)
886
+ self._features_sync()
887
+ else:
888
+ self.logger.warning(
889
+ f"Feature XML file {featureXML} not found, skipping loading.",
890
+ )
891
+
892
+ # set self.file_path to *.sample5
893
+ self.file_path = filename
894
+ # set self.label to basename without extension
895
+ if self.label is None or self.label == "":
896
+ self.label = os.path.splitext(os.path.basename(filename))[0]
897
+ self.logger.info(f"Sample loaded successfully from {filename}")
898
+
899
+
900
+ def load_schema(schema_path: str) -> Dict[str, Any]:
901
+ """
902
+ Load schema from JSON file with error handling.
903
+
904
+ Args:
905
+ schema_path: Path to the schema JSON file
906
+
907
+ Returns:
908
+ Dictionary containing the schema, empty dict if not found
909
+ """
910
+ try:
911
+ with open(schema_path) as f:
912
+ return json.load(f) # type: ignore
913
+ except FileNotFoundError:
914
+ return {}
915
+
916
+
917
+ def decode_metadata_attr(attr_value: Any) -> str:
918
+ """
919
+ Decode metadata attribute, handling both bytes and string types.
920
+
921
+ Args:
922
+ attr_value: The attribute value to decode
923
+
924
+ Returns:
925
+ String representation of the attribute
926
+ """
927
+ if isinstance(attr_value, bytes):
928
+ return attr_value.decode()
929
+ return str(attr_value) if attr_value is not None else ""
930
+
931
+
932
+ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
933
+ """
934
+ Clean null values in a Polars DataFrame by converting string nulls to proper nulls.
935
+
936
+ Args:
937
+ df: The Polars DataFrame to clean
938
+
939
+ Returns:
940
+ Cleaned DataFrame
941
+ """
942
+ cleaned_df = df
943
+ for col in df.columns:
944
+ if df[col].dtype == pl.Utf8: # String columns
945
+ cleaned_df = cleaned_df.with_columns([
946
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
947
+ ])
948
+ elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
949
+ cleaned_df = cleaned_df.with_columns([
950
+ pl.col(col).fill_nan(None).alias(col),
951
+ ])
952
+ return cleaned_df
953
+
954
+
955
+ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataFrame:
956
+ """
957
+ Cast a Polars DataFrame column to the specified dtype with appropriate handling.
958
+
959
+ Args:
960
+ df: The Polars DataFrame
961
+ col: Column name to cast
962
+ dtype_str: Target dtype as string (e.g., 'pl.Int64')
963
+
964
+ Returns:
965
+ DataFrame with the column cast to the new type
966
+ """
967
+ if not dtype_str.startswith("pl.") or "Object" in dtype_str:
968
+ return df
969
+
970
+ try:
971
+ target_dtype = eval(dtype_str)
972
+ current_dtype = df[col].dtype
973
+
974
+ if "Int" in dtype_str:
975
+ return _cast_to_int(df, col, current_dtype, target_dtype)
976
+ elif "Float" in dtype_str:
977
+ return _cast_to_float(df, col, current_dtype, target_dtype)
978
+ elif "Utf8" in dtype_str:
979
+ return df.with_columns(pl.col(col).cast(pl.Utf8))
980
+ else:
981
+ return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
982
+
983
+ except Exception:
984
+ return df
985
+
986
+
987
+ def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
988
+ """Helper function to cast column to integer type."""
989
+ if current_dtype == pl.Utf8:
990
+ return df.with_columns(
991
+ pl.col(col).str.to_integer().cast(target_dtype),
992
+ )
993
+ elif current_dtype in [pl.Float64, pl.Float32]:
994
+ return df.with_columns(pl.col(col).cast(target_dtype))
995
+ else:
996
+ return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
997
+
998
+
999
+ def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
1000
+ """Helper function to cast column to float type."""
1001
+ if current_dtype == pl.Utf8:
1002
+ return df.with_columns(
1003
+ pl.col(col).str.to_decimal().cast(target_dtype),
1004
+ )
1005
+ else:
1006
+ return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
1007
+
1008
+
1009
+ def _cast_with_binary_handling(
1010
+ df: pl.DataFrame,
1011
+ col: str,
1012
+ current_dtype: pl.DataType,
1013
+ target_dtype: pl.DataType,
1014
+ ) -> pl.DataFrame:
1015
+ """Helper function to handle binary data conversion."""
1016
+ if "Binary" in str(current_dtype):
1017
+ if target_dtype == pl.Utf8:
1018
+ return df.with_columns(
1019
+ pl.col(col)
1020
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1021
+ .cast(target_dtype),
1022
+ )
1023
+ elif "Int" in str(target_dtype):
1024
+ return df.with_columns(
1025
+ pl.col(col)
1026
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1027
+ .str.to_integer()
1028
+ .cast(target_dtype),
1029
+ )
1030
+ elif "Float" in str(target_dtype):
1031
+ return df.with_columns(
1032
+ pl.col(col)
1033
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1034
+ .str.to_decimal()
1035
+ .cast(target_dtype),
1036
+ )
1037
+
1038
+ # Fallback: try direct casting
1039
+ return df.with_columns(pl.col(col).cast(target_dtype))
1040
+
1041
+
1042
+ def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
1043
+ """
1044
+ Apply schema type casting to a Polars DataFrame.
1045
+
1046
+ Args:
1047
+ df: The DataFrame to modify
1048
+ schema: The schema dictionary
1049
+ df_name: Name of the DataFrame in the schema (e.g., 'scans_df', 'features_df')
1050
+
1051
+ Returns:
1052
+ DataFrame with schema types applied
1053
+ """
1054
+ df_schema = schema.get(df_name, {}).get("columns", {})
1055
+
1056
+ for col in df.columns:
1057
+ if col in df_schema:
1058
+ dtype_str = df_schema[col]["dtype"]
1059
+ df = cast_column_by_dtype(df, col, dtype_str)
1060
+
1061
+ return df
1062
+
1063
+
1064
+ def reconstruct_object_column(data_col: np.ndarray, col_name: str) -> List[Any]:
1065
+ """
1066
+ Reconstruct object columns from serialized data.
1067
+
1068
+ Args:
1069
+ data_col: Array containing serialized data
1070
+ col_name: Name of the column for type-specific reconstruction
1071
+
1072
+ Returns:
1073
+ List of reconstructed objects
1074
+ """
1075
+ reconstructed_data: list[Any] = []
1076
+
1077
+ for item in data_col:
1078
+ if isinstance(item, bytes):
1079
+ item = item.decode("utf-8")
1080
+
1081
+ if item == "None" or item == "":
1082
+ reconstructed_data.append(None)
1083
+ continue
1084
+
1085
+ try:
1086
+ if col_name == "chrom":
1087
+ reconstructed_data.append(Chromatogram.from_json(item))
1088
+ elif col_name == "ms2_scans":
1089
+ scan_list = json.loads(item)
1090
+ reconstructed_data.append(scan_list)
1091
+ elif col_name == "ms2_specs":
1092
+ json_list = json.loads(item)
1093
+ if json_list == ["None"]:
1094
+ reconstructed_data.append(None)
1095
+ else:
1096
+ spectrum_list: list[Any] = []
1097
+ for json_str in json_list:
1098
+ if json_str == "None":
1099
+ spectrum_list.append(None)
1100
+ else:
1101
+ spectrum_list.append(Spectrum.from_json(json_str))
1102
+ reconstructed_data.append(spectrum_list)
1103
+ else:
1104
+ # Unknown object column
1105
+ reconstructed_data.append(None)
1106
+ except (json.JSONDecodeError, ValueError):
1107
+ reconstructed_data.append(None)
1108
+
1109
+ return reconstructed_data
1110
+
1111
+
1112
+ def load_dataframe_from_h5_group(
1113
+ group: h5py.Group,
1114
+ schema: Dict[str, Any],
1115
+ df_name: str,
1116
+ logger: Optional[Any] = None,
1117
+ ) -> Tuple[Optional[pl.DataFrame], List[str]]:
1118
+ """
1119
+ Load a Polars DataFrame from an HDF5 group using schema.
1120
+
1121
+ Args:
1122
+ group: The HDF5 group containing the DataFrame data
1123
+ schema: The schema dictionary
1124
+ df_name: Name of the DataFrame in the schema
1125
+ logger: Optional logger for warnings
1126
+
1127
+ Returns:
1128
+ Tuple of (DataFrame or None, list of missing columns)
1129
+ """
1130
+ data: dict[str, Any] = {}
1131
+ missing_columns = []
1132
+
1133
+ # Load columns according to schema
1134
+ schema_columns = schema.get(df_name, {}).get("columns", [])
1135
+
1136
+ for col in schema_columns:
1137
+ if col not in group:
1138
+ if logger:
1139
+ logger.warning(f"Column '{col}' not found in {df_name}.")
1140
+ data[col] = None
1141
+ missing_columns.append(col)
1142
+ continue
1143
+
1144
+ dtype = schema[df_name]["columns"][col].get("dtype", "native")
1145
+
1146
+ if dtype == "pl.Object":
1147
+ # Handle object columns specially
1148
+ data[col] = reconstruct_object_column(group[col][:], col)
1149
+ else:
1150
+ data[col] = group[col][:]
1151
+
1152
+ if not data:
1153
+ return None, missing_columns
1154
+
1155
+ # Create DataFrame with proper schema for Object columns
1156
+ df_schema = {}
1157
+ for col, values in data.items():
1158
+ if col in schema_columns:
1159
+ dtype_str = schema[df_name]["columns"][col]["dtype"]
1160
+ if dtype_str == "pl.Object":
1161
+ df_schema[col] = pl.Object
1162
+
1163
+ try:
1164
+ if df_schema:
1165
+ df = pl.DataFrame(data, schema=df_schema)
1166
+ else:
1167
+ df = pl.DataFrame(data)
1168
+ except Exception:
1169
+ # Fallback: handle Object columns manually
1170
+ df = _create_dataframe_with_object_columns(data, schema, df_name)
1171
+
1172
+ # Clean null values
1173
+ df = clean_null_values_polars(df)
1174
+
1175
+ # Apply schema type casting
1176
+ df = apply_schema_to_dataframe(df, schema, df_name)
1177
+
1178
+ return df, missing_columns
1179
+
1180
+
1181
+ def _create_dataframe_with_object_columns(
1182
+ data: Dict[str, Any],
1183
+ schema: Dict[str, Any],
1184
+ df_name: str,
1185
+ ) -> pl.DataFrame:
1186
+ """
1187
+ Create DataFrame handling Object columns manually when schema creation fails.
1188
+
1189
+ Args:
1190
+ data: Dictionary of column data
1191
+ schema: The schema dictionary
1192
+ df_name: Name of the DataFrame in the schema
1193
+
1194
+ Returns:
1195
+ Polars DataFrame with Object columns properly handled
1196
+ """
1197
+ schema_columns = schema.get(df_name, {}).get("columns", {})
1198
+
1199
+ object_columns = {
1200
+ k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
1201
+ }
1202
+ regular_columns = {k: v for k, v in data.items() if k not in object_columns}
1203
+
1204
+ # Create DataFrame with regular columns first
1205
+ if regular_columns:
1206
+ df = pl.DataFrame(regular_columns)
1207
+ # Add Object columns one by one
1208
+ for col, values in object_columns.items():
1209
+ df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
1210
+ else:
1211
+ # Only Object columns
1212
+ df = pl.DataFrame()
1213
+ for col, values in object_columns.items():
1214
+ df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
1215
+
1216
+ return df
1217
+
1218
+
1219
+ def load_ms1_dataframe_from_h5_group(
1220
+ group: h5py.Group,
1221
+ schema: Dict[str, Any],
1222
+ logger: Optional[Any] = None,
1223
+ ) -> Optional[pl.DataFrame]:
1224
+ """
1225
+ Load MS1 DataFrame from HDF5 group.
1226
+
1227
+ Args:
1228
+ group: The HDF5 group containing MS1 data
1229
+ schema: The schema dictionary
1230
+ logger: Optional logger for warnings
1231
+
1232
+ Returns:
1233
+ Polars DataFrame or None
1234
+ """
1235
+ data = {}
1236
+
1237
+ # Get all datasets in the ms1 group
1238
+ for col in group.keys():
1239
+ data[col] = group[col][:]
1240
+
1241
+ if not data:
1242
+ return None
1243
+
1244
+ # Create DataFrame directly with Polars
1245
+ ms1_df = pl.DataFrame(data)
1246
+
1247
+ # Apply schema if available
1248
+ if "ms1_df" in schema and "columns" in schema["ms1_df"]:
1249
+ schema_columns = schema["ms1_df"]["columns"]
1250
+ for col in ms1_df.columns:
1251
+ if col in schema_columns:
1252
+ dtype_str = schema_columns[col]["dtype"]
1253
+ try:
1254
+ if "Int" in dtype_str:
1255
+ ms1_df = ms1_df.with_columns([
1256
+ pl.col(col).cast(pl.Int64, strict=False),
1257
+ ])
1258
+ elif "Float" in dtype_str:
1259
+ ms1_df = ms1_df.with_columns([
1260
+ pl.col(col).cast(pl.Float64, strict=False),
1261
+ ])
1262
+ except Exception as e:
1263
+ if logger:
1264
+ logger.warning(
1265
+ f"Failed to apply schema type {dtype_str} to column {col}: {e}",
1266
+ )
1267
+
1268
+ # Convert "None" strings and NaN values to proper null values
1269
+ return clean_null_values_polars(ms1_df)
1270
+
1271
+
1272
+ def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
1273
+ """
1274
+ Load parameters from HDF5 metadata group.
1275
+
1276
+ Args:
1277
+ metadata_group: The HDF5 metadata group containing parameters
1278
+
1279
+ Returns:
1280
+ Dictionary of parameters or None if not found
1281
+ """
1282
+ if "parameters" in metadata_group.attrs:
1283
+ try:
1284
+ params_json = decode_metadata_attr(metadata_group.attrs["parameters"])
1285
+ # Ensure params_json is a string before attempting JSON decode
1286
+ if isinstance(params_json, str) and params_json.strip():
1287
+ result = json.loads(params_json)
1288
+ # Ensure the result is a dictionary
1289
+ if isinstance(result, dict):
1290
+ return result
1291
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
1292
+ # Log the error for debugging
1293
+ print(f"Warning: Failed to parse parameters JSON: {e}")
1294
+ print(f"Raw parameter data type: {type(params_json)}")
1295
+ print(f"Raw parameter data: {repr(params_json)}")
1296
+ return None
1297
+
1298
+
1299
+ def create_h5_metadata_group(
1300
+ f: h5py.File,
1301
+ file_path: Optional[str],
1302
+ file_source: Optional[str],
1303
+ file_type: Optional[str],
1304
+ label: Optional[str],
1305
+ ) -> None:
1306
+ """
1307
+ Create and populate metadata group in HDF5 file.
1308
+
1309
+ Args:
1310
+ f: The HDF5 file object
1311
+ file_path: Source file path
1312
+ file_source: Original source file path
1313
+ file_type: Source file type
1314
+ label: Sample label
1315
+ """
1316
+ metadata_group = f.create_group("metadata")
1317
+ metadata_group.attrs["format"] = "master-sample5-1"
1318
+ metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
1319
+ metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
1320
+ metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
1321
+ metadata_group.attrs["label"] = str(label) if label is not None else ""