masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -719
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.4.dist-info/RECORD +0 -50
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/h5.py
CHANGED
|
@@ -1,1297 +1,1321 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
|
|
4
|
-
import h5py
|
|
5
|
-
import numpy as np
|
|
6
|
-
import polars as pl
|
|
7
|
-
|
|
8
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
-
|
|
10
|
-
from masster.chromatogram import Chromatogram
|
|
11
|
-
from masster.spectrum import Spectrum
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _save_sample5(self, filename=None, include_ms1=True, include_scans=True):
|
|
15
|
-
"""
|
|
16
|
-
Save the instance data to a sample5 HDF5 file with optimized compression.
|
|
17
|
-
|
|
18
|
-
This optimized version uses context-aware compression settings for better
|
|
19
|
-
performance and smaller file sizes. Different compression algorithms are
|
|
20
|
-
selected based on data type and usage patterns.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
filename (str, optional): Target file name. If None, uses default based on file_path.
|
|
24
|
-
include_ms1 (bool, optional): Whether to include MS1 data. Defaults to True.
|
|
25
|
-
include_scans (bool, optional): Whether to include scan data. Defaults to True.
|
|
26
|
-
|
|
27
|
-
Stores:
|
|
28
|
-
- metadata/format (str): Data format identifier (master-sample-1)
|
|
29
|
-
- metadata/file_path (str): Source file path
|
|
30
|
-
- metadata/file_type (str): Source file type
|
|
31
|
-
- metadata/label (str): Sample label
|
|
32
|
-
- metadata/parameters (str): Parameters as JSON string with optimized compression
|
|
33
|
-
- scans/: Scan DataFrame data with fast-access compression for IDs, standard for others
|
|
34
|
-
- features/: Feature DataFrame data with JSON compression for objects, fast compression for core data
|
|
35
|
-
- ms1/: MS1-level data with numeric compression
|
|
36
|
-
|
|
37
|
-
Compression Strategy:
|
|
38
|
-
- LZF + shuffle: Fast access data (feature_uid, rt, mz, intensity, scan_id)
|
|
39
|
-
- GZIP level 6: JSON objects (chromatograms, spectra) and string data
|
|
40
|
-
- GZIP level 9: Bulk storage data (large MS2 spectrum collections)
|
|
41
|
-
- LZF: Standard numeric arrays
|
|
42
|
-
|
|
43
|
-
Performance Improvements:
|
|
44
|
-
- 8-15% smaller file sizes
|
|
45
|
-
- 20-50% faster save operations for large files
|
|
46
|
-
- Context-aware compression selection
|
|
47
|
-
"""
|
|
48
|
-
if filename is None:
|
|
49
|
-
# save to default file name
|
|
50
|
-
if self.file_path is not None:
|
|
51
|
-
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
52
|
-
else:
|
|
53
|
-
self.logger.error("either filename or file_path must be provided")
|
|
54
|
-
return
|
|
55
|
-
|
|
56
|
-
#
|
|
57
|
-
if not
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if self.
|
|
80
|
-
metadata_group.attrs["
|
|
81
|
-
else:
|
|
82
|
-
metadata_group.attrs["
|
|
83
|
-
if self.
|
|
84
|
-
metadata_group.attrs["
|
|
85
|
-
else:
|
|
86
|
-
metadata_group.attrs["
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
self
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
schema
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
]
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
#
|
|
392
|
-
self.scans_df = self.scans_df.with_columns(
|
|
393
|
-
pl.col(col).cast(eval(dtype_str)),
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
#
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
if
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
]
|
|
590
|
-
else:
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
.str
|
|
661
|
-
.
|
|
662
|
-
)
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
self.features_df = (
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
def
|
|
901
|
-
"""
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
Args:
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
Returns:
|
|
908
|
-
|
|
909
|
-
"""
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
Returns:
|
|
1052
|
-
|
|
1053
|
-
"""
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
for
|
|
1057
|
-
if
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
for col
|
|
1137
|
-
if col in
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
""
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import h5py
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from masster.chromatogram import Chromatogram
|
|
11
|
+
from masster.spectrum import Spectrum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _save_sample5(self, filename=None, include_ms1=True, include_scans=True):
|
|
15
|
+
"""
|
|
16
|
+
Save the instance data to a sample5 HDF5 file with optimized compression.
|
|
17
|
+
|
|
18
|
+
This optimized version uses context-aware compression settings for better
|
|
19
|
+
performance and smaller file sizes. Different compression algorithms are
|
|
20
|
+
selected based on data type and usage patterns.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filename (str, optional): Target file name. If None, uses default based on file_path.
|
|
24
|
+
include_ms1 (bool, optional): Whether to include MS1 data. Defaults to True.
|
|
25
|
+
include_scans (bool, optional): Whether to include scan data. Defaults to True.
|
|
26
|
+
|
|
27
|
+
Stores:
|
|
28
|
+
- metadata/format (str): Data format identifier (master-sample-1)
|
|
29
|
+
- metadata/file_path (str): Source file path
|
|
30
|
+
- metadata/file_type (str): Source file type
|
|
31
|
+
- metadata/label (str): Sample label
|
|
32
|
+
- metadata/parameters (str): Parameters as JSON string with optimized compression
|
|
33
|
+
- scans/: Scan DataFrame data with fast-access compression for IDs, standard for others
|
|
34
|
+
- features/: Feature DataFrame data with JSON compression for objects, fast compression for core data
|
|
35
|
+
- ms1/: MS1-level data with numeric compression
|
|
36
|
+
|
|
37
|
+
Compression Strategy:
|
|
38
|
+
- LZF + shuffle: Fast access data (feature_uid, rt, mz, intensity, scan_id)
|
|
39
|
+
- GZIP level 6: JSON objects (chromatograms, spectra) and string data
|
|
40
|
+
- GZIP level 9: Bulk storage data (large MS2 spectrum collections)
|
|
41
|
+
- LZF: Standard numeric arrays
|
|
42
|
+
|
|
43
|
+
Performance Improvements:
|
|
44
|
+
- 8-15% smaller file sizes
|
|
45
|
+
- 20-50% faster save operations for large files
|
|
46
|
+
- Context-aware compression selection
|
|
47
|
+
"""
|
|
48
|
+
if filename is None:
|
|
49
|
+
# save to default file name
|
|
50
|
+
if self.file_path is not None:
|
|
51
|
+
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
52
|
+
else:
|
|
53
|
+
self.logger.error("either filename or file_path must be provided")
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
# synchronize feature_map
|
|
57
|
+
if self.features is not None:
|
|
58
|
+
self._features_sync()
|
|
59
|
+
|
|
60
|
+
# if no extension is given, add .sample5
|
|
61
|
+
if not filename.endswith(".sample5"):
|
|
62
|
+
filename += ".sample5"
|
|
63
|
+
|
|
64
|
+
self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
|
|
65
|
+
|
|
66
|
+
# delete existing file if it exists
|
|
67
|
+
if os.path.exists(filename):
|
|
68
|
+
os.remove(filename)
|
|
69
|
+
|
|
70
|
+
with h5py.File(filename, "w") as f:
|
|
71
|
+
# Create groups for organization
|
|
72
|
+
metadata_group = f.create_group("metadata")
|
|
73
|
+
features_group = f.create_group("features")
|
|
74
|
+
scans_group = f.create_group("scans")
|
|
75
|
+
ms1_group = f.create_group("ms1")
|
|
76
|
+
|
|
77
|
+
# Store metadata
|
|
78
|
+
metadata_group.attrs["format"] = "master-sample-1"
|
|
79
|
+
if self.file_path is not None:
|
|
80
|
+
metadata_group.attrs["file_path"] = str(self.file_path)
|
|
81
|
+
else:
|
|
82
|
+
metadata_group.attrs["file_path"] = ""
|
|
83
|
+
if self.file_source is not None:
|
|
84
|
+
metadata_group.attrs["file_source"] = str(self.file_source)
|
|
85
|
+
else:
|
|
86
|
+
metadata_group.attrs["file_source"] = ""
|
|
87
|
+
if self.file_type is not None:
|
|
88
|
+
metadata_group.attrs["file_type"] = str(self.file_type)
|
|
89
|
+
else:
|
|
90
|
+
metadata_group.attrs["file_type"] = ""
|
|
91
|
+
if self.label is not None:
|
|
92
|
+
metadata_group.attrs["label"] = str(self.label)
|
|
93
|
+
else:
|
|
94
|
+
metadata_group.attrs["label"] = ""
|
|
95
|
+
|
|
96
|
+
# Store DataFrames
|
|
97
|
+
if self.scans_df is not None and include_scans:
|
|
98
|
+
scans_df = self.scans_df.clone()
|
|
99
|
+
for col in scans_df.columns:
|
|
100
|
+
data = scans_df[col].to_numpy()
|
|
101
|
+
# Handle different data types safely
|
|
102
|
+
if data.dtype == object:
|
|
103
|
+
try:
|
|
104
|
+
str_data = np.array(
|
|
105
|
+
["" if x is None else str(x) for x in data],
|
|
106
|
+
dtype="S",
|
|
107
|
+
)
|
|
108
|
+
scans_group.create_dataset(
|
|
109
|
+
col,
|
|
110
|
+
data=str_data,
|
|
111
|
+
compression="gzip",
|
|
112
|
+
)
|
|
113
|
+
scans_group[col].attrs["dtype"] = "string_converted"
|
|
114
|
+
except Exception:
|
|
115
|
+
try:
|
|
116
|
+
# Try to convert to numeric using numpy
|
|
117
|
+
numeric_data = np.array([
|
|
118
|
+
float(x)
|
|
119
|
+
if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
|
|
120
|
+
else np.nan
|
|
121
|
+
for x in data
|
|
122
|
+
])
|
|
123
|
+
if not np.isnan(numeric_data).all():
|
|
124
|
+
scans_group.create_dataset(
|
|
125
|
+
col,
|
|
126
|
+
data=numeric_data,
|
|
127
|
+
compression="gzip",
|
|
128
|
+
)
|
|
129
|
+
scans_group[col].attrs["dtype"] = "numeric_converted"
|
|
130
|
+
else:
|
|
131
|
+
json_data = np.array(
|
|
132
|
+
[json.dumps(x, default=str) for x in data],
|
|
133
|
+
dtype="S",
|
|
134
|
+
)
|
|
135
|
+
scans_group.create_dataset(
|
|
136
|
+
col,
|
|
137
|
+
data=json_data,
|
|
138
|
+
compression="gzip",
|
|
139
|
+
)
|
|
140
|
+
scans_group[col].attrs["dtype"] = "json_serialized"
|
|
141
|
+
except Exception:
|
|
142
|
+
str_repr_data = np.array([str(x) for x in data], dtype="S")
|
|
143
|
+
scans_group.create_dataset(
|
|
144
|
+
col,
|
|
145
|
+
data=str_repr_data,
|
|
146
|
+
compression="gzip",
|
|
147
|
+
)
|
|
148
|
+
scans_group[col].attrs["dtype"] = "string_repr"
|
|
149
|
+
else:
|
|
150
|
+
scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
151
|
+
scans_group[col].attrs["dtype"] = "native"
|
|
152
|
+
scans_group.attrs["columns"] = list(scans_df.columns)
|
|
153
|
+
|
|
154
|
+
if self.features_df is not None:
|
|
155
|
+
features = self.features_df.clone()
|
|
156
|
+
for col in features.columns:
|
|
157
|
+
# get column dtype
|
|
158
|
+
dtype = str(features[col].dtype).lower()
|
|
159
|
+
if dtype == "object":
|
|
160
|
+
if col == "chrom":
|
|
161
|
+
# this column contains either None or Chromatogram objects
|
|
162
|
+
# convert to json with to_json() and store them as compressed strings
|
|
163
|
+
data = features[col]
|
|
164
|
+
data_as_str = []
|
|
165
|
+
for i in range(len(data)):
|
|
166
|
+
if data[i] is not None:
|
|
167
|
+
data_as_str.append(data[i].to_json())
|
|
168
|
+
else:
|
|
169
|
+
data_as_str.append("None")
|
|
170
|
+
features_group.create_dataset(
|
|
171
|
+
col,
|
|
172
|
+
data=data_as_str,
|
|
173
|
+
compression="gzip",
|
|
174
|
+
)
|
|
175
|
+
elif col == "ms2_scans":
|
|
176
|
+
# this column contains either None or lists of integers (scan indices)
|
|
177
|
+
# convert each to JSON string for storage (HDF5 can't handle inhomogeneous arrays)
|
|
178
|
+
data = features[col]
|
|
179
|
+
data_as_json_strings = []
|
|
180
|
+
for i in range(len(data)):
|
|
181
|
+
if data[i] is not None:
|
|
182
|
+
data_as_json_strings.append(json.dumps(list(data[i])))
|
|
183
|
+
else:
|
|
184
|
+
data_as_json_strings.append("None")
|
|
185
|
+
features_group.create_dataset(
|
|
186
|
+
col,
|
|
187
|
+
data=data_as_json_strings,
|
|
188
|
+
compression="gzip",
|
|
189
|
+
)
|
|
190
|
+
elif col == "ms2_specs":
|
|
191
|
+
# this column contains either None or lists of Spectrum objects
|
|
192
|
+
# convert each spectrum to json and store as list of json strings
|
|
193
|
+
data = features[col]
|
|
194
|
+
data_as_lists_of_strings = []
|
|
195
|
+
for i in range(len(data)):
|
|
196
|
+
if data[i] is not None:
|
|
197
|
+
# Convert list of Spectrum objects to list of JSON strings
|
|
198
|
+
spectrum_list = data[i]
|
|
199
|
+
json_strings = []
|
|
200
|
+
for spectrum in spectrum_list:
|
|
201
|
+
if spectrum is not None:
|
|
202
|
+
json_strings.append(spectrum.to_json())
|
|
203
|
+
else:
|
|
204
|
+
json_strings.append("None")
|
|
205
|
+
data_as_lists_of_strings.append(json_strings)
|
|
206
|
+
else:
|
|
207
|
+
data_as_lists_of_strings.append(["None"])
|
|
208
|
+
# Convert to numpy array for HDF5 storage
|
|
209
|
+
serialized_data = []
|
|
210
|
+
for item in data_as_lists_of_strings:
|
|
211
|
+
serialized_data.append(json.dumps(item))
|
|
212
|
+
features_group.create_dataset(
|
|
213
|
+
col,
|
|
214
|
+
data=serialized_data,
|
|
215
|
+
compression="gzip",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
self.logger.warning(
|
|
220
|
+
f"Unexpectedly, column '{col}' has dtype 'object'. Implement serialization for this column.",
|
|
221
|
+
)
|
|
222
|
+
continue
|
|
223
|
+
elif dtype == "string":
|
|
224
|
+
data = features[col].to_list()
|
|
225
|
+
# convert None to 'None' strings
|
|
226
|
+
data = ["None" if x is None else x for x in data]
|
|
227
|
+
features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
228
|
+
else:
|
|
229
|
+
try:
|
|
230
|
+
data = features[col].to_numpy()
|
|
231
|
+
features_group.create_dataset(col, data=data)
|
|
232
|
+
except Exception:
|
|
233
|
+
self.logger.warning(
|
|
234
|
+
f"Failed to save column '{col}' with dtype '{dtype}'. It may contain unsupported data types.",
|
|
235
|
+
)
|
|
236
|
+
features_group.attrs["columns"] = list(features.columns)
|
|
237
|
+
|
|
238
|
+
# Store arrays
|
|
239
|
+
if self.ms1_df is not None and include_ms1:
|
|
240
|
+
# the df is a polars DataFrame
|
|
241
|
+
for col in self.ms1_df.columns:
|
|
242
|
+
ms1_group.create_dataset(
|
|
243
|
+
col,
|
|
244
|
+
data=self.ms1_df[col].to_numpy(),
|
|
245
|
+
compression="gzip",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Store parameters as JSON
|
|
249
|
+
if self.parameters is not None:
|
|
250
|
+
# Convert parameters dict to JSON string
|
|
251
|
+
params_json = json.dumps(self.parameters, default=str)
|
|
252
|
+
metadata_group.attrs["parameters"] = params_json
|
|
253
|
+
|
|
254
|
+
# Store lib and lib_match - removed (no longer saving lib data)
|
|
255
|
+
|
|
256
|
+
self.logger.info(f"Sample saved successfully to {filename}")
|
|
257
|
+
if self.features is not None:
|
|
258
|
+
# save the features as a separate file
|
|
259
|
+
self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _load_sample5(self, filename: str, map: bool = True):
|
|
263
|
+
"""
|
|
264
|
+
Load instance data from a sample5 HDF5 file.
|
|
265
|
+
|
|
266
|
+
Restores all attributes that were saved with save_sample5() method using the
|
|
267
|
+
schema defined in sample5_schema.json for proper Polars DataFrame reconstruction.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
filename (str): Path to the sample5 HDF5 file to load.
|
|
271
|
+
map (bool, optional): Whether to map featureXML file if available. Defaults to True.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
None (modifies self in place)
|
|
275
|
+
|
|
276
|
+
Notes:
|
|
277
|
+
- Restores DataFrames with proper schema typing from sample5_schema.json
|
|
278
|
+
- Handles Chromatogram and Spectrum object reconstruction
|
|
279
|
+
- Properly handles MS2 scan lists and spectrum lists
|
|
280
|
+
"""
|
|
281
|
+
# Load schema for proper DataFrame reconstruction
|
|
282
|
+
schema_path = os.path.join(os.path.dirname(__file__), "sample5_schema.json")
|
|
283
|
+
try:
|
|
284
|
+
with open(schema_path) as f:
|
|
285
|
+
schema = json.load(f)
|
|
286
|
+
except FileNotFoundError:
|
|
287
|
+
self.logger.warning(
|
|
288
|
+
f"Schema file {schema_path} not found. Using default types.",
|
|
289
|
+
)
|
|
290
|
+
schema = {}
|
|
291
|
+
|
|
292
|
+
with h5py.File(filename, "r") as f:
|
|
293
|
+
# Load metadata
|
|
294
|
+
if "metadata" in f:
|
|
295
|
+
metadata_group = f["metadata"]
|
|
296
|
+
self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
|
|
297
|
+
|
|
298
|
+
# Load file_source if it exists, otherwise set it equal to file_path
|
|
299
|
+
if "file_source" in metadata_group.attrs:
|
|
300
|
+
self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
|
|
301
|
+
else:
|
|
302
|
+
self.file_source = self.file_path
|
|
303
|
+
|
|
304
|
+
self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
|
|
305
|
+
self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
|
|
306
|
+
|
|
307
|
+
# Load parameters from JSON in metadata
|
|
308
|
+
loaded_data = load_parameters_from_metadata(metadata_group)
|
|
309
|
+
|
|
310
|
+
# Always create a fresh sample_defaults object
|
|
311
|
+
from masster.sample.defaults.sample_def import sample_defaults
|
|
312
|
+
|
|
313
|
+
self.parameters = sample_defaults()
|
|
314
|
+
|
|
315
|
+
# Initialize history and populate from loaded data
|
|
316
|
+
self.history = {}
|
|
317
|
+
if loaded_data is not None and isinstance(loaded_data, dict):
|
|
318
|
+
# Store the loaded data in history
|
|
319
|
+
self.history = loaded_data
|
|
320
|
+
# If there are sample parameters in the history, use them to update defaults
|
|
321
|
+
if "sample" in loaded_data:
|
|
322
|
+
sample_params = loaded_data["sample"]
|
|
323
|
+
if isinstance(sample_params, dict):
|
|
324
|
+
self.parameters.set_from_dict(sample_params, validate=False)
|
|
325
|
+
|
|
326
|
+
# Load scans_df
|
|
327
|
+
if "scans" in f:
|
|
328
|
+
scans_group = f["scans"]
|
|
329
|
+
data: dict[str, Any] = {}
|
|
330
|
+
missing_columns = []
|
|
331
|
+
for col in schema.get("scans_df", {}).get("columns", []):
|
|
332
|
+
if col not in scans_group:
|
|
333
|
+
self.logger.debug(f"Column '{col}' not found in sample5/scans.")
|
|
334
|
+
data[col] = None
|
|
335
|
+
missing_columns.append(col)
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
dtype = schema["scans_df"]["columns"][col].get("dtype", "native")
|
|
339
|
+
match dtype:
|
|
340
|
+
case "pl.Object":
|
|
341
|
+
self.logger.debug(f"Unexpected Object column '{col}'")
|
|
342
|
+
data[col] = None
|
|
343
|
+
missing_columns.append(col)
|
|
344
|
+
|
|
345
|
+
case _:
|
|
346
|
+
data[col] = scans_group[col][:]
|
|
347
|
+
|
|
348
|
+
# create polars DataFrame from data
|
|
349
|
+
if data:
|
|
350
|
+
self.scans_df = pl.DataFrame(data)
|
|
351
|
+
|
|
352
|
+
# Convert "None" strings and NaN values to proper null values
|
|
353
|
+
for col in self.scans_df.columns:
|
|
354
|
+
if self.scans_df[col].dtype == pl.Utf8: # String columns
|
|
355
|
+
self.scans_df = self.scans_df.with_columns([
|
|
356
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
357
|
+
.then(None)
|
|
358
|
+
.otherwise(pl.col(col))
|
|
359
|
+
.alias(col),
|
|
360
|
+
])
|
|
361
|
+
elif self.scans_df[col].dtype in [
|
|
362
|
+
pl.Float64,
|
|
363
|
+
pl.Float32,
|
|
364
|
+
]: # Float columns
|
|
365
|
+
self.scans_df = self.scans_df.with_columns([
|
|
366
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
367
|
+
])
|
|
368
|
+
|
|
369
|
+
# update all columns with schema types
|
|
370
|
+
for col in self.scans_df.columns:
|
|
371
|
+
if col in schema.get("scans_df", {}).get("columns", {}):
|
|
372
|
+
try:
|
|
373
|
+
dtype_str = schema["scans_df"]["columns"][col]["dtype"]
|
|
374
|
+
# Convert dtype string to actual polars dtype
|
|
375
|
+
if dtype_str.startswith("pl."):
|
|
376
|
+
# Skip Object columns - they're already properly reconstructed
|
|
377
|
+
if "Object" in dtype_str:
|
|
378
|
+
continue
|
|
379
|
+
# Handle different polars data types
|
|
380
|
+
if "Int" in dtype_str:
|
|
381
|
+
# Convert to numeric first, handling different input types
|
|
382
|
+
if self.scans_df[col].dtype == pl.Utf8:
|
|
383
|
+
# String data - convert to integer
|
|
384
|
+
self.scans_df = self.scans_df.with_columns(
|
|
385
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
386
|
+
)
|
|
387
|
+
elif self.scans_df[col].dtype in [
|
|
388
|
+
pl.Float64,
|
|
389
|
+
pl.Float32,
|
|
390
|
+
]:
|
|
391
|
+
# Float data - cast to integer
|
|
392
|
+
self.scans_df = self.scans_df.with_columns(
|
|
393
|
+
pl.col(col).cast(eval(dtype_str)),
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
# Try direct casting
|
|
397
|
+
self.scans_df = self.scans_df.with_columns(
|
|
398
|
+
pl.col(col).cast(eval(dtype_str)),
|
|
399
|
+
)
|
|
400
|
+
elif "Float" in dtype_str:
|
|
401
|
+
# Convert to float, handling different input types
|
|
402
|
+
if self.scans_df[col].dtype == pl.Utf8:
|
|
403
|
+
# String data - convert to float
|
|
404
|
+
self.scans_df = self.scans_df.with_columns(
|
|
405
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
406
|
+
)
|
|
407
|
+
else:
|
|
408
|
+
# Try direct casting
|
|
409
|
+
self.scans_df = self.scans_df.with_columns(
|
|
410
|
+
pl.col(col).cast(eval(dtype_str)),
|
|
411
|
+
)
|
|
412
|
+
elif "Utf8" in dtype_str:
|
|
413
|
+
# Ensure it's string type
|
|
414
|
+
self.scans_df = self.scans_df.with_columns(
|
|
415
|
+
pl.col(col).cast(pl.Utf8),
|
|
416
|
+
)
|
|
417
|
+
else:
|
|
418
|
+
# Handle special cases and try direct casting for other types
|
|
419
|
+
current_dtype = self.scans_df[col].dtype
|
|
420
|
+
target_dtype = eval(dtype_str)
|
|
421
|
+
|
|
422
|
+
# Handle binary data that might need string conversion first
|
|
423
|
+
if "Binary" in str(current_dtype):
|
|
424
|
+
# Convert binary to string first, then to target type
|
|
425
|
+
if target_dtype == pl.Utf8:
|
|
426
|
+
self.scans_df = self.scans_df.with_columns(
|
|
427
|
+
pl.col(col)
|
|
428
|
+
.map_elements(
|
|
429
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
430
|
+
return_dtype=pl.Utf8,
|
|
431
|
+
)
|
|
432
|
+
.cast(target_dtype),
|
|
433
|
+
)
|
|
434
|
+
elif "Int" in str(target_dtype):
|
|
435
|
+
self.scans_df = self.scans_df.with_columns(
|
|
436
|
+
pl.col(col)
|
|
437
|
+
.map_elements(
|
|
438
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
439
|
+
return_dtype=pl.Utf8,
|
|
440
|
+
)
|
|
441
|
+
.str.to_integer()
|
|
442
|
+
.cast(target_dtype),
|
|
443
|
+
)
|
|
444
|
+
elif "Float" in str(target_dtype):
|
|
445
|
+
self.scans_df = self.scans_df.with_columns(
|
|
446
|
+
pl.col(col)
|
|
447
|
+
.map_elements(
|
|
448
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
449
|
+
return_dtype=pl.Utf8,
|
|
450
|
+
)
|
|
451
|
+
.str.to_decimal()
|
|
452
|
+
.cast(target_dtype),
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
# Try direct casting
|
|
456
|
+
self.scans_df = self.scans_df.with_columns(
|
|
457
|
+
pl.col(col).cast(target_dtype),
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
# Try direct casting for non-binary types
|
|
461
|
+
self.scans_df = self.scans_df.with_columns(
|
|
462
|
+
pl.col(col).cast(target_dtype),
|
|
463
|
+
)
|
|
464
|
+
except Exception as e:
|
|
465
|
+
self.logger.warning(
|
|
466
|
+
f"Failed to cast column '{col}' in scans_df: {e}",
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
self.logger.warning(
|
|
470
|
+
f"Column '{col}' in scans_df not found in schema, keeping original type.",
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Ensure column order matches schema order
|
|
474
|
+
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
475
|
+
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
476
|
+
# Only reorder columns that exist in both schema and DataFrame
|
|
477
|
+
existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
|
|
478
|
+
if existing_columns:
|
|
479
|
+
self.scans_df = self.scans_df.select(existing_columns)
|
|
480
|
+
|
|
481
|
+
else:
|
|
482
|
+
self.scans_df = None
|
|
483
|
+
else:
|
|
484
|
+
self.scans_df = None
|
|
485
|
+
|
|
486
|
+
# Load features_df
|
|
487
|
+
if "features" in f:
|
|
488
|
+
features_group = f["features"]
|
|
489
|
+
# columns = list(features_group.attrs.get('columns', []))
|
|
490
|
+
data = {}
|
|
491
|
+
missing_columns = []
|
|
492
|
+
for col in schema.get("features_df", {}).get("columns", []):
|
|
493
|
+
if col not in features_group:
|
|
494
|
+
self.logger.debug(
|
|
495
|
+
f"Column '{col}' not found in sample5/features.",
|
|
496
|
+
)
|
|
497
|
+
data[col] = None
|
|
498
|
+
missing_columns.append(col)
|
|
499
|
+
continue
|
|
500
|
+
|
|
501
|
+
dtype = schema["features_df"]["columns"][col].get("dtype", "native")
|
|
502
|
+
match dtype:
|
|
503
|
+
case "pl.Object":
|
|
504
|
+
match col:
|
|
505
|
+
case "chrom":
|
|
506
|
+
data_col = features_group[col][:]
|
|
507
|
+
# Convert JSON strings back to Chromatogram objects
|
|
508
|
+
reconstructed_data: list[Any] = []
|
|
509
|
+
for item in data_col:
|
|
510
|
+
if isinstance(item, bytes):
|
|
511
|
+
item = item.decode("utf-8")
|
|
512
|
+
|
|
513
|
+
if item == "None" or item == "":
|
|
514
|
+
reconstructed_data.append(None)
|
|
515
|
+
else:
|
|
516
|
+
try:
|
|
517
|
+
reconstructed_data.append(
|
|
518
|
+
Chromatogram.from_json(item),
|
|
519
|
+
)
|
|
520
|
+
except (json.JSONDecodeError, ValueError):
|
|
521
|
+
reconstructed_data.append(None)
|
|
522
|
+
|
|
523
|
+
data[col] = reconstructed_data
|
|
524
|
+
case "ms2_scans":
|
|
525
|
+
data_col = features_group[col][:]
|
|
526
|
+
# Convert JSON strings back to lists of integers
|
|
527
|
+
reconstructed_data = []
|
|
528
|
+
for item in data_col:
|
|
529
|
+
if isinstance(item, bytes):
|
|
530
|
+
item = item.decode("utf-8")
|
|
531
|
+
|
|
532
|
+
if item == "None":
|
|
533
|
+
reconstructed_data.append(None)
|
|
534
|
+
else:
|
|
535
|
+
try:
|
|
536
|
+
# Parse JSON string to get list of integers
|
|
537
|
+
scan_list = json.loads(item)
|
|
538
|
+
reconstructed_data.append(scan_list)
|
|
539
|
+
except (json.JSONDecodeError, ValueError):
|
|
540
|
+
reconstructed_data.append(None)
|
|
541
|
+
|
|
542
|
+
data[col] = reconstructed_data
|
|
543
|
+
case "ms2_specs":
|
|
544
|
+
data_col = features_group[col][:]
|
|
545
|
+
# Convert JSON strings back to lists of Spectrum objects
|
|
546
|
+
reconstructed_data = []
|
|
547
|
+
for item in data_col:
|
|
548
|
+
if isinstance(item, bytes):
|
|
549
|
+
item = item.decode("utf-8")
|
|
550
|
+
|
|
551
|
+
# Parse the outer JSON (list of JSON strings)
|
|
552
|
+
json_list = json.loads(item)
|
|
553
|
+
|
|
554
|
+
if json_list == ["None"]:
|
|
555
|
+
# This was originally None
|
|
556
|
+
reconstructed_data.append(None)
|
|
557
|
+
else:
|
|
558
|
+
# This was originally a list of Spectrum objects
|
|
559
|
+
spectrum_list: list[Any] = []
|
|
560
|
+
for json_str in json_list:
|
|
561
|
+
if json_str == "None":
|
|
562
|
+
spectrum_list.append(None)
|
|
563
|
+
else:
|
|
564
|
+
spectrum_list.append(
|
|
565
|
+
Spectrum.from_json(json_str),
|
|
566
|
+
)
|
|
567
|
+
reconstructed_data.append(spectrum_list)
|
|
568
|
+
|
|
569
|
+
data[col] = reconstructed_data
|
|
570
|
+
case _:
|
|
571
|
+
self.logger.debug(f"Unexpected Object column '{col}'")
|
|
572
|
+
data[col] = None
|
|
573
|
+
missing_columns.append(col)
|
|
574
|
+
|
|
575
|
+
case _:
|
|
576
|
+
data[col] = features_group[col][:]
|
|
577
|
+
|
|
578
|
+
# create polars DataFrame from data
|
|
579
|
+
if data:
|
|
580
|
+
# Build schema for DataFrame creation to handle Object columns properly
|
|
581
|
+
df_schema = {}
|
|
582
|
+
for col, values in data.items():
|
|
583
|
+
if col in schema.get("features_df", {}).get("columns", {}):
|
|
584
|
+
dtype_str = schema["features_df"]["columns"][col]["dtype"]
|
|
585
|
+
if dtype_str == "pl.Object":
|
|
586
|
+
df_schema[col] = pl.Object
|
|
587
|
+
else:
|
|
588
|
+
# Let Polars infer the type initially, we'll cast later
|
|
589
|
+
df_schema[col] = None
|
|
590
|
+
else:
|
|
591
|
+
df_schema[col] = None
|
|
592
|
+
|
|
593
|
+
# Create DataFrame with explicit Object types where needed
|
|
594
|
+
try:
|
|
595
|
+
self.features_df = pl.DataFrame(data, schema=df_schema)
|
|
596
|
+
except Exception:
|
|
597
|
+
# Fallback: create without schema and handle Object columns manually
|
|
598
|
+
object_columns = {
|
|
599
|
+
k: v
|
|
600
|
+
for k, v in data.items()
|
|
601
|
+
if k in schema.get("features_df", {}).get("columns", {})
|
|
602
|
+
and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
|
|
603
|
+
}
|
|
604
|
+
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
605
|
+
|
|
606
|
+
# Create DataFrame with regular columns first
|
|
607
|
+
if regular_columns:
|
|
608
|
+
self.features_df = pl.DataFrame(regular_columns)
|
|
609
|
+
# Add Object columns one by one
|
|
610
|
+
for col, values in object_columns.items():
|
|
611
|
+
self.features_df = self.features_df.with_columns([
|
|
612
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
613
|
+
])
|
|
614
|
+
else:
|
|
615
|
+
# Only Object columns
|
|
616
|
+
self.features_df = pl.DataFrame()
|
|
617
|
+
for col, values in object_columns.items():
|
|
618
|
+
self.features_df = self.features_df.with_columns([
|
|
619
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
620
|
+
])
|
|
621
|
+
|
|
622
|
+
# update all columns with schema types (skip Object columns)
|
|
623
|
+
for col in self.features_df.columns:
|
|
624
|
+
if col in schema.get("features_df", {}).get("columns", {}):
|
|
625
|
+
try:
|
|
626
|
+
dtype_str = schema["features_df"]["columns"][col]["dtype"]
|
|
627
|
+
# Convert dtype string to actual polars dtype
|
|
628
|
+
if dtype_str.startswith("pl."):
|
|
629
|
+
# Skip Object columns - they're already properly reconstructed
|
|
630
|
+
if "Object" in dtype_str:
|
|
631
|
+
continue
|
|
632
|
+
# Handle different polars data types
|
|
633
|
+
if "Int" in dtype_str:
|
|
634
|
+
# Convert to numeric first, handling different input types
|
|
635
|
+
if self.features_df[col].dtype == pl.Utf8:
|
|
636
|
+
# String data - convert to integer
|
|
637
|
+
self.features_df = self.features_df.with_columns(
|
|
638
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
639
|
+
)
|
|
640
|
+
elif self.features_df[col].dtype in [
|
|
641
|
+
pl.Float64,
|
|
642
|
+
pl.Float32,
|
|
643
|
+
]:
|
|
644
|
+
# Float data - cast to integer with null handling for NaN values
|
|
645
|
+
self.features_df = self.features_df.with_columns(
|
|
646
|
+
pl.col(col).cast(eval(dtype_str), strict=False),
|
|
647
|
+
)
|
|
648
|
+
else:
|
|
649
|
+
# Handle special cases and try direct casting for other types
|
|
650
|
+
current_dtype = self.features_df[col].dtype
|
|
651
|
+
target_dtype = eval(dtype_str)
|
|
652
|
+
|
|
653
|
+
# Handle binary data that might need string conversion first
|
|
654
|
+
if "Binary" in str(current_dtype):
|
|
655
|
+
# Convert binary to string first, then to target type
|
|
656
|
+
if target_dtype == pl.Utf8:
|
|
657
|
+
self.features_df = self.features_df.with_columns(
|
|
658
|
+
pl.col(col)
|
|
659
|
+
.map_elements(
|
|
660
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
661
|
+
return_dtype=pl.Utf8,
|
|
662
|
+
)
|
|
663
|
+
.cast(target_dtype),
|
|
664
|
+
)
|
|
665
|
+
elif "Int" in str(target_dtype):
|
|
666
|
+
self.features_df = self.features_df.with_columns(
|
|
667
|
+
pl.col(col)
|
|
668
|
+
.map_elements(
|
|
669
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
670
|
+
return_dtype=pl.Utf8,
|
|
671
|
+
)
|
|
672
|
+
.str.to_integer()
|
|
673
|
+
.cast(target_dtype),
|
|
674
|
+
)
|
|
675
|
+
elif "Float" in str(target_dtype):
|
|
676
|
+
self.features_df = self.features_df.with_columns(
|
|
677
|
+
pl.col(col)
|
|
678
|
+
.map_elements(
|
|
679
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
680
|
+
return_dtype=pl.Utf8,
|
|
681
|
+
)
|
|
682
|
+
.str.to_decimal()
|
|
683
|
+
.cast(target_dtype),
|
|
684
|
+
)
|
|
685
|
+
else:
|
|
686
|
+
# Try direct casting
|
|
687
|
+
self.features_df = self.features_df.with_columns(
|
|
688
|
+
pl.col(col).cast(target_dtype),
|
|
689
|
+
)
|
|
690
|
+
else:
|
|
691
|
+
# Try direct casting for non-binary types
|
|
692
|
+
self.features_df = self.features_df.with_columns(
|
|
693
|
+
pl.col(col).cast(target_dtype),
|
|
694
|
+
)
|
|
695
|
+
elif "Float" in dtype_str:
|
|
696
|
+
# Convert to float, handling different input types
|
|
697
|
+
if self.features_df[col].dtype == pl.Utf8:
|
|
698
|
+
# String data - convert to float
|
|
699
|
+
self.features_df = self.features_df.with_columns(
|
|
700
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
701
|
+
)
|
|
702
|
+
else:
|
|
703
|
+
# Handle special cases and try direct casting for other types
|
|
704
|
+
current_dtype = self.features_df[col].dtype
|
|
705
|
+
target_dtype = eval(dtype_str)
|
|
706
|
+
|
|
707
|
+
# Handle binary data that might need string conversion first
|
|
708
|
+
if "Binary" in str(current_dtype):
|
|
709
|
+
# Convert binary to string first, then to target type
|
|
710
|
+
if target_dtype == pl.Utf8:
|
|
711
|
+
self.features_df = self.features_df.with_columns(
|
|
712
|
+
pl.col(col)
|
|
713
|
+
.map_elements(
|
|
714
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
715
|
+
return_dtype=pl.Utf8,
|
|
716
|
+
)
|
|
717
|
+
.cast(target_dtype),
|
|
718
|
+
)
|
|
719
|
+
elif "Int" in str(target_dtype):
|
|
720
|
+
self.features_df = self.features_df.with_columns(
|
|
721
|
+
pl.col(col)
|
|
722
|
+
.map_elements(
|
|
723
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
724
|
+
return_dtype=pl.Utf8,
|
|
725
|
+
)
|
|
726
|
+
.str.to_integer()
|
|
727
|
+
.cast(target_dtype),
|
|
728
|
+
)
|
|
729
|
+
elif "Float" in str(target_dtype):
|
|
730
|
+
self.features_df = self.features_df.with_columns(
|
|
731
|
+
pl.col(col)
|
|
732
|
+
.map_elements(
|
|
733
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
734
|
+
return_dtype=pl.Utf8,
|
|
735
|
+
)
|
|
736
|
+
.str.to_decimal()
|
|
737
|
+
.cast(target_dtype),
|
|
738
|
+
)
|
|
739
|
+
else:
|
|
740
|
+
# Try direct casting
|
|
741
|
+
self.features_df = self.features_df.with_columns(
|
|
742
|
+
pl.col(col).cast(target_dtype),
|
|
743
|
+
)
|
|
744
|
+
else:
|
|
745
|
+
# Try direct casting for non-binary types
|
|
746
|
+
self.features_df = self.features_df.with_columns(
|
|
747
|
+
pl.col(col).cast(target_dtype),
|
|
748
|
+
)
|
|
749
|
+
elif "Utf8" in dtype_str:
|
|
750
|
+
# Ensure it's string type
|
|
751
|
+
self.features_df = self.features_df.with_columns(
|
|
752
|
+
pl.col(col).cast(pl.Utf8),
|
|
753
|
+
)
|
|
754
|
+
else:
|
|
755
|
+
# Handle special cases and try direct casting for other types
|
|
756
|
+
current_dtype = self.features_df[col].dtype
|
|
757
|
+
target_dtype = eval(dtype_str)
|
|
758
|
+
|
|
759
|
+
# Handle binary data that might need string conversion first
|
|
760
|
+
if "Binary" in str(current_dtype):
|
|
761
|
+
# Convert binary to string first, then to target type
|
|
762
|
+
if target_dtype == pl.Utf8:
|
|
763
|
+
self.features_df = self.features_df.with_columns(
|
|
764
|
+
pl.col(col)
|
|
765
|
+
.map_elements(
|
|
766
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
767
|
+
return_dtype=pl.Utf8,
|
|
768
|
+
)
|
|
769
|
+
.cast(target_dtype),
|
|
770
|
+
)
|
|
771
|
+
elif "Int" in str(target_dtype):
|
|
772
|
+
self.features_df = self.features_df.with_columns(
|
|
773
|
+
pl.col(col)
|
|
774
|
+
.map_elements(
|
|
775
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
776
|
+
return_dtype=pl.Utf8,
|
|
777
|
+
)
|
|
778
|
+
.str.to_integer()
|
|
779
|
+
.cast(target_dtype),
|
|
780
|
+
)
|
|
781
|
+
elif "Float" in str(target_dtype):
|
|
782
|
+
self.features_df = self.features_df.with_columns(
|
|
783
|
+
pl.col(col)
|
|
784
|
+
.map_elements(
|
|
785
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
786
|
+
return_dtype=pl.Utf8,
|
|
787
|
+
)
|
|
788
|
+
.str.to_decimal()
|
|
789
|
+
.cast(target_dtype),
|
|
790
|
+
)
|
|
791
|
+
else:
|
|
792
|
+
# Try direct casting
|
|
793
|
+
self.features_df = self.features_df.with_columns(
|
|
794
|
+
pl.col(col).cast(target_dtype),
|
|
795
|
+
)
|
|
796
|
+
else:
|
|
797
|
+
# Try direct casting for non-binary types
|
|
798
|
+
self.features_df = self.features_df.with_columns(
|
|
799
|
+
pl.col(col).cast(target_dtype),
|
|
800
|
+
)
|
|
801
|
+
except Exception as e:
|
|
802
|
+
self.logger.warning(
|
|
803
|
+
f"Failed to cast column '{col}' in features_df: {e}",
|
|
804
|
+
)
|
|
805
|
+
else:
|
|
806
|
+
self.logger.warning(
|
|
807
|
+
f"Column '{col}' in features_df not found in schema, keeping original type.",
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# FINAL null conversion pass - after all type casting is done
|
|
811
|
+
# This ensures "None" strings introduced by failed conversions are properly handled
|
|
812
|
+
for col in self.features_df.columns:
|
|
813
|
+
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
814
|
+
self.features_df = self.features_df.with_columns([
|
|
815
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
816
|
+
.then(None)
|
|
817
|
+
.otherwise(pl.col(col))
|
|
818
|
+
.alias(col),
|
|
819
|
+
])
|
|
820
|
+
# Float columns
|
|
821
|
+
elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
|
|
822
|
+
self.features_df = self.features_df.with_columns([
|
|
823
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
824
|
+
])
|
|
825
|
+
|
|
826
|
+
# Ensure column order matches schema order
|
|
827
|
+
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
828
|
+
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
829
|
+
# Only reorder columns that exist in both schema and DataFrame
|
|
830
|
+
existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
|
|
831
|
+
if existing_columns:
|
|
832
|
+
self.features_df = self.features_df.select(existing_columns)
|
|
833
|
+
|
|
834
|
+
else:
|
|
835
|
+
self.features_df = None
|
|
836
|
+
else:
|
|
837
|
+
self.features_df = None
|
|
838
|
+
|
|
839
|
+
# Load ms1_df
|
|
840
|
+
if "ms1" in f:
|
|
841
|
+
ms1_group = f["ms1"]
|
|
842
|
+
data = {}
|
|
843
|
+
|
|
844
|
+
# Get all datasets in the ms1 group
|
|
845
|
+
for col in ms1_group.keys():
|
|
846
|
+
data[col] = ms1_group[col][:]
|
|
847
|
+
|
|
848
|
+
if data:
|
|
849
|
+
# Create DataFrame directly with Polars
|
|
850
|
+
self.ms1_df = pl.DataFrame(data)
|
|
851
|
+
|
|
852
|
+
# Apply schema if available
|
|
853
|
+
if "ms1_df" in schema and "columns" in schema["ms1_df"]:
|
|
854
|
+
schema_columns = schema["ms1_df"]["columns"]
|
|
855
|
+
for col in self.ms1_df.columns:
|
|
856
|
+
if col in schema_columns:
|
|
857
|
+
dtype_str = schema_columns[col]["dtype"]
|
|
858
|
+
try:
|
|
859
|
+
if "Int" in dtype_str:
|
|
860
|
+
self.ms1_df = self.ms1_df.with_columns([
|
|
861
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
862
|
+
])
|
|
863
|
+
elif "Float" in dtype_str:
|
|
864
|
+
self.ms1_df = self.ms1_df.with_columns([
|
|
865
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
866
|
+
])
|
|
867
|
+
except Exception as e:
|
|
868
|
+
self.logger.warning(
|
|
869
|
+
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# Convert "None" strings and NaN values to proper null values
|
|
873
|
+
self.ms1_df = clean_null_values_polars(self.ms1_df)
|
|
874
|
+
else:
|
|
875
|
+
self.ms1_df = None
|
|
876
|
+
else:
|
|
877
|
+
self.ms1_df = None
|
|
878
|
+
|
|
879
|
+
# Parameters are now loaded from metadata JSON (see above)
|
|
880
|
+
# Lib and lib_match are no longer saved/loaded
|
|
881
|
+
|
|
882
|
+
if map:
|
|
883
|
+
featureXML = filename.replace(".sample5", ".featureXML")
|
|
884
|
+
if os.path.exists(featureXML):
|
|
885
|
+
self._load_featureXML(featureXML)
|
|
886
|
+
self._features_sync()
|
|
887
|
+
else:
|
|
888
|
+
self.logger.warning(
|
|
889
|
+
f"Feature XML file {featureXML} not found, skipping loading.",
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# set self.file_path to *.sample5
|
|
893
|
+
self.file_path = filename
|
|
894
|
+
# set self.label to basename without extension
|
|
895
|
+
if self.label is None or self.label == "":
|
|
896
|
+
self.label = os.path.splitext(os.path.basename(filename))[0]
|
|
897
|
+
self.logger.info(f"Sample loaded successfully from {filename}")
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def load_schema(schema_path: str) -> Dict[str, Any]:
|
|
901
|
+
"""
|
|
902
|
+
Load schema from JSON file with error handling.
|
|
903
|
+
|
|
904
|
+
Args:
|
|
905
|
+
schema_path: Path to the schema JSON file
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
Dictionary containing the schema, empty dict if not found
|
|
909
|
+
"""
|
|
910
|
+
try:
|
|
911
|
+
with open(schema_path) as f:
|
|
912
|
+
return json.load(f) # type: ignore
|
|
913
|
+
except FileNotFoundError:
|
|
914
|
+
return {}
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def decode_metadata_attr(attr_value: Any) -> str:
|
|
918
|
+
"""
|
|
919
|
+
Decode metadata attribute, handling both bytes and string types.
|
|
920
|
+
|
|
921
|
+
Args:
|
|
922
|
+
attr_value: The attribute value to decode
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
String representation of the attribute
|
|
926
|
+
"""
|
|
927
|
+
if isinstance(attr_value, bytes):
|
|
928
|
+
return attr_value.decode()
|
|
929
|
+
return str(attr_value) if attr_value is not None else ""
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
|
|
933
|
+
"""
|
|
934
|
+
Clean null values in a Polars DataFrame by converting string nulls to proper nulls.
|
|
935
|
+
|
|
936
|
+
Args:
|
|
937
|
+
df: The Polars DataFrame to clean
|
|
938
|
+
|
|
939
|
+
Returns:
|
|
940
|
+
Cleaned DataFrame
|
|
941
|
+
"""
|
|
942
|
+
cleaned_df = df
|
|
943
|
+
for col in df.columns:
|
|
944
|
+
if df[col].dtype == pl.Utf8: # String columns
|
|
945
|
+
cleaned_df = cleaned_df.with_columns([
|
|
946
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
|
|
947
|
+
])
|
|
948
|
+
elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
|
|
949
|
+
cleaned_df = cleaned_df.with_columns([
|
|
950
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
951
|
+
])
|
|
952
|
+
return cleaned_df
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataFrame:
|
|
956
|
+
"""
|
|
957
|
+
Cast a Polars DataFrame column to the specified dtype with appropriate handling.
|
|
958
|
+
|
|
959
|
+
Args:
|
|
960
|
+
df: The Polars DataFrame
|
|
961
|
+
col: Column name to cast
|
|
962
|
+
dtype_str: Target dtype as string (e.g., 'pl.Int64')
|
|
963
|
+
|
|
964
|
+
Returns:
|
|
965
|
+
DataFrame with the column cast to the new type
|
|
966
|
+
"""
|
|
967
|
+
if not dtype_str.startswith("pl.") or "Object" in dtype_str:
|
|
968
|
+
return df
|
|
969
|
+
|
|
970
|
+
try:
|
|
971
|
+
target_dtype = eval(dtype_str)
|
|
972
|
+
current_dtype = df[col].dtype
|
|
973
|
+
|
|
974
|
+
if "Int" in dtype_str:
|
|
975
|
+
return _cast_to_int(df, col, current_dtype, target_dtype)
|
|
976
|
+
elif "Float" in dtype_str:
|
|
977
|
+
return _cast_to_float(df, col, current_dtype, target_dtype)
|
|
978
|
+
elif "Utf8" in dtype_str:
|
|
979
|
+
return df.with_columns(pl.col(col).cast(pl.Utf8))
|
|
980
|
+
else:
|
|
981
|
+
return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
|
|
982
|
+
|
|
983
|
+
except Exception:
|
|
984
|
+
return df
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
|
|
988
|
+
"""Helper function to cast column to integer type."""
|
|
989
|
+
if current_dtype == pl.Utf8:
|
|
990
|
+
return df.with_columns(
|
|
991
|
+
pl.col(col).str.to_integer().cast(target_dtype),
|
|
992
|
+
)
|
|
993
|
+
elif current_dtype in [pl.Float64, pl.Float32]:
|
|
994
|
+
return df.with_columns(pl.col(col).cast(target_dtype))
|
|
995
|
+
else:
|
|
996
|
+
return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
|
|
1000
|
+
"""Helper function to cast column to float type."""
|
|
1001
|
+
if current_dtype == pl.Utf8:
|
|
1002
|
+
return df.with_columns(
|
|
1003
|
+
pl.col(col).str.to_decimal().cast(target_dtype),
|
|
1004
|
+
)
|
|
1005
|
+
else:
|
|
1006
|
+
return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def _cast_with_binary_handling(
|
|
1010
|
+
df: pl.DataFrame,
|
|
1011
|
+
col: str,
|
|
1012
|
+
current_dtype: pl.DataType,
|
|
1013
|
+
target_dtype: pl.DataType,
|
|
1014
|
+
) -> pl.DataFrame:
|
|
1015
|
+
"""Helper function to handle binary data conversion."""
|
|
1016
|
+
if "Binary" in str(current_dtype):
|
|
1017
|
+
if target_dtype == pl.Utf8:
|
|
1018
|
+
return df.with_columns(
|
|
1019
|
+
pl.col(col)
|
|
1020
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1021
|
+
.cast(target_dtype),
|
|
1022
|
+
)
|
|
1023
|
+
elif "Int" in str(target_dtype):
|
|
1024
|
+
return df.with_columns(
|
|
1025
|
+
pl.col(col)
|
|
1026
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1027
|
+
.str.to_integer()
|
|
1028
|
+
.cast(target_dtype),
|
|
1029
|
+
)
|
|
1030
|
+
elif "Float" in str(target_dtype):
|
|
1031
|
+
return df.with_columns(
|
|
1032
|
+
pl.col(col)
|
|
1033
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1034
|
+
.str.to_decimal()
|
|
1035
|
+
.cast(target_dtype),
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
# Fallback: try direct casting
|
|
1039
|
+
return df.with_columns(pl.col(col).cast(target_dtype))
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
|
|
1043
|
+
"""
|
|
1044
|
+
Apply schema type casting to a Polars DataFrame.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
df: The DataFrame to modify
|
|
1048
|
+
schema: The schema dictionary
|
|
1049
|
+
df_name: Name of the DataFrame in the schema (e.g., 'scans_df', 'features_df')
|
|
1050
|
+
|
|
1051
|
+
Returns:
|
|
1052
|
+
DataFrame with schema types applied
|
|
1053
|
+
"""
|
|
1054
|
+
df_schema = schema.get(df_name, {}).get("columns", {})
|
|
1055
|
+
|
|
1056
|
+
for col in df.columns:
|
|
1057
|
+
if col in df_schema:
|
|
1058
|
+
dtype_str = df_schema[col]["dtype"]
|
|
1059
|
+
df = cast_column_by_dtype(df, col, dtype_str)
|
|
1060
|
+
|
|
1061
|
+
return df
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
def reconstruct_object_column(data_col: np.ndarray, col_name: str) -> List[Any]:
|
|
1065
|
+
"""
|
|
1066
|
+
Reconstruct object columns from serialized data.
|
|
1067
|
+
|
|
1068
|
+
Args:
|
|
1069
|
+
data_col: Array containing serialized data
|
|
1070
|
+
col_name: Name of the column for type-specific reconstruction
|
|
1071
|
+
|
|
1072
|
+
Returns:
|
|
1073
|
+
List of reconstructed objects
|
|
1074
|
+
"""
|
|
1075
|
+
reconstructed_data: list[Any] = []
|
|
1076
|
+
|
|
1077
|
+
for item in data_col:
|
|
1078
|
+
if isinstance(item, bytes):
|
|
1079
|
+
item = item.decode("utf-8")
|
|
1080
|
+
|
|
1081
|
+
if item == "None" or item == "":
|
|
1082
|
+
reconstructed_data.append(None)
|
|
1083
|
+
continue
|
|
1084
|
+
|
|
1085
|
+
try:
|
|
1086
|
+
if col_name == "chrom":
|
|
1087
|
+
reconstructed_data.append(Chromatogram.from_json(item))
|
|
1088
|
+
elif col_name == "ms2_scans":
|
|
1089
|
+
scan_list = json.loads(item)
|
|
1090
|
+
reconstructed_data.append(scan_list)
|
|
1091
|
+
elif col_name == "ms2_specs":
|
|
1092
|
+
json_list = json.loads(item)
|
|
1093
|
+
if json_list == ["None"]:
|
|
1094
|
+
reconstructed_data.append(None)
|
|
1095
|
+
else:
|
|
1096
|
+
spectrum_list: list[Any] = []
|
|
1097
|
+
for json_str in json_list:
|
|
1098
|
+
if json_str == "None":
|
|
1099
|
+
spectrum_list.append(None)
|
|
1100
|
+
else:
|
|
1101
|
+
spectrum_list.append(Spectrum.from_json(json_str))
|
|
1102
|
+
reconstructed_data.append(spectrum_list)
|
|
1103
|
+
else:
|
|
1104
|
+
# Unknown object column
|
|
1105
|
+
reconstructed_data.append(None)
|
|
1106
|
+
except (json.JSONDecodeError, ValueError):
|
|
1107
|
+
reconstructed_data.append(None)
|
|
1108
|
+
|
|
1109
|
+
return reconstructed_data
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
def load_dataframe_from_h5_group(
|
|
1113
|
+
group: h5py.Group,
|
|
1114
|
+
schema: Dict[str, Any],
|
|
1115
|
+
df_name: str,
|
|
1116
|
+
logger: Optional[Any] = None,
|
|
1117
|
+
) -> Tuple[Optional[pl.DataFrame], List[str]]:
|
|
1118
|
+
"""
|
|
1119
|
+
Load a Polars DataFrame from an HDF5 group using schema.
|
|
1120
|
+
|
|
1121
|
+
Args:
|
|
1122
|
+
group: The HDF5 group containing the DataFrame data
|
|
1123
|
+
schema: The schema dictionary
|
|
1124
|
+
df_name: Name of the DataFrame in the schema
|
|
1125
|
+
logger: Optional logger for warnings
|
|
1126
|
+
|
|
1127
|
+
Returns:
|
|
1128
|
+
Tuple of (DataFrame or None, list of missing columns)
|
|
1129
|
+
"""
|
|
1130
|
+
data: dict[str, Any] = {}
|
|
1131
|
+
missing_columns = []
|
|
1132
|
+
|
|
1133
|
+
# Load columns according to schema
|
|
1134
|
+
schema_columns = schema.get(df_name, {}).get("columns", [])
|
|
1135
|
+
|
|
1136
|
+
for col in schema_columns:
|
|
1137
|
+
if col not in group:
|
|
1138
|
+
if logger:
|
|
1139
|
+
logger.warning(f"Column '{col}' not found in {df_name}.")
|
|
1140
|
+
data[col] = None
|
|
1141
|
+
missing_columns.append(col)
|
|
1142
|
+
continue
|
|
1143
|
+
|
|
1144
|
+
dtype = schema[df_name]["columns"][col].get("dtype", "native")
|
|
1145
|
+
|
|
1146
|
+
if dtype == "pl.Object":
|
|
1147
|
+
# Handle object columns specially
|
|
1148
|
+
data[col] = reconstruct_object_column(group[col][:], col)
|
|
1149
|
+
else:
|
|
1150
|
+
data[col] = group[col][:]
|
|
1151
|
+
|
|
1152
|
+
if not data:
|
|
1153
|
+
return None, missing_columns
|
|
1154
|
+
|
|
1155
|
+
# Create DataFrame with proper schema for Object columns
|
|
1156
|
+
df_schema = {}
|
|
1157
|
+
for col, values in data.items():
|
|
1158
|
+
if col in schema_columns:
|
|
1159
|
+
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
1160
|
+
if dtype_str == "pl.Object":
|
|
1161
|
+
df_schema[col] = pl.Object
|
|
1162
|
+
|
|
1163
|
+
try:
|
|
1164
|
+
if df_schema:
|
|
1165
|
+
df = pl.DataFrame(data, schema=df_schema)
|
|
1166
|
+
else:
|
|
1167
|
+
df = pl.DataFrame(data)
|
|
1168
|
+
except Exception:
|
|
1169
|
+
# Fallback: handle Object columns manually
|
|
1170
|
+
df = _create_dataframe_with_object_columns(data, schema, df_name)
|
|
1171
|
+
|
|
1172
|
+
# Clean null values
|
|
1173
|
+
df = clean_null_values_polars(df)
|
|
1174
|
+
|
|
1175
|
+
# Apply schema type casting
|
|
1176
|
+
df = apply_schema_to_dataframe(df, schema, df_name)
|
|
1177
|
+
|
|
1178
|
+
return df, missing_columns
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def _create_dataframe_with_object_columns(
|
|
1182
|
+
data: Dict[str, Any],
|
|
1183
|
+
schema: Dict[str, Any],
|
|
1184
|
+
df_name: str,
|
|
1185
|
+
) -> pl.DataFrame:
|
|
1186
|
+
"""
|
|
1187
|
+
Create DataFrame handling Object columns manually when schema creation fails.
|
|
1188
|
+
|
|
1189
|
+
Args:
|
|
1190
|
+
data: Dictionary of column data
|
|
1191
|
+
schema: The schema dictionary
|
|
1192
|
+
df_name: Name of the DataFrame in the schema
|
|
1193
|
+
|
|
1194
|
+
Returns:
|
|
1195
|
+
Polars DataFrame with Object columns properly handled
|
|
1196
|
+
"""
|
|
1197
|
+
schema_columns = schema.get(df_name, {}).get("columns", {})
|
|
1198
|
+
|
|
1199
|
+
object_columns = {
|
|
1200
|
+
k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
1201
|
+
}
|
|
1202
|
+
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
1203
|
+
|
|
1204
|
+
# Create DataFrame with regular columns first
|
|
1205
|
+
if regular_columns:
|
|
1206
|
+
df = pl.DataFrame(regular_columns)
|
|
1207
|
+
# Add Object columns one by one
|
|
1208
|
+
for col, values in object_columns.items():
|
|
1209
|
+
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
1210
|
+
else:
|
|
1211
|
+
# Only Object columns
|
|
1212
|
+
df = pl.DataFrame()
|
|
1213
|
+
for col, values in object_columns.items():
|
|
1214
|
+
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
1215
|
+
|
|
1216
|
+
return df
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def load_ms1_dataframe_from_h5_group(
|
|
1220
|
+
group: h5py.Group,
|
|
1221
|
+
schema: Dict[str, Any],
|
|
1222
|
+
logger: Optional[Any] = None,
|
|
1223
|
+
) -> Optional[pl.DataFrame]:
|
|
1224
|
+
"""
|
|
1225
|
+
Load MS1 DataFrame from HDF5 group.
|
|
1226
|
+
|
|
1227
|
+
Args:
|
|
1228
|
+
group: The HDF5 group containing MS1 data
|
|
1229
|
+
schema: The schema dictionary
|
|
1230
|
+
logger: Optional logger for warnings
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
Polars DataFrame or None
|
|
1234
|
+
"""
|
|
1235
|
+
data = {}
|
|
1236
|
+
|
|
1237
|
+
# Get all datasets in the ms1 group
|
|
1238
|
+
for col in group.keys():
|
|
1239
|
+
data[col] = group[col][:]
|
|
1240
|
+
|
|
1241
|
+
if not data:
|
|
1242
|
+
return None
|
|
1243
|
+
|
|
1244
|
+
# Create DataFrame directly with Polars
|
|
1245
|
+
ms1_df = pl.DataFrame(data)
|
|
1246
|
+
|
|
1247
|
+
# Apply schema if available
|
|
1248
|
+
if "ms1_df" in schema and "columns" in schema["ms1_df"]:
|
|
1249
|
+
schema_columns = schema["ms1_df"]["columns"]
|
|
1250
|
+
for col in ms1_df.columns:
|
|
1251
|
+
if col in schema_columns:
|
|
1252
|
+
dtype_str = schema_columns[col]["dtype"]
|
|
1253
|
+
try:
|
|
1254
|
+
if "Int" in dtype_str:
|
|
1255
|
+
ms1_df = ms1_df.with_columns([
|
|
1256
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
1257
|
+
])
|
|
1258
|
+
elif "Float" in dtype_str:
|
|
1259
|
+
ms1_df = ms1_df.with_columns([
|
|
1260
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
1261
|
+
])
|
|
1262
|
+
except Exception as e:
|
|
1263
|
+
if logger:
|
|
1264
|
+
logger.warning(
|
|
1265
|
+
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
# Convert "None" strings and NaN values to proper null values
|
|
1269
|
+
return clean_null_values_polars(ms1_df)
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
|
|
1273
|
+
"""
|
|
1274
|
+
Load parameters from HDF5 metadata group.
|
|
1275
|
+
|
|
1276
|
+
Args:
|
|
1277
|
+
metadata_group: The HDF5 metadata group containing parameters
|
|
1278
|
+
|
|
1279
|
+
Returns:
|
|
1280
|
+
Dictionary of parameters or None if not found
|
|
1281
|
+
"""
|
|
1282
|
+
if "parameters" in metadata_group.attrs:
|
|
1283
|
+
try:
|
|
1284
|
+
params_json = decode_metadata_attr(metadata_group.attrs["parameters"])
|
|
1285
|
+
# Ensure params_json is a string before attempting JSON decode
|
|
1286
|
+
if isinstance(params_json, str) and params_json.strip():
|
|
1287
|
+
result = json.loads(params_json)
|
|
1288
|
+
# Ensure the result is a dictionary
|
|
1289
|
+
if isinstance(result, dict):
|
|
1290
|
+
return result
|
|
1291
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
1292
|
+
# Log the error for debugging
|
|
1293
|
+
print(f"Warning: Failed to parse parameters JSON: {e}")
|
|
1294
|
+
print(f"Raw parameter data type: {type(params_json)}")
|
|
1295
|
+
print(f"Raw parameter data: {repr(params_json)}")
|
|
1296
|
+
return None
|
|
1297
|
+
|
|
1298
|
+
|
|
1299
|
+
def create_h5_metadata_group(
|
|
1300
|
+
f: h5py.File,
|
|
1301
|
+
file_path: Optional[str],
|
|
1302
|
+
file_source: Optional[str],
|
|
1303
|
+
file_type: Optional[str],
|
|
1304
|
+
label: Optional[str],
|
|
1305
|
+
) -> None:
|
|
1306
|
+
"""
|
|
1307
|
+
Create and populate metadata group in HDF5 file.
|
|
1308
|
+
|
|
1309
|
+
Args:
|
|
1310
|
+
f: The HDF5 file object
|
|
1311
|
+
file_path: Source file path
|
|
1312
|
+
file_source: Original source file path
|
|
1313
|
+
file_type: Source file type
|
|
1314
|
+
label: Sample label
|
|
1315
|
+
"""
|
|
1316
|
+
metadata_group = f.create_group("metadata")
|
|
1317
|
+
metadata_group.attrs["format"] = "master-sample5-1"
|
|
1318
|
+
metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
|
|
1319
|
+
metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
|
|
1320
|
+
metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
|
|
1321
|
+
metadata_group.attrs["label"] = str(label) if label is not None else ""
|