masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/sample/h5.py
CHANGED
|
@@ -11,7 +11,13 @@ from masster.chromatogram import Chromatogram
|
|
|
11
11
|
from masster.spectrum import Spectrum
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def _save_sample5(
|
|
14
|
+
def _save_sample5(
|
|
15
|
+
self,
|
|
16
|
+
filename=None,
|
|
17
|
+
include_ms1=True,
|
|
18
|
+
include_scans=True,
|
|
19
|
+
save_featurexml=False,
|
|
20
|
+
):
|
|
15
21
|
"""
|
|
16
22
|
Save the instance data to a sample5 HDF5 file with optimized compression.
|
|
17
23
|
|
|
@@ -56,14 +62,16 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
|
|
|
56
62
|
return
|
|
57
63
|
|
|
58
64
|
# synchronize feature_map if it exists
|
|
59
|
-
if hasattr(self,
|
|
65
|
+
if hasattr(self, "_feature_map") and self._feature_map is not None:
|
|
60
66
|
self._features_sync()
|
|
61
67
|
|
|
62
68
|
# if no extension is given, add .sample5
|
|
63
69
|
if not filename.endswith(".sample5"):
|
|
64
70
|
filename += ".sample5"
|
|
65
71
|
|
|
66
|
-
self.logger.debug(
|
|
72
|
+
self.logger.debug(
|
|
73
|
+
f"Saving sample to {filename} with optimized LZF+shuffle compression",
|
|
74
|
+
)
|
|
67
75
|
|
|
68
76
|
# delete existing file if it exists
|
|
69
77
|
if os.path.exists(filename):
|
|
@@ -116,12 +124,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
|
|
|
116
124
|
except Exception:
|
|
117
125
|
try:
|
|
118
126
|
# Try to convert to numeric using numpy
|
|
119
|
-
numeric_data = np.array(
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
127
|
+
numeric_data = np.array(
|
|
128
|
+
[
|
|
129
|
+
float(x)
|
|
130
|
+
if x is not None
|
|
131
|
+
and str(x)
|
|
132
|
+
.replace(".", "")
|
|
133
|
+
.replace("-", "")
|
|
134
|
+
.isdigit()
|
|
135
|
+
else np.nan
|
|
136
|
+
for x in data
|
|
137
|
+
],
|
|
138
|
+
)
|
|
125
139
|
if not np.isnan(numeric_data).all():
|
|
126
140
|
scans_group.create_dataset(
|
|
127
141
|
col,
|
|
@@ -149,7 +163,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
|
|
|
149
163
|
)
|
|
150
164
|
scans_group[col].attrs["dtype"] = "string_repr"
|
|
151
165
|
else:
|
|
152
|
-
scans_group.create_dataset(
|
|
166
|
+
scans_group.create_dataset(
|
|
167
|
+
col,
|
|
168
|
+
data=data,
|
|
169
|
+
compression="lzf",
|
|
170
|
+
shuffle=True,
|
|
171
|
+
)
|
|
153
172
|
scans_group[col].attrs["dtype"] = "native"
|
|
154
173
|
scans_group.attrs["columns"] = list(scans_df.columns)
|
|
155
174
|
|
|
@@ -226,7 +245,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
|
|
|
226
245
|
data = features[col].to_list()
|
|
227
246
|
# convert None to 'None' strings
|
|
228
247
|
data = ["None" if x is None else x for x in data]
|
|
229
|
-
features_group.create_dataset(
|
|
248
|
+
features_group.create_dataset(
|
|
249
|
+
col,
|
|
250
|
+
data=data,
|
|
251
|
+
compression="lzf",
|
|
252
|
+
shuffle=True,
|
|
253
|
+
)
|
|
230
254
|
else:
|
|
231
255
|
try:
|
|
232
256
|
data = features[col].to_numpy()
|
|
@@ -261,16 +285,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
|
|
|
261
285
|
feature_map = self._get_feature_map()
|
|
262
286
|
if feature_map is not None:
|
|
263
287
|
# Temporarily set features for save operation
|
|
264
|
-
old_features = getattr(self,
|
|
288
|
+
old_features = getattr(self, "_oms_features_map", None)
|
|
265
289
|
self._oms_features_map = feature_map
|
|
266
290
|
try:
|
|
267
|
-
self._save_featureXML(
|
|
291
|
+
self._save_featureXML(
|
|
292
|
+
filename=filename.replace(".sample5", ".featureXML"),
|
|
293
|
+
)
|
|
268
294
|
finally:
|
|
269
295
|
# Restore original features value
|
|
270
296
|
if old_features is not None:
|
|
271
297
|
self._oms_features_map = old_features
|
|
272
298
|
else:
|
|
273
|
-
delattr(self,
|
|
299
|
+
delattr(self, "_oms_features_map")
|
|
274
300
|
else:
|
|
275
301
|
self.logger.warning("Cannot save featureXML: no feature data available")
|
|
276
302
|
|
|
@@ -309,15 +335,21 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
309
335
|
# Load metadata
|
|
310
336
|
if "metadata" in f:
|
|
311
337
|
metadata_group = f["metadata"]
|
|
312
|
-
self.file_path = decode_metadata_attr(
|
|
338
|
+
self.file_path = decode_metadata_attr(
|
|
339
|
+
metadata_group.attrs.get("file_path", ""),
|
|
340
|
+
)
|
|
313
341
|
|
|
314
342
|
# Load file_source if it exists, otherwise set it equal to file_path
|
|
315
343
|
if "file_source" in metadata_group.attrs:
|
|
316
|
-
self.file_source = decode_metadata_attr(
|
|
344
|
+
self.file_source = decode_metadata_attr(
|
|
345
|
+
metadata_group.attrs.get("file_source", ""),
|
|
346
|
+
)
|
|
317
347
|
else:
|
|
318
348
|
self.file_source = self.file_path
|
|
319
349
|
|
|
320
|
-
self.file_type = decode_metadata_attr(
|
|
350
|
+
self.file_type = decode_metadata_attr(
|
|
351
|
+
metadata_group.attrs.get("file_type", ""),
|
|
352
|
+
)
|
|
321
353
|
self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
|
|
322
354
|
|
|
323
355
|
# Load parameters from JSON in metadata
|
|
@@ -368,19 +400,23 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
368
400
|
# Convert "None" strings and NaN values to proper null values
|
|
369
401
|
for col in self.scans_df.columns:
|
|
370
402
|
if self.scans_df[col].dtype == pl.Utf8: # String columns
|
|
371
|
-
self.scans_df = self.scans_df.with_columns(
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
403
|
+
self.scans_df = self.scans_df.with_columns(
|
|
404
|
+
[
|
|
405
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
406
|
+
.then(None)
|
|
407
|
+
.otherwise(pl.col(col))
|
|
408
|
+
.alias(col),
|
|
409
|
+
],
|
|
410
|
+
)
|
|
377
411
|
elif self.scans_df[col].dtype in [
|
|
378
412
|
pl.Float64,
|
|
379
413
|
pl.Float32,
|
|
380
414
|
]: # Float columns
|
|
381
|
-
self.scans_df = self.scans_df.with_columns(
|
|
382
|
-
|
|
383
|
-
|
|
415
|
+
self.scans_df = self.scans_df.with_columns(
|
|
416
|
+
[
|
|
417
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
418
|
+
],
|
|
419
|
+
)
|
|
384
420
|
|
|
385
421
|
# update all columns with schema types
|
|
386
422
|
for col in self.scans_df.columns:
|
|
@@ -398,7 +434,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
398
434
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
399
435
|
# String data - convert to integer
|
|
400
436
|
self.scans_df = self.scans_df.with_columns(
|
|
401
|
-
pl.col(col)
|
|
437
|
+
pl.col(col)
|
|
438
|
+
.str.to_integer()
|
|
439
|
+
.cast(eval(dtype_str)),
|
|
402
440
|
)
|
|
403
441
|
elif self.scans_df[col].dtype in [
|
|
404
442
|
pl.Float64,
|
|
@@ -418,7 +456,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
418
456
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
419
457
|
# String data - convert to float
|
|
420
458
|
self.scans_df = self.scans_df.with_columns(
|
|
421
|
-
pl.col(col)
|
|
459
|
+
pl.col(col)
|
|
460
|
+
.str.to_decimal()
|
|
461
|
+
.cast(eval(dtype_str)),
|
|
422
462
|
)
|
|
423
463
|
else:
|
|
424
464
|
# Try direct casting
|
|
@@ -442,7 +482,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
442
482
|
self.scans_df = self.scans_df.with_columns(
|
|
443
483
|
pl.col(col)
|
|
444
484
|
.map_elements(
|
|
445
|
-
lambda x: x.decode("utf-8")
|
|
485
|
+
lambda x: x.decode("utf-8")
|
|
486
|
+
if isinstance(x, bytes)
|
|
487
|
+
else str(x),
|
|
446
488
|
return_dtype=pl.Utf8,
|
|
447
489
|
)
|
|
448
490
|
.cast(target_dtype),
|
|
@@ -451,7 +493,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
451
493
|
self.scans_df = self.scans_df.with_columns(
|
|
452
494
|
pl.col(col)
|
|
453
495
|
.map_elements(
|
|
454
|
-
lambda x: x.decode("utf-8")
|
|
496
|
+
lambda x: x.decode("utf-8")
|
|
497
|
+
if isinstance(x, bytes)
|
|
498
|
+
else str(x),
|
|
455
499
|
return_dtype=pl.Utf8,
|
|
456
500
|
)
|
|
457
501
|
.str.to_integer()
|
|
@@ -461,7 +505,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
461
505
|
self.scans_df = self.scans_df.with_columns(
|
|
462
506
|
pl.col(col)
|
|
463
507
|
.map_elements(
|
|
464
|
-
lambda x: x.decode("utf-8")
|
|
508
|
+
lambda x: x.decode("utf-8")
|
|
509
|
+
if isinstance(x, bytes)
|
|
510
|
+
else str(x),
|
|
465
511
|
return_dtype=pl.Utf8,
|
|
466
512
|
)
|
|
467
513
|
.str.to_decimal()
|
|
@@ -490,7 +536,9 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
490
536
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
491
537
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
492
538
|
# Only reorder columns that exist in both schema and DataFrame
|
|
493
|
-
existing_columns = [
|
|
539
|
+
existing_columns = [
|
|
540
|
+
col for col in schema_column_order if col in self.scans_df.columns
|
|
541
|
+
]
|
|
494
542
|
if existing_columns:
|
|
495
543
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
496
544
|
|
|
@@ -617,23 +665,29 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
617
665
|
if k in schema.get("features_df", {}).get("columns", {})
|
|
618
666
|
and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
|
|
619
667
|
}
|
|
620
|
-
regular_columns = {
|
|
668
|
+
regular_columns = {
|
|
669
|
+
k: v for k, v in data.items() if k not in object_columns
|
|
670
|
+
}
|
|
621
671
|
|
|
622
672
|
# Create DataFrame with regular columns first
|
|
623
673
|
if regular_columns:
|
|
624
674
|
self.features_df = pl.DataFrame(regular_columns)
|
|
625
675
|
# Add Object columns one by one
|
|
626
676
|
for col, values in object_columns.items():
|
|
627
|
-
self.features_df = self.features_df.with_columns(
|
|
628
|
-
|
|
629
|
-
|
|
677
|
+
self.features_df = self.features_df.with_columns(
|
|
678
|
+
[
|
|
679
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
680
|
+
],
|
|
681
|
+
)
|
|
630
682
|
else:
|
|
631
683
|
# Only Object columns
|
|
632
684
|
self.features_df = pl.DataFrame()
|
|
633
685
|
for col, values in object_columns.items():
|
|
634
|
-
self.features_df = self.features_df.with_columns(
|
|
635
|
-
|
|
636
|
-
|
|
686
|
+
self.features_df = self.features_df.with_columns(
|
|
687
|
+
[
|
|
688
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
689
|
+
],
|
|
690
|
+
)
|
|
637
691
|
|
|
638
692
|
# update all columns with schema types (skip Object columns)
|
|
639
693
|
for col in self.features_df.columns:
|
|
@@ -650,16 +704,25 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
650
704
|
# Convert to numeric first, handling different input types
|
|
651
705
|
if self.features_df[col].dtype == pl.Utf8:
|
|
652
706
|
# String data - convert to integer
|
|
653
|
-
self.features_df =
|
|
654
|
-
|
|
707
|
+
self.features_df = (
|
|
708
|
+
self.features_df.with_columns(
|
|
709
|
+
pl.col(col)
|
|
710
|
+
.str.to_integer()
|
|
711
|
+
.cast(eval(dtype_str)),
|
|
712
|
+
)
|
|
655
713
|
)
|
|
656
714
|
elif self.features_df[col].dtype in [
|
|
657
715
|
pl.Float64,
|
|
658
716
|
pl.Float32,
|
|
659
717
|
]:
|
|
660
718
|
# Float data - cast to integer with null handling for NaN values
|
|
661
|
-
self.features_df =
|
|
662
|
-
|
|
719
|
+
self.features_df = (
|
|
720
|
+
self.features_df.with_columns(
|
|
721
|
+
pl.col(col).cast(
|
|
722
|
+
eval(dtype_str),
|
|
723
|
+
strict=False,
|
|
724
|
+
),
|
|
725
|
+
)
|
|
663
726
|
)
|
|
664
727
|
else:
|
|
665
728
|
# Handle special cases and try direct casting for other types
|
|
@@ -670,50 +733,70 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
670
733
|
if "Binary" in str(current_dtype):
|
|
671
734
|
# Convert binary to string first, then to target type
|
|
672
735
|
if target_dtype == pl.Utf8:
|
|
673
|
-
self.features_df =
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
736
|
+
self.features_df = (
|
|
737
|
+
self.features_df.with_columns(
|
|
738
|
+
pl.col(col)
|
|
739
|
+
.map_elements(
|
|
740
|
+
lambda x: x.decode("utf-8")
|
|
741
|
+
if isinstance(x, bytes)
|
|
742
|
+
else str(x),
|
|
743
|
+
return_dtype=pl.Utf8,
|
|
744
|
+
)
|
|
745
|
+
.cast(target_dtype),
|
|
678
746
|
)
|
|
679
|
-
.cast(target_dtype),
|
|
680
747
|
)
|
|
681
748
|
elif "Int" in str(target_dtype):
|
|
682
|
-
self.features_df =
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
749
|
+
self.features_df = (
|
|
750
|
+
self.features_df.with_columns(
|
|
751
|
+
pl.col(col)
|
|
752
|
+
.map_elements(
|
|
753
|
+
lambda x: x.decode("utf-8")
|
|
754
|
+
if isinstance(x, bytes)
|
|
755
|
+
else str(x),
|
|
756
|
+
return_dtype=pl.Utf8,
|
|
757
|
+
)
|
|
758
|
+
.str.to_integer()
|
|
759
|
+
.cast(target_dtype),
|
|
687
760
|
)
|
|
688
|
-
.str.to_integer()
|
|
689
|
-
.cast(target_dtype),
|
|
690
761
|
)
|
|
691
762
|
elif "Float" in str(target_dtype):
|
|
692
|
-
self.features_df =
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
763
|
+
self.features_df = (
|
|
764
|
+
self.features_df.with_columns(
|
|
765
|
+
pl.col(col)
|
|
766
|
+
.map_elements(
|
|
767
|
+
lambda x: x.decode("utf-8")
|
|
768
|
+
if isinstance(x, bytes)
|
|
769
|
+
else str(x),
|
|
770
|
+
return_dtype=pl.Utf8,
|
|
771
|
+
)
|
|
772
|
+
.str.to_decimal()
|
|
773
|
+
.cast(target_dtype),
|
|
697
774
|
)
|
|
698
|
-
.str.to_decimal()
|
|
699
|
-
.cast(target_dtype),
|
|
700
775
|
)
|
|
701
776
|
else:
|
|
702
777
|
# Try direct casting
|
|
703
|
-
self.features_df =
|
|
704
|
-
|
|
778
|
+
self.features_df = (
|
|
779
|
+
self.features_df.with_columns(
|
|
780
|
+
pl.col(col).cast(target_dtype),
|
|
781
|
+
)
|
|
705
782
|
)
|
|
706
783
|
else:
|
|
707
784
|
# Try direct casting for non-binary types
|
|
708
|
-
self.features_df =
|
|
709
|
-
|
|
785
|
+
self.features_df = (
|
|
786
|
+
self.features_df.with_columns(
|
|
787
|
+
pl.col(col).cast(target_dtype),
|
|
788
|
+
)
|
|
710
789
|
)
|
|
711
790
|
elif "Float" in dtype_str:
|
|
712
791
|
# Convert to float, handling different input types
|
|
713
792
|
if self.features_df[col].dtype == pl.Utf8:
|
|
714
793
|
# String data - convert to float
|
|
715
|
-
self.features_df =
|
|
716
|
-
|
|
794
|
+
self.features_df = (
|
|
795
|
+
self.features_df.with_columns(
|
|
796
|
+
pl.col(col)
|
|
797
|
+
.str.to_decimal()
|
|
798
|
+
.cast(eval(dtype_str)),
|
|
799
|
+
)
|
|
717
800
|
)
|
|
718
801
|
else:
|
|
719
802
|
# Handle special cases and try direct casting for other types
|
|
@@ -724,43 +807,59 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
724
807
|
if "Binary" in str(current_dtype):
|
|
725
808
|
# Convert binary to string first, then to target type
|
|
726
809
|
if target_dtype == pl.Utf8:
|
|
727
|
-
self.features_df =
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
810
|
+
self.features_df = (
|
|
811
|
+
self.features_df.with_columns(
|
|
812
|
+
pl.col(col)
|
|
813
|
+
.map_elements(
|
|
814
|
+
lambda x: x.decode("utf-8")
|
|
815
|
+
if isinstance(x, bytes)
|
|
816
|
+
else str(x),
|
|
817
|
+
return_dtype=pl.Utf8,
|
|
818
|
+
)
|
|
819
|
+
.cast(target_dtype),
|
|
732
820
|
)
|
|
733
|
-
.cast(target_dtype),
|
|
734
821
|
)
|
|
735
822
|
elif "Int" in str(target_dtype):
|
|
736
|
-
self.features_df =
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
823
|
+
self.features_df = (
|
|
824
|
+
self.features_df.with_columns(
|
|
825
|
+
pl.col(col)
|
|
826
|
+
.map_elements(
|
|
827
|
+
lambda x: x.decode("utf-8")
|
|
828
|
+
if isinstance(x, bytes)
|
|
829
|
+
else str(x),
|
|
830
|
+
return_dtype=pl.Utf8,
|
|
831
|
+
)
|
|
832
|
+
.str.to_integer()
|
|
833
|
+
.cast(target_dtype),
|
|
741
834
|
)
|
|
742
|
-
.str.to_integer()
|
|
743
|
-
.cast(target_dtype),
|
|
744
835
|
)
|
|
745
836
|
elif "Float" in str(target_dtype):
|
|
746
|
-
self.features_df =
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
837
|
+
self.features_df = (
|
|
838
|
+
self.features_df.with_columns(
|
|
839
|
+
pl.col(col)
|
|
840
|
+
.map_elements(
|
|
841
|
+
lambda x: x.decode("utf-8")
|
|
842
|
+
if isinstance(x, bytes)
|
|
843
|
+
else str(x),
|
|
844
|
+
return_dtype=pl.Utf8,
|
|
845
|
+
)
|
|
846
|
+
.str.to_decimal()
|
|
847
|
+
.cast(target_dtype),
|
|
751
848
|
)
|
|
752
|
-
.str.to_decimal()
|
|
753
|
-
.cast(target_dtype),
|
|
754
849
|
)
|
|
755
850
|
else:
|
|
756
851
|
# Try direct casting
|
|
757
|
-
self.features_df =
|
|
758
|
-
|
|
852
|
+
self.features_df = (
|
|
853
|
+
self.features_df.with_columns(
|
|
854
|
+
pl.col(col).cast(target_dtype),
|
|
855
|
+
)
|
|
759
856
|
)
|
|
760
857
|
else:
|
|
761
858
|
# Try direct casting for non-binary types
|
|
762
|
-
self.features_df =
|
|
763
|
-
|
|
859
|
+
self.features_df = (
|
|
860
|
+
self.features_df.with_columns(
|
|
861
|
+
pl.col(col).cast(target_dtype),
|
|
862
|
+
)
|
|
764
863
|
)
|
|
765
864
|
elif "Utf8" in dtype_str:
|
|
766
865
|
# Ensure it's string type
|
|
@@ -776,43 +875,59 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
776
875
|
if "Binary" in str(current_dtype):
|
|
777
876
|
# Convert binary to string first, then to target type
|
|
778
877
|
if target_dtype == pl.Utf8:
|
|
779
|
-
self.features_df =
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
878
|
+
self.features_df = (
|
|
879
|
+
self.features_df.with_columns(
|
|
880
|
+
pl.col(col)
|
|
881
|
+
.map_elements(
|
|
882
|
+
lambda x: x.decode("utf-8")
|
|
883
|
+
if isinstance(x, bytes)
|
|
884
|
+
else str(x),
|
|
885
|
+
return_dtype=pl.Utf8,
|
|
886
|
+
)
|
|
887
|
+
.cast(target_dtype),
|
|
784
888
|
)
|
|
785
|
-
.cast(target_dtype),
|
|
786
889
|
)
|
|
787
890
|
elif "Int" in str(target_dtype):
|
|
788
|
-
self.features_df =
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
891
|
+
self.features_df = (
|
|
892
|
+
self.features_df.with_columns(
|
|
893
|
+
pl.col(col)
|
|
894
|
+
.map_elements(
|
|
895
|
+
lambda x: x.decode("utf-8")
|
|
896
|
+
if isinstance(x, bytes)
|
|
897
|
+
else str(x),
|
|
898
|
+
return_dtype=pl.Utf8,
|
|
899
|
+
)
|
|
900
|
+
.str.to_integer()
|
|
901
|
+
.cast(target_dtype),
|
|
793
902
|
)
|
|
794
|
-
.str.to_integer()
|
|
795
|
-
.cast(target_dtype),
|
|
796
903
|
)
|
|
797
904
|
elif "Float" in str(target_dtype):
|
|
798
|
-
self.features_df =
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
905
|
+
self.features_df = (
|
|
906
|
+
self.features_df.with_columns(
|
|
907
|
+
pl.col(col)
|
|
908
|
+
.map_elements(
|
|
909
|
+
lambda x: x.decode("utf-8")
|
|
910
|
+
if isinstance(x, bytes)
|
|
911
|
+
else str(x),
|
|
912
|
+
return_dtype=pl.Utf8,
|
|
913
|
+
)
|
|
914
|
+
.str.to_decimal()
|
|
915
|
+
.cast(target_dtype),
|
|
803
916
|
)
|
|
804
|
-
.str.to_decimal()
|
|
805
|
-
.cast(target_dtype),
|
|
806
917
|
)
|
|
807
918
|
else:
|
|
808
919
|
# Try direct casting
|
|
809
|
-
self.features_df =
|
|
810
|
-
|
|
920
|
+
self.features_df = (
|
|
921
|
+
self.features_df.with_columns(
|
|
922
|
+
pl.col(col).cast(target_dtype),
|
|
923
|
+
)
|
|
811
924
|
)
|
|
812
925
|
else:
|
|
813
926
|
# Try direct casting for non-binary types
|
|
814
|
-
self.features_df =
|
|
815
|
-
|
|
927
|
+
self.features_df = (
|
|
928
|
+
self.features_df.with_columns(
|
|
929
|
+
pl.col(col).cast(target_dtype),
|
|
930
|
+
)
|
|
816
931
|
)
|
|
817
932
|
except Exception as e:
|
|
818
933
|
self.logger.warning(
|
|
@@ -827,23 +942,31 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
827
942
|
# This ensures "None" strings introduced by failed conversions are properly handled
|
|
828
943
|
for col in self.features_df.columns:
|
|
829
944
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
830
|
-
self.features_df = self.features_df.with_columns(
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
945
|
+
self.features_df = self.features_df.with_columns(
|
|
946
|
+
[
|
|
947
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
948
|
+
.then(None)
|
|
949
|
+
.otherwise(pl.col(col))
|
|
950
|
+
.alias(col),
|
|
951
|
+
],
|
|
952
|
+
)
|
|
836
953
|
# Float columns
|
|
837
954
|
elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
|
|
838
|
-
self.features_df = self.features_df.with_columns(
|
|
839
|
-
|
|
840
|
-
|
|
955
|
+
self.features_df = self.features_df.with_columns(
|
|
956
|
+
[
|
|
957
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
958
|
+
],
|
|
959
|
+
)
|
|
841
960
|
|
|
842
961
|
# Ensure column order matches schema order
|
|
843
962
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
844
963
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
845
964
|
# Only reorder columns that exist in both schema and DataFrame
|
|
846
|
-
existing_columns = [
|
|
965
|
+
existing_columns = [
|
|
966
|
+
col
|
|
967
|
+
for col in schema_column_order
|
|
968
|
+
if col in self.features_df.columns
|
|
969
|
+
]
|
|
847
970
|
if existing_columns:
|
|
848
971
|
self.features_df = self.features_df.select(existing_columns)
|
|
849
972
|
|
|
@@ -873,13 +996,17 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
873
996
|
dtype_str = schema_columns[col]["dtype"]
|
|
874
997
|
try:
|
|
875
998
|
if "Int" in dtype_str:
|
|
876
|
-
self.ms1_df = self.ms1_df.with_columns(
|
|
877
|
-
|
|
878
|
-
|
|
999
|
+
self.ms1_df = self.ms1_df.with_columns(
|
|
1000
|
+
[
|
|
1001
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
1002
|
+
],
|
|
1003
|
+
)
|
|
879
1004
|
elif "Float" in dtype_str:
|
|
880
|
-
self.ms1_df = self.ms1_df.with_columns(
|
|
881
|
-
|
|
882
|
-
|
|
1005
|
+
self.ms1_df = self.ms1_df.with_columns(
|
|
1006
|
+
[
|
|
1007
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
1008
|
+
],
|
|
1009
|
+
)
|
|
883
1010
|
except Exception as e:
|
|
884
1011
|
self.logger.warning(
|
|
885
1012
|
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
@@ -948,15 +1075,21 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
948
1075
|
# Load metadata
|
|
949
1076
|
if "metadata" in f:
|
|
950
1077
|
metadata_group = f["metadata"]
|
|
951
|
-
self.file_path = decode_metadata_attr(
|
|
1078
|
+
self.file_path = decode_metadata_attr(
|
|
1079
|
+
metadata_group.attrs.get("file_path", ""),
|
|
1080
|
+
)
|
|
952
1081
|
|
|
953
1082
|
# Load file_source if it exists, otherwise set it equal to file_path
|
|
954
1083
|
if "file_source" in metadata_group.attrs:
|
|
955
|
-
self.file_source = decode_metadata_attr(
|
|
1084
|
+
self.file_source = decode_metadata_attr(
|
|
1085
|
+
metadata_group.attrs.get("file_source", ""),
|
|
1086
|
+
)
|
|
956
1087
|
else:
|
|
957
1088
|
self.file_source = self.file_path
|
|
958
1089
|
|
|
959
|
-
self.file_type = decode_metadata_attr(
|
|
1090
|
+
self.file_type = decode_metadata_attr(
|
|
1091
|
+
metadata_group.attrs.get("file_type", ""),
|
|
1092
|
+
)
|
|
960
1093
|
self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
|
|
961
1094
|
|
|
962
1095
|
# Load parameters from JSON in metadata
|
|
@@ -1007,19 +1140,23 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1007
1140
|
# Convert "None" strings and NaN values to proper null values
|
|
1008
1141
|
for col in self.scans_df.columns:
|
|
1009
1142
|
if self.scans_df[col].dtype == pl.Utf8: # String columns
|
|
1010
|
-
self.scans_df = self.scans_df.with_columns(
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1143
|
+
self.scans_df = self.scans_df.with_columns(
|
|
1144
|
+
[
|
|
1145
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1146
|
+
.then(None)
|
|
1147
|
+
.otherwise(pl.col(col))
|
|
1148
|
+
.alias(col),
|
|
1149
|
+
],
|
|
1150
|
+
)
|
|
1016
1151
|
elif self.scans_df[col].dtype in [
|
|
1017
1152
|
pl.Float64,
|
|
1018
1153
|
pl.Float32,
|
|
1019
1154
|
]: # Float columns
|
|
1020
|
-
self.scans_df = self.scans_df.with_columns(
|
|
1021
|
-
|
|
1022
|
-
|
|
1155
|
+
self.scans_df = self.scans_df.with_columns(
|
|
1156
|
+
[
|
|
1157
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1158
|
+
],
|
|
1159
|
+
)
|
|
1023
1160
|
|
|
1024
1161
|
# update all columns with schema types
|
|
1025
1162
|
for col in self.scans_df.columns:
|
|
@@ -1037,7 +1174,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1037
1174
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1038
1175
|
# String data - convert to integer
|
|
1039
1176
|
self.scans_df = self.scans_df.with_columns(
|
|
1040
|
-
pl.col(col)
|
|
1177
|
+
pl.col(col)
|
|
1178
|
+
.str.to_integer()
|
|
1179
|
+
.cast(eval(dtype_str)),
|
|
1041
1180
|
)
|
|
1042
1181
|
elif self.scans_df[col].dtype in [
|
|
1043
1182
|
pl.Float64,
|
|
@@ -1057,7 +1196,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1057
1196
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1058
1197
|
# String data - convert to float
|
|
1059
1198
|
self.scans_df = self.scans_df.with_columns(
|
|
1060
|
-
pl.col(col)
|
|
1199
|
+
pl.col(col)
|
|
1200
|
+
.str.to_decimal()
|
|
1201
|
+
.cast(eval(dtype_str)),
|
|
1061
1202
|
)
|
|
1062
1203
|
else:
|
|
1063
1204
|
# Try direct casting
|
|
@@ -1081,7 +1222,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1081
1222
|
self.scans_df = self.scans_df.with_columns(
|
|
1082
1223
|
pl.col(col)
|
|
1083
1224
|
.map_elements(
|
|
1084
|
-
lambda x: x.decode("utf-8")
|
|
1225
|
+
lambda x: x.decode("utf-8")
|
|
1226
|
+
if isinstance(x, bytes)
|
|
1227
|
+
else str(x),
|
|
1085
1228
|
return_dtype=pl.Utf8,
|
|
1086
1229
|
)
|
|
1087
1230
|
.cast(target_dtype),
|
|
@@ -1090,7 +1233,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1090
1233
|
self.scans_df = self.scans_df.with_columns(
|
|
1091
1234
|
pl.col(col)
|
|
1092
1235
|
.map_elements(
|
|
1093
|
-
lambda x: x.decode("utf-8")
|
|
1236
|
+
lambda x: x.decode("utf-8")
|
|
1237
|
+
if isinstance(x, bytes)
|
|
1238
|
+
else str(x),
|
|
1094
1239
|
return_dtype=pl.Utf8,
|
|
1095
1240
|
)
|
|
1096
1241
|
.str.to_integer()
|
|
@@ -1100,7 +1245,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1100
1245
|
self.scans_df = self.scans_df.with_columns(
|
|
1101
1246
|
pl.col(col)
|
|
1102
1247
|
.map_elements(
|
|
1103
|
-
lambda x: x.decode("utf-8")
|
|
1248
|
+
lambda x: x.decode("utf-8")
|
|
1249
|
+
if isinstance(x, bytes)
|
|
1250
|
+
else str(x),
|
|
1104
1251
|
return_dtype=pl.Utf8,
|
|
1105
1252
|
)
|
|
1106
1253
|
.str.to_decimal()
|
|
@@ -1129,7 +1276,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1129
1276
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
1130
1277
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
1131
1278
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1132
|
-
existing_columns = [
|
|
1279
|
+
existing_columns = [
|
|
1280
|
+
col for col in schema_column_order if col in self.scans_df.columns
|
|
1281
|
+
]
|
|
1133
1282
|
if existing_columns:
|
|
1134
1283
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
1135
1284
|
|
|
@@ -1208,12 +1357,18 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1208
1357
|
spectrum_list = []
|
|
1209
1358
|
for spec_data in json.loads(item):
|
|
1210
1359
|
if spec_data is not None:
|
|
1211
|
-
spectrum = Spectrum.from_json(
|
|
1360
|
+
spectrum = Spectrum.from_json(
|
|
1361
|
+
spec_data,
|
|
1362
|
+
)
|
|
1212
1363
|
spectrum_list.append(spectrum)
|
|
1213
1364
|
else:
|
|
1214
1365
|
spectrum_list.append(None)
|
|
1215
1366
|
reconstructed_data.append(spectrum_list)
|
|
1216
|
-
except (
|
|
1367
|
+
except (
|
|
1368
|
+
json.JSONDecodeError,
|
|
1369
|
+
ValueError,
|
|
1370
|
+
TypeError,
|
|
1371
|
+
):
|
|
1217
1372
|
reconstructed_data.append(None)
|
|
1218
1373
|
|
|
1219
1374
|
data[col] = reconstructed_data
|
|
@@ -1229,10 +1384,13 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1229
1384
|
# Separate Object columns from regular columns to avoid astuple issues
|
|
1230
1385
|
object_columns = {}
|
|
1231
1386
|
regular_columns = {}
|
|
1232
|
-
|
|
1387
|
+
|
|
1233
1388
|
for col, values in data.items():
|
|
1234
1389
|
if col in schema.get("features_df", {}).get("columns", {}):
|
|
1235
|
-
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1390
|
+
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1391
|
+
"dtype",
|
|
1392
|
+
"",
|
|
1393
|
+
):
|
|
1236
1394
|
object_columns[col] = values
|
|
1237
1395
|
else:
|
|
1238
1396
|
regular_columns[col] = values
|
|
@@ -1245,38 +1403,48 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1245
1403
|
else:
|
|
1246
1404
|
# If no regular columns, create empty DataFrame
|
|
1247
1405
|
self.features_df = pl.DataFrame()
|
|
1248
|
-
|
|
1406
|
+
|
|
1249
1407
|
# Add Object columns one by one
|
|
1250
1408
|
for col, values in object_columns.items():
|
|
1251
1409
|
if not self.features_df.is_empty():
|
|
1252
1410
|
self.features_df = self.features_df.with_columns(
|
|
1253
|
-
pl.Series(col, values, dtype=pl.Object).alias(col)
|
|
1411
|
+
pl.Series(col, values, dtype=pl.Object).alias(col),
|
|
1254
1412
|
)
|
|
1255
1413
|
else:
|
|
1256
1414
|
# Create DataFrame with just this Object column
|
|
1257
|
-
self.features_df = pl.DataFrame(
|
|
1415
|
+
self.features_df = pl.DataFrame(
|
|
1416
|
+
{col: values},
|
|
1417
|
+
schema={col: pl.Object},
|
|
1418
|
+
)
|
|
1258
1419
|
|
|
1259
1420
|
# Convert "None" strings and NaN values to proper null values for regular columns first
|
|
1260
1421
|
for col in self.features_df.columns:
|
|
1261
1422
|
# Skip Object columns - they're already properly reconstructed
|
|
1262
1423
|
if col in schema.get("features_df", {}).get("columns", {}):
|
|
1263
|
-
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1424
|
+
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1425
|
+
"dtype",
|
|
1426
|
+
"",
|
|
1427
|
+
):
|
|
1264
1428
|
continue
|
|
1265
1429
|
|
|
1266
1430
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
1267
|
-
self.features_df = self.features_df.with_columns(
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1431
|
+
self.features_df = self.features_df.with_columns(
|
|
1432
|
+
[
|
|
1433
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1434
|
+
.then(None)
|
|
1435
|
+
.otherwise(pl.col(col))
|
|
1436
|
+
.alias(col),
|
|
1437
|
+
],
|
|
1438
|
+
)
|
|
1273
1439
|
elif self.features_df[col].dtype in [
|
|
1274
1440
|
pl.Float64,
|
|
1275
1441
|
pl.Float32,
|
|
1276
1442
|
]: # Float columns
|
|
1277
|
-
self.features_df = self.features_df.with_columns(
|
|
1278
|
-
|
|
1279
|
-
|
|
1443
|
+
self.features_df = self.features_df.with_columns(
|
|
1444
|
+
[
|
|
1445
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1446
|
+
],
|
|
1447
|
+
)
|
|
1280
1448
|
|
|
1281
1449
|
# update all columns with schema types
|
|
1282
1450
|
for col in self.features_df.columns:
|
|
@@ -1293,16 +1461,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1293
1461
|
# Convert to numeric first, handling different input types
|
|
1294
1462
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1295
1463
|
# String data - convert to integer
|
|
1296
|
-
self.features_df =
|
|
1297
|
-
|
|
1464
|
+
self.features_df = (
|
|
1465
|
+
self.features_df.with_columns(
|
|
1466
|
+
pl.col(col)
|
|
1467
|
+
.str.to_integer()
|
|
1468
|
+
.cast(eval(dtype_str)),
|
|
1469
|
+
)
|
|
1298
1470
|
)
|
|
1299
1471
|
elif self.features_df[col].dtype in [
|
|
1300
1472
|
pl.Float64,
|
|
1301
1473
|
pl.Float32,
|
|
1302
1474
|
]:
|
|
1303
1475
|
# Float data - cast to integer with null handling for NaN values
|
|
1304
|
-
self.features_df =
|
|
1305
|
-
|
|
1476
|
+
self.features_df = (
|
|
1477
|
+
self.features_df.with_columns(
|
|
1478
|
+
pl.col(col).cast(
|
|
1479
|
+
eval(dtype_str),
|
|
1480
|
+
strict=False,
|
|
1481
|
+
),
|
|
1482
|
+
)
|
|
1306
1483
|
)
|
|
1307
1484
|
else:
|
|
1308
1485
|
# Handle special cases and try direct casting for other types
|
|
@@ -1313,50 +1490,70 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1313
1490
|
if "Binary" in str(current_dtype):
|
|
1314
1491
|
# Convert binary to string first, then to target type
|
|
1315
1492
|
if target_dtype == pl.Utf8:
|
|
1316
|
-
self.features_df =
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1493
|
+
self.features_df = (
|
|
1494
|
+
self.features_df.with_columns(
|
|
1495
|
+
pl.col(col)
|
|
1496
|
+
.map_elements(
|
|
1497
|
+
lambda x: x.decode("utf-8")
|
|
1498
|
+
if isinstance(x, bytes)
|
|
1499
|
+
else str(x),
|
|
1500
|
+
return_dtype=pl.Utf8,
|
|
1501
|
+
)
|
|
1502
|
+
.cast(target_dtype),
|
|
1321
1503
|
)
|
|
1322
|
-
.cast(target_dtype),
|
|
1323
1504
|
)
|
|
1324
1505
|
elif "Int" in str(target_dtype):
|
|
1325
|
-
self.features_df =
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1506
|
+
self.features_df = (
|
|
1507
|
+
self.features_df.with_columns(
|
|
1508
|
+
pl.col(col)
|
|
1509
|
+
.map_elements(
|
|
1510
|
+
lambda x: x.decode("utf-8")
|
|
1511
|
+
if isinstance(x, bytes)
|
|
1512
|
+
else str(x),
|
|
1513
|
+
return_dtype=pl.Utf8,
|
|
1514
|
+
)
|
|
1515
|
+
.str.to_integer()
|
|
1516
|
+
.cast(target_dtype),
|
|
1330
1517
|
)
|
|
1331
|
-
.str.to_integer()
|
|
1332
|
-
.cast(target_dtype),
|
|
1333
1518
|
)
|
|
1334
1519
|
elif "Float" in str(target_dtype):
|
|
1335
|
-
self.features_df =
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1520
|
+
self.features_df = (
|
|
1521
|
+
self.features_df.with_columns(
|
|
1522
|
+
pl.col(col)
|
|
1523
|
+
.map_elements(
|
|
1524
|
+
lambda x: x.decode("utf-8")
|
|
1525
|
+
if isinstance(x, bytes)
|
|
1526
|
+
else str(x),
|
|
1527
|
+
return_dtype=pl.Utf8,
|
|
1528
|
+
)
|
|
1529
|
+
.str.to_decimal()
|
|
1530
|
+
.cast(target_dtype),
|
|
1340
1531
|
)
|
|
1341
|
-
.str.to_decimal()
|
|
1342
|
-
.cast(target_dtype),
|
|
1343
1532
|
)
|
|
1344
1533
|
else:
|
|
1345
1534
|
# Try direct casting
|
|
1346
|
-
self.features_df =
|
|
1347
|
-
|
|
1535
|
+
self.features_df = (
|
|
1536
|
+
self.features_df.with_columns(
|
|
1537
|
+
pl.col(col).cast(target_dtype),
|
|
1538
|
+
)
|
|
1348
1539
|
)
|
|
1349
1540
|
else:
|
|
1350
1541
|
# Try direct casting for non-binary types
|
|
1351
|
-
self.features_df =
|
|
1352
|
-
|
|
1542
|
+
self.features_df = (
|
|
1543
|
+
self.features_df.with_columns(
|
|
1544
|
+
pl.col(col).cast(target_dtype),
|
|
1545
|
+
)
|
|
1353
1546
|
)
|
|
1354
1547
|
elif "Float" in dtype_str:
|
|
1355
1548
|
# Convert to float, handling different input types
|
|
1356
1549
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1357
1550
|
# String data - convert to float
|
|
1358
|
-
self.features_df =
|
|
1359
|
-
|
|
1551
|
+
self.features_df = (
|
|
1552
|
+
self.features_df.with_columns(
|
|
1553
|
+
pl.col(col)
|
|
1554
|
+
.str.to_decimal()
|
|
1555
|
+
.cast(eval(dtype_str)),
|
|
1556
|
+
)
|
|
1360
1557
|
)
|
|
1361
1558
|
else:
|
|
1362
1559
|
# Handle special cases and try direct casting for other types
|
|
@@ -1367,43 +1564,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1367
1564
|
if "Binary" in str(current_dtype):
|
|
1368
1565
|
# Convert binary to string first, then to target type
|
|
1369
1566
|
if target_dtype == pl.Utf8:
|
|
1370
|
-
self.features_df =
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1567
|
+
self.features_df = (
|
|
1568
|
+
self.features_df.with_columns(
|
|
1569
|
+
pl.col(col)
|
|
1570
|
+
.map_elements(
|
|
1571
|
+
lambda x: x.decode("utf-8")
|
|
1572
|
+
if isinstance(x, bytes)
|
|
1573
|
+
else str(x),
|
|
1574
|
+
return_dtype=pl.Utf8,
|
|
1575
|
+
)
|
|
1576
|
+
.cast(target_dtype),
|
|
1375
1577
|
)
|
|
1376
|
-
.cast(target_dtype),
|
|
1377
1578
|
)
|
|
1378
1579
|
elif "Int" in str(target_dtype):
|
|
1379
|
-
self.features_df =
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1580
|
+
self.features_df = (
|
|
1581
|
+
self.features_df.with_columns(
|
|
1582
|
+
pl.col(col)
|
|
1583
|
+
.map_elements(
|
|
1584
|
+
lambda x: x.decode("utf-8")
|
|
1585
|
+
if isinstance(x, bytes)
|
|
1586
|
+
else str(x),
|
|
1587
|
+
return_dtype=pl.Utf8,
|
|
1588
|
+
)
|
|
1589
|
+
.str.to_integer()
|
|
1590
|
+
.cast(target_dtype),
|
|
1384
1591
|
)
|
|
1385
|
-
.str.to_integer()
|
|
1386
|
-
.cast(target_dtype),
|
|
1387
1592
|
)
|
|
1388
1593
|
elif "Float" in str(target_dtype):
|
|
1389
|
-
self.features_df =
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1594
|
+
self.features_df = (
|
|
1595
|
+
self.features_df.with_columns(
|
|
1596
|
+
pl.col(col)
|
|
1597
|
+
.map_elements(
|
|
1598
|
+
lambda x: x.decode("utf-8")
|
|
1599
|
+
if isinstance(x, bytes)
|
|
1600
|
+
else str(x),
|
|
1601
|
+
return_dtype=pl.Utf8,
|
|
1602
|
+
)
|
|
1603
|
+
.str.to_decimal()
|
|
1604
|
+
.cast(target_dtype),
|
|
1394
1605
|
)
|
|
1395
|
-
.str.to_decimal()
|
|
1396
|
-
.cast(target_dtype),
|
|
1397
1606
|
)
|
|
1398
1607
|
else:
|
|
1399
1608
|
# Try direct casting
|
|
1400
|
-
self.features_df =
|
|
1401
|
-
|
|
1609
|
+
self.features_df = (
|
|
1610
|
+
self.features_df.with_columns(
|
|
1611
|
+
pl.col(col).cast(target_dtype),
|
|
1612
|
+
)
|
|
1402
1613
|
)
|
|
1403
1614
|
else:
|
|
1404
1615
|
# Try direct casting for non-binary types
|
|
1405
|
-
self.features_df =
|
|
1406
|
-
|
|
1616
|
+
self.features_df = (
|
|
1617
|
+
self.features_df.with_columns(
|
|
1618
|
+
pl.col(col).cast(target_dtype),
|
|
1619
|
+
)
|
|
1407
1620
|
)
|
|
1408
1621
|
elif "Utf8" in dtype_str:
|
|
1409
1622
|
# Ensure it's string type
|
|
@@ -1419,43 +1632,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1419
1632
|
if "Binary" in str(current_dtype):
|
|
1420
1633
|
# Convert binary to string first, then to target type
|
|
1421
1634
|
if target_dtype == pl.Utf8:
|
|
1422
|
-
self.features_df =
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1635
|
+
self.features_df = (
|
|
1636
|
+
self.features_df.with_columns(
|
|
1637
|
+
pl.col(col)
|
|
1638
|
+
.map_elements(
|
|
1639
|
+
lambda x: x.decode("utf-8")
|
|
1640
|
+
if isinstance(x, bytes)
|
|
1641
|
+
else str(x),
|
|
1642
|
+
return_dtype=pl.Utf8,
|
|
1643
|
+
)
|
|
1644
|
+
.cast(target_dtype),
|
|
1427
1645
|
)
|
|
1428
|
-
.cast(target_dtype),
|
|
1429
1646
|
)
|
|
1430
1647
|
elif "Int" in str(target_dtype):
|
|
1431
|
-
self.features_df =
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1648
|
+
self.features_df = (
|
|
1649
|
+
self.features_df.with_columns(
|
|
1650
|
+
pl.col(col)
|
|
1651
|
+
.map_elements(
|
|
1652
|
+
lambda x: x.decode("utf-8")
|
|
1653
|
+
if isinstance(x, bytes)
|
|
1654
|
+
else str(x),
|
|
1655
|
+
return_dtype=pl.Utf8,
|
|
1656
|
+
)
|
|
1657
|
+
.str.to_integer()
|
|
1658
|
+
.cast(target_dtype),
|
|
1436
1659
|
)
|
|
1437
|
-
.str.to_integer()
|
|
1438
|
-
.cast(target_dtype),
|
|
1439
1660
|
)
|
|
1440
1661
|
elif "Float" in str(target_dtype):
|
|
1441
|
-
self.features_df =
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1662
|
+
self.features_df = (
|
|
1663
|
+
self.features_df.with_columns(
|
|
1664
|
+
pl.col(col)
|
|
1665
|
+
.map_elements(
|
|
1666
|
+
lambda x: x.decode("utf-8")
|
|
1667
|
+
if isinstance(x, bytes)
|
|
1668
|
+
else str(x),
|
|
1669
|
+
return_dtype=pl.Utf8,
|
|
1670
|
+
)
|
|
1671
|
+
.str.to_decimal()
|
|
1672
|
+
.cast(target_dtype),
|
|
1446
1673
|
)
|
|
1447
|
-
.str.to_decimal()
|
|
1448
|
-
.cast(target_dtype),
|
|
1449
1674
|
)
|
|
1450
1675
|
else:
|
|
1451
1676
|
# Try direct casting
|
|
1452
|
-
self.features_df =
|
|
1453
|
-
|
|
1677
|
+
self.features_df = (
|
|
1678
|
+
self.features_df.with_columns(
|
|
1679
|
+
pl.col(col).cast(target_dtype),
|
|
1680
|
+
)
|
|
1454
1681
|
)
|
|
1455
1682
|
else:
|
|
1456
1683
|
# Try direct casting for non-binary types
|
|
1457
|
-
self.features_df =
|
|
1458
|
-
|
|
1684
|
+
self.features_df = (
|
|
1685
|
+
self.features_df.with_columns(
|
|
1686
|
+
pl.col(col).cast(target_dtype),
|
|
1687
|
+
)
|
|
1459
1688
|
)
|
|
1460
1689
|
except Exception as e:
|
|
1461
1690
|
self.logger.warning(
|
|
@@ -1470,23 +1699,31 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1470
1699
|
# This ensures "None" strings introduced by failed conversions are properly handled
|
|
1471
1700
|
for col in self.features_df.columns:
|
|
1472
1701
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
1473
|
-
self.features_df = self.features_df.with_columns(
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1702
|
+
self.features_df = self.features_df.with_columns(
|
|
1703
|
+
[
|
|
1704
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1705
|
+
.then(None)
|
|
1706
|
+
.otherwise(pl.col(col))
|
|
1707
|
+
.alias(col),
|
|
1708
|
+
],
|
|
1709
|
+
)
|
|
1479
1710
|
# Float columns
|
|
1480
1711
|
elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
|
|
1481
|
-
self.features_df = self.features_df.with_columns(
|
|
1482
|
-
|
|
1483
|
-
|
|
1712
|
+
self.features_df = self.features_df.with_columns(
|
|
1713
|
+
[
|
|
1714
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1715
|
+
],
|
|
1716
|
+
)
|
|
1484
1717
|
|
|
1485
1718
|
# Ensure column order matches schema order
|
|
1486
1719
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
1487
1720
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
1488
1721
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1489
|
-
existing_columns = [
|
|
1722
|
+
existing_columns = [
|
|
1723
|
+
col
|
|
1724
|
+
for col in schema_column_order
|
|
1725
|
+
if col in self.features_df.columns
|
|
1726
|
+
]
|
|
1490
1727
|
if existing_columns:
|
|
1491
1728
|
self.features_df = self.features_df.select(existing_columns)
|
|
1492
1729
|
|
|
@@ -1516,7 +1753,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1516
1753
|
# set self.label to basename without extension
|
|
1517
1754
|
if self.label is None or self.label == "":
|
|
1518
1755
|
self.label = os.path.splitext(os.path.basename(filename))[0]
|
|
1519
|
-
self.logger.info(
|
|
1756
|
+
self.logger.info(
|
|
1757
|
+
f"Sample loaded successfully from {filename} (optimized for study)",
|
|
1758
|
+
)
|
|
1520
1759
|
|
|
1521
1760
|
|
|
1522
1761
|
def load_schema(schema_path: str) -> Dict[str, Any]:
|
|
@@ -1564,13 +1803,20 @@ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1564
1803
|
cleaned_df = df
|
|
1565
1804
|
for col in df.columns:
|
|
1566
1805
|
if df[col].dtype == pl.Utf8: # String columns
|
|
1567
|
-
cleaned_df = cleaned_df.with_columns(
|
|
1568
|
-
|
|
1569
|
-
|
|
1806
|
+
cleaned_df = cleaned_df.with_columns(
|
|
1807
|
+
[
|
|
1808
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1809
|
+
.then(None)
|
|
1810
|
+
.otherwise(pl.col(col))
|
|
1811
|
+
.alias(col),
|
|
1812
|
+
],
|
|
1813
|
+
)
|
|
1570
1814
|
elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
|
|
1571
|
-
cleaned_df = cleaned_df.with_columns(
|
|
1572
|
-
|
|
1573
|
-
|
|
1815
|
+
cleaned_df = cleaned_df.with_columns(
|
|
1816
|
+
[
|
|
1817
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1818
|
+
],
|
|
1819
|
+
)
|
|
1574
1820
|
return cleaned_df
|
|
1575
1821
|
|
|
1576
1822
|
|
|
@@ -1606,7 +1852,12 @@ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataF
|
|
|
1606
1852
|
return df
|
|
1607
1853
|
|
|
1608
1854
|
|
|
1609
|
-
def _cast_to_int(
|
|
1855
|
+
def _cast_to_int(
|
|
1856
|
+
df: pl.DataFrame,
|
|
1857
|
+
col: str,
|
|
1858
|
+
current_dtype: pl.DataType,
|
|
1859
|
+
target_dtype: pl.DataType,
|
|
1860
|
+
) -> pl.DataFrame:
|
|
1610
1861
|
"""Helper function to cast column to integer type."""
|
|
1611
1862
|
if current_dtype == pl.Utf8:
|
|
1612
1863
|
return df.with_columns(
|
|
@@ -1618,7 +1869,12 @@ def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_
|
|
|
1618
1869
|
return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
|
|
1619
1870
|
|
|
1620
1871
|
|
|
1621
|
-
def _cast_to_float(
|
|
1872
|
+
def _cast_to_float(
|
|
1873
|
+
df: pl.DataFrame,
|
|
1874
|
+
col: str,
|
|
1875
|
+
current_dtype: pl.DataType,
|
|
1876
|
+
target_dtype: pl.DataType,
|
|
1877
|
+
) -> pl.DataFrame:
|
|
1622
1878
|
"""Helper function to cast column to float type."""
|
|
1623
1879
|
if current_dtype == pl.Utf8:
|
|
1624
1880
|
return df.with_columns(
|
|
@@ -1639,20 +1895,29 @@ def _cast_with_binary_handling(
|
|
|
1639
1895
|
if target_dtype == pl.Utf8:
|
|
1640
1896
|
return df.with_columns(
|
|
1641
1897
|
pl.col(col)
|
|
1642
|
-
.map_elements(
|
|
1898
|
+
.map_elements(
|
|
1899
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1900
|
+
return_dtype=pl.Utf8,
|
|
1901
|
+
)
|
|
1643
1902
|
.cast(target_dtype),
|
|
1644
1903
|
)
|
|
1645
1904
|
elif "Int" in str(target_dtype):
|
|
1646
1905
|
return df.with_columns(
|
|
1647
1906
|
pl.col(col)
|
|
1648
|
-
.map_elements(
|
|
1907
|
+
.map_elements(
|
|
1908
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1909
|
+
return_dtype=pl.Utf8,
|
|
1910
|
+
)
|
|
1649
1911
|
.str.to_integer()
|
|
1650
1912
|
.cast(target_dtype),
|
|
1651
1913
|
)
|
|
1652
1914
|
elif "Float" in str(target_dtype):
|
|
1653
1915
|
return df.with_columns(
|
|
1654
1916
|
pl.col(col)
|
|
1655
|
-
.map_elements(
|
|
1917
|
+
.map_elements(
|
|
1918
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1919
|
+
return_dtype=pl.Utf8,
|
|
1920
|
+
)
|
|
1656
1921
|
.str.to_decimal()
|
|
1657
1922
|
.cast(target_dtype),
|
|
1658
1923
|
)
|
|
@@ -1661,7 +1926,11 @@ def _cast_with_binary_handling(
|
|
|
1661
1926
|
return df.with_columns(pl.col(col).cast(target_dtype))
|
|
1662
1927
|
|
|
1663
1928
|
|
|
1664
|
-
def apply_schema_to_dataframe(
|
|
1929
|
+
def apply_schema_to_dataframe(
|
|
1930
|
+
df: pl.DataFrame,
|
|
1931
|
+
schema: Dict[str, Any],
|
|
1932
|
+
df_name: str,
|
|
1933
|
+
) -> pl.DataFrame:
|
|
1665
1934
|
"""
|
|
1666
1935
|
Apply schema type casting to a Polars DataFrame.
|
|
1667
1936
|
|
|
@@ -1819,7 +2088,9 @@ def _create_dataframe_with_object_columns(
|
|
|
1819
2088
|
schema_columns = schema.get(df_name, {}).get("columns", {})
|
|
1820
2089
|
|
|
1821
2090
|
object_columns = {
|
|
1822
|
-
k: v
|
|
2091
|
+
k: v
|
|
2092
|
+
for k, v in data.items()
|
|
2093
|
+
if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
1823
2094
|
}
|
|
1824
2095
|
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
1825
2096
|
|
|
@@ -1874,13 +2145,17 @@ def load_ms1_dataframe_from_h5_group(
|
|
|
1874
2145
|
dtype_str = schema_columns[col]["dtype"]
|
|
1875
2146
|
try:
|
|
1876
2147
|
if "Int" in dtype_str:
|
|
1877
|
-
ms1_df = ms1_df.with_columns(
|
|
1878
|
-
|
|
1879
|
-
|
|
2148
|
+
ms1_df = ms1_df.with_columns(
|
|
2149
|
+
[
|
|
2150
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
2151
|
+
],
|
|
2152
|
+
)
|
|
1880
2153
|
elif "Float" in dtype_str:
|
|
1881
|
-
ms1_df = ms1_df.with_columns(
|
|
1882
|
-
|
|
1883
|
-
|
|
2154
|
+
ms1_df = ms1_df.with_columns(
|
|
2155
|
+
[
|
|
2156
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
2157
|
+
],
|
|
2158
|
+
)
|
|
1884
2159
|
except Exception as e:
|
|
1885
2160
|
if logger:
|
|
1886
2161
|
logger.warning(
|
|
@@ -1891,7 +2166,9 @@ def load_ms1_dataframe_from_h5_group(
|
|
|
1891
2166
|
return clean_null_values_polars(ms1_df)
|
|
1892
2167
|
|
|
1893
2168
|
|
|
1894
|
-
def load_parameters_from_metadata(
|
|
2169
|
+
def load_parameters_from_metadata(
|
|
2170
|
+
metadata_group: h5py.Group,
|
|
2171
|
+
) -> Optional[Dict[str, Any]]:
|
|
1895
2172
|
"""
|
|
1896
2173
|
Load parameters from HDF5 metadata group.
|
|
1897
2174
|
|
|
@@ -1938,6 +2215,8 @@ def create_h5_metadata_group(
|
|
|
1938
2215
|
metadata_group = f.create_group("metadata")
|
|
1939
2216
|
metadata_group.attrs["format"] = "master-sample5-1"
|
|
1940
2217
|
metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
|
|
1941
|
-
metadata_group.attrs["file_source"] =
|
|
2218
|
+
metadata_group.attrs["file_source"] = (
|
|
2219
|
+
str(file_source) if file_source is not None else ""
|
|
2220
|
+
)
|
|
1942
2221
|
metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
|
|
1943
2222
|
metadata_group.attrs["label"] = str(label) if label is not None else ""
|