masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/h5.py
CHANGED
|
@@ -7,17 +7,11 @@ import polars as pl
|
|
|
7
7
|
|
|
8
8
|
from typing import Any, Dict, List, Optional, Tuple
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
10
|
+
from masster.chromatogram import Chromatogram
|
|
11
|
+
from masster.spectrum import Spectrum
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def _save_sample5(
|
|
15
|
-
self,
|
|
16
|
-
filename=None,
|
|
17
|
-
include_ms1=True,
|
|
18
|
-
include_scans=True,
|
|
19
|
-
save_featurexml=False,
|
|
20
|
-
):
|
|
14
|
+
def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, save_featurexml=False):
|
|
21
15
|
"""
|
|
22
16
|
Save the instance data to a sample5 HDF5 file with optimized compression.
|
|
23
17
|
|
|
@@ -62,16 +56,14 @@ def _save_sample5(
|
|
|
62
56
|
return
|
|
63
57
|
|
|
64
58
|
# synchronize feature_map if it exists
|
|
65
|
-
if hasattr(self,
|
|
59
|
+
if hasattr(self, '_feature_map') and self._feature_map is not None:
|
|
66
60
|
self._features_sync()
|
|
67
61
|
|
|
68
62
|
# if no extension is given, add .sample5
|
|
69
63
|
if not filename.endswith(".sample5"):
|
|
70
64
|
filename += ".sample5"
|
|
71
65
|
|
|
72
|
-
self.logger.debug(
|
|
73
|
-
f"Saving sample to {filename} with optimized LZF+shuffle compression",
|
|
74
|
-
)
|
|
66
|
+
self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
|
|
75
67
|
|
|
76
68
|
# delete existing file if it exists
|
|
77
69
|
if os.path.exists(filename):
|
|
@@ -124,18 +116,12 @@ def _save_sample5(
|
|
|
124
116
|
except Exception:
|
|
125
117
|
try:
|
|
126
118
|
# Try to convert to numeric using numpy
|
|
127
|
-
numeric_data = np.array(
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
.replace("-", "")
|
|
134
|
-
.isdigit()
|
|
135
|
-
else np.nan
|
|
136
|
-
for x in data
|
|
137
|
-
],
|
|
138
|
-
)
|
|
119
|
+
numeric_data = np.array([
|
|
120
|
+
float(x)
|
|
121
|
+
if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
|
|
122
|
+
else np.nan
|
|
123
|
+
for x in data
|
|
124
|
+
])
|
|
139
125
|
if not np.isnan(numeric_data).all():
|
|
140
126
|
scans_group.create_dataset(
|
|
141
127
|
col,
|
|
@@ -163,12 +149,7 @@ def _save_sample5(
|
|
|
163
149
|
)
|
|
164
150
|
scans_group[col].attrs["dtype"] = "string_repr"
|
|
165
151
|
else:
|
|
166
|
-
scans_group.create_dataset(
|
|
167
|
-
col,
|
|
168
|
-
data=data,
|
|
169
|
-
compression="lzf",
|
|
170
|
-
shuffle=True,
|
|
171
|
-
)
|
|
152
|
+
scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
172
153
|
scans_group[col].attrs["dtype"] = "native"
|
|
173
154
|
scans_group.attrs["columns"] = list(scans_df.columns)
|
|
174
155
|
|
|
@@ -245,12 +226,7 @@ def _save_sample5(
|
|
|
245
226
|
data = features[col].to_list()
|
|
246
227
|
# convert None to 'None' strings
|
|
247
228
|
data = ["None" if x is None else x for x in data]
|
|
248
|
-
features_group.create_dataset(
|
|
249
|
-
col,
|
|
250
|
-
data=data,
|
|
251
|
-
compression="lzf",
|
|
252
|
-
shuffle=True,
|
|
253
|
-
)
|
|
229
|
+
features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
|
|
254
230
|
else:
|
|
255
231
|
try:
|
|
256
232
|
data = features[col].to_numpy()
|
|
@@ -285,18 +261,16 @@ def _save_sample5(
|
|
|
285
261
|
feature_map = self._get_feature_map()
|
|
286
262
|
if feature_map is not None:
|
|
287
263
|
# Temporarily set features for save operation
|
|
288
|
-
old_features = getattr(self,
|
|
264
|
+
old_features = getattr(self, '_oms_features_map', None)
|
|
289
265
|
self._oms_features_map = feature_map
|
|
290
266
|
try:
|
|
291
|
-
self._save_featureXML(
|
|
292
|
-
filename=filename.replace(".sample5", ".featureXML"),
|
|
293
|
-
)
|
|
267
|
+
self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
|
|
294
268
|
finally:
|
|
295
269
|
# Restore original features value
|
|
296
270
|
if old_features is not None:
|
|
297
271
|
self._oms_features_map = old_features
|
|
298
272
|
else:
|
|
299
|
-
delattr(self,
|
|
273
|
+
delattr(self, '_oms_features_map')
|
|
300
274
|
else:
|
|
301
275
|
self.logger.warning("Cannot save featureXML: no feature data available")
|
|
302
276
|
|
|
@@ -335,28 +309,22 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
335
309
|
# Load metadata
|
|
336
310
|
if "metadata" in f:
|
|
337
311
|
metadata_group = f["metadata"]
|
|
338
|
-
self.file_path = decode_metadata_attr(
|
|
339
|
-
metadata_group.attrs.get("file_path", ""),
|
|
340
|
-
)
|
|
312
|
+
self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
|
|
341
313
|
|
|
342
314
|
# Load file_source if it exists, otherwise set it equal to file_path
|
|
343
315
|
if "file_source" in metadata_group.attrs:
|
|
344
|
-
self.file_source = decode_metadata_attr(
|
|
345
|
-
metadata_group.attrs.get("file_source", ""),
|
|
346
|
-
)
|
|
316
|
+
self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
|
|
347
317
|
else:
|
|
348
318
|
self.file_source = self.file_path
|
|
349
319
|
|
|
350
|
-
self.file_type = decode_metadata_attr(
|
|
351
|
-
metadata_group.attrs.get("file_type", ""),
|
|
352
|
-
)
|
|
320
|
+
self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
|
|
353
321
|
self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
|
|
354
322
|
|
|
355
323
|
# Load parameters from JSON in metadata
|
|
356
324
|
loaded_data = load_parameters_from_metadata(metadata_group)
|
|
357
325
|
|
|
358
326
|
# Always create a fresh sample_defaults object
|
|
359
|
-
from
|
|
327
|
+
from masster.sample.defaults.sample_def import sample_defaults
|
|
360
328
|
|
|
361
329
|
self.parameters = sample_defaults()
|
|
362
330
|
|
|
@@ -400,23 +368,19 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
400
368
|
# Convert "None" strings and NaN values to proper null values
|
|
401
369
|
for col in self.scans_df.columns:
|
|
402
370
|
if self.scans_df[col].dtype == pl.Utf8: # String columns
|
|
403
|
-
self.scans_df = self.scans_df.with_columns(
|
|
404
|
-
[
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
],
|
|
410
|
-
)
|
|
371
|
+
self.scans_df = self.scans_df.with_columns([
|
|
372
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
373
|
+
.then(None)
|
|
374
|
+
.otherwise(pl.col(col))
|
|
375
|
+
.alias(col),
|
|
376
|
+
])
|
|
411
377
|
elif self.scans_df[col].dtype in [
|
|
412
378
|
pl.Float64,
|
|
413
379
|
pl.Float32,
|
|
414
380
|
]: # Float columns
|
|
415
|
-
self.scans_df = self.scans_df.with_columns(
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
],
|
|
419
|
-
)
|
|
381
|
+
self.scans_df = self.scans_df.with_columns([
|
|
382
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
383
|
+
])
|
|
420
384
|
|
|
421
385
|
# update all columns with schema types
|
|
422
386
|
for col in self.scans_df.columns:
|
|
@@ -434,9 +398,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
434
398
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
435
399
|
# String data - convert to integer
|
|
436
400
|
self.scans_df = self.scans_df.with_columns(
|
|
437
|
-
pl.col(col)
|
|
438
|
-
.str.to_integer()
|
|
439
|
-
.cast(eval(dtype_str)),
|
|
401
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
440
402
|
)
|
|
441
403
|
elif self.scans_df[col].dtype in [
|
|
442
404
|
pl.Float64,
|
|
@@ -456,9 +418,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
456
418
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
457
419
|
# String data - convert to float
|
|
458
420
|
self.scans_df = self.scans_df.with_columns(
|
|
459
|
-
pl.col(col)
|
|
460
|
-
.str.to_decimal()
|
|
461
|
-
.cast(eval(dtype_str)),
|
|
421
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
462
422
|
)
|
|
463
423
|
else:
|
|
464
424
|
# Try direct casting
|
|
@@ -482,9 +442,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
482
442
|
self.scans_df = self.scans_df.with_columns(
|
|
483
443
|
pl.col(col)
|
|
484
444
|
.map_elements(
|
|
485
|
-
lambda x: x.decode("utf-8")
|
|
486
|
-
if isinstance(x, bytes)
|
|
487
|
-
else str(x),
|
|
445
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
488
446
|
return_dtype=pl.Utf8,
|
|
489
447
|
)
|
|
490
448
|
.cast(target_dtype),
|
|
@@ -493,9 +451,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
493
451
|
self.scans_df = self.scans_df.with_columns(
|
|
494
452
|
pl.col(col)
|
|
495
453
|
.map_elements(
|
|
496
|
-
lambda x: x.decode("utf-8")
|
|
497
|
-
if isinstance(x, bytes)
|
|
498
|
-
else str(x),
|
|
454
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
499
455
|
return_dtype=pl.Utf8,
|
|
500
456
|
)
|
|
501
457
|
.str.to_integer()
|
|
@@ -505,9 +461,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
505
461
|
self.scans_df = self.scans_df.with_columns(
|
|
506
462
|
pl.col(col)
|
|
507
463
|
.map_elements(
|
|
508
|
-
lambda x: x.decode("utf-8")
|
|
509
|
-
if isinstance(x, bytes)
|
|
510
|
-
else str(x),
|
|
464
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
511
465
|
return_dtype=pl.Utf8,
|
|
512
466
|
)
|
|
513
467
|
.str.to_decimal()
|
|
@@ -536,9 +490,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
536
490
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
537
491
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
538
492
|
# Only reorder columns that exist in both schema and DataFrame
|
|
539
|
-
existing_columns = [
|
|
540
|
-
col for col in schema_column_order if col in self.scans_df.columns
|
|
541
|
-
]
|
|
493
|
+
existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
|
|
542
494
|
if existing_columns:
|
|
543
495
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
544
496
|
|
|
@@ -665,29 +617,23 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
665
617
|
if k in schema.get("features_df", {}).get("columns", {})
|
|
666
618
|
and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
|
|
667
619
|
}
|
|
668
|
-
regular_columns = {
|
|
669
|
-
k: v for k, v in data.items() if k not in object_columns
|
|
670
|
-
}
|
|
620
|
+
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
671
621
|
|
|
672
622
|
# Create DataFrame with regular columns first
|
|
673
623
|
if regular_columns:
|
|
674
624
|
self.features_df = pl.DataFrame(regular_columns)
|
|
675
625
|
# Add Object columns one by one
|
|
676
626
|
for col, values in object_columns.items():
|
|
677
|
-
self.features_df = self.features_df.with_columns(
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
],
|
|
681
|
-
)
|
|
627
|
+
self.features_df = self.features_df.with_columns([
|
|
628
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
629
|
+
])
|
|
682
630
|
else:
|
|
683
631
|
# Only Object columns
|
|
684
632
|
self.features_df = pl.DataFrame()
|
|
685
633
|
for col, values in object_columns.items():
|
|
686
|
-
self.features_df = self.features_df.with_columns(
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
],
|
|
690
|
-
)
|
|
634
|
+
self.features_df = self.features_df.with_columns([
|
|
635
|
+
pl.Series(col, values, dtype=pl.Object),
|
|
636
|
+
])
|
|
691
637
|
|
|
692
638
|
# update all columns with schema types (skip Object columns)
|
|
693
639
|
for col in self.features_df.columns:
|
|
@@ -704,25 +650,16 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
704
650
|
# Convert to numeric first, handling different input types
|
|
705
651
|
if self.features_df[col].dtype == pl.Utf8:
|
|
706
652
|
# String data - convert to integer
|
|
707
|
-
self.features_df = (
|
|
708
|
-
|
|
709
|
-
pl.col(col)
|
|
710
|
-
.str.to_integer()
|
|
711
|
-
.cast(eval(dtype_str)),
|
|
712
|
-
)
|
|
653
|
+
self.features_df = self.features_df.with_columns(
|
|
654
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
713
655
|
)
|
|
714
656
|
elif self.features_df[col].dtype in [
|
|
715
657
|
pl.Float64,
|
|
716
658
|
pl.Float32,
|
|
717
659
|
]:
|
|
718
660
|
# Float data - cast to integer with null handling for NaN values
|
|
719
|
-
self.features_df = (
|
|
720
|
-
|
|
721
|
-
pl.col(col).cast(
|
|
722
|
-
eval(dtype_str),
|
|
723
|
-
strict=False,
|
|
724
|
-
),
|
|
725
|
-
)
|
|
661
|
+
self.features_df = self.features_df.with_columns(
|
|
662
|
+
pl.col(col).cast(eval(dtype_str), strict=False),
|
|
726
663
|
)
|
|
727
664
|
else:
|
|
728
665
|
# Handle special cases and try direct casting for other types
|
|
@@ -733,70 +670,50 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
733
670
|
if "Binary" in str(current_dtype):
|
|
734
671
|
# Convert binary to string first, then to target type
|
|
735
672
|
if target_dtype == pl.Utf8:
|
|
736
|
-
self.features_df = (
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
.
|
|
740
|
-
|
|
741
|
-
if isinstance(x, bytes)
|
|
742
|
-
else str(x),
|
|
743
|
-
return_dtype=pl.Utf8,
|
|
744
|
-
)
|
|
745
|
-
.cast(target_dtype),
|
|
673
|
+
self.features_df = self.features_df.with_columns(
|
|
674
|
+
pl.col(col)
|
|
675
|
+
.map_elements(
|
|
676
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
677
|
+
return_dtype=pl.Utf8,
|
|
746
678
|
)
|
|
679
|
+
.cast(target_dtype),
|
|
747
680
|
)
|
|
748
681
|
elif "Int" in str(target_dtype):
|
|
749
|
-
self.features_df = (
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
.
|
|
753
|
-
|
|
754
|
-
if isinstance(x, bytes)
|
|
755
|
-
else str(x),
|
|
756
|
-
return_dtype=pl.Utf8,
|
|
757
|
-
)
|
|
758
|
-
.str.to_integer()
|
|
759
|
-
.cast(target_dtype),
|
|
682
|
+
self.features_df = self.features_df.with_columns(
|
|
683
|
+
pl.col(col)
|
|
684
|
+
.map_elements(
|
|
685
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
686
|
+
return_dtype=pl.Utf8,
|
|
760
687
|
)
|
|
688
|
+
.str.to_integer()
|
|
689
|
+
.cast(target_dtype),
|
|
761
690
|
)
|
|
762
691
|
elif "Float" in str(target_dtype):
|
|
763
|
-
self.features_df = (
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
.
|
|
767
|
-
|
|
768
|
-
if isinstance(x, bytes)
|
|
769
|
-
else str(x),
|
|
770
|
-
return_dtype=pl.Utf8,
|
|
771
|
-
)
|
|
772
|
-
.str.to_decimal()
|
|
773
|
-
.cast(target_dtype),
|
|
692
|
+
self.features_df = self.features_df.with_columns(
|
|
693
|
+
pl.col(col)
|
|
694
|
+
.map_elements(
|
|
695
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
696
|
+
return_dtype=pl.Utf8,
|
|
774
697
|
)
|
|
698
|
+
.str.to_decimal()
|
|
699
|
+
.cast(target_dtype),
|
|
775
700
|
)
|
|
776
701
|
else:
|
|
777
702
|
# Try direct casting
|
|
778
|
-
self.features_df = (
|
|
779
|
-
|
|
780
|
-
pl.col(col).cast(target_dtype),
|
|
781
|
-
)
|
|
703
|
+
self.features_df = self.features_df.with_columns(
|
|
704
|
+
pl.col(col).cast(target_dtype),
|
|
782
705
|
)
|
|
783
706
|
else:
|
|
784
707
|
# Try direct casting for non-binary types
|
|
785
|
-
self.features_df = (
|
|
786
|
-
|
|
787
|
-
pl.col(col).cast(target_dtype),
|
|
788
|
-
)
|
|
708
|
+
self.features_df = self.features_df.with_columns(
|
|
709
|
+
pl.col(col).cast(target_dtype),
|
|
789
710
|
)
|
|
790
711
|
elif "Float" in dtype_str:
|
|
791
712
|
# Convert to float, handling different input types
|
|
792
713
|
if self.features_df[col].dtype == pl.Utf8:
|
|
793
714
|
# String data - convert to float
|
|
794
|
-
self.features_df = (
|
|
795
|
-
|
|
796
|
-
pl.col(col)
|
|
797
|
-
.str.to_decimal()
|
|
798
|
-
.cast(eval(dtype_str)),
|
|
799
|
-
)
|
|
715
|
+
self.features_df = self.features_df.with_columns(
|
|
716
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
800
717
|
)
|
|
801
718
|
else:
|
|
802
719
|
# Handle special cases and try direct casting for other types
|
|
@@ -807,59 +724,43 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
807
724
|
if "Binary" in str(current_dtype):
|
|
808
725
|
# Convert binary to string first, then to target type
|
|
809
726
|
if target_dtype == pl.Utf8:
|
|
810
|
-
self.features_df = (
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
.
|
|
814
|
-
|
|
815
|
-
if isinstance(x, bytes)
|
|
816
|
-
else str(x),
|
|
817
|
-
return_dtype=pl.Utf8,
|
|
818
|
-
)
|
|
819
|
-
.cast(target_dtype),
|
|
727
|
+
self.features_df = self.features_df.with_columns(
|
|
728
|
+
pl.col(col)
|
|
729
|
+
.map_elements(
|
|
730
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
731
|
+
return_dtype=pl.Utf8,
|
|
820
732
|
)
|
|
733
|
+
.cast(target_dtype),
|
|
821
734
|
)
|
|
822
735
|
elif "Int" in str(target_dtype):
|
|
823
|
-
self.features_df = (
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
.
|
|
827
|
-
|
|
828
|
-
if isinstance(x, bytes)
|
|
829
|
-
else str(x),
|
|
830
|
-
return_dtype=pl.Utf8,
|
|
831
|
-
)
|
|
832
|
-
.str.to_integer()
|
|
833
|
-
.cast(target_dtype),
|
|
736
|
+
self.features_df = self.features_df.with_columns(
|
|
737
|
+
pl.col(col)
|
|
738
|
+
.map_elements(
|
|
739
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
740
|
+
return_dtype=pl.Utf8,
|
|
834
741
|
)
|
|
742
|
+
.str.to_integer()
|
|
743
|
+
.cast(target_dtype),
|
|
835
744
|
)
|
|
836
745
|
elif "Float" in str(target_dtype):
|
|
837
|
-
self.features_df = (
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
.
|
|
841
|
-
|
|
842
|
-
if isinstance(x, bytes)
|
|
843
|
-
else str(x),
|
|
844
|
-
return_dtype=pl.Utf8,
|
|
845
|
-
)
|
|
846
|
-
.str.to_decimal()
|
|
847
|
-
.cast(target_dtype),
|
|
746
|
+
self.features_df = self.features_df.with_columns(
|
|
747
|
+
pl.col(col)
|
|
748
|
+
.map_elements(
|
|
749
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
750
|
+
return_dtype=pl.Utf8,
|
|
848
751
|
)
|
|
752
|
+
.str.to_decimal()
|
|
753
|
+
.cast(target_dtype),
|
|
849
754
|
)
|
|
850
755
|
else:
|
|
851
756
|
# Try direct casting
|
|
852
|
-
self.features_df = (
|
|
853
|
-
|
|
854
|
-
pl.col(col).cast(target_dtype),
|
|
855
|
-
)
|
|
757
|
+
self.features_df = self.features_df.with_columns(
|
|
758
|
+
pl.col(col).cast(target_dtype),
|
|
856
759
|
)
|
|
857
760
|
else:
|
|
858
761
|
# Try direct casting for non-binary types
|
|
859
|
-
self.features_df = (
|
|
860
|
-
|
|
861
|
-
pl.col(col).cast(target_dtype),
|
|
862
|
-
)
|
|
762
|
+
self.features_df = self.features_df.with_columns(
|
|
763
|
+
pl.col(col).cast(target_dtype),
|
|
863
764
|
)
|
|
864
765
|
elif "Utf8" in dtype_str:
|
|
865
766
|
# Ensure it's string type
|
|
@@ -875,59 +776,43 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
875
776
|
if "Binary" in str(current_dtype):
|
|
876
777
|
# Convert binary to string first, then to target type
|
|
877
778
|
if target_dtype == pl.Utf8:
|
|
878
|
-
self.features_df = (
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
.
|
|
882
|
-
|
|
883
|
-
if isinstance(x, bytes)
|
|
884
|
-
else str(x),
|
|
885
|
-
return_dtype=pl.Utf8,
|
|
886
|
-
)
|
|
887
|
-
.cast(target_dtype),
|
|
779
|
+
self.features_df = self.features_df.with_columns(
|
|
780
|
+
pl.col(col)
|
|
781
|
+
.map_elements(
|
|
782
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
783
|
+
return_dtype=pl.Utf8,
|
|
888
784
|
)
|
|
785
|
+
.cast(target_dtype),
|
|
889
786
|
)
|
|
890
787
|
elif "Int" in str(target_dtype):
|
|
891
|
-
self.features_df = (
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
.
|
|
895
|
-
|
|
896
|
-
if isinstance(x, bytes)
|
|
897
|
-
else str(x),
|
|
898
|
-
return_dtype=pl.Utf8,
|
|
899
|
-
)
|
|
900
|
-
.str.to_integer()
|
|
901
|
-
.cast(target_dtype),
|
|
788
|
+
self.features_df = self.features_df.with_columns(
|
|
789
|
+
pl.col(col)
|
|
790
|
+
.map_elements(
|
|
791
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
792
|
+
return_dtype=pl.Utf8,
|
|
902
793
|
)
|
|
794
|
+
.str.to_integer()
|
|
795
|
+
.cast(target_dtype),
|
|
903
796
|
)
|
|
904
797
|
elif "Float" in str(target_dtype):
|
|
905
|
-
self.features_df = (
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
.
|
|
909
|
-
|
|
910
|
-
if isinstance(x, bytes)
|
|
911
|
-
else str(x),
|
|
912
|
-
return_dtype=pl.Utf8,
|
|
913
|
-
)
|
|
914
|
-
.str.to_decimal()
|
|
915
|
-
.cast(target_dtype),
|
|
798
|
+
self.features_df = self.features_df.with_columns(
|
|
799
|
+
pl.col(col)
|
|
800
|
+
.map_elements(
|
|
801
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
802
|
+
return_dtype=pl.Utf8,
|
|
916
803
|
)
|
|
804
|
+
.str.to_decimal()
|
|
805
|
+
.cast(target_dtype),
|
|
917
806
|
)
|
|
918
807
|
else:
|
|
919
808
|
# Try direct casting
|
|
920
|
-
self.features_df = (
|
|
921
|
-
|
|
922
|
-
pl.col(col).cast(target_dtype),
|
|
923
|
-
)
|
|
809
|
+
self.features_df = self.features_df.with_columns(
|
|
810
|
+
pl.col(col).cast(target_dtype),
|
|
924
811
|
)
|
|
925
812
|
else:
|
|
926
813
|
# Try direct casting for non-binary types
|
|
927
|
-
self.features_df = (
|
|
928
|
-
|
|
929
|
-
pl.col(col).cast(target_dtype),
|
|
930
|
-
)
|
|
814
|
+
self.features_df = self.features_df.with_columns(
|
|
815
|
+
pl.col(col).cast(target_dtype),
|
|
931
816
|
)
|
|
932
817
|
except Exception as e:
|
|
933
818
|
self.logger.warning(
|
|
@@ -942,31 +827,23 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
942
827
|
# This ensures "None" strings introduced by failed conversions are properly handled
|
|
943
828
|
for col in self.features_df.columns:
|
|
944
829
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
945
|
-
self.features_df = self.features_df.with_columns(
|
|
946
|
-
[
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
],
|
|
952
|
-
)
|
|
830
|
+
self.features_df = self.features_df.with_columns([
|
|
831
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
832
|
+
.then(None)
|
|
833
|
+
.otherwise(pl.col(col))
|
|
834
|
+
.alias(col),
|
|
835
|
+
])
|
|
953
836
|
# Float columns
|
|
954
837
|
elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
|
|
955
|
-
self.features_df = self.features_df.with_columns(
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
],
|
|
959
|
-
)
|
|
838
|
+
self.features_df = self.features_df.with_columns([
|
|
839
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
840
|
+
])
|
|
960
841
|
|
|
961
842
|
# Ensure column order matches schema order
|
|
962
843
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
963
844
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
964
845
|
# Only reorder columns that exist in both schema and DataFrame
|
|
965
|
-
existing_columns = [
|
|
966
|
-
col
|
|
967
|
-
for col in schema_column_order
|
|
968
|
-
if col in self.features_df.columns
|
|
969
|
-
]
|
|
846
|
+
existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
|
|
970
847
|
if existing_columns:
|
|
971
848
|
self.features_df = self.features_df.select(existing_columns)
|
|
972
849
|
|
|
@@ -996,17 +873,13 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
996
873
|
dtype_str = schema_columns[col]["dtype"]
|
|
997
874
|
try:
|
|
998
875
|
if "Int" in dtype_str:
|
|
999
|
-
self.ms1_df = self.ms1_df.with_columns(
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
],
|
|
1003
|
-
)
|
|
876
|
+
self.ms1_df = self.ms1_df.with_columns([
|
|
877
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
878
|
+
])
|
|
1004
879
|
elif "Float" in dtype_str:
|
|
1005
|
-
self.ms1_df = self.ms1_df.with_columns(
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
],
|
|
1009
|
-
)
|
|
880
|
+
self.ms1_df = self.ms1_df.with_columns([
|
|
881
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
882
|
+
])
|
|
1010
883
|
except Exception as e:
|
|
1011
884
|
self.logger.warning(
|
|
1012
885
|
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
@@ -1075,28 +948,22 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1075
948
|
# Load metadata
|
|
1076
949
|
if "metadata" in f:
|
|
1077
950
|
metadata_group = f["metadata"]
|
|
1078
|
-
self.file_path = decode_metadata_attr(
|
|
1079
|
-
metadata_group.attrs.get("file_path", ""),
|
|
1080
|
-
)
|
|
951
|
+
self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
|
|
1081
952
|
|
|
1082
953
|
# Load file_source if it exists, otherwise set it equal to file_path
|
|
1083
954
|
if "file_source" in metadata_group.attrs:
|
|
1084
|
-
self.file_source = decode_metadata_attr(
|
|
1085
|
-
metadata_group.attrs.get("file_source", ""),
|
|
1086
|
-
)
|
|
955
|
+
self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
|
|
1087
956
|
else:
|
|
1088
957
|
self.file_source = self.file_path
|
|
1089
958
|
|
|
1090
|
-
self.file_type = decode_metadata_attr(
|
|
1091
|
-
metadata_group.attrs.get("file_type", ""),
|
|
1092
|
-
)
|
|
959
|
+
self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
|
|
1093
960
|
self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
|
|
1094
961
|
|
|
1095
962
|
# Load parameters from JSON in metadata
|
|
1096
963
|
loaded_data = load_parameters_from_metadata(metadata_group)
|
|
1097
964
|
|
|
1098
965
|
# Always create a fresh sample_defaults object
|
|
1099
|
-
from
|
|
966
|
+
from masster.sample.defaults.sample_def import sample_defaults
|
|
1100
967
|
|
|
1101
968
|
self.parameters = sample_defaults()
|
|
1102
969
|
|
|
@@ -1140,23 +1007,19 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1140
1007
|
# Convert "None" strings and NaN values to proper null values
|
|
1141
1008
|
for col in self.scans_df.columns:
|
|
1142
1009
|
if self.scans_df[col].dtype == pl.Utf8: # String columns
|
|
1143
|
-
self.scans_df = self.scans_df.with_columns(
|
|
1144
|
-
[
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
],
|
|
1150
|
-
)
|
|
1010
|
+
self.scans_df = self.scans_df.with_columns([
|
|
1011
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1012
|
+
.then(None)
|
|
1013
|
+
.otherwise(pl.col(col))
|
|
1014
|
+
.alias(col),
|
|
1015
|
+
])
|
|
1151
1016
|
elif self.scans_df[col].dtype in [
|
|
1152
1017
|
pl.Float64,
|
|
1153
1018
|
pl.Float32,
|
|
1154
1019
|
]: # Float columns
|
|
1155
|
-
self.scans_df = self.scans_df.with_columns(
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
],
|
|
1159
|
-
)
|
|
1020
|
+
self.scans_df = self.scans_df.with_columns([
|
|
1021
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1022
|
+
])
|
|
1160
1023
|
|
|
1161
1024
|
# update all columns with schema types
|
|
1162
1025
|
for col in self.scans_df.columns:
|
|
@@ -1174,9 +1037,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1174
1037
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1175
1038
|
# String data - convert to integer
|
|
1176
1039
|
self.scans_df = self.scans_df.with_columns(
|
|
1177
|
-
pl.col(col)
|
|
1178
|
-
.str.to_integer()
|
|
1179
|
-
.cast(eval(dtype_str)),
|
|
1040
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
1180
1041
|
)
|
|
1181
1042
|
elif self.scans_df[col].dtype in [
|
|
1182
1043
|
pl.Float64,
|
|
@@ -1196,9 +1057,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1196
1057
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1197
1058
|
# String data - convert to float
|
|
1198
1059
|
self.scans_df = self.scans_df.with_columns(
|
|
1199
|
-
pl.col(col)
|
|
1200
|
-
.str.to_decimal()
|
|
1201
|
-
.cast(eval(dtype_str)),
|
|
1060
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
1202
1061
|
)
|
|
1203
1062
|
else:
|
|
1204
1063
|
# Try direct casting
|
|
@@ -1222,9 +1081,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1222
1081
|
self.scans_df = self.scans_df.with_columns(
|
|
1223
1082
|
pl.col(col)
|
|
1224
1083
|
.map_elements(
|
|
1225
|
-
lambda x: x.decode("utf-8")
|
|
1226
|
-
if isinstance(x, bytes)
|
|
1227
|
-
else str(x),
|
|
1084
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1228
1085
|
return_dtype=pl.Utf8,
|
|
1229
1086
|
)
|
|
1230
1087
|
.cast(target_dtype),
|
|
@@ -1233,9 +1090,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1233
1090
|
self.scans_df = self.scans_df.with_columns(
|
|
1234
1091
|
pl.col(col)
|
|
1235
1092
|
.map_elements(
|
|
1236
|
-
lambda x: x.decode("utf-8")
|
|
1237
|
-
if isinstance(x, bytes)
|
|
1238
|
-
else str(x),
|
|
1093
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1239
1094
|
return_dtype=pl.Utf8,
|
|
1240
1095
|
)
|
|
1241
1096
|
.str.to_integer()
|
|
@@ -1245,9 +1100,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1245
1100
|
self.scans_df = self.scans_df.with_columns(
|
|
1246
1101
|
pl.col(col)
|
|
1247
1102
|
.map_elements(
|
|
1248
|
-
lambda x: x.decode("utf-8")
|
|
1249
|
-
if isinstance(x, bytes)
|
|
1250
|
-
else str(x),
|
|
1103
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1251
1104
|
return_dtype=pl.Utf8,
|
|
1252
1105
|
)
|
|
1253
1106
|
.str.to_decimal()
|
|
@@ -1276,9 +1129,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1276
1129
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
1277
1130
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
1278
1131
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1279
|
-
existing_columns = [
|
|
1280
|
-
col for col in schema_column_order if col in self.scans_df.columns
|
|
1281
|
-
]
|
|
1132
|
+
existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
|
|
1282
1133
|
if existing_columns:
|
|
1283
1134
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
1284
1135
|
|
|
@@ -1357,18 +1208,12 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1357
1208
|
spectrum_list = []
|
|
1358
1209
|
for spec_data in json.loads(item):
|
|
1359
1210
|
if spec_data is not None:
|
|
1360
|
-
spectrum = Spectrum.from_json(
|
|
1361
|
-
spec_data,
|
|
1362
|
-
)
|
|
1211
|
+
spectrum = Spectrum.from_json(spec_data)
|
|
1363
1212
|
spectrum_list.append(spectrum)
|
|
1364
1213
|
else:
|
|
1365
1214
|
spectrum_list.append(None)
|
|
1366
1215
|
reconstructed_data.append(spectrum_list)
|
|
1367
|
-
except (
|
|
1368
|
-
json.JSONDecodeError,
|
|
1369
|
-
ValueError,
|
|
1370
|
-
TypeError,
|
|
1371
|
-
):
|
|
1216
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
1372
1217
|
reconstructed_data.append(None)
|
|
1373
1218
|
|
|
1374
1219
|
data[col] = reconstructed_data
|
|
@@ -1384,13 +1229,10 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1384
1229
|
# Separate Object columns from regular columns to avoid astuple issues
|
|
1385
1230
|
object_columns = {}
|
|
1386
1231
|
regular_columns = {}
|
|
1387
|
-
|
|
1232
|
+
|
|
1388
1233
|
for col, values in data.items():
|
|
1389
1234
|
if col in schema.get("features_df", {}).get("columns", {}):
|
|
1390
|
-
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1391
|
-
"dtype",
|
|
1392
|
-
"",
|
|
1393
|
-
):
|
|
1235
|
+
if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
|
|
1394
1236
|
object_columns[col] = values
|
|
1395
1237
|
else:
|
|
1396
1238
|
regular_columns[col] = values
|
|
@@ -1403,48 +1245,38 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1403
1245
|
else:
|
|
1404
1246
|
# If no regular columns, create empty DataFrame
|
|
1405
1247
|
self.features_df = pl.DataFrame()
|
|
1406
|
-
|
|
1248
|
+
|
|
1407
1249
|
# Add Object columns one by one
|
|
1408
1250
|
for col, values in object_columns.items():
|
|
1409
1251
|
if not self.features_df.is_empty():
|
|
1410
1252
|
self.features_df = self.features_df.with_columns(
|
|
1411
|
-
pl.Series(col, values, dtype=pl.Object).alias(col)
|
|
1253
|
+
pl.Series(col, values, dtype=pl.Object).alias(col)
|
|
1412
1254
|
)
|
|
1413
1255
|
else:
|
|
1414
1256
|
# Create DataFrame with just this Object column
|
|
1415
|
-
self.features_df = pl.DataFrame(
|
|
1416
|
-
{col: values},
|
|
1417
|
-
schema={col: pl.Object},
|
|
1418
|
-
)
|
|
1257
|
+
self.features_df = pl.DataFrame({col: values}, schema={col: pl.Object})
|
|
1419
1258
|
|
|
1420
1259
|
# Convert "None" strings and NaN values to proper null values for regular columns first
|
|
1421
1260
|
for col in self.features_df.columns:
|
|
1422
1261
|
# Skip Object columns - they're already properly reconstructed
|
|
1423
1262
|
if col in schema.get("features_df", {}).get("columns", {}):
|
|
1424
|
-
if "Object" in schema["features_df"]["columns"][col].get(
|
|
1425
|
-
"dtype",
|
|
1426
|
-
"",
|
|
1427
|
-
):
|
|
1263
|
+
if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
|
|
1428
1264
|
continue
|
|
1429
1265
|
|
|
1430
1266
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
1431
|
-
self.features_df = self.features_df.with_columns(
|
|
1432
|
-
[
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
],
|
|
1438
|
-
)
|
|
1267
|
+
self.features_df = self.features_df.with_columns([
|
|
1268
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1269
|
+
.then(None)
|
|
1270
|
+
.otherwise(pl.col(col))
|
|
1271
|
+
.alias(col),
|
|
1272
|
+
])
|
|
1439
1273
|
elif self.features_df[col].dtype in [
|
|
1440
1274
|
pl.Float64,
|
|
1441
1275
|
pl.Float32,
|
|
1442
1276
|
]: # Float columns
|
|
1443
|
-
self.features_df = self.features_df.with_columns(
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
],
|
|
1447
|
-
)
|
|
1277
|
+
self.features_df = self.features_df.with_columns([
|
|
1278
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1279
|
+
])
|
|
1448
1280
|
|
|
1449
1281
|
# update all columns with schema types
|
|
1450
1282
|
for col in self.features_df.columns:
|
|
@@ -1461,25 +1293,16 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1461
1293
|
# Convert to numeric first, handling different input types
|
|
1462
1294
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1463
1295
|
# String data - convert to integer
|
|
1464
|
-
self.features_df = (
|
|
1465
|
-
|
|
1466
|
-
pl.col(col)
|
|
1467
|
-
.str.to_integer()
|
|
1468
|
-
.cast(eval(dtype_str)),
|
|
1469
|
-
)
|
|
1296
|
+
self.features_df = self.features_df.with_columns(
|
|
1297
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
1470
1298
|
)
|
|
1471
1299
|
elif self.features_df[col].dtype in [
|
|
1472
1300
|
pl.Float64,
|
|
1473
1301
|
pl.Float32,
|
|
1474
1302
|
]:
|
|
1475
1303
|
# Float data - cast to integer with null handling for NaN values
|
|
1476
|
-
self.features_df = (
|
|
1477
|
-
|
|
1478
|
-
pl.col(col).cast(
|
|
1479
|
-
eval(dtype_str),
|
|
1480
|
-
strict=False,
|
|
1481
|
-
),
|
|
1482
|
-
)
|
|
1304
|
+
self.features_df = self.features_df.with_columns(
|
|
1305
|
+
pl.col(col).cast(eval(dtype_str), strict=False),
|
|
1483
1306
|
)
|
|
1484
1307
|
else:
|
|
1485
1308
|
# Handle special cases and try direct casting for other types
|
|
@@ -1490,70 +1313,50 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1490
1313
|
if "Binary" in str(current_dtype):
|
|
1491
1314
|
# Convert binary to string first, then to target type
|
|
1492
1315
|
if target_dtype == pl.Utf8:
|
|
1493
|
-
self.features_df = (
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
.
|
|
1497
|
-
|
|
1498
|
-
if isinstance(x, bytes)
|
|
1499
|
-
else str(x),
|
|
1500
|
-
return_dtype=pl.Utf8,
|
|
1501
|
-
)
|
|
1502
|
-
.cast(target_dtype),
|
|
1316
|
+
self.features_df = self.features_df.with_columns(
|
|
1317
|
+
pl.col(col)
|
|
1318
|
+
.map_elements(
|
|
1319
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1320
|
+
return_dtype=pl.Utf8,
|
|
1503
1321
|
)
|
|
1322
|
+
.cast(target_dtype),
|
|
1504
1323
|
)
|
|
1505
1324
|
elif "Int" in str(target_dtype):
|
|
1506
|
-
self.features_df = (
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
.
|
|
1510
|
-
|
|
1511
|
-
if isinstance(x, bytes)
|
|
1512
|
-
else str(x),
|
|
1513
|
-
return_dtype=pl.Utf8,
|
|
1514
|
-
)
|
|
1515
|
-
.str.to_integer()
|
|
1516
|
-
.cast(target_dtype),
|
|
1325
|
+
self.features_df = self.features_df.with_columns(
|
|
1326
|
+
pl.col(col)
|
|
1327
|
+
.map_elements(
|
|
1328
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1329
|
+
return_dtype=pl.Utf8,
|
|
1517
1330
|
)
|
|
1331
|
+
.str.to_integer()
|
|
1332
|
+
.cast(target_dtype),
|
|
1518
1333
|
)
|
|
1519
1334
|
elif "Float" in str(target_dtype):
|
|
1520
|
-
self.features_df = (
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
.
|
|
1524
|
-
|
|
1525
|
-
if isinstance(x, bytes)
|
|
1526
|
-
else str(x),
|
|
1527
|
-
return_dtype=pl.Utf8,
|
|
1528
|
-
)
|
|
1529
|
-
.str.to_decimal()
|
|
1530
|
-
.cast(target_dtype),
|
|
1335
|
+
self.features_df = self.features_df.with_columns(
|
|
1336
|
+
pl.col(col)
|
|
1337
|
+
.map_elements(
|
|
1338
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1339
|
+
return_dtype=pl.Utf8,
|
|
1531
1340
|
)
|
|
1341
|
+
.str.to_decimal()
|
|
1342
|
+
.cast(target_dtype),
|
|
1532
1343
|
)
|
|
1533
1344
|
else:
|
|
1534
1345
|
# Try direct casting
|
|
1535
|
-
self.features_df = (
|
|
1536
|
-
|
|
1537
|
-
pl.col(col).cast(target_dtype),
|
|
1538
|
-
)
|
|
1346
|
+
self.features_df = self.features_df.with_columns(
|
|
1347
|
+
pl.col(col).cast(target_dtype),
|
|
1539
1348
|
)
|
|
1540
1349
|
else:
|
|
1541
1350
|
# Try direct casting for non-binary types
|
|
1542
|
-
self.features_df = (
|
|
1543
|
-
|
|
1544
|
-
pl.col(col).cast(target_dtype),
|
|
1545
|
-
)
|
|
1351
|
+
self.features_df = self.features_df.with_columns(
|
|
1352
|
+
pl.col(col).cast(target_dtype),
|
|
1546
1353
|
)
|
|
1547
1354
|
elif "Float" in dtype_str:
|
|
1548
1355
|
# Convert to float, handling different input types
|
|
1549
1356
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1550
1357
|
# String data - convert to float
|
|
1551
|
-
self.features_df = (
|
|
1552
|
-
|
|
1553
|
-
pl.col(col)
|
|
1554
|
-
.str.to_decimal()
|
|
1555
|
-
.cast(eval(dtype_str)),
|
|
1556
|
-
)
|
|
1358
|
+
self.features_df = self.features_df.with_columns(
|
|
1359
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
1557
1360
|
)
|
|
1558
1361
|
else:
|
|
1559
1362
|
# Handle special cases and try direct casting for other types
|
|
@@ -1564,59 +1367,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1564
1367
|
if "Binary" in str(current_dtype):
|
|
1565
1368
|
# Convert binary to string first, then to target type
|
|
1566
1369
|
if target_dtype == pl.Utf8:
|
|
1567
|
-
self.features_df = (
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
.
|
|
1571
|
-
|
|
1572
|
-
if isinstance(x, bytes)
|
|
1573
|
-
else str(x),
|
|
1574
|
-
return_dtype=pl.Utf8,
|
|
1575
|
-
)
|
|
1576
|
-
.cast(target_dtype),
|
|
1370
|
+
self.features_df = self.features_df.with_columns(
|
|
1371
|
+
pl.col(col)
|
|
1372
|
+
.map_elements(
|
|
1373
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1374
|
+
return_dtype=pl.Utf8,
|
|
1577
1375
|
)
|
|
1376
|
+
.cast(target_dtype),
|
|
1578
1377
|
)
|
|
1579
1378
|
elif "Int" in str(target_dtype):
|
|
1580
|
-
self.features_df = (
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
.
|
|
1584
|
-
|
|
1585
|
-
if isinstance(x, bytes)
|
|
1586
|
-
else str(x),
|
|
1587
|
-
return_dtype=pl.Utf8,
|
|
1588
|
-
)
|
|
1589
|
-
.str.to_integer()
|
|
1590
|
-
.cast(target_dtype),
|
|
1379
|
+
self.features_df = self.features_df.with_columns(
|
|
1380
|
+
pl.col(col)
|
|
1381
|
+
.map_elements(
|
|
1382
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1383
|
+
return_dtype=pl.Utf8,
|
|
1591
1384
|
)
|
|
1385
|
+
.str.to_integer()
|
|
1386
|
+
.cast(target_dtype),
|
|
1592
1387
|
)
|
|
1593
1388
|
elif "Float" in str(target_dtype):
|
|
1594
|
-
self.features_df = (
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
.
|
|
1598
|
-
|
|
1599
|
-
if isinstance(x, bytes)
|
|
1600
|
-
else str(x),
|
|
1601
|
-
return_dtype=pl.Utf8,
|
|
1602
|
-
)
|
|
1603
|
-
.str.to_decimal()
|
|
1604
|
-
.cast(target_dtype),
|
|
1389
|
+
self.features_df = self.features_df.with_columns(
|
|
1390
|
+
pl.col(col)
|
|
1391
|
+
.map_elements(
|
|
1392
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1393
|
+
return_dtype=pl.Utf8,
|
|
1605
1394
|
)
|
|
1395
|
+
.str.to_decimal()
|
|
1396
|
+
.cast(target_dtype),
|
|
1606
1397
|
)
|
|
1607
1398
|
else:
|
|
1608
1399
|
# Try direct casting
|
|
1609
|
-
self.features_df = (
|
|
1610
|
-
|
|
1611
|
-
pl.col(col).cast(target_dtype),
|
|
1612
|
-
)
|
|
1400
|
+
self.features_df = self.features_df.with_columns(
|
|
1401
|
+
pl.col(col).cast(target_dtype),
|
|
1613
1402
|
)
|
|
1614
1403
|
else:
|
|
1615
1404
|
# Try direct casting for non-binary types
|
|
1616
|
-
self.features_df = (
|
|
1617
|
-
|
|
1618
|
-
pl.col(col).cast(target_dtype),
|
|
1619
|
-
)
|
|
1405
|
+
self.features_df = self.features_df.with_columns(
|
|
1406
|
+
pl.col(col).cast(target_dtype),
|
|
1620
1407
|
)
|
|
1621
1408
|
elif "Utf8" in dtype_str:
|
|
1622
1409
|
# Ensure it's string type
|
|
@@ -1632,59 +1419,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1632
1419
|
if "Binary" in str(current_dtype):
|
|
1633
1420
|
# Convert binary to string first, then to target type
|
|
1634
1421
|
if target_dtype == pl.Utf8:
|
|
1635
|
-
self.features_df = (
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
.
|
|
1639
|
-
|
|
1640
|
-
if isinstance(x, bytes)
|
|
1641
|
-
else str(x),
|
|
1642
|
-
return_dtype=pl.Utf8,
|
|
1643
|
-
)
|
|
1644
|
-
.cast(target_dtype),
|
|
1422
|
+
self.features_df = self.features_df.with_columns(
|
|
1423
|
+
pl.col(col)
|
|
1424
|
+
.map_elements(
|
|
1425
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1426
|
+
return_dtype=pl.Utf8,
|
|
1645
1427
|
)
|
|
1428
|
+
.cast(target_dtype),
|
|
1646
1429
|
)
|
|
1647
1430
|
elif "Int" in str(target_dtype):
|
|
1648
|
-
self.features_df = (
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
.
|
|
1652
|
-
|
|
1653
|
-
if isinstance(x, bytes)
|
|
1654
|
-
else str(x),
|
|
1655
|
-
return_dtype=pl.Utf8,
|
|
1656
|
-
)
|
|
1657
|
-
.str.to_integer()
|
|
1658
|
-
.cast(target_dtype),
|
|
1431
|
+
self.features_df = self.features_df.with_columns(
|
|
1432
|
+
pl.col(col)
|
|
1433
|
+
.map_elements(
|
|
1434
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1435
|
+
return_dtype=pl.Utf8,
|
|
1659
1436
|
)
|
|
1437
|
+
.str.to_integer()
|
|
1438
|
+
.cast(target_dtype),
|
|
1660
1439
|
)
|
|
1661
1440
|
elif "Float" in str(target_dtype):
|
|
1662
|
-
self.features_df = (
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
.
|
|
1666
|
-
|
|
1667
|
-
if isinstance(x, bytes)
|
|
1668
|
-
else str(x),
|
|
1669
|
-
return_dtype=pl.Utf8,
|
|
1670
|
-
)
|
|
1671
|
-
.str.to_decimal()
|
|
1672
|
-
.cast(target_dtype),
|
|
1441
|
+
self.features_df = self.features_df.with_columns(
|
|
1442
|
+
pl.col(col)
|
|
1443
|
+
.map_elements(
|
|
1444
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1445
|
+
return_dtype=pl.Utf8,
|
|
1673
1446
|
)
|
|
1447
|
+
.str.to_decimal()
|
|
1448
|
+
.cast(target_dtype),
|
|
1674
1449
|
)
|
|
1675
1450
|
else:
|
|
1676
1451
|
# Try direct casting
|
|
1677
|
-
self.features_df = (
|
|
1678
|
-
|
|
1679
|
-
pl.col(col).cast(target_dtype),
|
|
1680
|
-
)
|
|
1452
|
+
self.features_df = self.features_df.with_columns(
|
|
1453
|
+
pl.col(col).cast(target_dtype),
|
|
1681
1454
|
)
|
|
1682
1455
|
else:
|
|
1683
1456
|
# Try direct casting for non-binary types
|
|
1684
|
-
self.features_df = (
|
|
1685
|
-
|
|
1686
|
-
pl.col(col).cast(target_dtype),
|
|
1687
|
-
)
|
|
1457
|
+
self.features_df = self.features_df.with_columns(
|
|
1458
|
+
pl.col(col).cast(target_dtype),
|
|
1688
1459
|
)
|
|
1689
1460
|
except Exception as e:
|
|
1690
1461
|
self.logger.warning(
|
|
@@ -1699,31 +1470,23 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1699
1470
|
# This ensures "None" strings introduced by failed conversions are properly handled
|
|
1700
1471
|
for col in self.features_df.columns:
|
|
1701
1472
|
if self.features_df[col].dtype == pl.Utf8: # String columns
|
|
1702
|
-
self.features_df = self.features_df.with_columns(
|
|
1703
|
-
[
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
],
|
|
1709
|
-
)
|
|
1473
|
+
self.features_df = self.features_df.with_columns([
|
|
1474
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
|
|
1475
|
+
.then(None)
|
|
1476
|
+
.otherwise(pl.col(col))
|
|
1477
|
+
.alias(col),
|
|
1478
|
+
])
|
|
1710
1479
|
# Float columns
|
|
1711
1480
|
elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
|
|
1712
|
-
self.features_df = self.features_df.with_columns(
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
],
|
|
1716
|
-
)
|
|
1481
|
+
self.features_df = self.features_df.with_columns([
|
|
1482
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1483
|
+
])
|
|
1717
1484
|
|
|
1718
1485
|
# Ensure column order matches schema order
|
|
1719
1486
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
1720
1487
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
1721
1488
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1722
|
-
existing_columns = [
|
|
1723
|
-
col
|
|
1724
|
-
for col in schema_column_order
|
|
1725
|
-
if col in self.features_df.columns
|
|
1726
|
-
]
|
|
1489
|
+
existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
|
|
1727
1490
|
if existing_columns:
|
|
1728
1491
|
self.features_df = self.features_df.select(existing_columns)
|
|
1729
1492
|
|
|
@@ -1753,9 +1516,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1753
1516
|
# set self.label to basename without extension
|
|
1754
1517
|
if self.label is None or self.label == "":
|
|
1755
1518
|
self.label = os.path.splitext(os.path.basename(filename))[0]
|
|
1756
|
-
self.logger.info(
|
|
1757
|
-
f"Sample loaded successfully from {filename} (optimized for study)",
|
|
1758
|
-
)
|
|
1519
|
+
self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
|
|
1759
1520
|
|
|
1760
1521
|
|
|
1761
1522
|
def load_schema(schema_path: str) -> Dict[str, Any]:
|
|
@@ -1803,20 +1564,13 @@ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1803
1564
|
cleaned_df = df
|
|
1804
1565
|
for col in df.columns:
|
|
1805
1566
|
if df[col].dtype == pl.Utf8: # String columns
|
|
1806
|
-
cleaned_df = cleaned_df.with_columns(
|
|
1807
|
-
[
|
|
1808
|
-
|
|
1809
|
-
.then(None)
|
|
1810
|
-
.otherwise(pl.col(col))
|
|
1811
|
-
.alias(col),
|
|
1812
|
-
],
|
|
1813
|
-
)
|
|
1567
|
+
cleaned_df = cleaned_df.with_columns([
|
|
1568
|
+
pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
|
|
1569
|
+
])
|
|
1814
1570
|
elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
|
|
1815
|
-
cleaned_df = cleaned_df.with_columns(
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
],
|
|
1819
|
-
)
|
|
1571
|
+
cleaned_df = cleaned_df.with_columns([
|
|
1572
|
+
pl.col(col).fill_nan(None).alias(col),
|
|
1573
|
+
])
|
|
1820
1574
|
return cleaned_df
|
|
1821
1575
|
|
|
1822
1576
|
|
|
@@ -1852,12 +1606,7 @@ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataF
|
|
|
1852
1606
|
return df
|
|
1853
1607
|
|
|
1854
1608
|
|
|
1855
|
-
def _cast_to_int(
|
|
1856
|
-
df: pl.DataFrame,
|
|
1857
|
-
col: str,
|
|
1858
|
-
current_dtype: pl.DataType,
|
|
1859
|
-
target_dtype: pl.DataType,
|
|
1860
|
-
) -> pl.DataFrame:
|
|
1609
|
+
def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
|
|
1861
1610
|
"""Helper function to cast column to integer type."""
|
|
1862
1611
|
if current_dtype == pl.Utf8:
|
|
1863
1612
|
return df.with_columns(
|
|
@@ -1869,12 +1618,7 @@ def _cast_to_int(
|
|
|
1869
1618
|
return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
|
|
1870
1619
|
|
|
1871
1620
|
|
|
1872
|
-
def _cast_to_float(
|
|
1873
|
-
df: pl.DataFrame,
|
|
1874
|
-
col: str,
|
|
1875
|
-
current_dtype: pl.DataType,
|
|
1876
|
-
target_dtype: pl.DataType,
|
|
1877
|
-
) -> pl.DataFrame:
|
|
1621
|
+
def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
|
|
1878
1622
|
"""Helper function to cast column to float type."""
|
|
1879
1623
|
if current_dtype == pl.Utf8:
|
|
1880
1624
|
return df.with_columns(
|
|
@@ -1895,29 +1639,20 @@ def _cast_with_binary_handling(
|
|
|
1895
1639
|
if target_dtype == pl.Utf8:
|
|
1896
1640
|
return df.with_columns(
|
|
1897
1641
|
pl.col(col)
|
|
1898
|
-
.map_elements(
|
|
1899
|
-
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1900
|
-
return_dtype=pl.Utf8,
|
|
1901
|
-
)
|
|
1642
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1902
1643
|
.cast(target_dtype),
|
|
1903
1644
|
)
|
|
1904
1645
|
elif "Int" in str(target_dtype):
|
|
1905
1646
|
return df.with_columns(
|
|
1906
1647
|
pl.col(col)
|
|
1907
|
-
.map_elements(
|
|
1908
|
-
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1909
|
-
return_dtype=pl.Utf8,
|
|
1910
|
-
)
|
|
1648
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1911
1649
|
.str.to_integer()
|
|
1912
1650
|
.cast(target_dtype),
|
|
1913
1651
|
)
|
|
1914
1652
|
elif "Float" in str(target_dtype):
|
|
1915
1653
|
return df.with_columns(
|
|
1916
1654
|
pl.col(col)
|
|
1917
|
-
.map_elements(
|
|
1918
|
-
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1919
|
-
return_dtype=pl.Utf8,
|
|
1920
|
-
)
|
|
1655
|
+
.map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
|
|
1921
1656
|
.str.to_decimal()
|
|
1922
1657
|
.cast(target_dtype),
|
|
1923
1658
|
)
|
|
@@ -1926,11 +1661,7 @@ def _cast_with_binary_handling(
|
|
|
1926
1661
|
return df.with_columns(pl.col(col).cast(target_dtype))
|
|
1927
1662
|
|
|
1928
1663
|
|
|
1929
|
-
def apply_schema_to_dataframe(
|
|
1930
|
-
df: pl.DataFrame,
|
|
1931
|
-
schema: Dict[str, Any],
|
|
1932
|
-
df_name: str,
|
|
1933
|
-
) -> pl.DataFrame:
|
|
1664
|
+
def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
|
|
1934
1665
|
"""
|
|
1935
1666
|
Apply schema type casting to a Polars DataFrame.
|
|
1936
1667
|
|
|
@@ -2088,9 +1819,7 @@ def _create_dataframe_with_object_columns(
|
|
|
2088
1819
|
schema_columns = schema.get(df_name, {}).get("columns", {})
|
|
2089
1820
|
|
|
2090
1821
|
object_columns = {
|
|
2091
|
-
k: v
|
|
2092
|
-
for k, v in data.items()
|
|
2093
|
-
if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
1822
|
+
k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
2094
1823
|
}
|
|
2095
1824
|
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
2096
1825
|
|
|
@@ -2145,17 +1874,13 @@ def load_ms1_dataframe_from_h5_group(
|
|
|
2145
1874
|
dtype_str = schema_columns[col]["dtype"]
|
|
2146
1875
|
try:
|
|
2147
1876
|
if "Int" in dtype_str:
|
|
2148
|
-
ms1_df = ms1_df.with_columns(
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
],
|
|
2152
|
-
)
|
|
1877
|
+
ms1_df = ms1_df.with_columns([
|
|
1878
|
+
pl.col(col).cast(pl.Int64, strict=False),
|
|
1879
|
+
])
|
|
2153
1880
|
elif "Float" in dtype_str:
|
|
2154
|
-
ms1_df = ms1_df.with_columns(
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
],
|
|
2158
|
-
)
|
|
1881
|
+
ms1_df = ms1_df.with_columns([
|
|
1882
|
+
pl.col(col).cast(pl.Float64, strict=False),
|
|
1883
|
+
])
|
|
2159
1884
|
except Exception as e:
|
|
2160
1885
|
if logger:
|
|
2161
1886
|
logger.warning(
|
|
@@ -2166,9 +1891,7 @@ def load_ms1_dataframe_from_h5_group(
|
|
|
2166
1891
|
return clean_null_values_polars(ms1_df)
|
|
2167
1892
|
|
|
2168
1893
|
|
|
2169
|
-
def load_parameters_from_metadata(
|
|
2170
|
-
metadata_group: h5py.Group,
|
|
2171
|
-
) -> Optional[Dict[str, Any]]:
|
|
1894
|
+
def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
|
|
2172
1895
|
"""
|
|
2173
1896
|
Load parameters from HDF5 metadata group.
|
|
2174
1897
|
|
|
@@ -2215,8 +1938,6 @@ def create_h5_metadata_group(
|
|
|
2215
1938
|
metadata_group = f.create_group("metadata")
|
|
2216
1939
|
metadata_group.attrs["format"] = "master-sample5-1"
|
|
2217
1940
|
metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
|
|
2218
|
-
metadata_group.attrs["file_source"] = (
|
|
2219
|
-
str(file_source) if file_source is not None else ""
|
|
2220
|
-
)
|
|
1941
|
+
metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
|
|
2221
1942
|
metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
|
|
2222
1943
|
metadata_group.attrs["label"] = str(label) if label is not None else ""
|