masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/sample/h5.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
import h5py
|
|
@@ -62,7 +62,7 @@ def _save_sample5(
|
|
|
62
62
|
return
|
|
63
63
|
|
|
64
64
|
# synchronize feature_map if it exists
|
|
65
|
-
#if hasattr(self, "_feature_map") and self._feature_map is not None:
|
|
65
|
+
# if hasattr(self, "_feature_map") and self._feature_map is not None:
|
|
66
66
|
# self._features_sync()
|
|
67
67
|
|
|
68
68
|
# if no extension is given, add .sample5
|
|
@@ -94,7 +94,7 @@ def _save_sample5(
|
|
|
94
94
|
metadata_group.attrs["file_source"] = str(self.file_source)
|
|
95
95
|
else:
|
|
96
96
|
metadata_group.attrs["file_source"] = ""
|
|
97
|
-
if hasattr(self,
|
|
97
|
+
if hasattr(self, "type") and self.type is not None:
|
|
98
98
|
metadata_group.attrs["file_type"] = str(self.type)
|
|
99
99
|
else:
|
|
100
100
|
metadata_group.attrs["file_type"] = ""
|
|
@@ -127,11 +127,7 @@ def _save_sample5(
|
|
|
127
127
|
numeric_data = np.array(
|
|
128
128
|
[
|
|
129
129
|
float(x)
|
|
130
|
-
if x is not None
|
|
131
|
-
and str(x)
|
|
132
|
-
.replace(".", "")
|
|
133
|
-
.replace("-", "")
|
|
134
|
-
.isdigit()
|
|
130
|
+
if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
|
|
135
131
|
else np.nan
|
|
136
132
|
for x in data
|
|
137
133
|
],
|
|
@@ -289,21 +285,21 @@ def _save_sample5(
|
|
|
289
285
|
|
|
290
286
|
# Store parameters/history as JSON
|
|
291
287
|
# Always ensure we sync instance attributes to parameters before saving
|
|
292
|
-
if hasattr(self,
|
|
293
|
-
if hasattr(self,
|
|
288
|
+
if hasattr(self, "parameters") and self.parameters is not None:
|
|
289
|
+
if hasattr(self, "polarity") and self.polarity is not None:
|
|
294
290
|
self.parameters.polarity = self.polarity
|
|
295
|
-
if hasattr(self,
|
|
291
|
+
if hasattr(self, "type") and self.type is not None:
|
|
296
292
|
self.parameters.type = self.type
|
|
297
|
-
|
|
293
|
+
|
|
298
294
|
# Prepare save data
|
|
299
295
|
save_data = {}
|
|
300
|
-
|
|
296
|
+
|
|
301
297
|
# Add parameters as a dictionary
|
|
302
|
-
if hasattr(self,
|
|
298
|
+
if hasattr(self, "parameters") and self.parameters is not None:
|
|
303
299
|
save_data["sample"] = self.parameters.to_dict()
|
|
304
|
-
|
|
300
|
+
|
|
305
301
|
# Add history data (but ensure it's JSON serializable)
|
|
306
|
-
if hasattr(self,
|
|
302
|
+
if hasattr(self, "history") and self.history is not None:
|
|
307
303
|
# Convert any non-JSON-serializable objects to strings/dicts
|
|
308
304
|
serializable_history = {}
|
|
309
305
|
for key, value in self.history.items():
|
|
@@ -318,7 +314,7 @@ def _save_sample5(
|
|
|
318
314
|
# Convert to string if not serializable
|
|
319
315
|
serializable_history[key] = str(value)
|
|
320
316
|
save_data.update(serializable_history)
|
|
321
|
-
|
|
317
|
+
|
|
322
318
|
# Save as JSON
|
|
323
319
|
params_json = json.dumps(save_data, indent=2)
|
|
324
320
|
metadata_group.attrs["parameters"] = params_json
|
|
@@ -480,9 +476,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
480
476
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
481
477
|
# String data - convert to integer
|
|
482
478
|
self.scans_df = self.scans_df.with_columns(
|
|
483
|
-
pl.col(col)
|
|
484
|
-
.str.to_integer()
|
|
485
|
-
.cast(eval(dtype_str)),
|
|
479
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
486
480
|
)
|
|
487
481
|
elif self.scans_df[col].dtype in [
|
|
488
482
|
pl.Float64,
|
|
@@ -502,9 +496,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
502
496
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
503
497
|
# String data - convert to float
|
|
504
498
|
self.scans_df = self.scans_df.with_columns(
|
|
505
|
-
pl.col(col)
|
|
506
|
-
.str.to_decimal()
|
|
507
|
-
.cast(eval(dtype_str)),
|
|
499
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
508
500
|
)
|
|
509
501
|
else:
|
|
510
502
|
# Try direct casting
|
|
@@ -528,9 +520,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
528
520
|
self.scans_df = self.scans_df.with_columns(
|
|
529
521
|
pl.col(col)
|
|
530
522
|
.map_elements(
|
|
531
|
-
lambda x: x.decode("utf-8")
|
|
532
|
-
if isinstance(x, bytes)
|
|
533
|
-
else str(x),
|
|
523
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
534
524
|
return_dtype=pl.Utf8,
|
|
535
525
|
)
|
|
536
526
|
.cast(target_dtype),
|
|
@@ -539,9 +529,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
539
529
|
self.scans_df = self.scans_df.with_columns(
|
|
540
530
|
pl.col(col)
|
|
541
531
|
.map_elements(
|
|
542
|
-
lambda x: x.decode("utf-8")
|
|
543
|
-
if isinstance(x, bytes)
|
|
544
|
-
else str(x),
|
|
532
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
545
533
|
return_dtype=pl.Utf8,
|
|
546
534
|
)
|
|
547
535
|
.str.to_integer()
|
|
@@ -551,9 +539,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
551
539
|
self.scans_df = self.scans_df.with_columns(
|
|
552
540
|
pl.col(col)
|
|
553
541
|
.map_elements(
|
|
554
|
-
lambda x: x.decode("utf-8")
|
|
555
|
-
if isinstance(x, bytes)
|
|
556
|
-
else str(x),
|
|
542
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
557
543
|
return_dtype=pl.Utf8,
|
|
558
544
|
)
|
|
559
545
|
.str.to_decimal()
|
|
@@ -582,9 +568,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
582
568
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
583
569
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
584
570
|
# Only reorder columns that exist in both schema and DataFrame
|
|
585
|
-
existing_columns = [
|
|
586
|
-
col for col in schema_column_order if col in self.scans_df.columns
|
|
587
|
-
]
|
|
571
|
+
existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
|
|
588
572
|
if existing_columns:
|
|
589
573
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
590
574
|
|
|
@@ -730,9 +714,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
730
714
|
if k in schema.get("features_df", {}).get("columns", {})
|
|
731
715
|
and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
|
|
732
716
|
}
|
|
733
|
-
regular_columns = {
|
|
734
|
-
k: v for k, v in data.items() if k not in object_columns
|
|
735
|
-
}
|
|
717
|
+
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
736
718
|
|
|
737
719
|
# Create DataFrame with regular columns first
|
|
738
720
|
if regular_columns:
|
|
@@ -769,25 +751,19 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
769
751
|
# Convert to numeric first, handling different input types
|
|
770
752
|
if self.features_df[col].dtype == pl.Utf8:
|
|
771
753
|
# String data - convert to integer
|
|
772
|
-
self.features_df = (
|
|
773
|
-
|
|
774
|
-
pl.col(col)
|
|
775
|
-
.str.to_integer()
|
|
776
|
-
.cast(eval(dtype_str)),
|
|
777
|
-
)
|
|
754
|
+
self.features_df = self.features_df.with_columns(
|
|
755
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
778
756
|
)
|
|
779
757
|
elif self.features_df[col].dtype in [
|
|
780
758
|
pl.Float64,
|
|
781
759
|
pl.Float32,
|
|
782
760
|
]:
|
|
783
761
|
# Float data - cast to integer with null handling for NaN values
|
|
784
|
-
self.features_df = (
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
),
|
|
790
|
-
)
|
|
762
|
+
self.features_df = self.features_df.with_columns(
|
|
763
|
+
pl.col(col).cast(
|
|
764
|
+
eval(dtype_str),
|
|
765
|
+
strict=False,
|
|
766
|
+
),
|
|
791
767
|
)
|
|
792
768
|
else:
|
|
793
769
|
# Handle special cases and try direct casting for other types
|
|
@@ -798,70 +774,50 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
798
774
|
if "Binary" in str(current_dtype):
|
|
799
775
|
# Convert binary to string first, then to target type
|
|
800
776
|
if target_dtype == pl.Utf8:
|
|
801
|
-
self.features_df = (
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
.
|
|
805
|
-
|
|
806
|
-
if isinstance(x, bytes)
|
|
807
|
-
else str(x),
|
|
808
|
-
return_dtype=pl.Utf8,
|
|
809
|
-
)
|
|
810
|
-
.cast(target_dtype),
|
|
777
|
+
self.features_df = self.features_df.with_columns(
|
|
778
|
+
pl.col(col)
|
|
779
|
+
.map_elements(
|
|
780
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
781
|
+
return_dtype=pl.Utf8,
|
|
811
782
|
)
|
|
783
|
+
.cast(target_dtype),
|
|
812
784
|
)
|
|
813
785
|
elif "Int" in str(target_dtype):
|
|
814
|
-
self.features_df = (
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
.
|
|
818
|
-
|
|
819
|
-
if isinstance(x, bytes)
|
|
820
|
-
else str(x),
|
|
821
|
-
return_dtype=pl.Utf8,
|
|
822
|
-
)
|
|
823
|
-
.str.to_integer()
|
|
824
|
-
.cast(target_dtype),
|
|
786
|
+
self.features_df = self.features_df.with_columns(
|
|
787
|
+
pl.col(col)
|
|
788
|
+
.map_elements(
|
|
789
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
790
|
+
return_dtype=pl.Utf8,
|
|
825
791
|
)
|
|
792
|
+
.str.to_integer()
|
|
793
|
+
.cast(target_dtype),
|
|
826
794
|
)
|
|
827
795
|
elif "Float" in str(target_dtype):
|
|
828
|
-
self.features_df = (
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
.
|
|
832
|
-
|
|
833
|
-
if isinstance(x, bytes)
|
|
834
|
-
else str(x),
|
|
835
|
-
return_dtype=pl.Utf8,
|
|
836
|
-
)
|
|
837
|
-
.str.to_decimal()
|
|
838
|
-
.cast(target_dtype),
|
|
796
|
+
self.features_df = self.features_df.with_columns(
|
|
797
|
+
pl.col(col)
|
|
798
|
+
.map_elements(
|
|
799
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
800
|
+
return_dtype=pl.Utf8,
|
|
839
801
|
)
|
|
802
|
+
.str.to_decimal()
|
|
803
|
+
.cast(target_dtype),
|
|
840
804
|
)
|
|
841
805
|
else:
|
|
842
806
|
# Try direct casting
|
|
843
|
-
self.features_df = (
|
|
844
|
-
|
|
845
|
-
pl.col(col).cast(target_dtype),
|
|
846
|
-
)
|
|
807
|
+
self.features_df = self.features_df.with_columns(
|
|
808
|
+
pl.col(col).cast(target_dtype),
|
|
847
809
|
)
|
|
848
810
|
else:
|
|
849
811
|
# Try direct casting for non-binary types
|
|
850
|
-
self.features_df = (
|
|
851
|
-
|
|
852
|
-
pl.col(col).cast(target_dtype),
|
|
853
|
-
)
|
|
812
|
+
self.features_df = self.features_df.with_columns(
|
|
813
|
+
pl.col(col).cast(target_dtype),
|
|
854
814
|
)
|
|
855
815
|
elif "Float" in dtype_str:
|
|
856
816
|
# Convert to float, handling different input types
|
|
857
817
|
if self.features_df[col].dtype == pl.Utf8:
|
|
858
818
|
# String data - convert to float
|
|
859
|
-
self.features_df = (
|
|
860
|
-
|
|
861
|
-
pl.col(col)
|
|
862
|
-
.str.to_decimal()
|
|
863
|
-
.cast(eval(dtype_str)),
|
|
864
|
-
)
|
|
819
|
+
self.features_df = self.features_df.with_columns(
|
|
820
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
865
821
|
)
|
|
866
822
|
else:
|
|
867
823
|
# Handle special cases and try direct casting for other types
|
|
@@ -872,59 +828,43 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
872
828
|
if "Binary" in str(current_dtype):
|
|
873
829
|
# Convert binary to string first, then to target type
|
|
874
830
|
if target_dtype == pl.Utf8:
|
|
875
|
-
self.features_df = (
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
.
|
|
879
|
-
|
|
880
|
-
if isinstance(x, bytes)
|
|
881
|
-
else str(x),
|
|
882
|
-
return_dtype=pl.Utf8,
|
|
883
|
-
)
|
|
884
|
-
.cast(target_dtype),
|
|
831
|
+
self.features_df = self.features_df.with_columns(
|
|
832
|
+
pl.col(col)
|
|
833
|
+
.map_elements(
|
|
834
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
835
|
+
return_dtype=pl.Utf8,
|
|
885
836
|
)
|
|
837
|
+
.cast(target_dtype),
|
|
886
838
|
)
|
|
887
839
|
elif "Int" in str(target_dtype):
|
|
888
|
-
self.features_df = (
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
.
|
|
892
|
-
|
|
893
|
-
if isinstance(x, bytes)
|
|
894
|
-
else str(x),
|
|
895
|
-
return_dtype=pl.Utf8,
|
|
896
|
-
)
|
|
897
|
-
.str.to_integer()
|
|
898
|
-
.cast(target_dtype),
|
|
840
|
+
self.features_df = self.features_df.with_columns(
|
|
841
|
+
pl.col(col)
|
|
842
|
+
.map_elements(
|
|
843
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
844
|
+
return_dtype=pl.Utf8,
|
|
899
845
|
)
|
|
846
|
+
.str.to_integer()
|
|
847
|
+
.cast(target_dtype),
|
|
900
848
|
)
|
|
901
849
|
elif "Float" in str(target_dtype):
|
|
902
|
-
self.features_df = (
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
.
|
|
906
|
-
|
|
907
|
-
if isinstance(x, bytes)
|
|
908
|
-
else str(x),
|
|
909
|
-
return_dtype=pl.Utf8,
|
|
910
|
-
)
|
|
911
|
-
.str.to_decimal()
|
|
912
|
-
.cast(target_dtype),
|
|
850
|
+
self.features_df = self.features_df.with_columns(
|
|
851
|
+
pl.col(col)
|
|
852
|
+
.map_elements(
|
|
853
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
854
|
+
return_dtype=pl.Utf8,
|
|
913
855
|
)
|
|
856
|
+
.str.to_decimal()
|
|
857
|
+
.cast(target_dtype),
|
|
914
858
|
)
|
|
915
859
|
else:
|
|
916
860
|
# Try direct casting
|
|
917
|
-
self.features_df = (
|
|
918
|
-
|
|
919
|
-
pl.col(col).cast(target_dtype),
|
|
920
|
-
)
|
|
861
|
+
self.features_df = self.features_df.with_columns(
|
|
862
|
+
pl.col(col).cast(target_dtype),
|
|
921
863
|
)
|
|
922
864
|
else:
|
|
923
865
|
# Try direct casting for non-binary types
|
|
924
|
-
self.features_df = (
|
|
925
|
-
|
|
926
|
-
pl.col(col).cast(target_dtype),
|
|
927
|
-
)
|
|
866
|
+
self.features_df = self.features_df.with_columns(
|
|
867
|
+
pl.col(col).cast(target_dtype),
|
|
928
868
|
)
|
|
929
869
|
elif "Utf8" in dtype_str:
|
|
930
870
|
# Ensure it's string type
|
|
@@ -940,59 +880,43 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
940
880
|
if "Binary" in str(current_dtype):
|
|
941
881
|
# Convert binary to string first, then to target type
|
|
942
882
|
if target_dtype == pl.Utf8:
|
|
943
|
-
self.features_df = (
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
.
|
|
947
|
-
|
|
948
|
-
if isinstance(x, bytes)
|
|
949
|
-
else str(x),
|
|
950
|
-
return_dtype=pl.Utf8,
|
|
951
|
-
)
|
|
952
|
-
.cast(target_dtype),
|
|
883
|
+
self.features_df = self.features_df.with_columns(
|
|
884
|
+
pl.col(col)
|
|
885
|
+
.map_elements(
|
|
886
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
887
|
+
return_dtype=pl.Utf8,
|
|
953
888
|
)
|
|
889
|
+
.cast(target_dtype),
|
|
954
890
|
)
|
|
955
891
|
elif "Int" in str(target_dtype):
|
|
956
|
-
self.features_df = (
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
.
|
|
960
|
-
|
|
961
|
-
if isinstance(x, bytes)
|
|
962
|
-
else str(x),
|
|
963
|
-
return_dtype=pl.Utf8,
|
|
964
|
-
)
|
|
965
|
-
.str.to_integer()
|
|
966
|
-
.cast(target_dtype),
|
|
892
|
+
self.features_df = self.features_df.with_columns(
|
|
893
|
+
pl.col(col)
|
|
894
|
+
.map_elements(
|
|
895
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
896
|
+
return_dtype=pl.Utf8,
|
|
967
897
|
)
|
|
898
|
+
.str.to_integer()
|
|
899
|
+
.cast(target_dtype),
|
|
968
900
|
)
|
|
969
901
|
elif "Float" in str(target_dtype):
|
|
970
|
-
self.features_df = (
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
.
|
|
974
|
-
|
|
975
|
-
if isinstance(x, bytes)
|
|
976
|
-
else str(x),
|
|
977
|
-
return_dtype=pl.Utf8,
|
|
978
|
-
)
|
|
979
|
-
.str.to_decimal()
|
|
980
|
-
.cast(target_dtype),
|
|
902
|
+
self.features_df = self.features_df.with_columns(
|
|
903
|
+
pl.col(col)
|
|
904
|
+
.map_elements(
|
|
905
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
906
|
+
return_dtype=pl.Utf8,
|
|
981
907
|
)
|
|
908
|
+
.str.to_decimal()
|
|
909
|
+
.cast(target_dtype),
|
|
982
910
|
)
|
|
983
911
|
else:
|
|
984
912
|
# Try direct casting
|
|
985
|
-
self.features_df = (
|
|
986
|
-
|
|
987
|
-
pl.col(col).cast(target_dtype),
|
|
988
|
-
)
|
|
913
|
+
self.features_df = self.features_df.with_columns(
|
|
914
|
+
pl.col(col).cast(target_dtype),
|
|
989
915
|
)
|
|
990
916
|
else:
|
|
991
917
|
# Try direct casting for non-binary types
|
|
992
|
-
self.features_df = (
|
|
993
|
-
|
|
994
|
-
pl.col(col).cast(target_dtype),
|
|
995
|
-
)
|
|
918
|
+
self.features_df = self.features_df.with_columns(
|
|
919
|
+
pl.col(col).cast(target_dtype),
|
|
996
920
|
)
|
|
997
921
|
except Exception as e:
|
|
998
922
|
self.logger.warning(
|
|
@@ -1027,11 +951,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
1027
951
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
1028
952
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
1029
953
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1030
|
-
existing_columns = [
|
|
1031
|
-
col
|
|
1032
|
-
for col in schema_column_order
|
|
1033
|
-
if col in self.features_df.columns
|
|
1034
|
-
]
|
|
954
|
+
existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
|
|
1035
955
|
if existing_columns:
|
|
1036
956
|
self.features_df = self.features_df.select(existing_columns)
|
|
1037
957
|
|
|
@@ -1087,7 +1007,7 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
1087
1007
|
# Parameters are now loaded from metadata JSON (see above)
|
|
1088
1008
|
# Lib and lib_match are no longer saved/loaded
|
|
1089
1009
|
|
|
1090
|
-
#if map:
|
|
1010
|
+
# if map:
|
|
1091
1011
|
# featureXML = filename.replace(".sample5", ".featureXML")
|
|
1092
1012
|
# if os.path.exists(featureXML):
|
|
1093
1013
|
# self._load_featureXML(featureXML)
|
|
@@ -1102,14 +1022,14 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
1102
1022
|
# set self.label to basename without extension
|
|
1103
1023
|
if self.label is None or self.label == "":
|
|
1104
1024
|
self.label = os.path.splitext(os.path.basename(filename))[0]
|
|
1105
|
-
|
|
1025
|
+
|
|
1106
1026
|
# Sync instance attributes from loaded parameters
|
|
1107
|
-
if hasattr(self,
|
|
1108
|
-
if hasattr(self.parameters,
|
|
1027
|
+
if hasattr(self, "parameters") and self.parameters is not None:
|
|
1028
|
+
if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
|
|
1109
1029
|
self.polarity = self.parameters.polarity
|
|
1110
|
-
if hasattr(self.parameters,
|
|
1030
|
+
if hasattr(self.parameters, "type") and self.parameters.type is not None:
|
|
1111
1031
|
self.type = self.parameters.type
|
|
1112
|
-
|
|
1032
|
+
|
|
1113
1033
|
self.logger.info(f"Sample loaded from {filename}")
|
|
1114
1034
|
|
|
1115
1035
|
|
|
@@ -1247,9 +1167,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1247
1167
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1248
1168
|
# String data - convert to integer
|
|
1249
1169
|
self.scans_df = self.scans_df.with_columns(
|
|
1250
|
-
pl.col(col)
|
|
1251
|
-
.str.to_integer()
|
|
1252
|
-
.cast(eval(dtype_str)),
|
|
1170
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
1253
1171
|
)
|
|
1254
1172
|
elif self.scans_df[col].dtype in [
|
|
1255
1173
|
pl.Float64,
|
|
@@ -1269,9 +1187,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1269
1187
|
if self.scans_df[col].dtype == pl.Utf8:
|
|
1270
1188
|
# String data - convert to float
|
|
1271
1189
|
self.scans_df = self.scans_df.with_columns(
|
|
1272
|
-
pl.col(col)
|
|
1273
|
-
.str.to_decimal()
|
|
1274
|
-
.cast(eval(dtype_str)),
|
|
1190
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
1275
1191
|
)
|
|
1276
1192
|
else:
|
|
1277
1193
|
# Try direct casting
|
|
@@ -1295,9 +1211,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1295
1211
|
self.scans_df = self.scans_df.with_columns(
|
|
1296
1212
|
pl.col(col)
|
|
1297
1213
|
.map_elements(
|
|
1298
|
-
lambda x: x.decode("utf-8")
|
|
1299
|
-
if isinstance(x, bytes)
|
|
1300
|
-
else str(x),
|
|
1214
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1301
1215
|
return_dtype=pl.Utf8,
|
|
1302
1216
|
)
|
|
1303
1217
|
.cast(target_dtype),
|
|
@@ -1306,9 +1220,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1306
1220
|
self.scans_df = self.scans_df.with_columns(
|
|
1307
1221
|
pl.col(col)
|
|
1308
1222
|
.map_elements(
|
|
1309
|
-
lambda x: x.decode("utf-8")
|
|
1310
|
-
if isinstance(x, bytes)
|
|
1311
|
-
else str(x),
|
|
1223
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1312
1224
|
return_dtype=pl.Utf8,
|
|
1313
1225
|
)
|
|
1314
1226
|
.str.to_integer()
|
|
@@ -1318,9 +1230,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1318
1230
|
self.scans_df = self.scans_df.with_columns(
|
|
1319
1231
|
pl.col(col)
|
|
1320
1232
|
.map_elements(
|
|
1321
|
-
lambda x: x.decode("utf-8")
|
|
1322
|
-
if isinstance(x, bytes)
|
|
1323
|
-
else str(x),
|
|
1233
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1324
1234
|
return_dtype=pl.Utf8,
|
|
1325
1235
|
)
|
|
1326
1236
|
.str.to_decimal()
|
|
@@ -1349,9 +1259,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1349
1259
|
if "scans_df" in schema and "columns" in schema["scans_df"]:
|
|
1350
1260
|
schema_column_order = list(schema["scans_df"]["columns"].keys())
|
|
1351
1261
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1352
|
-
existing_columns = [
|
|
1353
|
-
col for col in schema_column_order if col in self.scans_df.columns
|
|
1354
|
-
]
|
|
1262
|
+
existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
|
|
1355
1263
|
if existing_columns:
|
|
1356
1264
|
self.scans_df = self.scans_df.select(existing_columns)
|
|
1357
1265
|
|
|
@@ -1556,25 +1464,19 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1556
1464
|
# Convert to numeric first, handling different input types
|
|
1557
1465
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1558
1466
|
# String data - convert to integer
|
|
1559
|
-
self.features_df = (
|
|
1560
|
-
|
|
1561
|
-
pl.col(col)
|
|
1562
|
-
.str.to_integer()
|
|
1563
|
-
.cast(eval(dtype_str)),
|
|
1564
|
-
)
|
|
1467
|
+
self.features_df = self.features_df.with_columns(
|
|
1468
|
+
pl.col(col).str.to_integer().cast(eval(dtype_str)),
|
|
1565
1469
|
)
|
|
1566
1470
|
elif self.features_df[col].dtype in [
|
|
1567
1471
|
pl.Float64,
|
|
1568
1472
|
pl.Float32,
|
|
1569
1473
|
]:
|
|
1570
1474
|
# Float data - cast to integer with null handling for NaN values
|
|
1571
|
-
self.features_df = (
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
),
|
|
1577
|
-
)
|
|
1475
|
+
self.features_df = self.features_df.with_columns(
|
|
1476
|
+
pl.col(col).cast(
|
|
1477
|
+
eval(dtype_str),
|
|
1478
|
+
strict=False,
|
|
1479
|
+
),
|
|
1578
1480
|
)
|
|
1579
1481
|
else:
|
|
1580
1482
|
# Handle special cases and try direct casting for other types
|
|
@@ -1585,70 +1487,50 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1585
1487
|
if "Binary" in str(current_dtype):
|
|
1586
1488
|
# Convert binary to string first, then to target type
|
|
1587
1489
|
if target_dtype == pl.Utf8:
|
|
1588
|
-
self.features_df = (
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
.
|
|
1592
|
-
|
|
1593
|
-
if isinstance(x, bytes)
|
|
1594
|
-
else str(x),
|
|
1595
|
-
return_dtype=pl.Utf8,
|
|
1596
|
-
)
|
|
1597
|
-
.cast(target_dtype),
|
|
1490
|
+
self.features_df = self.features_df.with_columns(
|
|
1491
|
+
pl.col(col)
|
|
1492
|
+
.map_elements(
|
|
1493
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1494
|
+
return_dtype=pl.Utf8,
|
|
1598
1495
|
)
|
|
1496
|
+
.cast(target_dtype),
|
|
1599
1497
|
)
|
|
1600
1498
|
elif "Int" in str(target_dtype):
|
|
1601
|
-
self.features_df = (
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
.
|
|
1605
|
-
|
|
1606
|
-
if isinstance(x, bytes)
|
|
1607
|
-
else str(x),
|
|
1608
|
-
return_dtype=pl.Utf8,
|
|
1609
|
-
)
|
|
1610
|
-
.str.to_integer()
|
|
1611
|
-
.cast(target_dtype),
|
|
1499
|
+
self.features_df = self.features_df.with_columns(
|
|
1500
|
+
pl.col(col)
|
|
1501
|
+
.map_elements(
|
|
1502
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1503
|
+
return_dtype=pl.Utf8,
|
|
1612
1504
|
)
|
|
1505
|
+
.str.to_integer()
|
|
1506
|
+
.cast(target_dtype),
|
|
1613
1507
|
)
|
|
1614
1508
|
elif "Float" in str(target_dtype):
|
|
1615
|
-
self.features_df = (
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
.
|
|
1619
|
-
|
|
1620
|
-
if isinstance(x, bytes)
|
|
1621
|
-
else str(x),
|
|
1622
|
-
return_dtype=pl.Utf8,
|
|
1623
|
-
)
|
|
1624
|
-
.str.to_decimal()
|
|
1625
|
-
.cast(target_dtype),
|
|
1509
|
+
self.features_df = self.features_df.with_columns(
|
|
1510
|
+
pl.col(col)
|
|
1511
|
+
.map_elements(
|
|
1512
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1513
|
+
return_dtype=pl.Utf8,
|
|
1626
1514
|
)
|
|
1515
|
+
.str.to_decimal()
|
|
1516
|
+
.cast(target_dtype),
|
|
1627
1517
|
)
|
|
1628
1518
|
else:
|
|
1629
1519
|
# Try direct casting
|
|
1630
|
-
self.features_df = (
|
|
1631
|
-
|
|
1632
|
-
pl.col(col).cast(target_dtype),
|
|
1633
|
-
)
|
|
1520
|
+
self.features_df = self.features_df.with_columns(
|
|
1521
|
+
pl.col(col).cast(target_dtype),
|
|
1634
1522
|
)
|
|
1635
1523
|
else:
|
|
1636
1524
|
# Try direct casting for non-binary types
|
|
1637
|
-
self.features_df = (
|
|
1638
|
-
|
|
1639
|
-
pl.col(col).cast(target_dtype),
|
|
1640
|
-
)
|
|
1525
|
+
self.features_df = self.features_df.with_columns(
|
|
1526
|
+
pl.col(col).cast(target_dtype),
|
|
1641
1527
|
)
|
|
1642
1528
|
elif "Float" in dtype_str:
|
|
1643
1529
|
# Convert to float, handling different input types
|
|
1644
1530
|
if self.features_df[col].dtype == pl.Utf8:
|
|
1645
1531
|
# String data - convert to float
|
|
1646
|
-
self.features_df = (
|
|
1647
|
-
|
|
1648
|
-
pl.col(col)
|
|
1649
|
-
.str.to_decimal()
|
|
1650
|
-
.cast(eval(dtype_str)),
|
|
1651
|
-
)
|
|
1532
|
+
self.features_df = self.features_df.with_columns(
|
|
1533
|
+
pl.col(col).str.to_decimal().cast(eval(dtype_str)),
|
|
1652
1534
|
)
|
|
1653
1535
|
else:
|
|
1654
1536
|
# Handle special cases and try direct casting for other types
|
|
@@ -1659,59 +1541,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1659
1541
|
if "Binary" in str(current_dtype):
|
|
1660
1542
|
# Convert binary to string first, then to target type
|
|
1661
1543
|
if target_dtype == pl.Utf8:
|
|
1662
|
-
self.features_df = (
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
.
|
|
1666
|
-
|
|
1667
|
-
if isinstance(x, bytes)
|
|
1668
|
-
else str(x),
|
|
1669
|
-
return_dtype=pl.Utf8,
|
|
1670
|
-
)
|
|
1671
|
-
.cast(target_dtype),
|
|
1544
|
+
self.features_df = self.features_df.with_columns(
|
|
1545
|
+
pl.col(col)
|
|
1546
|
+
.map_elements(
|
|
1547
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1548
|
+
return_dtype=pl.Utf8,
|
|
1672
1549
|
)
|
|
1550
|
+
.cast(target_dtype),
|
|
1673
1551
|
)
|
|
1674
1552
|
elif "Int" in str(target_dtype):
|
|
1675
|
-
self.features_df = (
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
.
|
|
1679
|
-
|
|
1680
|
-
if isinstance(x, bytes)
|
|
1681
|
-
else str(x),
|
|
1682
|
-
return_dtype=pl.Utf8,
|
|
1683
|
-
)
|
|
1684
|
-
.str.to_integer()
|
|
1685
|
-
.cast(target_dtype),
|
|
1553
|
+
self.features_df = self.features_df.with_columns(
|
|
1554
|
+
pl.col(col)
|
|
1555
|
+
.map_elements(
|
|
1556
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1557
|
+
return_dtype=pl.Utf8,
|
|
1686
1558
|
)
|
|
1559
|
+
.str.to_integer()
|
|
1560
|
+
.cast(target_dtype),
|
|
1687
1561
|
)
|
|
1688
1562
|
elif "Float" in str(target_dtype):
|
|
1689
|
-
self.features_df = (
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
.
|
|
1693
|
-
|
|
1694
|
-
if isinstance(x, bytes)
|
|
1695
|
-
else str(x),
|
|
1696
|
-
return_dtype=pl.Utf8,
|
|
1697
|
-
)
|
|
1698
|
-
.str.to_decimal()
|
|
1699
|
-
.cast(target_dtype),
|
|
1563
|
+
self.features_df = self.features_df.with_columns(
|
|
1564
|
+
pl.col(col)
|
|
1565
|
+
.map_elements(
|
|
1566
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1567
|
+
return_dtype=pl.Utf8,
|
|
1700
1568
|
)
|
|
1569
|
+
.str.to_decimal()
|
|
1570
|
+
.cast(target_dtype),
|
|
1701
1571
|
)
|
|
1702
1572
|
else:
|
|
1703
1573
|
# Try direct casting
|
|
1704
|
-
self.features_df = (
|
|
1705
|
-
|
|
1706
|
-
pl.col(col).cast(target_dtype),
|
|
1707
|
-
)
|
|
1574
|
+
self.features_df = self.features_df.with_columns(
|
|
1575
|
+
pl.col(col).cast(target_dtype),
|
|
1708
1576
|
)
|
|
1709
1577
|
else:
|
|
1710
1578
|
# Try direct casting for non-binary types
|
|
1711
|
-
self.features_df = (
|
|
1712
|
-
|
|
1713
|
-
pl.col(col).cast(target_dtype),
|
|
1714
|
-
)
|
|
1579
|
+
self.features_df = self.features_df.with_columns(
|
|
1580
|
+
pl.col(col).cast(target_dtype),
|
|
1715
1581
|
)
|
|
1716
1582
|
elif "Utf8" in dtype_str:
|
|
1717
1583
|
# Ensure it's string type
|
|
@@ -1727,59 +1593,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1727
1593
|
if "Binary" in str(current_dtype):
|
|
1728
1594
|
# Convert binary to string first, then to target type
|
|
1729
1595
|
if target_dtype == pl.Utf8:
|
|
1730
|
-
self.features_df = (
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
.
|
|
1734
|
-
|
|
1735
|
-
if isinstance(x, bytes)
|
|
1736
|
-
else str(x),
|
|
1737
|
-
return_dtype=pl.Utf8,
|
|
1738
|
-
)
|
|
1739
|
-
.cast(target_dtype),
|
|
1596
|
+
self.features_df = self.features_df.with_columns(
|
|
1597
|
+
pl.col(col)
|
|
1598
|
+
.map_elements(
|
|
1599
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1600
|
+
return_dtype=pl.Utf8,
|
|
1740
1601
|
)
|
|
1602
|
+
.cast(target_dtype),
|
|
1741
1603
|
)
|
|
1742
1604
|
elif "Int" in str(target_dtype):
|
|
1743
|
-
self.features_df = (
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
.
|
|
1747
|
-
|
|
1748
|
-
if isinstance(x, bytes)
|
|
1749
|
-
else str(x),
|
|
1750
|
-
return_dtype=pl.Utf8,
|
|
1751
|
-
)
|
|
1752
|
-
.str.to_integer()
|
|
1753
|
-
.cast(target_dtype),
|
|
1605
|
+
self.features_df = self.features_df.with_columns(
|
|
1606
|
+
pl.col(col)
|
|
1607
|
+
.map_elements(
|
|
1608
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1609
|
+
return_dtype=pl.Utf8,
|
|
1754
1610
|
)
|
|
1611
|
+
.str.to_integer()
|
|
1612
|
+
.cast(target_dtype),
|
|
1755
1613
|
)
|
|
1756
1614
|
elif "Float" in str(target_dtype):
|
|
1757
|
-
self.features_df = (
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
.
|
|
1761
|
-
|
|
1762
|
-
if isinstance(x, bytes)
|
|
1763
|
-
else str(x),
|
|
1764
|
-
return_dtype=pl.Utf8,
|
|
1765
|
-
)
|
|
1766
|
-
.str.to_decimal()
|
|
1767
|
-
.cast(target_dtype),
|
|
1615
|
+
self.features_df = self.features_df.with_columns(
|
|
1616
|
+
pl.col(col)
|
|
1617
|
+
.map_elements(
|
|
1618
|
+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
|
|
1619
|
+
return_dtype=pl.Utf8,
|
|
1768
1620
|
)
|
|
1621
|
+
.str.to_decimal()
|
|
1622
|
+
.cast(target_dtype),
|
|
1769
1623
|
)
|
|
1770
1624
|
else:
|
|
1771
1625
|
# Try direct casting
|
|
1772
|
-
self.features_df = (
|
|
1773
|
-
|
|
1774
|
-
pl.col(col).cast(target_dtype),
|
|
1775
|
-
)
|
|
1626
|
+
self.features_df = self.features_df.with_columns(
|
|
1627
|
+
pl.col(col).cast(target_dtype),
|
|
1776
1628
|
)
|
|
1777
1629
|
else:
|
|
1778
1630
|
# Try direct casting for non-binary types
|
|
1779
|
-
self.features_df = (
|
|
1780
|
-
|
|
1781
|
-
pl.col(col).cast(target_dtype),
|
|
1782
|
-
)
|
|
1631
|
+
self.features_df = self.features_df.with_columns(
|
|
1632
|
+
pl.col(col).cast(target_dtype),
|
|
1783
1633
|
)
|
|
1784
1634
|
except Exception as e:
|
|
1785
1635
|
self.logger.warning(
|
|
@@ -1814,11 +1664,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1814
1664
|
if "features_df" in schema and "columns" in schema["features_df"]:
|
|
1815
1665
|
schema_column_order = list(schema["features_df"]["columns"].keys())
|
|
1816
1666
|
# Only reorder columns that exist in both schema and DataFrame
|
|
1817
|
-
existing_columns = [
|
|
1818
|
-
col
|
|
1819
|
-
for col in schema_column_order
|
|
1820
|
-
if col in self.features_df.columns
|
|
1821
|
-
]
|
|
1667
|
+
existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
|
|
1822
1668
|
if existing_columns:
|
|
1823
1669
|
self.features_df = self.features_df.select(existing_columns)
|
|
1824
1670
|
|
|
@@ -1848,14 +1694,14 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1848
1694
|
# set self.label to basename without extension
|
|
1849
1695
|
if self.label is None or self.label == "":
|
|
1850
1696
|
self.label = os.path.splitext(os.path.basename(filename))[0]
|
|
1851
|
-
|
|
1697
|
+
|
|
1852
1698
|
# Sync instance attributes from loaded parameters
|
|
1853
|
-
if hasattr(self,
|
|
1854
|
-
if hasattr(self.parameters,
|
|
1699
|
+
if hasattr(self, "parameters") and self.parameters is not None:
|
|
1700
|
+
if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
|
|
1855
1701
|
self.polarity = self.parameters.polarity
|
|
1856
|
-
if hasattr(self.parameters,
|
|
1702
|
+
if hasattr(self.parameters, "type") and self.parameters.type is not None:
|
|
1857
1703
|
self.type = self.parameters.type
|
|
1858
|
-
|
|
1704
|
+
|
|
1859
1705
|
self.logger.info(
|
|
1860
1706
|
f"Sample loaded successfully from {filename} (optimized for study)",
|
|
1861
1707
|
)
|
|
@@ -2191,9 +2037,7 @@ def _create_dataframe_with_object_columns(
|
|
|
2191
2037
|
schema_columns = schema.get(df_name, {}).get("columns", {})
|
|
2192
2038
|
|
|
2193
2039
|
object_columns = {
|
|
2194
|
-
k: v
|
|
2195
|
-
for k, v in data.items()
|
|
2196
|
-
if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
2040
|
+
k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
|
|
2197
2041
|
}
|
|
2198
2042
|
regular_columns = {k: v for k, v in data.items() if k not in object_columns}
|
|
2199
2043
|
|
|
@@ -2318,8 +2162,6 @@ def create_h5_metadata_group(
|
|
|
2318
2162
|
metadata_group = f.create_group("metadata")
|
|
2319
2163
|
metadata_group.attrs["format"] = "masster-sample5-1"
|
|
2320
2164
|
metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
|
|
2321
|
-
metadata_group.attrs["file_source"] = (
|
|
2322
|
-
str(file_source) if file_source is not None else ""
|
|
2323
|
-
)
|
|
2165
|
+
metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
|
|
2324
2166
|
metadata_group.attrs["file_type"] = str(type) if type is not None else ""
|
|
2325
2167
|
metadata_group.attrs["label"] = str(label) if label is not None else ""
|