masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/sample/h5.py CHANGED
@@ -1,4 +1,4 @@
1
- import json
1
+ import json
2
2
  import os
3
3
 
4
4
  import h5py
@@ -62,7 +62,7 @@ def _save_sample5(
62
62
  return
63
63
 
64
64
  # synchronize feature_map if it exists
65
- #if hasattr(self, "_feature_map") and self._feature_map is not None:
65
+ # if hasattr(self, "_feature_map") and self._feature_map is not None:
66
66
  # self._features_sync()
67
67
 
68
68
  # if no extension is given, add .sample5
@@ -94,7 +94,7 @@ def _save_sample5(
94
94
  metadata_group.attrs["file_source"] = str(self.file_source)
95
95
  else:
96
96
  metadata_group.attrs["file_source"] = ""
97
- if hasattr(self, 'type') and self.type is not None:
97
+ if hasattr(self, "type") and self.type is not None:
98
98
  metadata_group.attrs["file_type"] = str(self.type)
99
99
  else:
100
100
  metadata_group.attrs["file_type"] = ""
@@ -127,11 +127,7 @@ def _save_sample5(
127
127
  numeric_data = np.array(
128
128
  [
129
129
  float(x)
130
- if x is not None
131
- and str(x)
132
- .replace(".", "")
133
- .replace("-", "")
134
- .isdigit()
130
+ if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
135
131
  else np.nan
136
132
  for x in data
137
133
  ],
@@ -289,21 +285,21 @@ def _save_sample5(
289
285
 
290
286
  # Store parameters/history as JSON
291
287
  # Always ensure we sync instance attributes to parameters before saving
292
- if hasattr(self, 'parameters') and self.parameters is not None:
293
- if hasattr(self, 'polarity') and self.polarity is not None:
288
+ if hasattr(self, "parameters") and self.parameters is not None:
289
+ if hasattr(self, "polarity") and self.polarity is not None:
294
290
  self.parameters.polarity = self.polarity
295
- if hasattr(self, 'type') and self.type is not None:
291
+ if hasattr(self, "type") and self.type is not None:
296
292
  self.parameters.type = self.type
297
-
293
+
298
294
  # Prepare save data
299
295
  save_data = {}
300
-
296
+
301
297
  # Add parameters as a dictionary
302
- if hasattr(self, 'parameters') and self.parameters is not None:
298
+ if hasattr(self, "parameters") and self.parameters is not None:
303
299
  save_data["sample"] = self.parameters.to_dict()
304
-
300
+
305
301
  # Add history data (but ensure it's JSON serializable)
306
- if hasattr(self, 'history') and self.history is not None:
302
+ if hasattr(self, "history") and self.history is not None:
307
303
  # Convert any non-JSON-serializable objects to strings/dicts
308
304
  serializable_history = {}
309
305
  for key, value in self.history.items():
@@ -318,7 +314,7 @@ def _save_sample5(
318
314
  # Convert to string if not serializable
319
315
  serializable_history[key] = str(value)
320
316
  save_data.update(serializable_history)
321
-
317
+
322
318
  # Save as JSON
323
319
  params_json = json.dumps(save_data, indent=2)
324
320
  metadata_group.attrs["parameters"] = params_json
@@ -480,9 +476,7 @@ def _load_sample5(self, filename: str, map: bool = False):
480
476
  if self.scans_df[col].dtype == pl.Utf8:
481
477
  # String data - convert to integer
482
478
  self.scans_df = self.scans_df.with_columns(
483
- pl.col(col)
484
- .str.to_integer()
485
- .cast(eval(dtype_str)),
479
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
486
480
  )
487
481
  elif self.scans_df[col].dtype in [
488
482
  pl.Float64,
@@ -502,9 +496,7 @@ def _load_sample5(self, filename: str, map: bool = False):
502
496
  if self.scans_df[col].dtype == pl.Utf8:
503
497
  # String data - convert to float
504
498
  self.scans_df = self.scans_df.with_columns(
505
- pl.col(col)
506
- .str.to_decimal()
507
- .cast(eval(dtype_str)),
499
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
508
500
  )
509
501
  else:
510
502
  # Try direct casting
@@ -528,9 +520,7 @@ def _load_sample5(self, filename: str, map: bool = False):
528
520
  self.scans_df = self.scans_df.with_columns(
529
521
  pl.col(col)
530
522
  .map_elements(
531
- lambda x: x.decode("utf-8")
532
- if isinstance(x, bytes)
533
- else str(x),
523
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
534
524
  return_dtype=pl.Utf8,
535
525
  )
536
526
  .cast(target_dtype),
@@ -539,9 +529,7 @@ def _load_sample5(self, filename: str, map: bool = False):
539
529
  self.scans_df = self.scans_df.with_columns(
540
530
  pl.col(col)
541
531
  .map_elements(
542
- lambda x: x.decode("utf-8")
543
- if isinstance(x, bytes)
544
- else str(x),
532
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
545
533
  return_dtype=pl.Utf8,
546
534
  )
547
535
  .str.to_integer()
@@ -551,9 +539,7 @@ def _load_sample5(self, filename: str, map: bool = False):
551
539
  self.scans_df = self.scans_df.with_columns(
552
540
  pl.col(col)
553
541
  .map_elements(
554
- lambda x: x.decode("utf-8")
555
- if isinstance(x, bytes)
556
- else str(x),
542
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
557
543
  return_dtype=pl.Utf8,
558
544
  )
559
545
  .str.to_decimal()
@@ -582,9 +568,7 @@ def _load_sample5(self, filename: str, map: bool = False):
582
568
  if "scans_df" in schema and "columns" in schema["scans_df"]:
583
569
  schema_column_order = list(schema["scans_df"]["columns"].keys())
584
570
  # Only reorder columns that exist in both schema and DataFrame
585
- existing_columns = [
586
- col for col in schema_column_order if col in self.scans_df.columns
587
- ]
571
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
588
572
  if existing_columns:
589
573
  self.scans_df = self.scans_df.select(existing_columns)
590
574
 
@@ -730,9 +714,7 @@ def _load_sample5(self, filename: str, map: bool = False):
730
714
  if k in schema.get("features_df", {}).get("columns", {})
731
715
  and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
732
716
  }
733
- regular_columns = {
734
- k: v for k, v in data.items() if k not in object_columns
735
- }
717
+ regular_columns = {k: v for k, v in data.items() if k not in object_columns}
736
718
 
737
719
  # Create DataFrame with regular columns first
738
720
  if regular_columns:
@@ -769,25 +751,19 @@ def _load_sample5(self, filename: str, map: bool = False):
769
751
  # Convert to numeric first, handling different input types
770
752
  if self.features_df[col].dtype == pl.Utf8:
771
753
  # String data - convert to integer
772
- self.features_df = (
773
- self.features_df.with_columns(
774
- pl.col(col)
775
- .str.to_integer()
776
- .cast(eval(dtype_str)),
777
- )
754
+ self.features_df = self.features_df.with_columns(
755
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
778
756
  )
779
757
  elif self.features_df[col].dtype in [
780
758
  pl.Float64,
781
759
  pl.Float32,
782
760
  ]:
783
761
  # Float data - cast to integer with null handling for NaN values
784
- self.features_df = (
785
- self.features_df.with_columns(
786
- pl.col(col).cast(
787
- eval(dtype_str),
788
- strict=False,
789
- ),
790
- )
762
+ self.features_df = self.features_df.with_columns(
763
+ pl.col(col).cast(
764
+ eval(dtype_str),
765
+ strict=False,
766
+ ),
791
767
  )
792
768
  else:
793
769
  # Handle special cases and try direct casting for other types
@@ -798,70 +774,50 @@ def _load_sample5(self, filename: str, map: bool = False):
798
774
  if "Binary" in str(current_dtype):
799
775
  # Convert binary to string first, then to target type
800
776
  if target_dtype == pl.Utf8:
801
- self.features_df = (
802
- self.features_df.with_columns(
803
- pl.col(col)
804
- .map_elements(
805
- lambda x: x.decode("utf-8")
806
- if isinstance(x, bytes)
807
- else str(x),
808
- return_dtype=pl.Utf8,
809
- )
810
- .cast(target_dtype),
777
+ self.features_df = self.features_df.with_columns(
778
+ pl.col(col)
779
+ .map_elements(
780
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
781
+ return_dtype=pl.Utf8,
811
782
  )
783
+ .cast(target_dtype),
812
784
  )
813
785
  elif "Int" in str(target_dtype):
814
- self.features_df = (
815
- self.features_df.with_columns(
816
- pl.col(col)
817
- .map_elements(
818
- lambda x: x.decode("utf-8")
819
- if isinstance(x, bytes)
820
- else str(x),
821
- return_dtype=pl.Utf8,
822
- )
823
- .str.to_integer()
824
- .cast(target_dtype),
786
+ self.features_df = self.features_df.with_columns(
787
+ pl.col(col)
788
+ .map_elements(
789
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
790
+ return_dtype=pl.Utf8,
825
791
  )
792
+ .str.to_integer()
793
+ .cast(target_dtype),
826
794
  )
827
795
  elif "Float" in str(target_dtype):
828
- self.features_df = (
829
- self.features_df.with_columns(
830
- pl.col(col)
831
- .map_elements(
832
- lambda x: x.decode("utf-8")
833
- if isinstance(x, bytes)
834
- else str(x),
835
- return_dtype=pl.Utf8,
836
- )
837
- .str.to_decimal()
838
- .cast(target_dtype),
796
+ self.features_df = self.features_df.with_columns(
797
+ pl.col(col)
798
+ .map_elements(
799
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
800
+ return_dtype=pl.Utf8,
839
801
  )
802
+ .str.to_decimal()
803
+ .cast(target_dtype),
840
804
  )
841
805
  else:
842
806
  # Try direct casting
843
- self.features_df = (
844
- self.features_df.with_columns(
845
- pl.col(col).cast(target_dtype),
846
- )
807
+ self.features_df = self.features_df.with_columns(
808
+ pl.col(col).cast(target_dtype),
847
809
  )
848
810
  else:
849
811
  # Try direct casting for non-binary types
850
- self.features_df = (
851
- self.features_df.with_columns(
852
- pl.col(col).cast(target_dtype),
853
- )
812
+ self.features_df = self.features_df.with_columns(
813
+ pl.col(col).cast(target_dtype),
854
814
  )
855
815
  elif "Float" in dtype_str:
856
816
  # Convert to float, handling different input types
857
817
  if self.features_df[col].dtype == pl.Utf8:
858
818
  # String data - convert to float
859
- self.features_df = (
860
- self.features_df.with_columns(
861
- pl.col(col)
862
- .str.to_decimal()
863
- .cast(eval(dtype_str)),
864
- )
819
+ self.features_df = self.features_df.with_columns(
820
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
865
821
  )
866
822
  else:
867
823
  # Handle special cases and try direct casting for other types
@@ -872,59 +828,43 @@ def _load_sample5(self, filename: str, map: bool = False):
872
828
  if "Binary" in str(current_dtype):
873
829
  # Convert binary to string first, then to target type
874
830
  if target_dtype == pl.Utf8:
875
- self.features_df = (
876
- self.features_df.with_columns(
877
- pl.col(col)
878
- .map_elements(
879
- lambda x: x.decode("utf-8")
880
- if isinstance(x, bytes)
881
- else str(x),
882
- return_dtype=pl.Utf8,
883
- )
884
- .cast(target_dtype),
831
+ self.features_df = self.features_df.with_columns(
832
+ pl.col(col)
833
+ .map_elements(
834
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
835
+ return_dtype=pl.Utf8,
885
836
  )
837
+ .cast(target_dtype),
886
838
  )
887
839
  elif "Int" in str(target_dtype):
888
- self.features_df = (
889
- self.features_df.with_columns(
890
- pl.col(col)
891
- .map_elements(
892
- lambda x: x.decode("utf-8")
893
- if isinstance(x, bytes)
894
- else str(x),
895
- return_dtype=pl.Utf8,
896
- )
897
- .str.to_integer()
898
- .cast(target_dtype),
840
+ self.features_df = self.features_df.with_columns(
841
+ pl.col(col)
842
+ .map_elements(
843
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
844
+ return_dtype=pl.Utf8,
899
845
  )
846
+ .str.to_integer()
847
+ .cast(target_dtype),
900
848
  )
901
849
  elif "Float" in str(target_dtype):
902
- self.features_df = (
903
- self.features_df.with_columns(
904
- pl.col(col)
905
- .map_elements(
906
- lambda x: x.decode("utf-8")
907
- if isinstance(x, bytes)
908
- else str(x),
909
- return_dtype=pl.Utf8,
910
- )
911
- .str.to_decimal()
912
- .cast(target_dtype),
850
+ self.features_df = self.features_df.with_columns(
851
+ pl.col(col)
852
+ .map_elements(
853
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
854
+ return_dtype=pl.Utf8,
913
855
  )
856
+ .str.to_decimal()
857
+ .cast(target_dtype),
914
858
  )
915
859
  else:
916
860
  # Try direct casting
917
- self.features_df = (
918
- self.features_df.with_columns(
919
- pl.col(col).cast(target_dtype),
920
- )
861
+ self.features_df = self.features_df.with_columns(
862
+ pl.col(col).cast(target_dtype),
921
863
  )
922
864
  else:
923
865
  # Try direct casting for non-binary types
924
- self.features_df = (
925
- self.features_df.with_columns(
926
- pl.col(col).cast(target_dtype),
927
- )
866
+ self.features_df = self.features_df.with_columns(
867
+ pl.col(col).cast(target_dtype),
928
868
  )
929
869
  elif "Utf8" in dtype_str:
930
870
  # Ensure it's string type
@@ -940,59 +880,43 @@ def _load_sample5(self, filename: str, map: bool = False):
940
880
  if "Binary" in str(current_dtype):
941
881
  # Convert binary to string first, then to target type
942
882
  if target_dtype == pl.Utf8:
943
- self.features_df = (
944
- self.features_df.with_columns(
945
- pl.col(col)
946
- .map_elements(
947
- lambda x: x.decode("utf-8")
948
- if isinstance(x, bytes)
949
- else str(x),
950
- return_dtype=pl.Utf8,
951
- )
952
- .cast(target_dtype),
883
+ self.features_df = self.features_df.with_columns(
884
+ pl.col(col)
885
+ .map_elements(
886
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
887
+ return_dtype=pl.Utf8,
953
888
  )
889
+ .cast(target_dtype),
954
890
  )
955
891
  elif "Int" in str(target_dtype):
956
- self.features_df = (
957
- self.features_df.with_columns(
958
- pl.col(col)
959
- .map_elements(
960
- lambda x: x.decode("utf-8")
961
- if isinstance(x, bytes)
962
- else str(x),
963
- return_dtype=pl.Utf8,
964
- )
965
- .str.to_integer()
966
- .cast(target_dtype),
892
+ self.features_df = self.features_df.with_columns(
893
+ pl.col(col)
894
+ .map_elements(
895
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
896
+ return_dtype=pl.Utf8,
967
897
  )
898
+ .str.to_integer()
899
+ .cast(target_dtype),
968
900
  )
969
901
  elif "Float" in str(target_dtype):
970
- self.features_df = (
971
- self.features_df.with_columns(
972
- pl.col(col)
973
- .map_elements(
974
- lambda x: x.decode("utf-8")
975
- if isinstance(x, bytes)
976
- else str(x),
977
- return_dtype=pl.Utf8,
978
- )
979
- .str.to_decimal()
980
- .cast(target_dtype),
902
+ self.features_df = self.features_df.with_columns(
903
+ pl.col(col)
904
+ .map_elements(
905
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
906
+ return_dtype=pl.Utf8,
981
907
  )
908
+ .str.to_decimal()
909
+ .cast(target_dtype),
982
910
  )
983
911
  else:
984
912
  # Try direct casting
985
- self.features_df = (
986
- self.features_df.with_columns(
987
- pl.col(col).cast(target_dtype),
988
- )
913
+ self.features_df = self.features_df.with_columns(
914
+ pl.col(col).cast(target_dtype),
989
915
  )
990
916
  else:
991
917
  # Try direct casting for non-binary types
992
- self.features_df = (
993
- self.features_df.with_columns(
994
- pl.col(col).cast(target_dtype),
995
- )
918
+ self.features_df = self.features_df.with_columns(
919
+ pl.col(col).cast(target_dtype),
996
920
  )
997
921
  except Exception as e:
998
922
  self.logger.warning(
@@ -1027,11 +951,7 @@ def _load_sample5(self, filename: str, map: bool = False):
1027
951
  if "features_df" in schema and "columns" in schema["features_df"]:
1028
952
  schema_column_order = list(schema["features_df"]["columns"].keys())
1029
953
  # Only reorder columns that exist in both schema and DataFrame
1030
- existing_columns = [
1031
- col
1032
- for col in schema_column_order
1033
- if col in self.features_df.columns
1034
- ]
954
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
1035
955
  if existing_columns:
1036
956
  self.features_df = self.features_df.select(existing_columns)
1037
957
 
@@ -1087,7 +1007,7 @@ def _load_sample5(self, filename: str, map: bool = False):
1087
1007
  # Parameters are now loaded from metadata JSON (see above)
1088
1008
  # Lib and lib_match are no longer saved/loaded
1089
1009
 
1090
- #if map:
1010
+ # if map:
1091
1011
  # featureXML = filename.replace(".sample5", ".featureXML")
1092
1012
  # if os.path.exists(featureXML):
1093
1013
  # self._load_featureXML(featureXML)
@@ -1102,14 +1022,14 @@ def _load_sample5(self, filename: str, map: bool = False):
1102
1022
  # set self.label to basename without extension
1103
1023
  if self.label is None or self.label == "":
1104
1024
  self.label = os.path.splitext(os.path.basename(filename))[0]
1105
-
1025
+
1106
1026
  # Sync instance attributes from loaded parameters
1107
- if hasattr(self, 'parameters') and self.parameters is not None:
1108
- if hasattr(self.parameters, 'polarity') and self.parameters.polarity is not None:
1027
+ if hasattr(self, "parameters") and self.parameters is not None:
1028
+ if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
1109
1029
  self.polarity = self.parameters.polarity
1110
- if hasattr(self.parameters, 'type') and self.parameters.type is not None:
1030
+ if hasattr(self.parameters, "type") and self.parameters.type is not None:
1111
1031
  self.type = self.parameters.type
1112
-
1032
+
1113
1033
  self.logger.info(f"Sample loaded from {filename}")
1114
1034
 
1115
1035
 
@@ -1247,9 +1167,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1247
1167
  if self.scans_df[col].dtype == pl.Utf8:
1248
1168
  # String data - convert to integer
1249
1169
  self.scans_df = self.scans_df.with_columns(
1250
- pl.col(col)
1251
- .str.to_integer()
1252
- .cast(eval(dtype_str)),
1170
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1253
1171
  )
1254
1172
  elif self.scans_df[col].dtype in [
1255
1173
  pl.Float64,
@@ -1269,9 +1187,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1269
1187
  if self.scans_df[col].dtype == pl.Utf8:
1270
1188
  # String data - convert to float
1271
1189
  self.scans_df = self.scans_df.with_columns(
1272
- pl.col(col)
1273
- .str.to_decimal()
1274
- .cast(eval(dtype_str)),
1190
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1275
1191
  )
1276
1192
  else:
1277
1193
  # Try direct casting
@@ -1295,9 +1211,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1295
1211
  self.scans_df = self.scans_df.with_columns(
1296
1212
  pl.col(col)
1297
1213
  .map_elements(
1298
- lambda x: x.decode("utf-8")
1299
- if isinstance(x, bytes)
1300
- else str(x),
1214
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1301
1215
  return_dtype=pl.Utf8,
1302
1216
  )
1303
1217
  .cast(target_dtype),
@@ -1306,9 +1220,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1306
1220
  self.scans_df = self.scans_df.with_columns(
1307
1221
  pl.col(col)
1308
1222
  .map_elements(
1309
- lambda x: x.decode("utf-8")
1310
- if isinstance(x, bytes)
1311
- else str(x),
1223
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1312
1224
  return_dtype=pl.Utf8,
1313
1225
  )
1314
1226
  .str.to_integer()
@@ -1318,9 +1230,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1318
1230
  self.scans_df = self.scans_df.with_columns(
1319
1231
  pl.col(col)
1320
1232
  .map_elements(
1321
- lambda x: x.decode("utf-8")
1322
- if isinstance(x, bytes)
1323
- else str(x),
1233
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1324
1234
  return_dtype=pl.Utf8,
1325
1235
  )
1326
1236
  .str.to_decimal()
@@ -1349,9 +1259,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1349
1259
  if "scans_df" in schema and "columns" in schema["scans_df"]:
1350
1260
  schema_column_order = list(schema["scans_df"]["columns"].keys())
1351
1261
  # Only reorder columns that exist in both schema and DataFrame
1352
- existing_columns = [
1353
- col for col in schema_column_order if col in self.scans_df.columns
1354
- ]
1262
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
1355
1263
  if existing_columns:
1356
1264
  self.scans_df = self.scans_df.select(existing_columns)
1357
1265
 
@@ -1556,25 +1464,19 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1556
1464
  # Convert to numeric first, handling different input types
1557
1465
  if self.features_df[col].dtype == pl.Utf8:
1558
1466
  # String data - convert to integer
1559
- self.features_df = (
1560
- self.features_df.with_columns(
1561
- pl.col(col)
1562
- .str.to_integer()
1563
- .cast(eval(dtype_str)),
1564
- )
1467
+ self.features_df = self.features_df.with_columns(
1468
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1565
1469
  )
1566
1470
  elif self.features_df[col].dtype in [
1567
1471
  pl.Float64,
1568
1472
  pl.Float32,
1569
1473
  ]:
1570
1474
  # Float data - cast to integer with null handling for NaN values
1571
- self.features_df = (
1572
- self.features_df.with_columns(
1573
- pl.col(col).cast(
1574
- eval(dtype_str),
1575
- strict=False,
1576
- ),
1577
- )
1475
+ self.features_df = self.features_df.with_columns(
1476
+ pl.col(col).cast(
1477
+ eval(dtype_str),
1478
+ strict=False,
1479
+ ),
1578
1480
  )
1579
1481
  else:
1580
1482
  # Handle special cases and try direct casting for other types
@@ -1585,70 +1487,50 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1585
1487
  if "Binary" in str(current_dtype):
1586
1488
  # Convert binary to string first, then to target type
1587
1489
  if target_dtype == pl.Utf8:
1588
- self.features_df = (
1589
- self.features_df.with_columns(
1590
- pl.col(col)
1591
- .map_elements(
1592
- lambda x: x.decode("utf-8")
1593
- if isinstance(x, bytes)
1594
- else str(x),
1595
- return_dtype=pl.Utf8,
1596
- )
1597
- .cast(target_dtype),
1490
+ self.features_df = self.features_df.with_columns(
1491
+ pl.col(col)
1492
+ .map_elements(
1493
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1494
+ return_dtype=pl.Utf8,
1598
1495
  )
1496
+ .cast(target_dtype),
1599
1497
  )
1600
1498
  elif "Int" in str(target_dtype):
1601
- self.features_df = (
1602
- self.features_df.with_columns(
1603
- pl.col(col)
1604
- .map_elements(
1605
- lambda x: x.decode("utf-8")
1606
- if isinstance(x, bytes)
1607
- else str(x),
1608
- return_dtype=pl.Utf8,
1609
- )
1610
- .str.to_integer()
1611
- .cast(target_dtype),
1499
+ self.features_df = self.features_df.with_columns(
1500
+ pl.col(col)
1501
+ .map_elements(
1502
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1503
+ return_dtype=pl.Utf8,
1612
1504
  )
1505
+ .str.to_integer()
1506
+ .cast(target_dtype),
1613
1507
  )
1614
1508
  elif "Float" in str(target_dtype):
1615
- self.features_df = (
1616
- self.features_df.with_columns(
1617
- pl.col(col)
1618
- .map_elements(
1619
- lambda x: x.decode("utf-8")
1620
- if isinstance(x, bytes)
1621
- else str(x),
1622
- return_dtype=pl.Utf8,
1623
- )
1624
- .str.to_decimal()
1625
- .cast(target_dtype),
1509
+ self.features_df = self.features_df.with_columns(
1510
+ pl.col(col)
1511
+ .map_elements(
1512
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1513
+ return_dtype=pl.Utf8,
1626
1514
  )
1515
+ .str.to_decimal()
1516
+ .cast(target_dtype),
1627
1517
  )
1628
1518
  else:
1629
1519
  # Try direct casting
1630
- self.features_df = (
1631
- self.features_df.with_columns(
1632
- pl.col(col).cast(target_dtype),
1633
- )
1520
+ self.features_df = self.features_df.with_columns(
1521
+ pl.col(col).cast(target_dtype),
1634
1522
  )
1635
1523
  else:
1636
1524
  # Try direct casting for non-binary types
1637
- self.features_df = (
1638
- self.features_df.with_columns(
1639
- pl.col(col).cast(target_dtype),
1640
- )
1525
+ self.features_df = self.features_df.with_columns(
1526
+ pl.col(col).cast(target_dtype),
1641
1527
  )
1642
1528
  elif "Float" in dtype_str:
1643
1529
  # Convert to float, handling different input types
1644
1530
  if self.features_df[col].dtype == pl.Utf8:
1645
1531
  # String data - convert to float
1646
- self.features_df = (
1647
- self.features_df.with_columns(
1648
- pl.col(col)
1649
- .str.to_decimal()
1650
- .cast(eval(dtype_str)),
1651
- )
1532
+ self.features_df = self.features_df.with_columns(
1533
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1652
1534
  )
1653
1535
  else:
1654
1536
  # Handle special cases and try direct casting for other types
@@ -1659,59 +1541,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1659
1541
  if "Binary" in str(current_dtype):
1660
1542
  # Convert binary to string first, then to target type
1661
1543
  if target_dtype == pl.Utf8:
1662
- self.features_df = (
1663
- self.features_df.with_columns(
1664
- pl.col(col)
1665
- .map_elements(
1666
- lambda x: x.decode("utf-8")
1667
- if isinstance(x, bytes)
1668
- else str(x),
1669
- return_dtype=pl.Utf8,
1670
- )
1671
- .cast(target_dtype),
1544
+ self.features_df = self.features_df.with_columns(
1545
+ pl.col(col)
1546
+ .map_elements(
1547
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1548
+ return_dtype=pl.Utf8,
1672
1549
  )
1550
+ .cast(target_dtype),
1673
1551
  )
1674
1552
  elif "Int" in str(target_dtype):
1675
- self.features_df = (
1676
- self.features_df.with_columns(
1677
- pl.col(col)
1678
- .map_elements(
1679
- lambda x: x.decode("utf-8")
1680
- if isinstance(x, bytes)
1681
- else str(x),
1682
- return_dtype=pl.Utf8,
1683
- )
1684
- .str.to_integer()
1685
- .cast(target_dtype),
1553
+ self.features_df = self.features_df.with_columns(
1554
+ pl.col(col)
1555
+ .map_elements(
1556
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1557
+ return_dtype=pl.Utf8,
1686
1558
  )
1559
+ .str.to_integer()
1560
+ .cast(target_dtype),
1687
1561
  )
1688
1562
  elif "Float" in str(target_dtype):
1689
- self.features_df = (
1690
- self.features_df.with_columns(
1691
- pl.col(col)
1692
- .map_elements(
1693
- lambda x: x.decode("utf-8")
1694
- if isinstance(x, bytes)
1695
- else str(x),
1696
- return_dtype=pl.Utf8,
1697
- )
1698
- .str.to_decimal()
1699
- .cast(target_dtype),
1563
+ self.features_df = self.features_df.with_columns(
1564
+ pl.col(col)
1565
+ .map_elements(
1566
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1567
+ return_dtype=pl.Utf8,
1700
1568
  )
1569
+ .str.to_decimal()
1570
+ .cast(target_dtype),
1701
1571
  )
1702
1572
  else:
1703
1573
  # Try direct casting
1704
- self.features_df = (
1705
- self.features_df.with_columns(
1706
- pl.col(col).cast(target_dtype),
1707
- )
1574
+ self.features_df = self.features_df.with_columns(
1575
+ pl.col(col).cast(target_dtype),
1708
1576
  )
1709
1577
  else:
1710
1578
  # Try direct casting for non-binary types
1711
- self.features_df = (
1712
- self.features_df.with_columns(
1713
- pl.col(col).cast(target_dtype),
1714
- )
1579
+ self.features_df = self.features_df.with_columns(
1580
+ pl.col(col).cast(target_dtype),
1715
1581
  )
1716
1582
  elif "Utf8" in dtype_str:
1717
1583
  # Ensure it's string type
@@ -1727,59 +1593,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1727
1593
  if "Binary" in str(current_dtype):
1728
1594
  # Convert binary to string first, then to target type
1729
1595
  if target_dtype == pl.Utf8:
1730
- self.features_df = (
1731
- self.features_df.with_columns(
1732
- pl.col(col)
1733
- .map_elements(
1734
- lambda x: x.decode("utf-8")
1735
- if isinstance(x, bytes)
1736
- else str(x),
1737
- return_dtype=pl.Utf8,
1738
- )
1739
- .cast(target_dtype),
1596
+ self.features_df = self.features_df.with_columns(
1597
+ pl.col(col)
1598
+ .map_elements(
1599
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1600
+ return_dtype=pl.Utf8,
1740
1601
  )
1602
+ .cast(target_dtype),
1741
1603
  )
1742
1604
  elif "Int" in str(target_dtype):
1743
- self.features_df = (
1744
- self.features_df.with_columns(
1745
- pl.col(col)
1746
- .map_elements(
1747
- lambda x: x.decode("utf-8")
1748
- if isinstance(x, bytes)
1749
- else str(x),
1750
- return_dtype=pl.Utf8,
1751
- )
1752
- .str.to_integer()
1753
- .cast(target_dtype),
1605
+ self.features_df = self.features_df.with_columns(
1606
+ pl.col(col)
1607
+ .map_elements(
1608
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1609
+ return_dtype=pl.Utf8,
1754
1610
  )
1611
+ .str.to_integer()
1612
+ .cast(target_dtype),
1755
1613
  )
1756
1614
  elif "Float" in str(target_dtype):
1757
- self.features_df = (
1758
- self.features_df.with_columns(
1759
- pl.col(col)
1760
- .map_elements(
1761
- lambda x: x.decode("utf-8")
1762
- if isinstance(x, bytes)
1763
- else str(x),
1764
- return_dtype=pl.Utf8,
1765
- )
1766
- .str.to_decimal()
1767
- .cast(target_dtype),
1615
+ self.features_df = self.features_df.with_columns(
1616
+ pl.col(col)
1617
+ .map_elements(
1618
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1619
+ return_dtype=pl.Utf8,
1768
1620
  )
1621
+ .str.to_decimal()
1622
+ .cast(target_dtype),
1769
1623
  )
1770
1624
  else:
1771
1625
  # Try direct casting
1772
- self.features_df = (
1773
- self.features_df.with_columns(
1774
- pl.col(col).cast(target_dtype),
1775
- )
1626
+ self.features_df = self.features_df.with_columns(
1627
+ pl.col(col).cast(target_dtype),
1776
1628
  )
1777
1629
  else:
1778
1630
  # Try direct casting for non-binary types
1779
- self.features_df = (
1780
- self.features_df.with_columns(
1781
- pl.col(col).cast(target_dtype),
1782
- )
1631
+ self.features_df = self.features_df.with_columns(
1632
+ pl.col(col).cast(target_dtype),
1783
1633
  )
1784
1634
  except Exception as e:
1785
1635
  self.logger.warning(
@@ -1814,11 +1664,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1814
1664
  if "features_df" in schema and "columns" in schema["features_df"]:
1815
1665
  schema_column_order = list(schema["features_df"]["columns"].keys())
1816
1666
  # Only reorder columns that exist in both schema and DataFrame
1817
- existing_columns = [
1818
- col
1819
- for col in schema_column_order
1820
- if col in self.features_df.columns
1821
- ]
1667
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
1822
1668
  if existing_columns:
1823
1669
  self.features_df = self.features_df.select(existing_columns)
1824
1670
 
@@ -1848,14 +1694,14 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1848
1694
  # set self.label to basename without extension
1849
1695
  if self.label is None or self.label == "":
1850
1696
  self.label = os.path.splitext(os.path.basename(filename))[0]
1851
-
1697
+
1852
1698
  # Sync instance attributes from loaded parameters
1853
- if hasattr(self, 'parameters') and self.parameters is not None:
1854
- if hasattr(self.parameters, 'polarity') and self.parameters.polarity is not None:
1699
+ if hasattr(self, "parameters") and self.parameters is not None:
1700
+ if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
1855
1701
  self.polarity = self.parameters.polarity
1856
- if hasattr(self.parameters, 'type') and self.parameters.type is not None:
1702
+ if hasattr(self.parameters, "type") and self.parameters.type is not None:
1857
1703
  self.type = self.parameters.type
1858
-
1704
+
1859
1705
  self.logger.info(
1860
1706
  f"Sample loaded successfully from {filename} (optimized for study)",
1861
1707
  )
@@ -2191,9 +2037,7 @@ def _create_dataframe_with_object_columns(
2191
2037
  schema_columns = schema.get(df_name, {}).get("columns", {})
2192
2038
 
2193
2039
  object_columns = {
2194
- k: v
2195
- for k, v in data.items()
2196
- if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
2040
+ k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
2197
2041
  }
2198
2042
  regular_columns = {k: v for k, v in data.items() if k not in object_columns}
2199
2043
 
@@ -2318,8 +2162,6 @@ def create_h5_metadata_group(
2318
2162
  metadata_group = f.create_group("metadata")
2319
2163
  metadata_group.attrs["format"] = "masster-sample5-1"
2320
2164
  metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
2321
- metadata_group.attrs["file_source"] = (
2322
- str(file_source) if file_source is not None else ""
2323
- )
2165
+ metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
2324
2166
  metadata_group.attrs["file_type"] = str(type) if type is not None else ""
2325
2167
  metadata_group.attrs["label"] = str(label) if label is not None else ""