masster 0.3.15__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (78) hide show
  1. {masster-0.3.15 → masster-0.3.16}/PKG-INFO +1 -1
  2. {masster-0.3.15 → masster-0.3.16}/pyproject.toml +1 -1
  3. {masster-0.3.15 → masster-0.3.16}/src/masster/_version.py +1 -1
  4. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/h5.py +577 -0
  5. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/load.py +57 -0
  6. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/sample.py +4 -0
  7. {masster-0.3.15 → masster-0.3.16}/src/masster/spectrum.py +3 -0
  8. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/fill_def.py +3 -3
  9. {masster-0.3.15 → masster-0.3.16}/src/masster/study/export.py +3 -0
  10. {masster-0.3.15 → masster-0.3.16}/src/masster/study/load.py +565 -215
  11. {masster-0.3.15 → masster-0.3.16}/src/masster/study/study.py +8 -0
  12. {masster-0.3.15 → masster-0.3.16}/src/masster/study/study5_schema.json +3 -0
  13. {masster-0.3.15 → masster-0.3.16}/uv.lock +1 -1
  14. {masster-0.3.15 → masster-0.3.16}/.github/workflows/publish.yml +0 -0
  15. {masster-0.3.15 → masster-0.3.16}/.github/workflows/security.yml +0 -0
  16. {masster-0.3.15 → masster-0.3.16}/.github/workflows/test.yml +0 -0
  17. {masster-0.3.15 → masster-0.3.16}/.gitignore +0 -0
  18. {masster-0.3.15 → masster-0.3.16}/.pre-commit-config.yaml +0 -0
  19. {masster-0.3.15 → masster-0.3.16}/LICENSE +0 -0
  20. {masster-0.3.15 → masster-0.3.16}/Makefile +0 -0
  21. {masster-0.3.15 → masster-0.3.16}/README.md +0 -0
  22. {masster-0.3.15 → masster-0.3.16}/TESTING.md +0 -0
  23. {masster-0.3.15 → masster-0.3.16}/demo/example_batch_process.py +0 -0
  24. {masster-0.3.15 → masster-0.3.16}/demo/example_sample_process.py +0 -0
  25. {masster-0.3.15 → masster-0.3.16}/src/masster/__init__.py +0 -0
  26. {masster-0.3.15 → masster-0.3.16}/src/masster/chromatogram.py +0 -0
  27. {masster-0.3.15 → masster-0.3.16}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
  28. {masster-0.3.15 → masster-0.3.16}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  29. {masster-0.3.15 → masster-0.3.16}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  30. {masster-0.3.15 → masster-0.3.16}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  31. {masster-0.3.15 → masster-0.3.16}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  32. {masster-0.3.15 → masster-0.3.16}/src/masster/logger.py +0 -0
  33. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/__init__.py +0 -0
  34. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/__init__.py +0 -0
  35. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  36. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/find_features_def.py +0 -0
  37. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  38. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  39. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/defaults/sample_def.py +0 -0
  40. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/helpers.py +0 -0
  41. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/lib.py +0 -0
  42. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/parameters.py +0 -0
  43. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/plot.py +0 -0
  44. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/processing.py +0 -0
  45. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/quant.py +0 -0
  46. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/sample5_schema.json +0 -0
  47. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/save.py +0 -0
  48. {masster-0.3.15 → masster-0.3.16}/src/masster/sample/sciex.py +0 -0
  49. {masster-0.3.15 → masster-0.3.16}/src/masster/study/__init__.py +0 -0
  50. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/__init__.py +0 -0
  51. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/align_def.py +0 -0
  52. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/export_def.py +0 -0
  53. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/fill_chrom_def.py +0 -0
  54. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/find_consensus_def.py +0 -0
  55. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/find_ms2_def.py +0 -0
  56. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  57. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/integrate_def.py +0 -0
  58. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/merge_def.py +0 -0
  59. {masster-0.3.15 → masster-0.3.16}/src/masster/study/defaults/study_def.py +0 -0
  60. {masster-0.3.15 → masster-0.3.16}/src/masster/study/h5.py +0 -0
  61. {masster-0.3.15 → masster-0.3.16}/src/masster/study/helpers.py +0 -0
  62. {masster-0.3.15 → masster-0.3.16}/src/masster/study/helpers_optimized.py +0 -0
  63. {masster-0.3.15 → masster-0.3.16}/src/masster/study/parameters.py +0 -0
  64. {masster-0.3.15 → masster-0.3.16}/src/masster/study/plot.py +0 -0
  65. {masster-0.3.15 → masster-0.3.16}/src/masster/study/processing.py +0 -0
  66. {masster-0.3.15 → masster-0.3.16}/src/masster/study/save.py +0 -0
  67. {masster-0.3.15 → masster-0.3.16}/tests/conftest.py +0 -0
  68. {masster-0.3.15 → masster-0.3.16}/tests/test_chromatogram.py +0 -0
  69. {masster-0.3.15 → masster-0.3.16}/tests/test_defaults.py +0 -0
  70. {masster-0.3.15 → masster-0.3.16}/tests/test_imports.py +0 -0
  71. {masster-0.3.15 → masster-0.3.16}/tests/test_integration.py +0 -0
  72. {masster-0.3.15 → masster-0.3.16}/tests/test_logger.py +0 -0
  73. {masster-0.3.15 → masster-0.3.16}/tests/test_parameters.py +0 -0
  74. {masster-0.3.15 → masster-0.3.16}/tests/test_sample.py +0 -0
  75. {masster-0.3.15 → masster-0.3.16}/tests/test_spectrum.py +0 -0
  76. {masster-0.3.15 → masster-0.3.16}/tests/test_study.py +0 -0
  77. {masster-0.3.15 → masster-0.3.16}/tests/test_version.py +0 -0
  78. {masster-0.3.15 → masster-0.3.16}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.3.15
3
+ Version: 0.3.16
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.3.15"
4
+ version = "0.3.16"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.5.7"
4
+ __version__ = "0.3.16"
5
5
 
6
6
 
7
7
  def get_version():
@@ -897,6 +897,583 @@ def _load_sample5(self, filename: str, map: bool = True):
897
897
  self.logger.info(f"Sample loaded successfully from {filename}")
898
898
 
899
899
 
900
+ def _load_sample5_study(self, filename: str, map: bool = True):
901
+ """
902
+ Optimized variant of _load_sample5 for study loading that skips reading ms1_df.
903
+
904
+ This is used when adding samples to studies where ms1_df data is not needed,
905
+ improving loading throughput by skipping the potentially large ms1_df dataset.
906
+
907
+ Args:
908
+ filename (str): Path to the sample5 HDF5 file to load.
909
+ map (bool, optional): Whether to map featureXML file if available. Defaults to True.
910
+
911
+ Returns:
912
+ None (modifies self in place)
913
+
914
+ Notes:
915
+ - Same as _load_sample5 but skips ms1_df loading for better performance
916
+ - Sets ms1_df = None explicitly
917
+ - Suitable for study workflows where MS1 spectral data is not required
918
+ """
919
+ # Load schema for proper DataFrame reconstruction
920
+ schema_path = os.path.join(os.path.dirname(__file__), "sample5_schema.json")
921
+ try:
922
+ with open(schema_path) as f:
923
+ schema = json.load(f)
924
+ except FileNotFoundError:
925
+ self.logger.warning(
926
+ f"Schema file {schema_path} not found. Using default types.",
927
+ )
928
+ schema = {}
929
+
930
+ with h5py.File(filename, "r") as f:
931
+ # Load metadata
932
+ if "metadata" in f:
933
+ metadata_group = f["metadata"]
934
+ self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
935
+
936
+ # Load file_source if it exists, otherwise set it equal to file_path
937
+ if "file_source" in metadata_group.attrs:
938
+ self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
939
+ else:
940
+ self.file_source = self.file_path
941
+
942
+ self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
943
+ self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
944
+
945
+ # Load parameters from JSON in metadata
946
+ loaded_data = load_parameters_from_metadata(metadata_group)
947
+
948
+ # Always create a fresh sample_defaults object
949
+ from masster.sample.defaults.sample_def import sample_defaults
950
+
951
+ self.parameters = sample_defaults()
952
+
953
+ # Initialize history and populate from loaded data
954
+ self.history = {}
955
+ if loaded_data is not None and isinstance(loaded_data, dict):
956
+ # Store the loaded data in history
957
+ self.history = loaded_data
958
+ # If there are sample parameters in the history, use them to update defaults
959
+ if "sample" in loaded_data:
960
+ sample_params = loaded_data["sample"]
961
+ if isinstance(sample_params, dict):
962
+ self.parameters.set_from_dict(sample_params, validate=False)
963
+
964
+ # Load scans_df
965
+ if "scans" in f:
966
+ scans_group = f["scans"]
967
+ data: dict[str, Any] = {}
968
+ missing_columns = []
969
+ for col in schema.get("scans_df", {}).get("columns", []):
970
+ if col not in scans_group:
971
+ self.logger.debug(f"Column '{col}' not found in sample5/scans.")
972
+ data[col] = None
973
+ missing_columns.append(col)
974
+ continue
975
+
976
+ dtype = schema["scans_df"]["columns"][col].get("dtype", "native")
977
+ match dtype:
978
+ case "pl.Object":
979
+ self.logger.debug(f"Unexpected Object column '{col}'")
980
+ data[col] = None
981
+ missing_columns.append(col)
982
+
983
+ case _:
984
+ data[col] = scans_group[col][:]
985
+
986
+ # create polars DataFrame from data
987
+ if data:
988
+ self.scans_df = pl.DataFrame(data)
989
+
990
+ # Convert "None" strings and NaN values to proper null values
991
+ for col in self.scans_df.columns:
992
+ if self.scans_df[col].dtype == pl.Utf8: # String columns
993
+ self.scans_df = self.scans_df.with_columns([
994
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
995
+ .then(None)
996
+ .otherwise(pl.col(col))
997
+ .alias(col),
998
+ ])
999
+ elif self.scans_df[col].dtype in [
1000
+ pl.Float64,
1001
+ pl.Float32,
1002
+ ]: # Float columns
1003
+ self.scans_df = self.scans_df.with_columns([
1004
+ pl.col(col).fill_nan(None).alias(col),
1005
+ ])
1006
+
1007
+ # update all columns with schema types
1008
+ for col in self.scans_df.columns:
1009
+ if col in schema.get("scans_df", {}).get("columns", {}):
1010
+ try:
1011
+ dtype_str = schema["scans_df"]["columns"][col]["dtype"]
1012
+ # Convert dtype string to actual polars dtype
1013
+ if dtype_str.startswith("pl."):
1014
+ # Skip Object columns - they're already properly reconstructed
1015
+ if "Object" in dtype_str:
1016
+ continue
1017
+ # Handle different polars data types
1018
+ if "Int" in dtype_str:
1019
+ # Convert to numeric first, handling different input types
1020
+ if self.scans_df[col].dtype == pl.Utf8:
1021
+ # String data - convert to integer
1022
+ self.scans_df = self.scans_df.with_columns(
1023
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1024
+ )
1025
+ elif self.scans_df[col].dtype in [
1026
+ pl.Float64,
1027
+ pl.Float32,
1028
+ ]:
1029
+ # Float data - cast to integer
1030
+ self.scans_df = self.scans_df.with_columns(
1031
+ pl.col(col).cast(eval(dtype_str)),
1032
+ )
1033
+ else:
1034
+ # Try direct casting
1035
+ self.scans_df = self.scans_df.with_columns(
1036
+ pl.col(col).cast(eval(dtype_str)),
1037
+ )
1038
+ elif "Float" in dtype_str:
1039
+ # Convert to float, handling different input types
1040
+ if self.scans_df[col].dtype == pl.Utf8:
1041
+ # String data - convert to float
1042
+ self.scans_df = self.scans_df.with_columns(
1043
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1044
+ )
1045
+ else:
1046
+ # Try direct casting
1047
+ self.scans_df = self.scans_df.with_columns(
1048
+ pl.col(col).cast(eval(dtype_str)),
1049
+ )
1050
+ elif "Utf8" in dtype_str:
1051
+ # Ensure it's string type
1052
+ self.scans_df = self.scans_df.with_columns(
1053
+ pl.col(col).cast(pl.Utf8),
1054
+ )
1055
+ else:
1056
+ # Handle special cases and try direct casting for other types
1057
+ current_dtype = self.scans_df[col].dtype
1058
+ target_dtype = eval(dtype_str)
1059
+
1060
+ # Handle binary data that might need string conversion first
1061
+ if "Binary" in str(current_dtype):
1062
+ # Convert binary to string first, then to target type
1063
+ if target_dtype == pl.Utf8:
1064
+ self.scans_df = self.scans_df.with_columns(
1065
+ pl.col(col)
1066
+ .map_elements(
1067
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1068
+ return_dtype=pl.Utf8,
1069
+ )
1070
+ .cast(target_dtype),
1071
+ )
1072
+ elif "Int" in str(target_dtype):
1073
+ self.scans_df = self.scans_df.with_columns(
1074
+ pl.col(col)
1075
+ .map_elements(
1076
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1077
+ return_dtype=pl.Utf8,
1078
+ )
1079
+ .str.to_integer()
1080
+ .cast(target_dtype),
1081
+ )
1082
+ elif "Float" in str(target_dtype):
1083
+ self.scans_df = self.scans_df.with_columns(
1084
+ pl.col(col)
1085
+ .map_elements(
1086
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1087
+ return_dtype=pl.Utf8,
1088
+ )
1089
+ .str.to_decimal()
1090
+ .cast(target_dtype),
1091
+ )
1092
+ else:
1093
+ # Try direct casting
1094
+ self.scans_df = self.scans_df.with_columns(
1095
+ pl.col(col).cast(target_dtype),
1096
+ )
1097
+ else:
1098
+ # Try direct casting for non-binary types
1099
+ self.scans_df = self.scans_df.with_columns(
1100
+ pl.col(col).cast(target_dtype),
1101
+ )
1102
+ except Exception as e:
1103
+ self.logger.warning(
1104
+ f"Failed to cast column '{col}' in scans_df: {e}",
1105
+ )
1106
+ else:
1107
+ self.logger.warning(
1108
+ f"Column '{col}' in scans_df not found in schema, keeping original type.",
1109
+ )
1110
+
1111
+ # Ensure column order matches schema order
1112
+ if "scans_df" in schema and "columns" in schema["scans_df"]:
1113
+ schema_column_order = list(schema["scans_df"]["columns"].keys())
1114
+ # Only reorder columns that exist in both schema and DataFrame
1115
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
1116
+ if existing_columns:
1117
+ self.scans_df = self.scans_df.select(existing_columns)
1118
+
1119
+ else:
1120
+ self.scans_df = None
1121
+ else:
1122
+ self.scans_df = None
1123
+
1124
+ # Load features_df
1125
+ if "features" in f:
1126
+ features_group = f["features"]
1127
+ # columns = list(features_group.attrs.get('columns', []))
1128
+ data = {}
1129
+ missing_columns = []
1130
+ for col in schema.get("features_df", {}).get("columns", []):
1131
+ if col not in features_group:
1132
+ self.logger.debug(
1133
+ f"Column '{col}' not found in sample5/features.",
1134
+ )
1135
+ data[col] = None
1136
+ missing_columns.append(col)
1137
+ continue
1138
+
1139
+ dtype = schema["features_df"]["columns"][col].get("dtype", "native")
1140
+ match dtype:
1141
+ case "pl.Object":
1142
+ match col:
1143
+ case "chrom":
1144
+ data_col = features_group[col][:]
1145
+ # Convert JSON strings back to Chromatogram objects
1146
+ reconstructed_data: list[Any] = []
1147
+ for item in data_col:
1148
+ if isinstance(item, bytes):
1149
+ item = item.decode("utf-8")
1150
+
1151
+ if item == "None" or item == "":
1152
+ reconstructed_data.append(None)
1153
+ else:
1154
+ try:
1155
+ reconstructed_data.append(
1156
+ Chromatogram.from_json(item),
1157
+ )
1158
+ except (json.JSONDecodeError, ValueError):
1159
+ reconstructed_data.append(None)
1160
+
1161
+ data[col] = reconstructed_data
1162
+ case "ms2_scans":
1163
+ data_col = features_group[col][:]
1164
+ # Convert JSON strings back to list objects
1165
+ reconstructed_data = []
1166
+ for item in data_col:
1167
+ if isinstance(item, bytes):
1168
+ item = item.decode("utf-8")
1169
+
1170
+ if item == "None" or item == "":
1171
+ reconstructed_data.append(None)
1172
+ else:
1173
+ try:
1174
+ reconstructed_data.append(json.loads(item))
1175
+ except json.JSONDecodeError:
1176
+ reconstructed_data.append(None)
1177
+
1178
+ data[col] = reconstructed_data
1179
+ case "ms2_specs":
1180
+ data_col = features_group[col][:]
1181
+ # Convert JSON strings back to list of Spectrum objects
1182
+ reconstructed_data = []
1183
+ for item in data_col:
1184
+ if isinstance(item, bytes):
1185
+ item = item.decode("utf-8")
1186
+
1187
+ if item == "None" or item == "":
1188
+ reconstructed_data.append(None)
1189
+ else:
1190
+ try:
1191
+ spectrum_list = []
1192
+ for spec_data in json.loads(item):
1193
+ if spec_data is not None:
1194
+ spectrum = Spectrum.from_json(spec_data)
1195
+ spectrum_list.append(spectrum)
1196
+ else:
1197
+ spectrum_list.append(None)
1198
+ reconstructed_data.append(spectrum_list)
1199
+ except (json.JSONDecodeError, ValueError, TypeError):
1200
+ reconstructed_data.append(None)
1201
+
1202
+ data[col] = reconstructed_data
1203
+ case _:
1204
+ # Handle other Object columns as raw data
1205
+ data[col] = features_group[col][:]
1206
+
1207
+ case _:
1208
+ data[col] = features_group[col][:]
1209
+
1210
+ # create polars DataFrame from data
1211
+ if data:
1212
+ self.features_df = pl.DataFrame(data, strict=False)
1213
+
1214
+ # Convert "None" strings and NaN values to proper null values for regular columns first
1215
+ for col in self.features_df.columns:
1216
+ # Skip Object columns - they're already properly reconstructed
1217
+ if col in schema.get("features_df", {}).get("columns", {}):
1218
+ if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
1219
+ continue
1220
+
1221
+ if self.features_df[col].dtype == pl.Utf8: # String columns
1222
+ self.features_df = self.features_df.with_columns([
1223
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1224
+ .then(None)
1225
+ .otherwise(pl.col(col))
1226
+ .alias(col),
1227
+ ])
1228
+ elif self.features_df[col].dtype in [
1229
+ pl.Float64,
1230
+ pl.Float32,
1231
+ ]: # Float columns
1232
+ self.features_df = self.features_df.with_columns([
1233
+ pl.col(col).fill_nan(None).alias(col),
1234
+ ])
1235
+
1236
+ # update all columns with schema types
1237
+ for col in self.features_df.columns:
1238
+ if col in schema.get("features_df", {}).get("columns", {}):
1239
+ try:
1240
+ dtype_str = schema["features_df"]["columns"][col]["dtype"]
1241
+ # Convert dtype string to actual polars dtype
1242
+ if dtype_str.startswith("pl."):
1243
+ # Skip Object columns - they're already properly reconstructed
1244
+ if "Object" in dtype_str:
1245
+ continue
1246
+ # Handle different polars data types
1247
+ if "Int" in dtype_str:
1248
+ # Convert to numeric first, handling different input types
1249
+ if self.features_df[col].dtype == pl.Utf8:
1250
+ # String data - convert to integer
1251
+ self.features_df = self.features_df.with_columns(
1252
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1253
+ )
1254
+ elif self.features_df[col].dtype in [
1255
+ pl.Float64,
1256
+ pl.Float32,
1257
+ ]:
1258
+ # Float data - cast to integer with null handling for NaN values
1259
+ self.features_df = self.features_df.with_columns(
1260
+ pl.col(col).cast(eval(dtype_str), strict=False),
1261
+ )
1262
+ else:
1263
+ # Handle special cases and try direct casting for other types
1264
+ current_dtype = self.features_df[col].dtype
1265
+ target_dtype = eval(dtype_str)
1266
+
1267
+ # Handle binary data that might need string conversion first
1268
+ if "Binary" in str(current_dtype):
1269
+ # Convert binary to string first, then to target type
1270
+ if target_dtype == pl.Utf8:
1271
+ self.features_df = self.features_df.with_columns(
1272
+ pl.col(col)
1273
+ .map_elements(
1274
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1275
+ return_dtype=pl.Utf8,
1276
+ )
1277
+ .cast(target_dtype),
1278
+ )
1279
+ elif "Int" in str(target_dtype):
1280
+ self.features_df = self.features_df.with_columns(
1281
+ pl.col(col)
1282
+ .map_elements(
1283
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1284
+ return_dtype=pl.Utf8,
1285
+ )
1286
+ .str.to_integer()
1287
+ .cast(target_dtype),
1288
+ )
1289
+ elif "Float" in str(target_dtype):
1290
+ self.features_df = self.features_df.with_columns(
1291
+ pl.col(col)
1292
+ .map_elements(
1293
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1294
+ return_dtype=pl.Utf8,
1295
+ )
1296
+ .str.to_decimal()
1297
+ .cast(target_dtype),
1298
+ )
1299
+ else:
1300
+ # Try direct casting
1301
+ self.features_df = self.features_df.with_columns(
1302
+ pl.col(col).cast(target_dtype),
1303
+ )
1304
+ else:
1305
+ # Try direct casting for non-binary types
1306
+ self.features_df = self.features_df.with_columns(
1307
+ pl.col(col).cast(target_dtype),
1308
+ )
1309
+ elif "Float" in dtype_str:
1310
+ # Convert to float, handling different input types
1311
+ if self.features_df[col].dtype == pl.Utf8:
1312
+ # String data - convert to float
1313
+ self.features_df = self.features_df.with_columns(
1314
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1315
+ )
1316
+ else:
1317
+ # Handle special cases and try direct casting for other types
1318
+ current_dtype = self.features_df[col].dtype
1319
+ target_dtype = eval(dtype_str)
1320
+
1321
+ # Handle binary data that might need string conversion first
1322
+ if "Binary" in str(current_dtype):
1323
+ # Convert binary to string first, then to target type
1324
+ if target_dtype == pl.Utf8:
1325
+ self.features_df = self.features_df.with_columns(
1326
+ pl.col(col)
1327
+ .map_elements(
1328
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1329
+ return_dtype=pl.Utf8,
1330
+ )
1331
+ .cast(target_dtype),
1332
+ )
1333
+ elif "Int" in str(target_dtype):
1334
+ self.features_df = self.features_df.with_columns(
1335
+ pl.col(col)
1336
+ .map_elements(
1337
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1338
+ return_dtype=pl.Utf8,
1339
+ )
1340
+ .str.to_integer()
1341
+ .cast(target_dtype),
1342
+ )
1343
+ elif "Float" in str(target_dtype):
1344
+ self.features_df = self.features_df.with_columns(
1345
+ pl.col(col)
1346
+ .map_elements(
1347
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1348
+ return_dtype=pl.Utf8,
1349
+ )
1350
+ .str.to_decimal()
1351
+ .cast(target_dtype),
1352
+ )
1353
+ else:
1354
+ # Try direct casting
1355
+ self.features_df = self.features_df.with_columns(
1356
+ pl.col(col).cast(target_dtype),
1357
+ )
1358
+ else:
1359
+ # Try direct casting for non-binary types
1360
+ self.features_df = self.features_df.with_columns(
1361
+ pl.col(col).cast(target_dtype),
1362
+ )
1363
+ elif "Utf8" in dtype_str:
1364
+ # Ensure it's string type
1365
+ self.features_df = self.features_df.with_columns(
1366
+ pl.col(col).cast(pl.Utf8),
1367
+ )
1368
+ else:
1369
+ # Handle special cases and try direct casting for other types
1370
+ current_dtype = self.features_df[col].dtype
1371
+ target_dtype = eval(dtype_str)
1372
+
1373
+ # Handle binary data that might need string conversion first
1374
+ if "Binary" in str(current_dtype):
1375
+ # Convert binary to string first, then to target type
1376
+ if target_dtype == pl.Utf8:
1377
+ self.features_df = self.features_df.with_columns(
1378
+ pl.col(col)
1379
+ .map_elements(
1380
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1381
+ return_dtype=pl.Utf8,
1382
+ )
1383
+ .cast(target_dtype),
1384
+ )
1385
+ elif "Int" in str(target_dtype):
1386
+ self.features_df = self.features_df.with_columns(
1387
+ pl.col(col)
1388
+ .map_elements(
1389
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1390
+ return_dtype=pl.Utf8,
1391
+ )
1392
+ .str.to_integer()
1393
+ .cast(target_dtype),
1394
+ )
1395
+ elif "Float" in str(target_dtype):
1396
+ self.features_df = self.features_df.with_columns(
1397
+ pl.col(col)
1398
+ .map_elements(
1399
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1400
+ return_dtype=pl.Utf8,
1401
+ )
1402
+ .str.to_decimal()
1403
+ .cast(target_dtype),
1404
+ )
1405
+ else:
1406
+ # Try direct casting
1407
+ self.features_df = self.features_df.with_columns(
1408
+ pl.col(col).cast(target_dtype),
1409
+ )
1410
+ else:
1411
+ # Try direct casting for non-binary types
1412
+ self.features_df = self.features_df.with_columns(
1413
+ pl.col(col).cast(target_dtype),
1414
+ )
1415
+ except Exception as e:
1416
+ self.logger.warning(
1417
+ f"Failed to cast column '{col}' in features_df: {e}",
1418
+ )
1419
+ else:
1420
+ self.logger.warning(
1421
+ f"Column '{col}' in features_df not found in schema, keeping original type.",
1422
+ )
1423
+
1424
+ # FINAL null conversion pass - after all type casting is done
1425
+ # This ensures "None" strings introduced by failed conversions are properly handled
1426
+ for col in self.features_df.columns:
1427
+ if self.features_df[col].dtype == pl.Utf8: # String columns
1428
+ self.features_df = self.features_df.with_columns([
1429
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1430
+ .then(None)
1431
+ .otherwise(pl.col(col))
1432
+ .alias(col),
1433
+ ])
1434
+ # Float columns
1435
+ elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
1436
+ self.features_df = self.features_df.with_columns([
1437
+ pl.col(col).fill_nan(None).alias(col),
1438
+ ])
1439
+
1440
+ # Ensure column order matches schema order
1441
+ if "features_df" in schema and "columns" in schema["features_df"]:
1442
+ schema_column_order = list(schema["features_df"]["columns"].keys())
1443
+ # Only reorder columns that exist in both schema and DataFrame
1444
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
1445
+ if existing_columns:
1446
+ self.features_df = self.features_df.select(existing_columns)
1447
+
1448
+ else:
1449
+ self.features_df = None
1450
+ else:
1451
+ self.features_df = None
1452
+
1453
+ # OPTIMIZED: Skip loading ms1_df for study use - set to None for performance
1454
+ self.ms1_df = None
1455
+
1456
+ # Parameters are now loaded from metadata JSON (see above)
1457
+ # Lib and lib_match are no longer saved/loaded
1458
+
1459
+ if map:
1460
+ featureXML = filename.replace(".sample5", ".featureXML")
1461
+ if os.path.exists(featureXML):
1462
+ self._load_featureXML(featureXML)
1463
+ self._features_sync()
1464
+ else:
1465
+ self.logger.warning(
1466
+ f"Feature XML file {featureXML} not found, skipping loading.",
1467
+ )
1468
+
1469
+ # set self.file_path to *.sample5
1470
+ self.file_path = filename
1471
+ # set self.label to basename without extension
1472
+ if self.label is None or self.label == "":
1473
+ self.label = os.path.splitext(os.path.basename(filename))[0]
1474
+ self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
1475
+
1476
+
900
1477
  def load_schema(schema_path: str) -> Dict[str, Any]:
901
1478
  """
902
1479
  Load schema from JSON file with error handling.