masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -61,18 +61,18 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
|
|
|
61
61
|
if df_name not in schema:
|
|
62
62
|
# Fallback to basic empty DataFrame if schema not found
|
|
63
63
|
return pl.DataFrame()
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
df_schema = schema[df_name]["columns"]
|
|
66
66
|
empty_data = {}
|
|
67
67
|
polars_schema = {}
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
for col_name, col_info in df_schema.items():
|
|
70
70
|
dtype_str = col_info["dtype"]
|
|
71
71
|
# Convert string representation to actual Polars dtype
|
|
72
72
|
if dtype_str == "pl.Int64":
|
|
73
73
|
polars_dtype = pl.Int64
|
|
74
74
|
elif dtype_str == "pl.Int32":
|
|
75
|
-
polars_dtype = pl.Int32
|
|
75
|
+
polars_dtype = pl.Int32
|
|
76
76
|
elif dtype_str == "pl.Float64":
|
|
77
77
|
polars_dtype = pl.Float64
|
|
78
78
|
elif dtype_str == "pl.Utf8":
|
|
@@ -88,10 +88,10 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
|
|
|
88
88
|
else:
|
|
89
89
|
# Fallback to string if unknown type
|
|
90
90
|
polars_dtype = pl.String
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
empty_data[col_name] = []
|
|
93
93
|
polars_schema[col_name] = polars_dtype
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
return pl.DataFrame(empty_data, schema=polars_schema)
|
|
96
96
|
|
|
97
97
|
|
|
@@ -313,7 +313,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
313
313
|
serialized_chunk.append(json.dumps(item.tolist()))
|
|
314
314
|
except (AttributeError, TypeError):
|
|
315
315
|
# Fallback for non-numpy data
|
|
316
|
-
serialized_chunk.append(json.dumps(list(item) if hasattr(item,
|
|
316
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
317
317
|
else:
|
|
318
318
|
serialized_chunk.append("None")
|
|
319
319
|
elif col_name == "ms1_spec":
|
|
@@ -325,7 +325,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
325
325
|
serialized_chunk.append(json.dumps(item.tolist()))
|
|
326
326
|
except (AttributeError, TypeError):
|
|
327
327
|
# Fallback for non-numpy data
|
|
328
|
-
serialized_chunk.append(json.dumps(list(item) if hasattr(item,
|
|
328
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
329
329
|
else:
|
|
330
330
|
serialized_chunk.append("None")
|
|
331
331
|
else:
|
|
@@ -392,10 +392,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
392
392
|
)
|
|
393
393
|
# Fallback to simple string conversion for this chunk
|
|
394
394
|
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
395
|
-
results[chunk_start] = [
|
|
396
|
-
str(item) if item is not None else "None"
|
|
397
|
-
for item in chunk
|
|
398
|
-
]
|
|
395
|
+
results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
|
|
399
396
|
|
|
400
397
|
# Reassemble in correct order
|
|
401
398
|
for i in range(0, total_items, chunk_size):
|
|
@@ -598,7 +595,7 @@ def _save_dataframe_column_legacy(
|
|
|
598
595
|
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
599
596
|
except (AttributeError, TypeError):
|
|
600
597
|
# Fallback for non-numpy data
|
|
601
|
-
data_as_json_strings.append(json.dumps(list(item) if hasattr(item,
|
|
598
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
602
599
|
else:
|
|
603
600
|
data_as_json_strings.append("None")
|
|
604
601
|
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
@@ -612,7 +609,7 @@ def _save_dataframe_column_legacy(
|
|
|
612
609
|
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
613
610
|
except (AttributeError, TypeError):
|
|
614
611
|
# Fallback for non-numpy data
|
|
615
|
-
data_as_json_strings.append(json.dumps(list(item) if hasattr(item,
|
|
612
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
616
613
|
else:
|
|
617
614
|
data_as_json_strings.append("None")
|
|
618
615
|
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
@@ -712,9 +709,7 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
712
709
|
"adduct": str(adduct_row[0]),
|
|
713
710
|
"count": int(float(adduct_row[1])),
|
|
714
711
|
"percentage": float(adduct_row[2]),
|
|
715
|
-
"mass": float(adduct_row[3])
|
|
716
|
-
if len(adduct_row) > 3
|
|
717
|
-
else 0.0,
|
|
712
|
+
"mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0,
|
|
718
713
|
},
|
|
719
714
|
)
|
|
720
715
|
reconstructed_data.append(converted_adducts)
|
|
@@ -722,15 +717,39 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
722
717
|
# Handle isotope patterns (numpy arrays with [mz, intensity] data)
|
|
723
718
|
try:
|
|
724
719
|
import numpy as np
|
|
725
|
-
|
|
726
|
-
#
|
|
727
|
-
|
|
728
|
-
|
|
720
|
+
|
|
721
|
+
# Try JSON parsing first (new format)
|
|
722
|
+
try:
|
|
723
|
+
iso_data = json.loads(item)
|
|
724
|
+
# Convert back to numpy array
|
|
725
|
+
reconstructed_data.append(np.array(iso_data) if iso_data else None)
|
|
726
|
+
except json.JSONDecodeError:
|
|
727
|
+
# Handle numpy array string representation (old format)
|
|
728
|
+
# This handles strings like "[[ 875.7865 447675. ]\n [ 876.7902 168819. ]]"
|
|
729
|
+
try:
|
|
730
|
+
# Use numpy's string representation parser
|
|
731
|
+
iso_array = np.fromstring(item.replace('[', '').replace(']', '').replace('\n', ' '), sep=' ')
|
|
732
|
+
# Reshape to 2D array (pairs of mz, intensity)
|
|
733
|
+
if len(iso_array) % 2 == 0:
|
|
734
|
+
iso_array = iso_array.reshape(-1, 2)
|
|
735
|
+
reconstructed_data.append(iso_array)
|
|
736
|
+
else:
|
|
737
|
+
reconstructed_data.append(None)
|
|
738
|
+
except (ValueError, AttributeError):
|
|
739
|
+
# If all else fails, try to evaluate the string as a literal
|
|
740
|
+
try:
|
|
741
|
+
import ast
|
|
742
|
+
iso_data = ast.literal_eval(item)
|
|
743
|
+
reconstructed_data.append(np.array(iso_data) if iso_data else None)
|
|
744
|
+
except (ValueError, SyntaxError):
|
|
745
|
+
reconstructed_data.append(None)
|
|
746
|
+
except (ValueError, ImportError):
|
|
729
747
|
reconstructed_data.append(None)
|
|
730
748
|
elif col_name == "ms1_spec":
|
|
731
749
|
# Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
|
|
732
750
|
try:
|
|
733
751
|
import numpy as np
|
|
752
|
+
|
|
734
753
|
ms1_spec_data = json.loads(item)
|
|
735
754
|
# Convert back to numpy array
|
|
736
755
|
reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
|
|
@@ -821,25 +840,25 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
821
840
|
# First check all data for numpy object arrays and move them to object columns
|
|
822
841
|
additional_object_cols = []
|
|
823
842
|
for k, v in data.items():
|
|
824
|
-
if k not in object_columns and hasattr(v,
|
|
843
|
+
if k not in object_columns and hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
825
844
|
# This is a numpy object array that should be treated as object
|
|
826
845
|
additional_object_cols.append(k)
|
|
827
846
|
object_columns.append(k)
|
|
828
|
-
|
|
847
|
+
|
|
829
848
|
if additional_object_cols:
|
|
830
849
|
# Re-run reconstruction for these columns
|
|
831
850
|
for col in additional_object_cols:
|
|
832
851
|
data[col] = _reconstruct_object_column(data[col], col)
|
|
833
|
-
|
|
852
|
+
|
|
834
853
|
object_data = {k: v for k, v in data.items() if k in object_columns}
|
|
835
854
|
regular_data = {k: v for k, v in data.items() if k not in object_columns}
|
|
836
855
|
|
|
837
856
|
# Final check: ensure no numpy object arrays in regular_data
|
|
838
857
|
problematic_cols = []
|
|
839
858
|
for k, v in regular_data.items():
|
|
840
|
-
if hasattr(v,
|
|
859
|
+
if hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
841
860
|
problematic_cols.append(k)
|
|
842
|
-
|
|
861
|
+
|
|
843
862
|
if problematic_cols:
|
|
844
863
|
# Move these to object_data
|
|
845
864
|
for col in problematic_cols:
|
|
@@ -878,7 +897,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
878
897
|
# and handle numpy scalars within lists
|
|
879
898
|
safe_regular_data = {}
|
|
880
899
|
import numpy as np
|
|
881
|
-
|
|
900
|
+
|
|
882
901
|
def convert_numpy_scalars(value):
|
|
883
902
|
"""Convert numpy scalars to Python native types recursively."""
|
|
884
903
|
if isinstance(value, np.generic):
|
|
@@ -887,17 +906,19 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
887
906
|
return [convert_numpy_scalars(item) for item in value]
|
|
888
907
|
else:
|
|
889
908
|
return value
|
|
890
|
-
|
|
909
|
+
|
|
891
910
|
for k, v in regular_data.items():
|
|
892
|
-
if hasattr(v,
|
|
911
|
+
if hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
893
912
|
# Convert numpy object array to Python list
|
|
894
|
-
safe_regular_data[k] = [
|
|
913
|
+
safe_regular_data[k] = [
|
|
914
|
+
convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, "tolist") else list(v))
|
|
915
|
+
]
|
|
895
916
|
elif isinstance(v, list):
|
|
896
917
|
# Handle lists that might contain numpy scalars
|
|
897
918
|
safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
|
|
898
919
|
else:
|
|
899
920
|
safe_regular_data[k] = convert_numpy_scalars(v)
|
|
900
|
-
|
|
921
|
+
|
|
901
922
|
# Create DataFrame with proper error handling
|
|
902
923
|
try:
|
|
903
924
|
df = pl.DataFrame(safe_regular_data)
|
|
@@ -914,7 +935,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
914
935
|
except Exception:
|
|
915
936
|
# Last resort: skip the column entirely
|
|
916
937
|
continue
|
|
917
|
-
|
|
938
|
+
|
|
918
939
|
# Add Object columns one by one
|
|
919
940
|
for col, values in object_data.items():
|
|
920
941
|
# print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
@@ -993,9 +1014,7 @@ def _load_dataframe_from_group(
|
|
|
993
1014
|
)
|
|
994
1015
|
schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
|
|
995
1016
|
logger.debug(f"Schema section for {df_name}: {schema_section}")
|
|
996
|
-
schema_columns = (
|
|
997
|
-
schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
998
|
-
)
|
|
1017
|
+
schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
999
1018
|
logger.debug(f"Schema columns for {df_name}: {schema_columns}")
|
|
1000
1019
|
if schema_columns is None:
|
|
1001
1020
|
schema_columns = []
|
|
@@ -1158,11 +1177,7 @@ def _load_dataframe_from_group(
|
|
|
1158
1177
|
}
|
|
1159
1178
|
migrated_old_names = set(column_migrations.keys())
|
|
1160
1179
|
|
|
1161
|
-
extra_columns = [
|
|
1162
|
-
col
|
|
1163
|
-
for col in hdf5_columns
|
|
1164
|
-
if col not in (schema_columns or []) and col not in migrated_old_names
|
|
1165
|
-
]
|
|
1180
|
+
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
1166
1181
|
|
|
1167
1182
|
for col in extra_columns:
|
|
1168
1183
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -1188,10 +1203,7 @@ def _load_dataframe_from_group(
|
|
|
1188
1203
|
object_columns.append(col)
|
|
1189
1204
|
else:
|
|
1190
1205
|
# Regular string data
|
|
1191
|
-
data[col] = [
|
|
1192
|
-
item.decode("utf-8") if isinstance(item, bytes) else item
|
|
1193
|
-
for item in column_data
|
|
1194
|
-
]
|
|
1206
|
+
data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
1195
1207
|
except Exception:
|
|
1196
1208
|
# If decoding fails, treat as regular data
|
|
1197
1209
|
data[col] = column_data
|
|
@@ -1204,19 +1216,10 @@ def _load_dataframe_from_group(
|
|
|
1204
1216
|
# Handle byte string conversion for non-object columns
|
|
1205
1217
|
# Only convert to strings for columns that should actually be strings
|
|
1206
1218
|
for col, values in data.items():
|
|
1207
|
-
if (
|
|
1208
|
-
col not in object_columns
|
|
1209
|
-
and values is not None
|
|
1210
|
-
and len(values) > 0
|
|
1211
|
-
and isinstance(values[0], bytes)
|
|
1212
|
-
):
|
|
1219
|
+
if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
|
|
1213
1220
|
# Check schema to see if this should be a string column
|
|
1214
1221
|
should_be_string = False
|
|
1215
|
-
if
|
|
1216
|
-
df_name in schema
|
|
1217
|
-
and "columns" in schema[df_name]
|
|
1218
|
-
and col in schema[df_name]["columns"]
|
|
1219
|
-
):
|
|
1222
|
+
if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
|
|
1220
1223
|
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
1221
1224
|
should_be_string = dtype_str == "pl.Utf8"
|
|
1222
1225
|
|
|
@@ -1237,25 +1240,25 @@ def _load_dataframe_from_group(
|
|
|
1237
1240
|
logger.debug(
|
|
1238
1241
|
f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
|
|
1239
1242
|
)
|
|
1240
|
-
|
|
1243
|
+
|
|
1241
1244
|
# Debug: check for problematic data types in all columns before DataFrame creation
|
|
1242
1245
|
for col, values in data.items():
|
|
1243
|
-
if hasattr(values,
|
|
1246
|
+
if hasattr(values, "dtype") and str(values.dtype) == "object":
|
|
1244
1247
|
logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
|
|
1245
1248
|
if col not in object_columns:
|
|
1246
1249
|
object_columns.append(col)
|
|
1247
|
-
|
|
1250
|
+
|
|
1248
1251
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
1249
1252
|
else:
|
|
1250
1253
|
# Debug: check for problematic data types when no object columns are expected
|
|
1251
1254
|
for col, values in data.items():
|
|
1252
|
-
if hasattr(values,
|
|
1255
|
+
if hasattr(values, "dtype") and str(values.dtype) == "object":
|
|
1253
1256
|
logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
|
|
1254
1257
|
# Treat as object column
|
|
1255
1258
|
if object_columns is None:
|
|
1256
1259
|
object_columns = []
|
|
1257
1260
|
object_columns.append(col)
|
|
1258
|
-
|
|
1261
|
+
|
|
1259
1262
|
if object_columns:
|
|
1260
1263
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
1261
1264
|
else:
|
|
@@ -1302,34 +1305,21 @@ def _save_study5_compressed(self, filename):
|
|
|
1302
1305
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1303
1306
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1304
1307
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1305
|
-
if (
|
|
1306
|
-
self.consensus_mapping_df is not None
|
|
1307
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1308
|
-
):
|
|
1308
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1309
1309
|
dataframes_to_save.append(
|
|
1310
1310
|
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1311
1311
|
)
|
|
1312
1312
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1313
1313
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1314
|
-
if (
|
|
1315
|
-
hasattr(self, "lib_df")
|
|
1316
|
-
and self.lib_df is not None
|
|
1317
|
-
and not self.lib_df.is_empty()
|
|
1318
|
-
):
|
|
1314
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1319
1315
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
1320
|
-
if (
|
|
1321
|
-
hasattr(self, "id_df")
|
|
1322
|
-
and self.id_df is not None
|
|
1323
|
-
and not self.id_df.is_empty()
|
|
1324
|
-
):
|
|
1316
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1325
1317
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
1326
1318
|
|
|
1327
1319
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1328
1320
|
|
|
1329
1321
|
# Show progress for large saves
|
|
1330
|
-
tdqm_disable =
|
|
1331
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1332
|
-
)
|
|
1322
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1333
1323
|
|
|
1334
1324
|
with tqdm(
|
|
1335
1325
|
total=total_steps,
|
|
@@ -1347,14 +1337,8 @@ def _save_study5_compressed(self, filename):
|
|
|
1347
1337
|
|
|
1348
1338
|
# Store metadata
|
|
1349
1339
|
metadata_group.attrs["format"] = "masster-study-1"
|
|
1350
|
-
metadata_group.attrs["folder"] = (
|
|
1351
|
-
|
|
1352
|
-
)
|
|
1353
|
-
metadata_group.attrs["label"] = (
|
|
1354
|
-
str(self.label)
|
|
1355
|
-
if hasattr(self, "label") and self.label is not None
|
|
1356
|
-
else ""
|
|
1357
|
-
)
|
|
1340
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
1341
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1358
1342
|
|
|
1359
1343
|
# Store parameters as JSON
|
|
1360
1344
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1419,10 +1403,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1419
1403
|
pbar.update(1)
|
|
1420
1404
|
|
|
1421
1405
|
# Store consensus_mapping_df - keep existing fast method
|
|
1422
|
-
if (
|
|
1423
|
-
self.consensus_mapping_df is not None
|
|
1424
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1425
|
-
):
|
|
1406
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1426
1407
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1427
1408
|
self.logger.debug(
|
|
1428
1409
|
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
@@ -1458,11 +1439,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1458
1439
|
pbar.update(1)
|
|
1459
1440
|
|
|
1460
1441
|
# Store lib_df - library data
|
|
1461
|
-
if (
|
|
1462
|
-
hasattr(self, "lib_df")
|
|
1463
|
-
and self.lib_df is not None
|
|
1464
|
-
and not self.lib_df.is_empty()
|
|
1465
|
-
):
|
|
1442
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1466
1443
|
self.logger.debug(
|
|
1467
1444
|
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1468
1445
|
)
|
|
@@ -1476,11 +1453,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1476
1453
|
pbar.update(1)
|
|
1477
1454
|
|
|
1478
1455
|
# Store id_df - identification results
|
|
1479
|
-
if (
|
|
1480
|
-
hasattr(self, "id_df")
|
|
1481
|
-
and self.id_df is not None
|
|
1482
|
-
and not self.id_df.is_empty()
|
|
1483
|
-
):
|
|
1456
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1484
1457
|
self.logger.debug(
|
|
1485
1458
|
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1486
1459
|
)
|
|
@@ -1636,34 +1609,21 @@ def _save_study5(self, filename):
|
|
|
1636
1609
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1637
1610
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1638
1611
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1639
|
-
if (
|
|
1640
|
-
self.consensus_mapping_df is not None
|
|
1641
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1642
|
-
):
|
|
1612
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1643
1613
|
dataframes_to_save.append(
|
|
1644
1614
|
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1645
1615
|
)
|
|
1646
1616
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1647
1617
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1648
|
-
if (
|
|
1649
|
-
hasattr(self, "lib_df")
|
|
1650
|
-
and self.lib_df is not None
|
|
1651
|
-
and not self.lib_df.is_empty()
|
|
1652
|
-
):
|
|
1618
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1653
1619
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
1654
|
-
if (
|
|
1655
|
-
hasattr(self, "id_df")
|
|
1656
|
-
and self.id_df is not None
|
|
1657
|
-
and not self.id_df.is_empty()
|
|
1658
|
-
):
|
|
1620
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1659
1621
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
1660
1622
|
|
|
1661
1623
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1662
1624
|
|
|
1663
1625
|
# Show progress for large saves
|
|
1664
|
-
tdqm_disable =
|
|
1665
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1666
|
-
)
|
|
1626
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1667
1627
|
|
|
1668
1628
|
with tqdm(
|
|
1669
1629
|
total=total_steps,
|
|
@@ -1681,14 +1641,8 @@ def _save_study5(self, filename):
|
|
|
1681
1641
|
|
|
1682
1642
|
# Store metadata
|
|
1683
1643
|
metadata_group.attrs["format"] = "masster-study-1"
|
|
1684
|
-
metadata_group.attrs["folder"] = (
|
|
1685
|
-
|
|
1686
|
-
)
|
|
1687
|
-
metadata_group.attrs["label"] = (
|
|
1688
|
-
str(self.label)
|
|
1689
|
-
if hasattr(self, "label") and self.label is not None
|
|
1690
|
-
else ""
|
|
1691
|
-
)
|
|
1644
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
1645
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1692
1646
|
|
|
1693
1647
|
# Store parameters as JSON
|
|
1694
1648
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1756,10 +1710,7 @@ def _save_study5(self, filename):
|
|
|
1756
1710
|
pbar.update(1)
|
|
1757
1711
|
|
|
1758
1712
|
# Store consensus_mapping_df - keep existing fast method
|
|
1759
|
-
if (
|
|
1760
|
-
self.consensus_mapping_df is not None
|
|
1761
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1762
|
-
):
|
|
1713
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1763
1714
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1764
1715
|
self.logger.debug(
|
|
1765
1716
|
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
@@ -1795,11 +1746,7 @@ def _save_study5(self, filename):
|
|
|
1795
1746
|
pbar.update(1)
|
|
1796
1747
|
|
|
1797
1748
|
# Store lib_df - library data
|
|
1798
|
-
if (
|
|
1799
|
-
hasattr(self, "lib_df")
|
|
1800
|
-
and self.lib_df is not None
|
|
1801
|
-
and not self.lib_df.is_empty()
|
|
1802
|
-
):
|
|
1749
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1803
1750
|
self.logger.debug(
|
|
1804
1751
|
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1805
1752
|
)
|
|
@@ -1813,11 +1760,7 @@ def _save_study5(self, filename):
|
|
|
1813
1760
|
pbar.update(1)
|
|
1814
1761
|
|
|
1815
1762
|
# Store id_df - identification results
|
|
1816
|
-
if (
|
|
1817
|
-
hasattr(self, "id_df")
|
|
1818
|
-
and self.id_df is not None
|
|
1819
|
-
and not self.id_df.is_empty()
|
|
1820
|
-
):
|
|
1763
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1821
1764
|
self.logger.debug(
|
|
1822
1765
|
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1823
1766
|
)
|
|
@@ -1896,12 +1839,7 @@ def _load_study5(self, filename=None):
|
|
|
1896
1839
|
|
|
1897
1840
|
with h5py.File(filename, "r") as f:
|
|
1898
1841
|
# Use progress bar to show loading progress
|
|
1899
|
-
with tqdm(
|
|
1900
|
-
total=len(loading_steps),
|
|
1901
|
-
desc="Loading study",
|
|
1902
|
-
disable=tdqm_disable,
|
|
1903
|
-
unit="step"
|
|
1904
|
-
) as pbar:
|
|
1842
|
+
with tqdm(total=len(loading_steps), desc="Loading study", disable=tdqm_disable, unit="step") as pbar:
|
|
1905
1843
|
# Load metadata
|
|
1906
1844
|
pbar.set_description(
|
|
1907
1845
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
|
|
@@ -1963,24 +1901,14 @@ def _load_study5(self, filename=None):
|
|
|
1963
1901
|
# Synchronize instance attributes with parameters (similar to __init__)
|
|
1964
1902
|
# Note: folder and label are already loaded from metadata attributes above
|
|
1965
1903
|
# but we ensure they match the parameters for consistency
|
|
1966
|
-
if (
|
|
1967
|
-
hasattr(self.parameters, "folder")
|
|
1968
|
-
and self.parameters.folder is not None
|
|
1969
|
-
):
|
|
1904
|
+
if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
|
|
1970
1905
|
self.folder = self.parameters.folder
|
|
1971
|
-
if (
|
|
1972
|
-
hasattr(self.parameters, "label")
|
|
1973
|
-
and self.parameters.label is not None
|
|
1974
|
-
):
|
|
1906
|
+
if hasattr(self.parameters, "label") and self.parameters.label is not None:
|
|
1975
1907
|
self.label = self.parameters.label
|
|
1976
1908
|
if hasattr(self.parameters, "log_level"):
|
|
1977
1909
|
self.log_level = self.parameters.log_level
|
|
1978
1910
|
if hasattr(self.parameters, "log_label"):
|
|
1979
|
-
self.log_label =
|
|
1980
|
-
self.parameters.log_label
|
|
1981
|
-
if self.parameters.log_label is not None
|
|
1982
|
-
else ""
|
|
1983
|
-
)
|
|
1911
|
+
self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
|
|
1984
1912
|
if hasattr(self.parameters, "log_sink"):
|
|
1985
1913
|
self.log_sink = self.parameters.log_sink
|
|
1986
1914
|
pbar.update(1)
|
|
@@ -2017,7 +1945,7 @@ def _load_study5(self, filename=None):
|
|
|
2017
1945
|
self.logger,
|
|
2018
1946
|
object_columns,
|
|
2019
1947
|
)
|
|
2020
|
-
|
|
1948
|
+
|
|
2021
1949
|
# Sanity check: replace any missing rt_original with rt values
|
|
2022
1950
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
2023
1951
|
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
@@ -2046,10 +1974,15 @@ def _load_study5(self, filename=None):
|
|
|
2046
1974
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus",
|
|
2047
1975
|
)
|
|
2048
1976
|
if "consensus" in f and len(f["consensus"].keys()) > 0:
|
|
2049
|
-
# Only include
|
|
1977
|
+
# Only include object columns if they actually exist in the file
|
|
2050
1978
|
object_columns = []
|
|
2051
|
-
|
|
2052
|
-
|
|
1979
|
+
try:
|
|
1980
|
+
if "adducts" in f["consensus"]:
|
|
1981
|
+
object_columns.append("adducts")
|
|
1982
|
+
if "iso" in f["consensus"]:
|
|
1983
|
+
object_columns.append("iso")
|
|
1984
|
+
except (KeyError, TypeError):
|
|
1985
|
+
pass
|
|
2053
1986
|
|
|
2054
1987
|
self.consensus_df = _load_dataframe_from_group(
|
|
2055
1988
|
f["consensus"],
|
|
@@ -2061,16 +1994,11 @@ def _load_study5(self, filename=None):
|
|
|
2061
1994
|
|
|
2062
1995
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
2063
1996
|
if self.consensus_df is not None:
|
|
2064
|
-
if
|
|
2065
|
-
"adducts" not in self.consensus_df.columns
|
|
2066
|
-
or self.consensus_df["adducts"].dtype == pl.Null
|
|
2067
|
-
):
|
|
1997
|
+
if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
|
|
2068
1998
|
self.logger.info(
|
|
2069
1999
|
"Adding missing 'adducts' column for backward compatibility",
|
|
2070
2000
|
)
|
|
2071
|
-
empty_adducts: list[list] = [
|
|
2072
|
-
[] for _ in range(len(self.consensus_df))
|
|
2073
|
-
]
|
|
2001
|
+
empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
|
|
2074
2002
|
|
|
2075
2003
|
# If column exists but is Null, drop it first
|
|
2076
2004
|
if "adducts" in self.consensus_df.columns:
|
|
@@ -2163,11 +2091,7 @@ def _load_study5(self, filename=None):
|
|
|
2163
2091
|
pbar.update(1)
|
|
2164
2092
|
|
|
2165
2093
|
# Check and migrate old string-based map_id to integer indices
|
|
2166
|
-
if (
|
|
2167
|
-
self.samples_df is not None
|
|
2168
|
-
and not self.samples_df.is_empty()
|
|
2169
|
-
and self.samples_df["map_id"].dtype == pl.Utf8
|
|
2170
|
-
):
|
|
2094
|
+
if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
|
|
2171
2095
|
self.logger.info(
|
|
2172
2096
|
"Detected old string-based map_id format, migrating to integer indices",
|
|
2173
2097
|
)
|
|
@@ -2191,26 +2115,26 @@ def _load_study5(self, filename=None):
|
|
|
2191
2115
|
_sanitize_nulls(self)
|
|
2192
2116
|
|
|
2193
2117
|
self.logger.debug("Study loaded")
|
|
2194
|
-
|
|
2118
|
+
|
|
2195
2119
|
|
|
2196
2120
|
def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
2197
2121
|
"""
|
|
2198
2122
|
Optimized method to load only MS1 data from a sample5 file for isotope detection.
|
|
2199
|
-
|
|
2123
|
+
|
|
2200
2124
|
This method efficiently loads only the ms1_df from a sample5 HDF5 file without
|
|
2201
2125
|
loading other potentially large datasets like features_df, scans_df, etc.
|
|
2202
|
-
|
|
2126
|
+
|
|
2203
2127
|
Args:
|
|
2204
2128
|
sample_path (str): Path to the sample5 HDF5 file
|
|
2205
|
-
|
|
2129
|
+
|
|
2206
2130
|
Returns:
|
|
2207
|
-
pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
|
|
2131
|
+
pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
|
|
2208
2132
|
Returns empty DataFrame if no MS1 data found or file cannot be read
|
|
2209
|
-
|
|
2133
|
+
|
|
2210
2134
|
Note:
|
|
2211
2135
|
Used by find_iso() for efficient isotope pattern detection without full sample loading
|
|
2212
2136
|
"""
|
|
2213
|
-
#try:
|
|
2137
|
+
# try:
|
|
2214
2138
|
# add .sample5 extension if not provided
|
|
2215
2139
|
if not filename.endswith(".sample5"):
|
|
2216
2140
|
filename += ".sample5"
|
|
@@ -2219,45 +2143,46 @@ def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
|
2219
2143
|
if "ms1" not in f:
|
|
2220
2144
|
self.logger.debug(f"No MS1 data found in {filename}")
|
|
2221
2145
|
return pl.DataFrame()
|
|
2222
|
-
|
|
2146
|
+
|
|
2223
2147
|
ms1_group = f["ms1"]
|
|
2224
|
-
|
|
2148
|
+
|
|
2225
2149
|
# Load MS1 data efficiently
|
|
2226
2150
|
ms1_data = {}
|
|
2227
2151
|
for col in ms1_group.keys():
|
|
2228
2152
|
ms1_data[col] = ms1_group[col][:]
|
|
2229
|
-
|
|
2153
|
+
|
|
2230
2154
|
if not ms1_data:
|
|
2231
2155
|
self.logger.debug(f"Empty MS1 data in {filename}")
|
|
2232
2156
|
return pl.DataFrame()
|
|
2233
|
-
|
|
2157
|
+
|
|
2234
2158
|
# Create DataFrame with proper schema
|
|
2235
2159
|
ms1_df = pl.DataFrame(ms1_data)
|
|
2236
|
-
|
|
2160
|
+
|
|
2237
2161
|
# Apply expected schema for MS1 data
|
|
2238
2162
|
expected_schema = {
|
|
2239
2163
|
"cycle": pl.Int64,
|
|
2240
|
-
"scan_uid": pl.Int64,
|
|
2164
|
+
"scan_uid": pl.Int64,
|
|
2241
2165
|
"rt": pl.Float64,
|
|
2242
2166
|
"mz": pl.Float64,
|
|
2243
|
-
"inty": pl.Float64
|
|
2167
|
+
"inty": pl.Float64,
|
|
2244
2168
|
}
|
|
2245
|
-
|
|
2169
|
+
|
|
2246
2170
|
# Cast columns to expected types if they exist
|
|
2247
2171
|
cast_expressions = []
|
|
2248
2172
|
for col, dtype in expected_schema.items():
|
|
2249
2173
|
if col in ms1_df.columns:
|
|
2250
2174
|
cast_expressions.append(pl.col(col).cast(dtype))
|
|
2251
|
-
|
|
2175
|
+
|
|
2252
2176
|
if cast_expressions:
|
|
2253
2177
|
ms1_df = ms1_df.with_columns(cast_expressions)
|
|
2254
|
-
|
|
2178
|
+
|
|
2255
2179
|
self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
|
|
2256
2180
|
return ms1_df
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2181
|
+
|
|
2182
|
+
|
|
2183
|
+
# except Exception as e:
|
|
2184
|
+
# self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
2185
|
+
# return pl.DataFrame()
|
|
2261
2186
|
|
|
2262
2187
|
|
|
2263
2188
|
def _sanitize_nulls(self):
|
|
@@ -2269,14 +2194,14 @@ def _sanitize_nulls(self):
|
|
|
2269
2194
|
import uuid
|
|
2270
2195
|
import polars as pl
|
|
2271
2196
|
import time
|
|
2272
|
-
|
|
2197
|
+
|
|
2273
2198
|
# Sanitize features_df feature_id column
|
|
2274
|
-
if hasattr(self,
|
|
2199
|
+
if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
|
|
2275
2200
|
# Check for null feature_ids
|
|
2276
2201
|
null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
|
|
2277
2202
|
if null_feature_ids > 0:
|
|
2278
2203
|
self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
|
|
2279
|
-
|
|
2204
|
+
|
|
2280
2205
|
# Find the maximum existing feature_id (convert strings to int if possible)
|
|
2281
2206
|
max_existing_id = 0
|
|
2282
2207
|
existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
|
|
@@ -2287,13 +2212,13 @@ def _sanitize_nulls(self):
|
|
|
2287
2212
|
except (ValueError, TypeError):
|
|
2288
2213
|
# Skip non-integer IDs
|
|
2289
2214
|
pass
|
|
2290
|
-
|
|
2215
|
+
|
|
2291
2216
|
# Generate new sequential integer IDs starting from max + timestamp offset
|
|
2292
2217
|
# Use timestamp to ensure uniqueness across different sanitization runs
|
|
2293
2218
|
base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
|
|
2294
2219
|
new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
|
|
2295
2220
|
uid_index = 0
|
|
2296
|
-
|
|
2221
|
+
|
|
2297
2222
|
# Create a list to store all feature_ids
|
|
2298
2223
|
feature_ids = []
|
|
2299
2224
|
for feature_id in self.features_df["feature_id"].to_list():
|
|
@@ -2302,25 +2227,23 @@ def _sanitize_nulls(self):
|
|
|
2302
2227
|
uid_index += 1
|
|
2303
2228
|
else:
|
|
2304
2229
|
feature_ids.append(feature_id)
|
|
2305
|
-
|
|
2230
|
+
|
|
2306
2231
|
# Update the DataFrame with sanitized feature_ids
|
|
2307
|
-
self.features_df = self.features_df.with_columns(
|
|
2308
|
-
|
|
2309
|
-
)
|
|
2310
|
-
|
|
2232
|
+
self.features_df = self.features_df.with_columns(pl.Series("feature_id", feature_ids, dtype=pl.Utf8))
|
|
2233
|
+
|
|
2311
2234
|
self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
|
|
2312
|
-
|
|
2235
|
+
|
|
2313
2236
|
# Sanitize consensus_df consensus_id column
|
|
2314
|
-
if hasattr(self,
|
|
2237
|
+
if hasattr(self, "consensus_df") and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
2315
2238
|
if "consensus_id" in self.consensus_df.columns:
|
|
2316
2239
|
null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
|
|
2317
2240
|
if null_consensus_ids > 0:
|
|
2318
2241
|
self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
|
|
2319
|
-
|
|
2242
|
+
|
|
2320
2243
|
# Generate new UIDs for null values using the same method as merge()
|
|
2321
|
-
new_uids = [str(uuid.uuid4()).replace(
|
|
2244
|
+
new_uids = [str(uuid.uuid4()).replace("-", "")[:16] for _ in range(null_consensus_ids)]
|
|
2322
2245
|
uid_index = 0
|
|
2323
|
-
|
|
2246
|
+
|
|
2324
2247
|
# Create a list to store all consensus_ids
|
|
2325
2248
|
consensus_ids = []
|
|
2326
2249
|
for consensus_id in self.consensus_df["consensus_id"].to_list():
|
|
@@ -2329,7 +2252,7 @@ def _sanitize_nulls(self):
|
|
|
2329
2252
|
uid_index += 1
|
|
2330
2253
|
else:
|
|
2331
2254
|
consensus_ids.append(consensus_id)
|
|
2332
|
-
|
|
2255
|
+
|
|
2333
2256
|
# Update the DataFrame with sanitized consensus_ids
|
|
2334
2257
|
self.consensus_df = self.consensus_df.with_columns(
|
|
2335
2258
|
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
@@ -2338,7 +2261,7 @@ def _sanitize_nulls(self):
|
|
|
2338
2261
|
self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
|
|
2339
2262
|
|
|
2340
2263
|
# Sanitize rt_original in features_df by replacing null or NaN values with rt values
|
|
2341
|
-
if hasattr(self,
|
|
2264
|
+
if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
|
|
2342
2265
|
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
2343
2266
|
# Check for null or NaN values in rt_original
|
|
2344
2267
|
null_or_nan_rt_original = self.features_df.filter(
|
|
@@ -2352,4 +2275,4 @@ def _sanitize_nulls(self):
|
|
|
2352
2275
|
.otherwise(pl.col("rt_original"))
|
|
2353
2276
|
.alias("rt_original")
|
|
2354
2277
|
)
|
|
2355
|
-
self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
|
|
2278
|
+
self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
|