masster 0.5.22__py3-none-any.whl → 0.5.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +100 -204
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +763 -713
- {masster-0.5.22.dist-info → masster-0.5.23.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.23.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.23.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.23.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.23.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -61,18 +61,18 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
|
|
|
61
61
|
if df_name not in schema:
|
|
62
62
|
# Fallback to basic empty DataFrame if schema not found
|
|
63
63
|
return pl.DataFrame()
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
df_schema = schema[df_name]["columns"]
|
|
66
66
|
empty_data = {}
|
|
67
67
|
polars_schema = {}
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
for col_name, col_info in df_schema.items():
|
|
70
70
|
dtype_str = col_info["dtype"]
|
|
71
71
|
# Convert string representation to actual Polars dtype
|
|
72
72
|
if dtype_str == "pl.Int64":
|
|
73
73
|
polars_dtype = pl.Int64
|
|
74
74
|
elif dtype_str == "pl.Int32":
|
|
75
|
-
polars_dtype = pl.Int32
|
|
75
|
+
polars_dtype = pl.Int32
|
|
76
76
|
elif dtype_str == "pl.Float64":
|
|
77
77
|
polars_dtype = pl.Float64
|
|
78
78
|
elif dtype_str == "pl.Utf8":
|
|
@@ -88,10 +88,10 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
|
|
|
88
88
|
else:
|
|
89
89
|
# Fallback to string if unknown type
|
|
90
90
|
polars_dtype = pl.String
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
empty_data[col_name] = []
|
|
93
93
|
polars_schema[col_name] = polars_dtype
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
return pl.DataFrame(empty_data, schema=polars_schema)
|
|
96
96
|
|
|
97
97
|
|
|
@@ -313,7 +313,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
313
313
|
serialized_chunk.append(json.dumps(item.tolist()))
|
|
314
314
|
except (AttributeError, TypeError):
|
|
315
315
|
# Fallback for non-numpy data
|
|
316
|
-
serialized_chunk.append(json.dumps(list(item) if hasattr(item,
|
|
316
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
317
317
|
else:
|
|
318
318
|
serialized_chunk.append("None")
|
|
319
319
|
elif col_name == "ms1_spec":
|
|
@@ -325,7 +325,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
325
325
|
serialized_chunk.append(json.dumps(item.tolist()))
|
|
326
326
|
except (AttributeError, TypeError):
|
|
327
327
|
# Fallback for non-numpy data
|
|
328
|
-
serialized_chunk.append(json.dumps(list(item) if hasattr(item,
|
|
328
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
329
329
|
else:
|
|
330
330
|
serialized_chunk.append("None")
|
|
331
331
|
else:
|
|
@@ -392,10 +392,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
392
392
|
)
|
|
393
393
|
# Fallback to simple string conversion for this chunk
|
|
394
394
|
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
395
|
-
results[chunk_start] = [
|
|
396
|
-
str(item) if item is not None else "None"
|
|
397
|
-
for item in chunk
|
|
398
|
-
]
|
|
395
|
+
results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
|
|
399
396
|
|
|
400
397
|
# Reassemble in correct order
|
|
401
398
|
for i in range(0, total_items, chunk_size):
|
|
@@ -598,7 +595,7 @@ def _save_dataframe_column_legacy(
|
|
|
598
595
|
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
599
596
|
except (AttributeError, TypeError):
|
|
600
597
|
# Fallback for non-numpy data
|
|
601
|
-
data_as_json_strings.append(json.dumps(list(item) if hasattr(item,
|
|
598
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
602
599
|
else:
|
|
603
600
|
data_as_json_strings.append("None")
|
|
604
601
|
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
@@ -612,7 +609,7 @@ def _save_dataframe_column_legacy(
|
|
|
612
609
|
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
613
610
|
except (AttributeError, TypeError):
|
|
614
611
|
# Fallback for non-numpy data
|
|
615
|
-
data_as_json_strings.append(json.dumps(list(item) if hasattr(item,
|
|
612
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
|
|
616
613
|
else:
|
|
617
614
|
data_as_json_strings.append("None")
|
|
618
615
|
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
@@ -712,9 +709,7 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
712
709
|
"adduct": str(adduct_row[0]),
|
|
713
710
|
"count": int(float(adduct_row[1])),
|
|
714
711
|
"percentage": float(adduct_row[2]),
|
|
715
|
-
"mass": float(adduct_row[3])
|
|
716
|
-
if len(adduct_row) > 3
|
|
717
|
-
else 0.0,
|
|
712
|
+
"mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0,
|
|
718
713
|
},
|
|
719
714
|
)
|
|
720
715
|
reconstructed_data.append(converted_adducts)
|
|
@@ -722,6 +717,7 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
722
717
|
# Handle isotope patterns (numpy arrays with [mz, intensity] data)
|
|
723
718
|
try:
|
|
724
719
|
import numpy as np
|
|
720
|
+
|
|
725
721
|
iso_data = json.loads(item)
|
|
726
722
|
# Convert back to numpy array
|
|
727
723
|
reconstructed_data.append(np.array(iso_data) if iso_data else None)
|
|
@@ -731,6 +727,7 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
731
727
|
# Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
|
|
732
728
|
try:
|
|
733
729
|
import numpy as np
|
|
730
|
+
|
|
734
731
|
ms1_spec_data = json.loads(item)
|
|
735
732
|
# Convert back to numpy array
|
|
736
733
|
reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
|
|
@@ -821,25 +818,25 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
821
818
|
# First check all data for numpy object arrays and move them to object columns
|
|
822
819
|
additional_object_cols = []
|
|
823
820
|
for k, v in data.items():
|
|
824
|
-
if k not in object_columns and hasattr(v,
|
|
821
|
+
if k not in object_columns and hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
825
822
|
# This is a numpy object array that should be treated as object
|
|
826
823
|
additional_object_cols.append(k)
|
|
827
824
|
object_columns.append(k)
|
|
828
|
-
|
|
825
|
+
|
|
829
826
|
if additional_object_cols:
|
|
830
827
|
# Re-run reconstruction for these columns
|
|
831
828
|
for col in additional_object_cols:
|
|
832
829
|
data[col] = _reconstruct_object_column(data[col], col)
|
|
833
|
-
|
|
830
|
+
|
|
834
831
|
object_data = {k: v for k, v in data.items() if k in object_columns}
|
|
835
832
|
regular_data = {k: v for k, v in data.items() if k not in object_columns}
|
|
836
833
|
|
|
837
834
|
# Final check: ensure no numpy object arrays in regular_data
|
|
838
835
|
problematic_cols = []
|
|
839
836
|
for k, v in regular_data.items():
|
|
840
|
-
if hasattr(v,
|
|
837
|
+
if hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
841
838
|
problematic_cols.append(k)
|
|
842
|
-
|
|
839
|
+
|
|
843
840
|
if problematic_cols:
|
|
844
841
|
# Move these to object_data
|
|
845
842
|
for col in problematic_cols:
|
|
@@ -878,7 +875,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
878
875
|
# and handle numpy scalars within lists
|
|
879
876
|
safe_regular_data = {}
|
|
880
877
|
import numpy as np
|
|
881
|
-
|
|
878
|
+
|
|
882
879
|
def convert_numpy_scalars(value):
|
|
883
880
|
"""Convert numpy scalars to Python native types recursively."""
|
|
884
881
|
if isinstance(value, np.generic):
|
|
@@ -887,17 +884,19 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
887
884
|
return [convert_numpy_scalars(item) for item in value]
|
|
888
885
|
else:
|
|
889
886
|
return value
|
|
890
|
-
|
|
887
|
+
|
|
891
888
|
for k, v in regular_data.items():
|
|
892
|
-
if hasattr(v,
|
|
889
|
+
if hasattr(v, "dtype") and str(v.dtype) == "object":
|
|
893
890
|
# Convert numpy object array to Python list
|
|
894
|
-
safe_regular_data[k] = [
|
|
891
|
+
safe_regular_data[k] = [
|
|
892
|
+
convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, "tolist") else list(v))
|
|
893
|
+
]
|
|
895
894
|
elif isinstance(v, list):
|
|
896
895
|
# Handle lists that might contain numpy scalars
|
|
897
896
|
safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
|
|
898
897
|
else:
|
|
899
898
|
safe_regular_data[k] = convert_numpy_scalars(v)
|
|
900
|
-
|
|
899
|
+
|
|
901
900
|
# Create DataFrame with proper error handling
|
|
902
901
|
try:
|
|
903
902
|
df = pl.DataFrame(safe_regular_data)
|
|
@@ -914,7 +913,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
914
913
|
except Exception:
|
|
915
914
|
# Last resort: skip the column entirely
|
|
916
915
|
continue
|
|
917
|
-
|
|
916
|
+
|
|
918
917
|
# Add Object columns one by one
|
|
919
918
|
for col, values in object_data.items():
|
|
920
919
|
# print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
@@ -993,9 +992,7 @@ def _load_dataframe_from_group(
|
|
|
993
992
|
)
|
|
994
993
|
schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
|
|
995
994
|
logger.debug(f"Schema section for {df_name}: {schema_section}")
|
|
996
|
-
schema_columns = (
|
|
997
|
-
schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
998
|
-
)
|
|
995
|
+
schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
|
|
999
996
|
logger.debug(f"Schema columns for {df_name}: {schema_columns}")
|
|
1000
997
|
if schema_columns is None:
|
|
1001
998
|
schema_columns = []
|
|
@@ -1158,11 +1155,7 @@ def _load_dataframe_from_group(
|
|
|
1158
1155
|
}
|
|
1159
1156
|
migrated_old_names = set(column_migrations.keys())
|
|
1160
1157
|
|
|
1161
|
-
extra_columns = [
|
|
1162
|
-
col
|
|
1163
|
-
for col in hdf5_columns
|
|
1164
|
-
if col not in (schema_columns or []) and col not in migrated_old_names
|
|
1165
|
-
]
|
|
1158
|
+
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
|
|
1166
1159
|
|
|
1167
1160
|
for col in extra_columns:
|
|
1168
1161
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
@@ -1188,10 +1181,7 @@ def _load_dataframe_from_group(
|
|
|
1188
1181
|
object_columns.append(col)
|
|
1189
1182
|
else:
|
|
1190
1183
|
# Regular string data
|
|
1191
|
-
data[col] = [
|
|
1192
|
-
item.decode("utf-8") if isinstance(item, bytes) else item
|
|
1193
|
-
for item in column_data
|
|
1194
|
-
]
|
|
1184
|
+
data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
1195
1185
|
except Exception:
|
|
1196
1186
|
# If decoding fails, treat as regular data
|
|
1197
1187
|
data[col] = column_data
|
|
@@ -1204,19 +1194,10 @@ def _load_dataframe_from_group(
|
|
|
1204
1194
|
# Handle byte string conversion for non-object columns
|
|
1205
1195
|
# Only convert to strings for columns that should actually be strings
|
|
1206
1196
|
for col, values in data.items():
|
|
1207
|
-
if (
|
|
1208
|
-
col not in object_columns
|
|
1209
|
-
and values is not None
|
|
1210
|
-
and len(values) > 0
|
|
1211
|
-
and isinstance(values[0], bytes)
|
|
1212
|
-
):
|
|
1197
|
+
if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
|
|
1213
1198
|
# Check schema to see if this should be a string column
|
|
1214
1199
|
should_be_string = False
|
|
1215
|
-
if
|
|
1216
|
-
df_name in schema
|
|
1217
|
-
and "columns" in schema[df_name]
|
|
1218
|
-
and col in schema[df_name]["columns"]
|
|
1219
|
-
):
|
|
1200
|
+
if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
|
|
1220
1201
|
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
1221
1202
|
should_be_string = dtype_str == "pl.Utf8"
|
|
1222
1203
|
|
|
@@ -1237,25 +1218,25 @@ def _load_dataframe_from_group(
|
|
|
1237
1218
|
logger.debug(
|
|
1238
1219
|
f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
|
|
1239
1220
|
)
|
|
1240
|
-
|
|
1221
|
+
|
|
1241
1222
|
# Debug: check for problematic data types in all columns before DataFrame creation
|
|
1242
1223
|
for col, values in data.items():
|
|
1243
|
-
if hasattr(values,
|
|
1224
|
+
if hasattr(values, "dtype") and str(values.dtype) == "object":
|
|
1244
1225
|
logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
|
|
1245
1226
|
if col not in object_columns:
|
|
1246
1227
|
object_columns.append(col)
|
|
1247
|
-
|
|
1228
|
+
|
|
1248
1229
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
1249
1230
|
else:
|
|
1250
1231
|
# Debug: check for problematic data types when no object columns are expected
|
|
1251
1232
|
for col, values in data.items():
|
|
1252
|
-
if hasattr(values,
|
|
1233
|
+
if hasattr(values, "dtype") and str(values.dtype) == "object":
|
|
1253
1234
|
logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
|
|
1254
1235
|
# Treat as object column
|
|
1255
1236
|
if object_columns is None:
|
|
1256
1237
|
object_columns = []
|
|
1257
1238
|
object_columns.append(col)
|
|
1258
|
-
|
|
1239
|
+
|
|
1259
1240
|
if object_columns:
|
|
1260
1241
|
df = _create_dataframe_with_objects(data, object_columns)
|
|
1261
1242
|
else:
|
|
@@ -1302,34 +1283,21 @@ def _save_study5_compressed(self, filename):
|
|
|
1302
1283
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1303
1284
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1304
1285
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1305
|
-
if (
|
|
1306
|
-
self.consensus_mapping_df is not None
|
|
1307
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1308
|
-
):
|
|
1286
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1309
1287
|
dataframes_to_save.append(
|
|
1310
1288
|
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1311
1289
|
)
|
|
1312
1290
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1313
1291
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1314
|
-
if (
|
|
1315
|
-
hasattr(self, "lib_df")
|
|
1316
|
-
and self.lib_df is not None
|
|
1317
|
-
and not self.lib_df.is_empty()
|
|
1318
|
-
):
|
|
1292
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1319
1293
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
1320
|
-
if (
|
|
1321
|
-
hasattr(self, "id_df")
|
|
1322
|
-
and self.id_df is not None
|
|
1323
|
-
and not self.id_df.is_empty()
|
|
1324
|
-
):
|
|
1294
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1325
1295
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
1326
1296
|
|
|
1327
1297
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1328
1298
|
|
|
1329
1299
|
# Show progress for large saves
|
|
1330
|
-
tdqm_disable =
|
|
1331
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1332
|
-
)
|
|
1300
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1333
1301
|
|
|
1334
1302
|
with tqdm(
|
|
1335
1303
|
total=total_steps,
|
|
@@ -1347,14 +1315,8 @@ def _save_study5_compressed(self, filename):
|
|
|
1347
1315
|
|
|
1348
1316
|
# Store metadata
|
|
1349
1317
|
metadata_group.attrs["format"] = "masster-study-1"
|
|
1350
|
-
metadata_group.attrs["folder"] = (
|
|
1351
|
-
|
|
1352
|
-
)
|
|
1353
|
-
metadata_group.attrs["label"] = (
|
|
1354
|
-
str(self.label)
|
|
1355
|
-
if hasattr(self, "label") and self.label is not None
|
|
1356
|
-
else ""
|
|
1357
|
-
)
|
|
1318
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
1319
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1358
1320
|
|
|
1359
1321
|
# Store parameters as JSON
|
|
1360
1322
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1419,10 +1381,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1419
1381
|
pbar.update(1)
|
|
1420
1382
|
|
|
1421
1383
|
# Store consensus_mapping_df - keep existing fast method
|
|
1422
|
-
if (
|
|
1423
|
-
self.consensus_mapping_df is not None
|
|
1424
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1425
|
-
):
|
|
1384
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1426
1385
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1427
1386
|
self.logger.debug(
|
|
1428
1387
|
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
@@ -1458,11 +1417,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1458
1417
|
pbar.update(1)
|
|
1459
1418
|
|
|
1460
1419
|
# Store lib_df - library data
|
|
1461
|
-
if (
|
|
1462
|
-
hasattr(self, "lib_df")
|
|
1463
|
-
and self.lib_df is not None
|
|
1464
|
-
and not self.lib_df.is_empty()
|
|
1465
|
-
):
|
|
1420
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1466
1421
|
self.logger.debug(
|
|
1467
1422
|
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1468
1423
|
)
|
|
@@ -1476,11 +1431,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1476
1431
|
pbar.update(1)
|
|
1477
1432
|
|
|
1478
1433
|
# Store id_df - identification results
|
|
1479
|
-
if (
|
|
1480
|
-
hasattr(self, "id_df")
|
|
1481
|
-
and self.id_df is not None
|
|
1482
|
-
and not self.id_df.is_empty()
|
|
1483
|
-
):
|
|
1434
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1484
1435
|
self.logger.debug(
|
|
1485
1436
|
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1486
1437
|
)
|
|
@@ -1636,34 +1587,21 @@ def _save_study5(self, filename):
|
|
|
1636
1587
|
dataframes_to_save.append(("features", len(self.features_df)))
|
|
1637
1588
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1638
1589
|
dataframes_to_save.append(("consensus", len(self.consensus_df)))
|
|
1639
|
-
if (
|
|
1640
|
-
self.consensus_mapping_df is not None
|
|
1641
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1642
|
-
):
|
|
1590
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1643
1591
|
dataframes_to_save.append(
|
|
1644
1592
|
("consensus_mapping", len(self.consensus_mapping_df)),
|
|
1645
1593
|
)
|
|
1646
1594
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1647
1595
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1648
|
-
if (
|
|
1649
|
-
hasattr(self, "lib_df")
|
|
1650
|
-
and self.lib_df is not None
|
|
1651
|
-
and not self.lib_df.is_empty()
|
|
1652
|
-
):
|
|
1596
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1653
1597
|
dataframes_to_save.append(("lib", len(self.lib_df)))
|
|
1654
|
-
if (
|
|
1655
|
-
hasattr(self, "id_df")
|
|
1656
|
-
and self.id_df is not None
|
|
1657
|
-
and not self.id_df.is_empty()
|
|
1658
|
-
):
|
|
1598
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1659
1599
|
dataframes_to_save.append(("id", len(self.id_df)))
|
|
1660
1600
|
|
|
1661
1601
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1662
1602
|
|
|
1663
1603
|
# Show progress for large saves
|
|
1664
|
-
tdqm_disable =
|
|
1665
|
-
self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1666
|
-
)
|
|
1604
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1667
1605
|
|
|
1668
1606
|
with tqdm(
|
|
1669
1607
|
total=total_steps,
|
|
@@ -1681,14 +1619,8 @@ def _save_study5(self, filename):
|
|
|
1681
1619
|
|
|
1682
1620
|
# Store metadata
|
|
1683
1621
|
metadata_group.attrs["format"] = "masster-study-1"
|
|
1684
|
-
metadata_group.attrs["folder"] = (
|
|
1685
|
-
|
|
1686
|
-
)
|
|
1687
|
-
metadata_group.attrs["label"] = (
|
|
1688
|
-
str(self.label)
|
|
1689
|
-
if hasattr(self, "label") and self.label is not None
|
|
1690
|
-
else ""
|
|
1691
|
-
)
|
|
1622
|
+
metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
|
|
1623
|
+
metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
|
|
1692
1624
|
|
|
1693
1625
|
# Store parameters as JSON
|
|
1694
1626
|
if hasattr(self, "parameters") and self.history is not None:
|
|
@@ -1756,10 +1688,7 @@ def _save_study5(self, filename):
|
|
|
1756
1688
|
pbar.update(1)
|
|
1757
1689
|
|
|
1758
1690
|
# Store consensus_mapping_df - keep existing fast method
|
|
1759
|
-
if (
|
|
1760
|
-
self.consensus_mapping_df is not None
|
|
1761
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1762
|
-
):
|
|
1691
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1763
1692
|
consensus_mapping = self.consensus_mapping_df.clone()
|
|
1764
1693
|
self.logger.debug(
|
|
1765
1694
|
f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
|
|
@@ -1795,11 +1724,7 @@ def _save_study5(self, filename):
|
|
|
1795
1724
|
pbar.update(1)
|
|
1796
1725
|
|
|
1797
1726
|
# Store lib_df - library data
|
|
1798
|
-
if (
|
|
1799
|
-
hasattr(self, "lib_df")
|
|
1800
|
-
and self.lib_df is not None
|
|
1801
|
-
and not self.lib_df.is_empty()
|
|
1802
|
-
):
|
|
1727
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
1803
1728
|
self.logger.debug(
|
|
1804
1729
|
f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
|
|
1805
1730
|
)
|
|
@@ -1813,11 +1738,7 @@ def _save_study5(self, filename):
|
|
|
1813
1738
|
pbar.update(1)
|
|
1814
1739
|
|
|
1815
1740
|
# Store id_df - identification results
|
|
1816
|
-
if (
|
|
1817
|
-
hasattr(self, "id_df")
|
|
1818
|
-
and self.id_df is not None
|
|
1819
|
-
and not self.id_df.is_empty()
|
|
1820
|
-
):
|
|
1741
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
1821
1742
|
self.logger.debug(
|
|
1822
1743
|
f"Saving id_df with {len(self.id_df)} rows using optimized method",
|
|
1823
1744
|
)
|
|
@@ -1896,12 +1817,7 @@ def _load_study5(self, filename=None):
|
|
|
1896
1817
|
|
|
1897
1818
|
with h5py.File(filename, "r") as f:
|
|
1898
1819
|
# Use progress bar to show loading progress
|
|
1899
|
-
with tqdm(
|
|
1900
|
-
total=len(loading_steps),
|
|
1901
|
-
desc="Loading study",
|
|
1902
|
-
disable=tdqm_disable,
|
|
1903
|
-
unit="step"
|
|
1904
|
-
) as pbar:
|
|
1820
|
+
with tqdm(total=len(loading_steps), desc="Loading study", disable=tdqm_disable, unit="step") as pbar:
|
|
1905
1821
|
# Load metadata
|
|
1906
1822
|
pbar.set_description(
|
|
1907
1823
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
|
|
@@ -1963,24 +1879,14 @@ def _load_study5(self, filename=None):
|
|
|
1963
1879
|
# Synchronize instance attributes with parameters (similar to __init__)
|
|
1964
1880
|
# Note: folder and label are already loaded from metadata attributes above
|
|
1965
1881
|
# but we ensure they match the parameters for consistency
|
|
1966
|
-
if (
|
|
1967
|
-
hasattr(self.parameters, "folder")
|
|
1968
|
-
and self.parameters.folder is not None
|
|
1969
|
-
):
|
|
1882
|
+
if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
|
|
1970
1883
|
self.folder = self.parameters.folder
|
|
1971
|
-
if (
|
|
1972
|
-
hasattr(self.parameters, "label")
|
|
1973
|
-
and self.parameters.label is not None
|
|
1974
|
-
):
|
|
1884
|
+
if hasattr(self.parameters, "label") and self.parameters.label is not None:
|
|
1975
1885
|
self.label = self.parameters.label
|
|
1976
1886
|
if hasattr(self.parameters, "log_level"):
|
|
1977
1887
|
self.log_level = self.parameters.log_level
|
|
1978
1888
|
if hasattr(self.parameters, "log_label"):
|
|
1979
|
-
self.log_label =
|
|
1980
|
-
self.parameters.log_label
|
|
1981
|
-
if self.parameters.log_label is not None
|
|
1982
|
-
else ""
|
|
1983
|
-
)
|
|
1889
|
+
self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
|
|
1984
1890
|
if hasattr(self.parameters, "log_sink"):
|
|
1985
1891
|
self.log_sink = self.parameters.log_sink
|
|
1986
1892
|
pbar.update(1)
|
|
@@ -2017,7 +1923,7 @@ def _load_study5(self, filename=None):
|
|
|
2017
1923
|
self.logger,
|
|
2018
1924
|
object_columns,
|
|
2019
1925
|
)
|
|
2020
|
-
|
|
1926
|
+
|
|
2021
1927
|
# Sanity check: replace any missing rt_original with rt values
|
|
2022
1928
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
2023
1929
|
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
@@ -2061,16 +1967,11 @@ def _load_study5(self, filename=None):
|
|
|
2061
1967
|
|
|
2062
1968
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
2063
1969
|
if self.consensus_df is not None:
|
|
2064
|
-
if
|
|
2065
|
-
"adducts" not in self.consensus_df.columns
|
|
2066
|
-
or self.consensus_df["adducts"].dtype == pl.Null
|
|
2067
|
-
):
|
|
1970
|
+
if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
|
|
2068
1971
|
self.logger.info(
|
|
2069
1972
|
"Adding missing 'adducts' column for backward compatibility",
|
|
2070
1973
|
)
|
|
2071
|
-
empty_adducts: list[list] = [
|
|
2072
|
-
[] for _ in range(len(self.consensus_df))
|
|
2073
|
-
]
|
|
1974
|
+
empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
|
|
2074
1975
|
|
|
2075
1976
|
# If column exists but is Null, drop it first
|
|
2076
1977
|
if "adducts" in self.consensus_df.columns:
|
|
@@ -2163,11 +2064,7 @@ def _load_study5(self, filename=None):
|
|
|
2163
2064
|
pbar.update(1)
|
|
2164
2065
|
|
|
2165
2066
|
# Check and migrate old string-based map_id to integer indices
|
|
2166
|
-
if (
|
|
2167
|
-
self.samples_df is not None
|
|
2168
|
-
and not self.samples_df.is_empty()
|
|
2169
|
-
and self.samples_df["map_id"].dtype == pl.Utf8
|
|
2170
|
-
):
|
|
2067
|
+
if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
|
|
2171
2068
|
self.logger.info(
|
|
2172
2069
|
"Detected old string-based map_id format, migrating to integer indices",
|
|
2173
2070
|
)
|
|
@@ -2191,26 +2088,26 @@ def _load_study5(self, filename=None):
|
|
|
2191
2088
|
_sanitize_nulls(self)
|
|
2192
2089
|
|
|
2193
2090
|
self.logger.debug("Study loaded")
|
|
2194
|
-
|
|
2091
|
+
|
|
2195
2092
|
|
|
2196
2093
|
def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
2197
2094
|
"""
|
|
2198
2095
|
Optimized method to load only MS1 data from a sample5 file for isotope detection.
|
|
2199
|
-
|
|
2096
|
+
|
|
2200
2097
|
This method efficiently loads only the ms1_df from a sample5 HDF5 file without
|
|
2201
2098
|
loading other potentially large datasets like features_df, scans_df, etc.
|
|
2202
|
-
|
|
2099
|
+
|
|
2203
2100
|
Args:
|
|
2204
2101
|
sample_path (str): Path to the sample5 HDF5 file
|
|
2205
|
-
|
|
2102
|
+
|
|
2206
2103
|
Returns:
|
|
2207
|
-
pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
|
|
2104
|
+
pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
|
|
2208
2105
|
Returns empty DataFrame if no MS1 data found or file cannot be read
|
|
2209
|
-
|
|
2106
|
+
|
|
2210
2107
|
Note:
|
|
2211
2108
|
Used by find_iso() for efficient isotope pattern detection without full sample loading
|
|
2212
2109
|
"""
|
|
2213
|
-
#try:
|
|
2110
|
+
# try:
|
|
2214
2111
|
# add .sample5 extension if not provided
|
|
2215
2112
|
if not filename.endswith(".sample5"):
|
|
2216
2113
|
filename += ".sample5"
|
|
@@ -2219,45 +2116,46 @@ def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
|
2219
2116
|
if "ms1" not in f:
|
|
2220
2117
|
self.logger.debug(f"No MS1 data found in {filename}")
|
|
2221
2118
|
return pl.DataFrame()
|
|
2222
|
-
|
|
2119
|
+
|
|
2223
2120
|
ms1_group = f["ms1"]
|
|
2224
|
-
|
|
2121
|
+
|
|
2225
2122
|
# Load MS1 data efficiently
|
|
2226
2123
|
ms1_data = {}
|
|
2227
2124
|
for col in ms1_group.keys():
|
|
2228
2125
|
ms1_data[col] = ms1_group[col][:]
|
|
2229
|
-
|
|
2126
|
+
|
|
2230
2127
|
if not ms1_data:
|
|
2231
2128
|
self.logger.debug(f"Empty MS1 data in {filename}")
|
|
2232
2129
|
return pl.DataFrame()
|
|
2233
|
-
|
|
2130
|
+
|
|
2234
2131
|
# Create DataFrame with proper schema
|
|
2235
2132
|
ms1_df = pl.DataFrame(ms1_data)
|
|
2236
|
-
|
|
2133
|
+
|
|
2237
2134
|
# Apply expected schema for MS1 data
|
|
2238
2135
|
expected_schema = {
|
|
2239
2136
|
"cycle": pl.Int64,
|
|
2240
|
-
"scan_uid": pl.Int64,
|
|
2137
|
+
"scan_uid": pl.Int64,
|
|
2241
2138
|
"rt": pl.Float64,
|
|
2242
2139
|
"mz": pl.Float64,
|
|
2243
|
-
"inty": pl.Float64
|
|
2140
|
+
"inty": pl.Float64,
|
|
2244
2141
|
}
|
|
2245
|
-
|
|
2142
|
+
|
|
2246
2143
|
# Cast columns to expected types if they exist
|
|
2247
2144
|
cast_expressions = []
|
|
2248
2145
|
for col, dtype in expected_schema.items():
|
|
2249
2146
|
if col in ms1_df.columns:
|
|
2250
2147
|
cast_expressions.append(pl.col(col).cast(dtype))
|
|
2251
|
-
|
|
2148
|
+
|
|
2252
2149
|
if cast_expressions:
|
|
2253
2150
|
ms1_df = ms1_df.with_columns(cast_expressions)
|
|
2254
|
-
|
|
2151
|
+
|
|
2255
2152
|
self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
|
|
2256
2153
|
return ms1_df
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2154
|
+
|
|
2155
|
+
|
|
2156
|
+
# except Exception as e:
|
|
2157
|
+
# self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
2158
|
+
# return pl.DataFrame()
|
|
2261
2159
|
|
|
2262
2160
|
|
|
2263
2161
|
def _sanitize_nulls(self):
|
|
@@ -2269,14 +2167,14 @@ def _sanitize_nulls(self):
|
|
|
2269
2167
|
import uuid
|
|
2270
2168
|
import polars as pl
|
|
2271
2169
|
import time
|
|
2272
|
-
|
|
2170
|
+
|
|
2273
2171
|
# Sanitize features_df feature_id column
|
|
2274
|
-
if hasattr(self,
|
|
2172
|
+
if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
|
|
2275
2173
|
# Check for null feature_ids
|
|
2276
2174
|
null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
|
|
2277
2175
|
if null_feature_ids > 0:
|
|
2278
2176
|
self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
|
|
2279
|
-
|
|
2177
|
+
|
|
2280
2178
|
# Find the maximum existing feature_id (convert strings to int if possible)
|
|
2281
2179
|
max_existing_id = 0
|
|
2282
2180
|
existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
|
|
@@ -2287,13 +2185,13 @@ def _sanitize_nulls(self):
|
|
|
2287
2185
|
except (ValueError, TypeError):
|
|
2288
2186
|
# Skip non-integer IDs
|
|
2289
2187
|
pass
|
|
2290
|
-
|
|
2188
|
+
|
|
2291
2189
|
# Generate new sequential integer IDs starting from max + timestamp offset
|
|
2292
2190
|
# Use timestamp to ensure uniqueness across different sanitization runs
|
|
2293
2191
|
base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
|
|
2294
2192
|
new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
|
|
2295
2193
|
uid_index = 0
|
|
2296
|
-
|
|
2194
|
+
|
|
2297
2195
|
# Create a list to store all feature_ids
|
|
2298
2196
|
feature_ids = []
|
|
2299
2197
|
for feature_id in self.features_df["feature_id"].to_list():
|
|
@@ -2302,25 +2200,23 @@ def _sanitize_nulls(self):
|
|
|
2302
2200
|
uid_index += 1
|
|
2303
2201
|
else:
|
|
2304
2202
|
feature_ids.append(feature_id)
|
|
2305
|
-
|
|
2203
|
+
|
|
2306
2204
|
# Update the DataFrame with sanitized feature_ids
|
|
2307
|
-
self.features_df = self.features_df.with_columns(
|
|
2308
|
-
|
|
2309
|
-
)
|
|
2310
|
-
|
|
2205
|
+
self.features_df = self.features_df.with_columns(pl.Series("feature_id", feature_ids, dtype=pl.Utf8))
|
|
2206
|
+
|
|
2311
2207
|
self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
|
|
2312
|
-
|
|
2208
|
+
|
|
2313
2209
|
# Sanitize consensus_df consensus_id column
|
|
2314
|
-
if hasattr(self,
|
|
2210
|
+
if hasattr(self, "consensus_df") and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
2315
2211
|
if "consensus_id" in self.consensus_df.columns:
|
|
2316
2212
|
null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
|
|
2317
2213
|
if null_consensus_ids > 0:
|
|
2318
2214
|
self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
|
|
2319
|
-
|
|
2215
|
+
|
|
2320
2216
|
# Generate new UIDs for null values using the same method as merge()
|
|
2321
|
-
new_uids = [str(uuid.uuid4()).replace(
|
|
2217
|
+
new_uids = [str(uuid.uuid4()).replace("-", "")[:16] for _ in range(null_consensus_ids)]
|
|
2322
2218
|
uid_index = 0
|
|
2323
|
-
|
|
2219
|
+
|
|
2324
2220
|
# Create a list to store all consensus_ids
|
|
2325
2221
|
consensus_ids = []
|
|
2326
2222
|
for consensus_id in self.consensus_df["consensus_id"].to_list():
|
|
@@ -2329,7 +2225,7 @@ def _sanitize_nulls(self):
|
|
|
2329
2225
|
uid_index += 1
|
|
2330
2226
|
else:
|
|
2331
2227
|
consensus_ids.append(consensus_id)
|
|
2332
|
-
|
|
2228
|
+
|
|
2333
2229
|
# Update the DataFrame with sanitized consensus_ids
|
|
2334
2230
|
self.consensus_df = self.consensus_df.with_columns(
|
|
2335
2231
|
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
@@ -2338,7 +2234,7 @@ def _sanitize_nulls(self):
|
|
|
2338
2234
|
self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
|
|
2339
2235
|
|
|
2340
2236
|
# Sanitize rt_original in features_df by replacing null or NaN values with rt values
|
|
2341
|
-
if hasattr(self,
|
|
2237
|
+
if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
|
|
2342
2238
|
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
2343
2239
|
# Check for null or NaN values in rt_original
|
|
2344
2240
|
null_or_nan_rt_original = self.features_df.filter(
|
|
@@ -2352,4 +2248,4 @@ def _sanitize_nulls(self):
|
|
|
2352
2248
|
.otherwise(pl.col("rt_original"))
|
|
2353
2249
|
.alias("rt_original")
|
|
2354
2250
|
)
|
|
2355
|
-
self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
|
|
2251
|
+
self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
|