masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -61,18 +61,18 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
61
61
  if df_name not in schema:
62
62
  # Fallback to basic empty DataFrame if schema not found
63
63
  return pl.DataFrame()
64
-
64
+
65
65
  df_schema = schema[df_name]["columns"]
66
66
  empty_data = {}
67
67
  polars_schema = {}
68
-
68
+
69
69
  for col_name, col_info in df_schema.items():
70
70
  dtype_str = col_info["dtype"]
71
71
  # Convert string representation to actual Polars dtype
72
72
  if dtype_str == "pl.Int64":
73
73
  polars_dtype = pl.Int64
74
74
  elif dtype_str == "pl.Int32":
75
- polars_dtype = pl.Int32
75
+ polars_dtype = pl.Int32
76
76
  elif dtype_str == "pl.Float64":
77
77
  polars_dtype = pl.Float64
78
78
  elif dtype_str == "pl.Utf8":
@@ -88,10 +88,10 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
88
88
  else:
89
89
  # Fallback to string if unknown type
90
90
  polars_dtype = pl.String
91
-
91
+
92
92
  empty_data[col_name] = []
93
93
  polars_schema[col_name] = polars_dtype
94
-
94
+
95
95
  return pl.DataFrame(empty_data, schema=polars_schema)
96
96
 
97
97
 
@@ -313,7 +313,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
313
313
  serialized_chunk.append(json.dumps(item.tolist()))
314
314
  except (AttributeError, TypeError):
315
315
  # Fallback for non-numpy data
316
- serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
316
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
317
317
  else:
318
318
  serialized_chunk.append("None")
319
319
  elif col_name == "ms1_spec":
@@ -325,7 +325,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
325
325
  serialized_chunk.append(json.dumps(item.tolist()))
326
326
  except (AttributeError, TypeError):
327
327
  # Fallback for non-numpy data
328
- serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
328
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
329
329
  else:
330
330
  serialized_chunk.append("None")
331
331
  else:
@@ -392,10 +392,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
392
392
  )
393
393
  # Fallback to simple string conversion for this chunk
394
394
  chunk = data_list[chunk_start : chunk_start + chunk_size]
395
- results[chunk_start] = [
396
- str(item) if item is not None else "None"
397
- for item in chunk
398
- ]
395
+ results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
399
396
 
400
397
  # Reassemble in correct order
401
398
  for i in range(0, total_items, chunk_size):
@@ -598,7 +595,7 @@ def _save_dataframe_column_legacy(
598
595
  data_as_json_strings.append(json.dumps(item.tolist()))
599
596
  except (AttributeError, TypeError):
600
597
  # Fallback for non-numpy data
601
- data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
598
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
602
599
  else:
603
600
  data_as_json_strings.append("None")
604
601
  group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -612,7 +609,7 @@ def _save_dataframe_column_legacy(
612
609
  data_as_json_strings.append(json.dumps(item.tolist()))
613
610
  except (AttributeError, TypeError):
614
611
  # Fallback for non-numpy data
615
- data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
612
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
616
613
  else:
617
614
  data_as_json_strings.append("None")
618
615
  group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -712,9 +709,7 @@ def _reconstruct_object_column(data_col, col_name: str):
712
709
  "adduct": str(adduct_row[0]),
713
710
  "count": int(float(adduct_row[1])),
714
711
  "percentage": float(adduct_row[2]),
715
- "mass": float(adduct_row[3])
716
- if len(adduct_row) > 3
717
- else 0.0,
712
+ "mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0,
718
713
  },
719
714
  )
720
715
  reconstructed_data.append(converted_adducts)
@@ -722,15 +717,39 @@ def _reconstruct_object_column(data_col, col_name: str):
722
717
  # Handle isotope patterns (numpy arrays with [mz, intensity] data)
723
718
  try:
724
719
  import numpy as np
725
- iso_data = json.loads(item)
726
- # Convert back to numpy array
727
- reconstructed_data.append(np.array(iso_data) if iso_data else None)
728
- except (json.JSONDecodeError, ValueError, ImportError):
720
+
721
+ # Try JSON parsing first (new format)
722
+ try:
723
+ iso_data = json.loads(item)
724
+ # Convert back to numpy array
725
+ reconstructed_data.append(np.array(iso_data) if iso_data else None)
726
+ except json.JSONDecodeError:
727
+ # Handle numpy array string representation (old format)
728
+ # This handles strings like "[[ 875.7865 447675. ]\n [ 876.7902 168819. ]]"
729
+ try:
730
+ # Use numpy's string representation parser
731
+ iso_array = np.fromstring(item.replace('[', '').replace(']', '').replace('\n', ' '), sep=' ')
732
+ # Reshape to 2D array (pairs of mz, intensity)
733
+ if len(iso_array) % 2 == 0:
734
+ iso_array = iso_array.reshape(-1, 2)
735
+ reconstructed_data.append(iso_array)
736
+ else:
737
+ reconstructed_data.append(None)
738
+ except (ValueError, AttributeError):
739
+ # If all else fails, try to evaluate the string as a literal
740
+ try:
741
+ import ast
742
+ iso_data = ast.literal_eval(item)
743
+ reconstructed_data.append(np.array(iso_data) if iso_data else None)
744
+ except (ValueError, SyntaxError):
745
+ reconstructed_data.append(None)
746
+ except (ValueError, ImportError):
729
747
  reconstructed_data.append(None)
730
748
  elif col_name == "ms1_spec":
731
749
  # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
732
750
  try:
733
751
  import numpy as np
752
+
734
753
  ms1_spec_data = json.loads(item)
735
754
  # Convert back to numpy array
736
755
  reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
@@ -821,25 +840,25 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
821
840
  # First check all data for numpy object arrays and move them to object columns
822
841
  additional_object_cols = []
823
842
  for k, v in data.items():
824
- if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
843
+ if k not in object_columns and hasattr(v, "dtype") and str(v.dtype) == "object":
825
844
  # This is a numpy object array that should be treated as object
826
845
  additional_object_cols.append(k)
827
846
  object_columns.append(k)
828
-
847
+
829
848
  if additional_object_cols:
830
849
  # Re-run reconstruction for these columns
831
850
  for col in additional_object_cols:
832
851
  data[col] = _reconstruct_object_column(data[col], col)
833
-
852
+
834
853
  object_data = {k: v for k, v in data.items() if k in object_columns}
835
854
  regular_data = {k: v for k, v in data.items() if k not in object_columns}
836
855
 
837
856
  # Final check: ensure no numpy object arrays in regular_data
838
857
  problematic_cols = []
839
858
  for k, v in regular_data.items():
840
- if hasattr(v, 'dtype') and str(v.dtype) == 'object':
859
+ if hasattr(v, "dtype") and str(v.dtype) == "object":
841
860
  problematic_cols.append(k)
842
-
861
+
843
862
  if problematic_cols:
844
863
  # Move these to object_data
845
864
  for col in problematic_cols:
@@ -878,7 +897,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
878
897
  # and handle numpy scalars within lists
879
898
  safe_regular_data = {}
880
899
  import numpy as np
881
-
900
+
882
901
  def convert_numpy_scalars(value):
883
902
  """Convert numpy scalars to Python native types recursively."""
884
903
  if isinstance(value, np.generic):
@@ -887,17 +906,19 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
887
906
  return [convert_numpy_scalars(item) for item in value]
888
907
  else:
889
908
  return value
890
-
909
+
891
910
  for k, v in regular_data.items():
892
- if hasattr(v, 'dtype') and str(v.dtype) == 'object':
911
+ if hasattr(v, "dtype") and str(v.dtype) == "object":
893
912
  # Convert numpy object array to Python list
894
- safe_regular_data[k] = [convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, 'tolist') else list(v))]
913
+ safe_regular_data[k] = [
914
+ convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, "tolist") else list(v))
915
+ ]
895
916
  elif isinstance(v, list):
896
917
  # Handle lists that might contain numpy scalars
897
918
  safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
898
919
  else:
899
920
  safe_regular_data[k] = convert_numpy_scalars(v)
900
-
921
+
901
922
  # Create DataFrame with proper error handling
902
923
  try:
903
924
  df = pl.DataFrame(safe_regular_data)
@@ -914,7 +935,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
914
935
  except Exception:
915
936
  # Last resort: skip the column entirely
916
937
  continue
917
-
938
+
918
939
  # Add Object columns one by one
919
940
  for col, values in object_data.items():
920
941
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
@@ -993,9 +1014,7 @@ def _load_dataframe_from_group(
993
1014
  )
994
1015
  schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
995
1016
  logger.debug(f"Schema section for {df_name}: {schema_section}")
996
- schema_columns = (
997
- schema_section.get("columns", []) if isinstance(schema_section, dict) else []
998
- )
1017
+ schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
999
1018
  logger.debug(f"Schema columns for {df_name}: {schema_columns}")
1000
1019
  if schema_columns is None:
1001
1020
  schema_columns = []
@@ -1158,11 +1177,7 @@ def _load_dataframe_from_group(
1158
1177
  }
1159
1178
  migrated_old_names = set(column_migrations.keys())
1160
1179
 
1161
- extra_columns = [
1162
- col
1163
- for col in hdf5_columns
1164
- if col not in (schema_columns or []) and col not in migrated_old_names
1165
- ]
1180
+ extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
1166
1181
 
1167
1182
  for col in extra_columns:
1168
1183
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1188,10 +1203,7 @@ def _load_dataframe_from_group(
1188
1203
  object_columns.append(col)
1189
1204
  else:
1190
1205
  # Regular string data
1191
- data[col] = [
1192
- item.decode("utf-8") if isinstance(item, bytes) else item
1193
- for item in column_data
1194
- ]
1206
+ data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
1195
1207
  except Exception:
1196
1208
  # If decoding fails, treat as regular data
1197
1209
  data[col] = column_data
@@ -1204,19 +1216,10 @@ def _load_dataframe_from_group(
1204
1216
  # Handle byte string conversion for non-object columns
1205
1217
  # Only convert to strings for columns that should actually be strings
1206
1218
  for col, values in data.items():
1207
- if (
1208
- col not in object_columns
1209
- and values is not None
1210
- and len(values) > 0
1211
- and isinstance(values[0], bytes)
1212
- ):
1219
+ if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
1213
1220
  # Check schema to see if this should be a string column
1214
1221
  should_be_string = False
1215
- if (
1216
- df_name in schema
1217
- and "columns" in schema[df_name]
1218
- and col in schema[df_name]["columns"]
1219
- ):
1222
+ if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
1220
1223
  dtype_str = schema[df_name]["columns"][col]["dtype"]
1221
1224
  should_be_string = dtype_str == "pl.Utf8"
1222
1225
 
@@ -1237,25 +1240,25 @@ def _load_dataframe_from_group(
1237
1240
  logger.debug(
1238
1241
  f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
1239
1242
  )
1240
-
1243
+
1241
1244
  # Debug: check for problematic data types in all columns before DataFrame creation
1242
1245
  for col, values in data.items():
1243
- if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1246
+ if hasattr(values, "dtype") and str(values.dtype) == "object":
1244
1247
  logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
1245
1248
  if col not in object_columns:
1246
1249
  object_columns.append(col)
1247
-
1250
+
1248
1251
  df = _create_dataframe_with_objects(data, object_columns)
1249
1252
  else:
1250
1253
  # Debug: check for problematic data types when no object columns are expected
1251
1254
  for col, values in data.items():
1252
- if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1255
+ if hasattr(values, "dtype") and str(values.dtype) == "object":
1253
1256
  logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
1254
1257
  # Treat as object column
1255
1258
  if object_columns is None:
1256
1259
  object_columns = []
1257
1260
  object_columns.append(col)
1258
-
1261
+
1259
1262
  if object_columns:
1260
1263
  df = _create_dataframe_with_objects(data, object_columns)
1261
1264
  else:
@@ -1302,34 +1305,21 @@ def _save_study5_compressed(self, filename):
1302
1305
  dataframes_to_save.append(("features", len(self.features_df)))
1303
1306
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1304
1307
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1305
- if (
1306
- self.consensus_mapping_df is not None
1307
- and not self.consensus_mapping_df.is_empty()
1308
- ):
1308
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1309
1309
  dataframes_to_save.append(
1310
1310
  ("consensus_mapping", len(self.consensus_mapping_df)),
1311
1311
  )
1312
1312
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1313
1313
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1314
- if (
1315
- hasattr(self, "lib_df")
1316
- and self.lib_df is not None
1317
- and not self.lib_df.is_empty()
1318
- ):
1314
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1319
1315
  dataframes_to_save.append(("lib", len(self.lib_df)))
1320
- if (
1321
- hasattr(self, "id_df")
1322
- and self.id_df is not None
1323
- and not self.id_df.is_empty()
1324
- ):
1316
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1325
1317
  dataframes_to_save.append(("id", len(self.id_df)))
1326
1318
 
1327
1319
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1328
1320
 
1329
1321
  # Show progress for large saves
1330
- tdqm_disable = (
1331
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1332
- )
1322
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1333
1323
 
1334
1324
  with tqdm(
1335
1325
  total=total_steps,
@@ -1347,14 +1337,8 @@ def _save_study5_compressed(self, filename):
1347
1337
 
1348
1338
  # Store metadata
1349
1339
  metadata_group.attrs["format"] = "masster-study-1"
1350
- metadata_group.attrs["folder"] = (
1351
- str(self.folder) if self.folder is not None else ""
1352
- )
1353
- metadata_group.attrs["label"] = (
1354
- str(self.label)
1355
- if hasattr(self, "label") and self.label is not None
1356
- else ""
1357
- )
1340
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1341
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1358
1342
 
1359
1343
  # Store parameters as JSON
1360
1344
  if hasattr(self, "parameters") and self.history is not None:
@@ -1419,10 +1403,7 @@ def _save_study5_compressed(self, filename):
1419
1403
  pbar.update(1)
1420
1404
 
1421
1405
  # Store consensus_mapping_df - keep existing fast method
1422
- if (
1423
- self.consensus_mapping_df is not None
1424
- and not self.consensus_mapping_df.is_empty()
1425
- ):
1406
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1426
1407
  consensus_mapping = self.consensus_mapping_df.clone()
1427
1408
  self.logger.debug(
1428
1409
  f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1458,11 +1439,7 @@ def _save_study5_compressed(self, filename):
1458
1439
  pbar.update(1)
1459
1440
 
1460
1441
  # Store lib_df - library data
1461
- if (
1462
- hasattr(self, "lib_df")
1463
- and self.lib_df is not None
1464
- and not self.lib_df.is_empty()
1465
- ):
1442
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1466
1443
  self.logger.debug(
1467
1444
  f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1468
1445
  )
@@ -1476,11 +1453,7 @@ def _save_study5_compressed(self, filename):
1476
1453
  pbar.update(1)
1477
1454
 
1478
1455
  # Store id_df - identification results
1479
- if (
1480
- hasattr(self, "id_df")
1481
- and self.id_df is not None
1482
- and not self.id_df.is_empty()
1483
- ):
1456
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1484
1457
  self.logger.debug(
1485
1458
  f"Saving id_df with {len(self.id_df)} rows using optimized method",
1486
1459
  )
@@ -1636,34 +1609,21 @@ def _save_study5(self, filename):
1636
1609
  dataframes_to_save.append(("features", len(self.features_df)))
1637
1610
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1638
1611
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1639
- if (
1640
- self.consensus_mapping_df is not None
1641
- and not self.consensus_mapping_df.is_empty()
1642
- ):
1612
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1643
1613
  dataframes_to_save.append(
1644
1614
  ("consensus_mapping", len(self.consensus_mapping_df)),
1645
1615
  )
1646
1616
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1647
1617
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1648
- if (
1649
- hasattr(self, "lib_df")
1650
- and self.lib_df is not None
1651
- and not self.lib_df.is_empty()
1652
- ):
1618
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1653
1619
  dataframes_to_save.append(("lib", len(self.lib_df)))
1654
- if (
1655
- hasattr(self, "id_df")
1656
- and self.id_df is not None
1657
- and not self.id_df.is_empty()
1658
- ):
1620
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1659
1621
  dataframes_to_save.append(("id", len(self.id_df)))
1660
1622
 
1661
1623
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1662
1624
 
1663
1625
  # Show progress for large saves
1664
- tdqm_disable = (
1665
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1666
- )
1626
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1667
1627
 
1668
1628
  with tqdm(
1669
1629
  total=total_steps,
@@ -1681,14 +1641,8 @@ def _save_study5(self, filename):
1681
1641
 
1682
1642
  # Store metadata
1683
1643
  metadata_group.attrs["format"] = "masster-study-1"
1684
- metadata_group.attrs["folder"] = (
1685
- str(self.folder) if self.folder is not None else ""
1686
- )
1687
- metadata_group.attrs["label"] = (
1688
- str(self.label)
1689
- if hasattr(self, "label") and self.label is not None
1690
- else ""
1691
- )
1644
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1645
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1692
1646
 
1693
1647
  # Store parameters as JSON
1694
1648
  if hasattr(self, "parameters") and self.history is not None:
@@ -1756,10 +1710,7 @@ def _save_study5(self, filename):
1756
1710
  pbar.update(1)
1757
1711
 
1758
1712
  # Store consensus_mapping_df - keep existing fast method
1759
- if (
1760
- self.consensus_mapping_df is not None
1761
- and not self.consensus_mapping_df.is_empty()
1762
- ):
1713
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1763
1714
  consensus_mapping = self.consensus_mapping_df.clone()
1764
1715
  self.logger.debug(
1765
1716
  f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1795,11 +1746,7 @@ def _save_study5(self, filename):
1795
1746
  pbar.update(1)
1796
1747
 
1797
1748
  # Store lib_df - library data
1798
- if (
1799
- hasattr(self, "lib_df")
1800
- and self.lib_df is not None
1801
- and not self.lib_df.is_empty()
1802
- ):
1749
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1803
1750
  self.logger.debug(
1804
1751
  f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1805
1752
  )
@@ -1813,11 +1760,7 @@ def _save_study5(self, filename):
1813
1760
  pbar.update(1)
1814
1761
 
1815
1762
  # Store id_df - identification results
1816
- if (
1817
- hasattr(self, "id_df")
1818
- and self.id_df is not None
1819
- and not self.id_df.is_empty()
1820
- ):
1763
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1821
1764
  self.logger.debug(
1822
1765
  f"Saving id_df with {len(self.id_df)} rows using optimized method",
1823
1766
  )
@@ -1896,12 +1839,7 @@ def _load_study5(self, filename=None):
1896
1839
 
1897
1840
  with h5py.File(filename, "r") as f:
1898
1841
  # Use progress bar to show loading progress
1899
- with tqdm(
1900
- total=len(loading_steps),
1901
- desc="Loading study",
1902
- disable=tdqm_disable,
1903
- unit="step"
1904
- ) as pbar:
1842
+ with tqdm(total=len(loading_steps), desc="Loading study", disable=tdqm_disable, unit="step") as pbar:
1905
1843
  # Load metadata
1906
1844
  pbar.set_description(
1907
1845
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
@@ -1963,24 +1901,14 @@ def _load_study5(self, filename=None):
1963
1901
  # Synchronize instance attributes with parameters (similar to __init__)
1964
1902
  # Note: folder and label are already loaded from metadata attributes above
1965
1903
  # but we ensure they match the parameters for consistency
1966
- if (
1967
- hasattr(self.parameters, "folder")
1968
- and self.parameters.folder is not None
1969
- ):
1904
+ if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
1970
1905
  self.folder = self.parameters.folder
1971
- if (
1972
- hasattr(self.parameters, "label")
1973
- and self.parameters.label is not None
1974
- ):
1906
+ if hasattr(self.parameters, "label") and self.parameters.label is not None:
1975
1907
  self.label = self.parameters.label
1976
1908
  if hasattr(self.parameters, "log_level"):
1977
1909
  self.log_level = self.parameters.log_level
1978
1910
  if hasattr(self.parameters, "log_label"):
1979
- self.log_label = (
1980
- self.parameters.log_label
1981
- if self.parameters.log_label is not None
1982
- else ""
1983
- )
1911
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1984
1912
  if hasattr(self.parameters, "log_sink"):
1985
1913
  self.log_sink = self.parameters.log_sink
1986
1914
  pbar.update(1)
@@ -2017,7 +1945,7 @@ def _load_study5(self, filename=None):
2017
1945
  self.logger,
2018
1946
  object_columns,
2019
1947
  )
2020
-
1948
+
2021
1949
  # Sanity check: replace any missing rt_original with rt values
2022
1950
  if self.features_df is not None and not self.features_df.is_empty():
2023
1951
  if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
@@ -2046,10 +1974,15 @@ def _load_study5(self, filename=None):
2046
1974
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus",
2047
1975
  )
2048
1976
  if "consensus" in f and len(f["consensus"].keys()) > 0:
2049
- # Only include adducts in object_columns if it actually exists in the file
1977
+ # Only include object columns if they actually exist in the file
2050
1978
  object_columns = []
2051
- if "adducts" in f["consensus"]:
2052
- object_columns.append("adducts")
1979
+ try:
1980
+ if "adducts" in f["consensus"]:
1981
+ object_columns.append("adducts")
1982
+ if "iso" in f["consensus"]:
1983
+ object_columns.append("iso")
1984
+ except (KeyError, TypeError):
1985
+ pass
2053
1986
 
2054
1987
  self.consensus_df = _load_dataframe_from_group(
2055
1988
  f["consensus"],
@@ -2061,16 +1994,11 @@ def _load_study5(self, filename=None):
2061
1994
 
2062
1995
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
2063
1996
  if self.consensus_df is not None:
2064
- if (
2065
- "adducts" not in self.consensus_df.columns
2066
- or self.consensus_df["adducts"].dtype == pl.Null
2067
- ):
1997
+ if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
2068
1998
  self.logger.info(
2069
1999
  "Adding missing 'adducts' column for backward compatibility",
2070
2000
  )
2071
- empty_adducts: list[list] = [
2072
- [] for _ in range(len(self.consensus_df))
2073
- ]
2001
+ empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
2074
2002
 
2075
2003
  # If column exists but is Null, drop it first
2076
2004
  if "adducts" in self.consensus_df.columns:
@@ -2163,11 +2091,7 @@ def _load_study5(self, filename=None):
2163
2091
  pbar.update(1)
2164
2092
 
2165
2093
  # Check and migrate old string-based map_id to integer indices
2166
- if (
2167
- self.samples_df is not None
2168
- and not self.samples_df.is_empty()
2169
- and self.samples_df["map_id"].dtype == pl.Utf8
2170
- ):
2094
+ if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
2171
2095
  self.logger.info(
2172
2096
  "Detected old string-based map_id format, migrating to integer indices",
2173
2097
  )
@@ -2191,26 +2115,26 @@ def _load_study5(self, filename=None):
2191
2115
  _sanitize_nulls(self)
2192
2116
 
2193
2117
  self.logger.debug("Study loaded")
2194
-
2118
+
2195
2119
 
2196
2120
  def _load_ms1(self, filename: str) -> pl.DataFrame:
2197
2121
  """
2198
2122
  Optimized method to load only MS1 data from a sample5 file for isotope detection.
2199
-
2123
+
2200
2124
  This method efficiently loads only the ms1_df from a sample5 HDF5 file without
2201
2125
  loading other potentially large datasets like features_df, scans_df, etc.
2202
-
2126
+
2203
2127
  Args:
2204
2128
  sample_path (str): Path to the sample5 HDF5 file
2205
-
2129
+
2206
2130
  Returns:
2207
- pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
2131
+ pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
2208
2132
  Returns empty DataFrame if no MS1 data found or file cannot be read
2209
-
2133
+
2210
2134
  Note:
2211
2135
  Used by find_iso() for efficient isotope pattern detection without full sample loading
2212
2136
  """
2213
- #try:
2137
+ # try:
2214
2138
  # add .sample5 extension if not provided
2215
2139
  if not filename.endswith(".sample5"):
2216
2140
  filename += ".sample5"
@@ -2219,45 +2143,46 @@ def _load_ms1(self, filename: str) -> pl.DataFrame:
2219
2143
  if "ms1" not in f:
2220
2144
  self.logger.debug(f"No MS1 data found in {filename}")
2221
2145
  return pl.DataFrame()
2222
-
2146
+
2223
2147
  ms1_group = f["ms1"]
2224
-
2148
+
2225
2149
  # Load MS1 data efficiently
2226
2150
  ms1_data = {}
2227
2151
  for col in ms1_group.keys():
2228
2152
  ms1_data[col] = ms1_group[col][:]
2229
-
2153
+
2230
2154
  if not ms1_data:
2231
2155
  self.logger.debug(f"Empty MS1 data in {filename}")
2232
2156
  return pl.DataFrame()
2233
-
2157
+
2234
2158
  # Create DataFrame with proper schema
2235
2159
  ms1_df = pl.DataFrame(ms1_data)
2236
-
2160
+
2237
2161
  # Apply expected schema for MS1 data
2238
2162
  expected_schema = {
2239
2163
  "cycle": pl.Int64,
2240
- "scan_uid": pl.Int64,
2164
+ "scan_uid": pl.Int64,
2241
2165
  "rt": pl.Float64,
2242
2166
  "mz": pl.Float64,
2243
- "inty": pl.Float64
2167
+ "inty": pl.Float64,
2244
2168
  }
2245
-
2169
+
2246
2170
  # Cast columns to expected types if they exist
2247
2171
  cast_expressions = []
2248
2172
  for col, dtype in expected_schema.items():
2249
2173
  if col in ms1_df.columns:
2250
2174
  cast_expressions.append(pl.col(col).cast(dtype))
2251
-
2175
+
2252
2176
  if cast_expressions:
2253
2177
  ms1_df = ms1_df.with_columns(cast_expressions)
2254
-
2178
+
2255
2179
  self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
2256
2180
  return ms1_df
2257
-
2258
- #except Exception as e:
2259
- # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2260
- # return pl.DataFrame()
2181
+
2182
+
2183
+ # except Exception as e:
2184
+ # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2185
+ # return pl.DataFrame()
2261
2186
 
2262
2187
 
2263
2188
  def _sanitize_nulls(self):
@@ -2269,14 +2194,14 @@ def _sanitize_nulls(self):
2269
2194
  import uuid
2270
2195
  import polars as pl
2271
2196
  import time
2272
-
2197
+
2273
2198
  # Sanitize features_df feature_id column
2274
- if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2199
+ if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
2275
2200
  # Check for null feature_ids
2276
2201
  null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
2277
2202
  if null_feature_ids > 0:
2278
2203
  self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
2279
-
2204
+
2280
2205
  # Find the maximum existing feature_id (convert strings to int if possible)
2281
2206
  max_existing_id = 0
2282
2207
  existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
@@ -2287,13 +2212,13 @@ def _sanitize_nulls(self):
2287
2212
  except (ValueError, TypeError):
2288
2213
  # Skip non-integer IDs
2289
2214
  pass
2290
-
2215
+
2291
2216
  # Generate new sequential integer IDs starting from max + timestamp offset
2292
2217
  # Use timestamp to ensure uniqueness across different sanitization runs
2293
2218
  base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
2294
2219
  new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
2295
2220
  uid_index = 0
2296
-
2221
+
2297
2222
  # Create a list to store all feature_ids
2298
2223
  feature_ids = []
2299
2224
  for feature_id in self.features_df["feature_id"].to_list():
@@ -2302,25 +2227,23 @@ def _sanitize_nulls(self):
2302
2227
  uid_index += 1
2303
2228
  else:
2304
2229
  feature_ids.append(feature_id)
2305
-
2230
+
2306
2231
  # Update the DataFrame with sanitized feature_ids
2307
- self.features_df = self.features_df.with_columns(
2308
- pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
2309
- )
2310
-
2232
+ self.features_df = self.features_df.with_columns(pl.Series("feature_id", feature_ids, dtype=pl.Utf8))
2233
+
2311
2234
  self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
2312
-
2235
+
2313
2236
  # Sanitize consensus_df consensus_id column
2314
- if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
2237
+ if hasattr(self, "consensus_df") and self.consensus_df is not None and not self.consensus_df.is_empty():
2315
2238
  if "consensus_id" in self.consensus_df.columns:
2316
2239
  null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
2317
2240
  if null_consensus_ids > 0:
2318
2241
  self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
2319
-
2242
+
2320
2243
  # Generate new UIDs for null values using the same method as merge()
2321
- new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
2244
+ new_uids = [str(uuid.uuid4()).replace("-", "")[:16] for _ in range(null_consensus_ids)]
2322
2245
  uid_index = 0
2323
-
2246
+
2324
2247
  # Create a list to store all consensus_ids
2325
2248
  consensus_ids = []
2326
2249
  for consensus_id in self.consensus_df["consensus_id"].to_list():
@@ -2329,7 +2252,7 @@ def _sanitize_nulls(self):
2329
2252
  uid_index += 1
2330
2253
  else:
2331
2254
  consensus_ids.append(consensus_id)
2332
-
2255
+
2333
2256
  # Update the DataFrame with sanitized consensus_ids
2334
2257
  self.consensus_df = self.consensus_df.with_columns(
2335
2258
  pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
@@ -2338,7 +2261,7 @@ def _sanitize_nulls(self):
2338
2261
  self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
2339
2262
 
2340
2263
  # Sanitize rt_original in features_df by replacing null or NaN values with rt values
2341
- if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2264
+ if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
2342
2265
  if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
2343
2266
  # Check for null or NaN values in rt_original
2344
2267
  null_or_nan_rt_original = self.features_df.filter(
@@ -2352,4 +2275,4 @@ def _sanitize_nulls(self):
2352
2275
  .otherwise(pl.col("rt_original"))
2353
2276
  .alias("rt_original")
2354
2277
  )
2355
- self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
2278
+ self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")