masster 0.5.21__py3-none-any.whl → 0.5.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -61,18 +61,18 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
61
61
  if df_name not in schema:
62
62
  # Fallback to basic empty DataFrame if schema not found
63
63
  return pl.DataFrame()
64
-
64
+
65
65
  df_schema = schema[df_name]["columns"]
66
66
  empty_data = {}
67
67
  polars_schema = {}
68
-
68
+
69
69
  for col_name, col_info in df_schema.items():
70
70
  dtype_str = col_info["dtype"]
71
71
  # Convert string representation to actual Polars dtype
72
72
  if dtype_str == "pl.Int64":
73
73
  polars_dtype = pl.Int64
74
74
  elif dtype_str == "pl.Int32":
75
- polars_dtype = pl.Int32
75
+ polars_dtype = pl.Int32
76
76
  elif dtype_str == "pl.Float64":
77
77
  polars_dtype = pl.Float64
78
78
  elif dtype_str == "pl.Utf8":
@@ -88,10 +88,10 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
88
88
  else:
89
89
  # Fallback to string if unknown type
90
90
  polars_dtype = pl.String
91
-
91
+
92
92
  empty_data[col_name] = []
93
93
  polars_schema[col_name] = polars_dtype
94
-
94
+
95
95
  return pl.DataFrame(empty_data, schema=polars_schema)
96
96
 
97
97
 
@@ -313,7 +313,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
313
313
  serialized_chunk.append(json.dumps(item.tolist()))
314
314
  except (AttributeError, TypeError):
315
315
  # Fallback for non-numpy data
316
- serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
316
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
317
317
  else:
318
318
  serialized_chunk.append("None")
319
319
  elif col_name == "ms1_spec":
@@ -325,7 +325,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
325
325
  serialized_chunk.append(json.dumps(item.tolist()))
326
326
  except (AttributeError, TypeError):
327
327
  # Fallback for non-numpy data
328
- serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
328
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
329
329
  else:
330
330
  serialized_chunk.append("None")
331
331
  else:
@@ -392,10 +392,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
392
392
  )
393
393
  # Fallback to simple string conversion for this chunk
394
394
  chunk = data_list[chunk_start : chunk_start + chunk_size]
395
- results[chunk_start] = [
396
- str(item) if item is not None else "None"
397
- for item in chunk
398
- ]
395
+ results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
399
396
 
400
397
  # Reassemble in correct order
401
398
  for i in range(0, total_items, chunk_size):
@@ -598,7 +595,7 @@ def _save_dataframe_column_legacy(
598
595
  data_as_json_strings.append(json.dumps(item.tolist()))
599
596
  except (AttributeError, TypeError):
600
597
  # Fallback for non-numpy data
601
- data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
598
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
602
599
  else:
603
600
  data_as_json_strings.append("None")
604
601
  group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -612,7 +609,7 @@ def _save_dataframe_column_legacy(
612
609
  data_as_json_strings.append(json.dumps(item.tolist()))
613
610
  except (AttributeError, TypeError):
614
611
  # Fallback for non-numpy data
615
- data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
612
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
616
613
  else:
617
614
  data_as_json_strings.append("None")
618
615
  group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -712,9 +709,7 @@ def _reconstruct_object_column(data_col, col_name: str):
712
709
  "adduct": str(adduct_row[0]),
713
710
  "count": int(float(adduct_row[1])),
714
711
  "percentage": float(adduct_row[2]),
715
- "mass": float(adduct_row[3])
716
- if len(adduct_row) > 3
717
- else 0.0,
712
+ "mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0,
718
713
  },
719
714
  )
720
715
  reconstructed_data.append(converted_adducts)
@@ -722,6 +717,7 @@ def _reconstruct_object_column(data_col, col_name: str):
722
717
  # Handle isotope patterns (numpy arrays with [mz, intensity] data)
723
718
  try:
724
719
  import numpy as np
720
+
725
721
  iso_data = json.loads(item)
726
722
  # Convert back to numpy array
727
723
  reconstructed_data.append(np.array(iso_data) if iso_data else None)
@@ -731,6 +727,7 @@ def _reconstruct_object_column(data_col, col_name: str):
731
727
  # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
732
728
  try:
733
729
  import numpy as np
730
+
734
731
  ms1_spec_data = json.loads(item)
735
732
  # Convert back to numpy array
736
733
  reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
@@ -821,25 +818,25 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
821
818
  # First check all data for numpy object arrays and move them to object columns
822
819
  additional_object_cols = []
823
820
  for k, v in data.items():
824
- if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
821
+ if k not in object_columns and hasattr(v, "dtype") and str(v.dtype) == "object":
825
822
  # This is a numpy object array that should be treated as object
826
823
  additional_object_cols.append(k)
827
824
  object_columns.append(k)
828
-
825
+
829
826
  if additional_object_cols:
830
827
  # Re-run reconstruction for these columns
831
828
  for col in additional_object_cols:
832
829
  data[col] = _reconstruct_object_column(data[col], col)
833
-
830
+
834
831
  object_data = {k: v for k, v in data.items() if k in object_columns}
835
832
  regular_data = {k: v for k, v in data.items() if k not in object_columns}
836
833
 
837
834
  # Final check: ensure no numpy object arrays in regular_data
838
835
  problematic_cols = []
839
836
  for k, v in regular_data.items():
840
- if hasattr(v, 'dtype') and str(v.dtype) == 'object':
837
+ if hasattr(v, "dtype") and str(v.dtype) == "object":
841
838
  problematic_cols.append(k)
842
-
839
+
843
840
  if problematic_cols:
844
841
  # Move these to object_data
845
842
  for col in problematic_cols:
@@ -878,7 +875,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
878
875
  # and handle numpy scalars within lists
879
876
  safe_regular_data = {}
880
877
  import numpy as np
881
-
878
+
882
879
  def convert_numpy_scalars(value):
883
880
  """Convert numpy scalars to Python native types recursively."""
884
881
  if isinstance(value, np.generic):
@@ -887,17 +884,19 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
887
884
  return [convert_numpy_scalars(item) for item in value]
888
885
  else:
889
886
  return value
890
-
887
+
891
888
  for k, v in regular_data.items():
892
- if hasattr(v, 'dtype') and str(v.dtype) == 'object':
889
+ if hasattr(v, "dtype") and str(v.dtype) == "object":
893
890
  # Convert numpy object array to Python list
894
- safe_regular_data[k] = [convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, 'tolist') else list(v))]
891
+ safe_regular_data[k] = [
892
+ convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, "tolist") else list(v))
893
+ ]
895
894
  elif isinstance(v, list):
896
895
  # Handle lists that might contain numpy scalars
897
896
  safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
898
897
  else:
899
898
  safe_regular_data[k] = convert_numpy_scalars(v)
900
-
899
+
901
900
  # Create DataFrame with proper error handling
902
901
  try:
903
902
  df = pl.DataFrame(safe_regular_data)
@@ -914,7 +913,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
914
913
  except Exception:
915
914
  # Last resort: skip the column entirely
916
915
  continue
917
-
916
+
918
917
  # Add Object columns one by one
919
918
  for col, values in object_data.items():
920
919
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
@@ -993,9 +992,7 @@ def _load_dataframe_from_group(
993
992
  )
994
993
  schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
995
994
  logger.debug(f"Schema section for {df_name}: {schema_section}")
996
- schema_columns = (
997
- schema_section.get("columns", []) if isinstance(schema_section, dict) else []
998
- )
995
+ schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
999
996
  logger.debug(f"Schema columns for {df_name}: {schema_columns}")
1000
997
  if schema_columns is None:
1001
998
  schema_columns = []
@@ -1158,11 +1155,7 @@ def _load_dataframe_from_group(
1158
1155
  }
1159
1156
  migrated_old_names = set(column_migrations.keys())
1160
1157
 
1161
- extra_columns = [
1162
- col
1163
- for col in hdf5_columns
1164
- if col not in (schema_columns or []) and col not in migrated_old_names
1165
- ]
1158
+ extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
1166
1159
 
1167
1160
  for col in extra_columns:
1168
1161
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1188,10 +1181,7 @@ def _load_dataframe_from_group(
1188
1181
  object_columns.append(col)
1189
1182
  else:
1190
1183
  # Regular string data
1191
- data[col] = [
1192
- item.decode("utf-8") if isinstance(item, bytes) else item
1193
- for item in column_data
1194
- ]
1184
+ data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
1195
1185
  except Exception:
1196
1186
  # If decoding fails, treat as regular data
1197
1187
  data[col] = column_data
@@ -1204,19 +1194,10 @@ def _load_dataframe_from_group(
1204
1194
  # Handle byte string conversion for non-object columns
1205
1195
  # Only convert to strings for columns that should actually be strings
1206
1196
  for col, values in data.items():
1207
- if (
1208
- col not in object_columns
1209
- and values is not None
1210
- and len(values) > 0
1211
- and isinstance(values[0], bytes)
1212
- ):
1197
+ if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
1213
1198
  # Check schema to see if this should be a string column
1214
1199
  should_be_string = False
1215
- if (
1216
- df_name in schema
1217
- and "columns" in schema[df_name]
1218
- and col in schema[df_name]["columns"]
1219
- ):
1200
+ if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
1220
1201
  dtype_str = schema[df_name]["columns"][col]["dtype"]
1221
1202
  should_be_string = dtype_str == "pl.Utf8"
1222
1203
 
@@ -1237,25 +1218,25 @@ def _load_dataframe_from_group(
1237
1218
  logger.debug(
1238
1219
  f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
1239
1220
  )
1240
-
1221
+
1241
1222
  # Debug: check for problematic data types in all columns before DataFrame creation
1242
1223
  for col, values in data.items():
1243
- if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1224
+ if hasattr(values, "dtype") and str(values.dtype) == "object":
1244
1225
  logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
1245
1226
  if col not in object_columns:
1246
1227
  object_columns.append(col)
1247
-
1228
+
1248
1229
  df = _create_dataframe_with_objects(data, object_columns)
1249
1230
  else:
1250
1231
  # Debug: check for problematic data types when no object columns are expected
1251
1232
  for col, values in data.items():
1252
- if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1233
+ if hasattr(values, "dtype") and str(values.dtype) == "object":
1253
1234
  logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
1254
1235
  # Treat as object column
1255
1236
  if object_columns is None:
1256
1237
  object_columns = []
1257
1238
  object_columns.append(col)
1258
-
1239
+
1259
1240
  if object_columns:
1260
1241
  df = _create_dataframe_with_objects(data, object_columns)
1261
1242
  else:
@@ -1302,34 +1283,21 @@ def _save_study5_compressed(self, filename):
1302
1283
  dataframes_to_save.append(("features", len(self.features_df)))
1303
1284
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1304
1285
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1305
- if (
1306
- self.consensus_mapping_df is not None
1307
- and not self.consensus_mapping_df.is_empty()
1308
- ):
1286
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1309
1287
  dataframes_to_save.append(
1310
1288
  ("consensus_mapping", len(self.consensus_mapping_df)),
1311
1289
  )
1312
1290
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1313
1291
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1314
- if (
1315
- hasattr(self, "lib_df")
1316
- and self.lib_df is not None
1317
- and not self.lib_df.is_empty()
1318
- ):
1292
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1319
1293
  dataframes_to_save.append(("lib", len(self.lib_df)))
1320
- if (
1321
- hasattr(self, "id_df")
1322
- and self.id_df is not None
1323
- and not self.id_df.is_empty()
1324
- ):
1294
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1325
1295
  dataframes_to_save.append(("id", len(self.id_df)))
1326
1296
 
1327
1297
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1328
1298
 
1329
1299
  # Show progress for large saves
1330
- tdqm_disable = (
1331
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1332
- )
1300
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1333
1301
 
1334
1302
  with tqdm(
1335
1303
  total=total_steps,
@@ -1347,14 +1315,8 @@ def _save_study5_compressed(self, filename):
1347
1315
 
1348
1316
  # Store metadata
1349
1317
  metadata_group.attrs["format"] = "masster-study-1"
1350
- metadata_group.attrs["folder"] = (
1351
- str(self.folder) if self.folder is not None else ""
1352
- )
1353
- metadata_group.attrs["label"] = (
1354
- str(self.label)
1355
- if hasattr(self, "label") and self.label is not None
1356
- else ""
1357
- )
1318
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1319
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1358
1320
 
1359
1321
  # Store parameters as JSON
1360
1322
  if hasattr(self, "parameters") and self.history is not None:
@@ -1419,10 +1381,7 @@ def _save_study5_compressed(self, filename):
1419
1381
  pbar.update(1)
1420
1382
 
1421
1383
  # Store consensus_mapping_df - keep existing fast method
1422
- if (
1423
- self.consensus_mapping_df is not None
1424
- and not self.consensus_mapping_df.is_empty()
1425
- ):
1384
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1426
1385
  consensus_mapping = self.consensus_mapping_df.clone()
1427
1386
  self.logger.debug(
1428
1387
  f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1458,11 +1417,7 @@ def _save_study5_compressed(self, filename):
1458
1417
  pbar.update(1)
1459
1418
 
1460
1419
  # Store lib_df - library data
1461
- if (
1462
- hasattr(self, "lib_df")
1463
- and self.lib_df is not None
1464
- and not self.lib_df.is_empty()
1465
- ):
1420
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1466
1421
  self.logger.debug(
1467
1422
  f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1468
1423
  )
@@ -1476,11 +1431,7 @@ def _save_study5_compressed(self, filename):
1476
1431
  pbar.update(1)
1477
1432
 
1478
1433
  # Store id_df - identification results
1479
- if (
1480
- hasattr(self, "id_df")
1481
- and self.id_df is not None
1482
- and not self.id_df.is_empty()
1483
- ):
1434
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1484
1435
  self.logger.debug(
1485
1436
  f"Saving id_df with {len(self.id_df)} rows using optimized method",
1486
1437
  )
@@ -1636,34 +1587,21 @@ def _save_study5(self, filename):
1636
1587
  dataframes_to_save.append(("features", len(self.features_df)))
1637
1588
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1638
1589
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1639
- if (
1640
- self.consensus_mapping_df is not None
1641
- and not self.consensus_mapping_df.is_empty()
1642
- ):
1590
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1643
1591
  dataframes_to_save.append(
1644
1592
  ("consensus_mapping", len(self.consensus_mapping_df)),
1645
1593
  )
1646
1594
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1647
1595
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1648
- if (
1649
- hasattr(self, "lib_df")
1650
- and self.lib_df is not None
1651
- and not self.lib_df.is_empty()
1652
- ):
1596
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1653
1597
  dataframes_to_save.append(("lib", len(self.lib_df)))
1654
- if (
1655
- hasattr(self, "id_df")
1656
- and self.id_df is not None
1657
- and not self.id_df.is_empty()
1658
- ):
1598
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1659
1599
  dataframes_to_save.append(("id", len(self.id_df)))
1660
1600
 
1661
1601
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1662
1602
 
1663
1603
  # Show progress for large saves
1664
- tdqm_disable = (
1665
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1666
- )
1604
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1667
1605
 
1668
1606
  with tqdm(
1669
1607
  total=total_steps,
@@ -1681,14 +1619,8 @@ def _save_study5(self, filename):
1681
1619
 
1682
1620
  # Store metadata
1683
1621
  metadata_group.attrs["format"] = "masster-study-1"
1684
- metadata_group.attrs["folder"] = (
1685
- str(self.folder) if self.folder is not None else ""
1686
- )
1687
- metadata_group.attrs["label"] = (
1688
- str(self.label)
1689
- if hasattr(self, "label") and self.label is not None
1690
- else ""
1691
- )
1622
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1623
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1692
1624
 
1693
1625
  # Store parameters as JSON
1694
1626
  if hasattr(self, "parameters") and self.history is not None:
@@ -1756,10 +1688,7 @@ def _save_study5(self, filename):
1756
1688
  pbar.update(1)
1757
1689
 
1758
1690
  # Store consensus_mapping_df - keep existing fast method
1759
- if (
1760
- self.consensus_mapping_df is not None
1761
- and not self.consensus_mapping_df.is_empty()
1762
- ):
1691
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1763
1692
  consensus_mapping = self.consensus_mapping_df.clone()
1764
1693
  self.logger.debug(
1765
1694
  f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1795,11 +1724,7 @@ def _save_study5(self, filename):
1795
1724
  pbar.update(1)
1796
1725
 
1797
1726
  # Store lib_df - library data
1798
- if (
1799
- hasattr(self, "lib_df")
1800
- and self.lib_df is not None
1801
- and not self.lib_df.is_empty()
1802
- ):
1727
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1803
1728
  self.logger.debug(
1804
1729
  f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1805
1730
  )
@@ -1813,11 +1738,7 @@ def _save_study5(self, filename):
1813
1738
  pbar.update(1)
1814
1739
 
1815
1740
  # Store id_df - identification results
1816
- if (
1817
- hasattr(self, "id_df")
1818
- and self.id_df is not None
1819
- and not self.id_df.is_empty()
1820
- ):
1741
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
1821
1742
  self.logger.debug(
1822
1743
  f"Saving id_df with {len(self.id_df)} rows using optimized method",
1823
1744
  )
@@ -1896,12 +1817,7 @@ def _load_study5(self, filename=None):
1896
1817
 
1897
1818
  with h5py.File(filename, "r") as f:
1898
1819
  # Use progress bar to show loading progress
1899
- with tqdm(
1900
- total=len(loading_steps),
1901
- desc="Loading study",
1902
- disable=tdqm_disable,
1903
- unit="step"
1904
- ) as pbar:
1820
+ with tqdm(total=len(loading_steps), desc="Loading study", disable=tdqm_disable, unit="step") as pbar:
1905
1821
  # Load metadata
1906
1822
  pbar.set_description(
1907
1823
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
@@ -1963,24 +1879,14 @@ def _load_study5(self, filename=None):
1963
1879
  # Synchronize instance attributes with parameters (similar to __init__)
1964
1880
  # Note: folder and label are already loaded from metadata attributes above
1965
1881
  # but we ensure they match the parameters for consistency
1966
- if (
1967
- hasattr(self.parameters, "folder")
1968
- and self.parameters.folder is not None
1969
- ):
1882
+ if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
1970
1883
  self.folder = self.parameters.folder
1971
- if (
1972
- hasattr(self.parameters, "label")
1973
- and self.parameters.label is not None
1974
- ):
1884
+ if hasattr(self.parameters, "label") and self.parameters.label is not None:
1975
1885
  self.label = self.parameters.label
1976
1886
  if hasattr(self.parameters, "log_level"):
1977
1887
  self.log_level = self.parameters.log_level
1978
1888
  if hasattr(self.parameters, "log_label"):
1979
- self.log_label = (
1980
- self.parameters.log_label
1981
- if self.parameters.log_label is not None
1982
- else ""
1983
- )
1889
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1984
1890
  if hasattr(self.parameters, "log_sink"):
1985
1891
  self.log_sink = self.parameters.log_sink
1986
1892
  pbar.update(1)
@@ -2017,7 +1923,7 @@ def _load_study5(self, filename=None):
2017
1923
  self.logger,
2018
1924
  object_columns,
2019
1925
  )
2020
-
1926
+
2021
1927
  # Sanity check: replace any missing rt_original with rt values
2022
1928
  if self.features_df is not None and not self.features_df.is_empty():
2023
1929
  if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
@@ -2061,16 +1967,11 @@ def _load_study5(self, filename=None):
2061
1967
 
2062
1968
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
2063
1969
  if self.consensus_df is not None:
2064
- if (
2065
- "adducts" not in self.consensus_df.columns
2066
- or self.consensus_df["adducts"].dtype == pl.Null
2067
- ):
1970
+ if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
2068
1971
  self.logger.info(
2069
1972
  "Adding missing 'adducts' column for backward compatibility",
2070
1973
  )
2071
- empty_adducts: list[list] = [
2072
- [] for _ in range(len(self.consensus_df))
2073
- ]
1974
+ empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
2074
1975
 
2075
1976
  # If column exists but is Null, drop it first
2076
1977
  if "adducts" in self.consensus_df.columns:
@@ -2163,11 +2064,7 @@ def _load_study5(self, filename=None):
2163
2064
  pbar.update(1)
2164
2065
 
2165
2066
  # Check and migrate old string-based map_id to integer indices
2166
- if (
2167
- self.samples_df is not None
2168
- and not self.samples_df.is_empty()
2169
- and self.samples_df["map_id"].dtype == pl.Utf8
2170
- ):
2067
+ if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
2171
2068
  self.logger.info(
2172
2069
  "Detected old string-based map_id format, migrating to integer indices",
2173
2070
  )
@@ -2191,26 +2088,26 @@ def _load_study5(self, filename=None):
2191
2088
  _sanitize_nulls(self)
2192
2089
 
2193
2090
  self.logger.debug("Study loaded")
2194
-
2091
+
2195
2092
 
2196
2093
  def _load_ms1(self, filename: str) -> pl.DataFrame:
2197
2094
  """
2198
2095
  Optimized method to load only MS1 data from a sample5 file for isotope detection.
2199
-
2096
+
2200
2097
  This method efficiently loads only the ms1_df from a sample5 HDF5 file without
2201
2098
  loading other potentially large datasets like features_df, scans_df, etc.
2202
-
2099
+
2203
2100
  Args:
2204
2101
  sample_path (str): Path to the sample5 HDF5 file
2205
-
2102
+
2206
2103
  Returns:
2207
- pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
2104
+ pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
2208
2105
  Returns empty DataFrame if no MS1 data found or file cannot be read
2209
-
2106
+
2210
2107
  Note:
2211
2108
  Used by find_iso() for efficient isotope pattern detection without full sample loading
2212
2109
  """
2213
- #try:
2110
+ # try:
2214
2111
  # add .sample5 extension if not provided
2215
2112
  if not filename.endswith(".sample5"):
2216
2113
  filename += ".sample5"
@@ -2219,45 +2116,46 @@ def _load_ms1(self, filename: str) -> pl.DataFrame:
2219
2116
  if "ms1" not in f:
2220
2117
  self.logger.debug(f"No MS1 data found in {filename}")
2221
2118
  return pl.DataFrame()
2222
-
2119
+
2223
2120
  ms1_group = f["ms1"]
2224
-
2121
+
2225
2122
  # Load MS1 data efficiently
2226
2123
  ms1_data = {}
2227
2124
  for col in ms1_group.keys():
2228
2125
  ms1_data[col] = ms1_group[col][:]
2229
-
2126
+
2230
2127
  if not ms1_data:
2231
2128
  self.logger.debug(f"Empty MS1 data in {filename}")
2232
2129
  return pl.DataFrame()
2233
-
2130
+
2234
2131
  # Create DataFrame with proper schema
2235
2132
  ms1_df = pl.DataFrame(ms1_data)
2236
-
2133
+
2237
2134
  # Apply expected schema for MS1 data
2238
2135
  expected_schema = {
2239
2136
  "cycle": pl.Int64,
2240
- "scan_uid": pl.Int64,
2137
+ "scan_uid": pl.Int64,
2241
2138
  "rt": pl.Float64,
2242
2139
  "mz": pl.Float64,
2243
- "inty": pl.Float64
2140
+ "inty": pl.Float64,
2244
2141
  }
2245
-
2142
+
2246
2143
  # Cast columns to expected types if they exist
2247
2144
  cast_expressions = []
2248
2145
  for col, dtype in expected_schema.items():
2249
2146
  if col in ms1_df.columns:
2250
2147
  cast_expressions.append(pl.col(col).cast(dtype))
2251
-
2148
+
2252
2149
  if cast_expressions:
2253
2150
  ms1_df = ms1_df.with_columns(cast_expressions)
2254
-
2151
+
2255
2152
  self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
2256
2153
  return ms1_df
2257
-
2258
- #except Exception as e:
2259
- # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2260
- # return pl.DataFrame()
2154
+
2155
+
2156
+ # except Exception as e:
2157
+ # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2158
+ # return pl.DataFrame()
2261
2159
 
2262
2160
 
2263
2161
  def _sanitize_nulls(self):
@@ -2269,14 +2167,14 @@ def _sanitize_nulls(self):
2269
2167
  import uuid
2270
2168
  import polars as pl
2271
2169
  import time
2272
-
2170
+
2273
2171
  # Sanitize features_df feature_id column
2274
- if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2172
+ if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
2275
2173
  # Check for null feature_ids
2276
2174
  null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
2277
2175
  if null_feature_ids > 0:
2278
2176
  self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
2279
-
2177
+
2280
2178
  # Find the maximum existing feature_id (convert strings to int if possible)
2281
2179
  max_existing_id = 0
2282
2180
  existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
@@ -2287,13 +2185,13 @@ def _sanitize_nulls(self):
2287
2185
  except (ValueError, TypeError):
2288
2186
  # Skip non-integer IDs
2289
2187
  pass
2290
-
2188
+
2291
2189
  # Generate new sequential integer IDs starting from max + timestamp offset
2292
2190
  # Use timestamp to ensure uniqueness across different sanitization runs
2293
2191
  base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
2294
2192
  new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
2295
2193
  uid_index = 0
2296
-
2194
+
2297
2195
  # Create a list to store all feature_ids
2298
2196
  feature_ids = []
2299
2197
  for feature_id in self.features_df["feature_id"].to_list():
@@ -2302,25 +2200,23 @@ def _sanitize_nulls(self):
2302
2200
  uid_index += 1
2303
2201
  else:
2304
2202
  feature_ids.append(feature_id)
2305
-
2203
+
2306
2204
  # Update the DataFrame with sanitized feature_ids
2307
- self.features_df = self.features_df.with_columns(
2308
- pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
2309
- )
2310
-
2205
+ self.features_df = self.features_df.with_columns(pl.Series("feature_id", feature_ids, dtype=pl.Utf8))
2206
+
2311
2207
  self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
2312
-
2208
+
2313
2209
  # Sanitize consensus_df consensus_id column
2314
- if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
2210
+ if hasattr(self, "consensus_df") and self.consensus_df is not None and not self.consensus_df.is_empty():
2315
2211
  if "consensus_id" in self.consensus_df.columns:
2316
2212
  null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
2317
2213
  if null_consensus_ids > 0:
2318
2214
  self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
2319
-
2215
+
2320
2216
  # Generate new UIDs for null values using the same method as merge()
2321
- new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
2217
+ new_uids = [str(uuid.uuid4()).replace("-", "")[:16] for _ in range(null_consensus_ids)]
2322
2218
  uid_index = 0
2323
-
2219
+
2324
2220
  # Create a list to store all consensus_ids
2325
2221
  consensus_ids = []
2326
2222
  for consensus_id in self.consensus_df["consensus_id"].to_list():
@@ -2329,7 +2225,7 @@ def _sanitize_nulls(self):
2329
2225
  uid_index += 1
2330
2226
  else:
2331
2227
  consensus_ids.append(consensus_id)
2332
-
2228
+
2333
2229
  # Update the DataFrame with sanitized consensus_ids
2334
2230
  self.consensus_df = self.consensus_df.with_columns(
2335
2231
  pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
@@ -2338,7 +2234,7 @@ def _sanitize_nulls(self):
2338
2234
  self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
2339
2235
 
2340
2236
  # Sanitize rt_original in features_df by replacing null or NaN values with rt values
2341
- if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2237
+ if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
2342
2238
  if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
2343
2239
  # Check for null or NaN values in rt_original
2344
2240
  null_or_nan_rt_original = self.features_df.filter(
@@ -2352,4 +2248,4 @@ def _sanitize_nulls(self):
2352
2248
  .otherwise(pl.col("rt_original"))
2353
2249
  .alias("rt_original")
2354
2250
  )
2355
- self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
2251
+ self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")