masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +1 -1
  4. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  5. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  6. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  7. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  8. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  9. masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
  10. masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
  11. masster/data/libs/ccm.csv +120 -0
  12. masster/data/libs/urine.csv +4693 -0
  13. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  14. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  15. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  16. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  17. masster/logger.py +11 -11
  18. masster/sample/__init__.py +1 -1
  19. masster/sample/adducts.py +338 -264
  20. masster/sample/defaults/find_adducts_def.py +21 -8
  21. masster/sample/h5.py +561 -282
  22. masster/sample/helpers.py +131 -75
  23. masster/sample/lib.py +4 -4
  24. masster/sample/load.py +31 -17
  25. masster/sample/parameters.py +1 -1
  26. masster/sample/plot.py +7 -7
  27. masster/sample/processing.py +117 -87
  28. masster/sample/sample.py +103 -90
  29. masster/sample/sample5_schema.json +196 -0
  30. masster/sample/save.py +35 -12
  31. masster/spectrum.py +1 -1
  32. masster/study/__init__.py +1 -1
  33. masster/study/defaults/align_def.py +5 -1
  34. masster/study/defaults/identify_def.py +3 -1
  35. masster/study/defaults/study_def.py +58 -25
  36. masster/study/export.py +360 -210
  37. masster/study/h5.py +560 -158
  38. masster/study/helpers.py +496 -203
  39. masster/study/helpers_optimized.py +1 -1
  40. masster/study/id.py +538 -349
  41. masster/study/load.py +233 -143
  42. masster/study/plot.py +71 -71
  43. masster/study/processing.py +456 -254
  44. masster/study/save.py +15 -5
  45. masster/study/study.py +213 -131
  46. masster/study/study5_schema.json +360 -0
  47. masster-0.4.5.dist-info/METADATA +131 -0
  48. masster-0.4.5.dist-info/RECORD +71 -0
  49. masster-0.4.3.dist-info/METADATA +0 -791
  50. masster-0.4.3.dist-info/RECORD +0 -56
  51. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  52. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  53. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  54. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/helpers.py CHANGED
@@ -22,7 +22,7 @@ import pandas as pd
22
22
  import polars as pl
23
23
 
24
24
  from tqdm import tqdm
25
- from masster.chromatogram import Chromatogram
25
+ from master.chromatogram import Chromatogram
26
26
 
27
27
 
28
28
  # =====================================================================================
@@ -71,7 +71,12 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
71
71
  # fallback to pandas
72
72
  try:
73
73
  bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
74
- bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
74
+ bpc_pd = (
75
+ bpc_pd.groupby("rt")
76
+ .agg({"inty": "max"})
77
+ .reset_index()
78
+ .sort_values("rt")
79
+ )
75
80
  except Exception:
76
81
  raise
77
82
 
@@ -113,11 +118,16 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
113
118
  mapping_rows = pl.DataFrame()
114
119
 
115
120
  # If we still have no sample selector, try to infer sample from the Sample object s
116
- if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
121
+ if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
122
+ s,
123
+ "sample_path",
124
+ ):
117
125
  # attempt to match by sample_path or file name
118
126
  try:
119
127
  # find row where sample_path matches
120
- mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
128
+ mapping_rows = feats.filter(
129
+ pl.col("sample_path") == getattr(s, "file", None),
130
+ )
121
131
  except Exception:
122
132
  mapping_rows = pl.DataFrame()
123
133
 
@@ -204,7 +214,9 @@ def get_tic(owner, sample=None, label=None):
204
214
  except Exception:
205
215
  raise
206
216
  else:
207
- raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
217
+ raise ValueError(
218
+ "Neither ms1_df nor scans_df available for TIC computation",
219
+ )
208
220
 
209
221
  if tic_pd.empty:
210
222
  raise ValueError("Computed TIC is empty")
@@ -367,14 +379,17 @@ def get_chrom(self, uids=None, samples=None):
367
379
  )
368
380
  # Pre-filter features_df to only relevant features and samples
369
381
  filtered_features = self.features_df.filter(
370
- pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
371
- ).select([
372
- "feature_uid",
373
- "chrom",
374
- "rt",
375
- "rt_original",
376
- "sample_uid",
377
- ])
382
+ pl.col("feature_uid").is_in(relevant_feature_uids)
383
+ & pl.col("sample_uid").is_in(sample_uids),
384
+ ).select(
385
+ [
386
+ "feature_uid",
387
+ "chrom",
388
+ "rt",
389
+ "rt_original",
390
+ "sample_uid",
391
+ ],
392
+ )
378
393
 
379
394
  # Pre-filter samples_df
380
395
  filtered_samples = self.samples_df.filter(
@@ -409,11 +424,13 @@ def get_chrom(self, uids=None, samples=None):
409
424
  # Create a mapping dictionary for O(1) lookup instead of O(n) filtering
410
425
  self.logger.debug("Creating lookup dictionary for chromatogram objects.")
411
426
  chrom_lookup = {}
412
- for row in df_combined.select([
413
- "consensus_uid",
414
- "sample_name",
415
- "chrom",
416
- ]).iter_rows():
427
+ for row in df_combined.select(
428
+ [
429
+ "consensus_uid",
430
+ "sample_name",
431
+ "chrom",
432
+ ],
433
+ ).iter_rows():
417
434
  key = (row[0], row[1]) # (consensus_uid, sample_name)
418
435
  chrom_lookup[key] = row[2] # chrom object
419
436
 
@@ -532,7 +549,9 @@ def get_consensus_matrix(self, quant="chrom_area"):
532
549
 
533
550
  # Build consensus matrix directly using the consensus_mapping_df
534
551
  matrix_dict = {}
535
- sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
552
+ sample_mapping = dict(
553
+ self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
554
+ )
536
555
 
537
556
  for row in self.consensus_mapping_df.iter_rows(named=True):
538
557
  consensus_uid = row["consensus_uid"]
@@ -550,7 +569,10 @@ def get_consensus_matrix(self, quant="chrom_area"):
550
569
 
551
570
  # Take max if multiple features map to same consensus/sample combination
552
571
  if sample_name in matrix_dict[consensus_uid]:
553
- matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
572
+ matrix_dict[consensus_uid][sample_name] = max(
573
+ matrix_dict[consensus_uid][sample_name],
574
+ value,
575
+ )
554
576
  else:
555
577
  matrix_dict[consensus_uid][sample_name] = value
556
578
 
@@ -569,10 +591,12 @@ def get_consensus_matrix(self, quant="chrom_area"):
569
591
 
570
592
  # Fill null values with 0 and round numeric columns
571
593
  numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
572
- df2 = df2.with_columns([
573
- pl.col("consensus_uid").cast(pl.UInt64),
574
- *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
575
- ])
594
+ df2 = df2.with_columns(
595
+ [
596
+ pl.col("consensus_uid").cast(pl.UInt64),
597
+ *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
598
+ ],
599
+ )
576
600
 
577
601
  return df2
578
602
 
@@ -792,7 +816,7 @@ def get_sample(self, sample):
792
816
 
793
817
  This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
794
818
  """
795
- from masster.sample.sample import Sample
819
+ from master.sample.sample import Sample
796
820
 
797
821
  if isinstance(sample, Sample):
798
822
  return sample
@@ -802,7 +826,9 @@ def get_sample(self, sample):
802
826
  elif isinstance(sample, str):
803
827
  rows = self.samples_df.filter(pl.col("sample_name") == sample)
804
828
  else:
805
- raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
829
+ raise ValueError(
830
+ "sample must be an int (sample_uid), str (sample_name) or a Sample instance",
831
+ )
806
832
 
807
833
  if rows.is_empty():
808
834
  raise KeyError(f"Sample not found: {sample}")
@@ -836,7 +862,9 @@ def get_orphans(self):
836
862
  Get all features that are not in the consensus mapping.
837
863
  """
838
864
  not_in_consensus = self.features_df.filter(
839
- ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
865
+ ~self.features_df["feature_uid"].is_in(
866
+ self.consensus_mapping_df["feature_uid"].to_list(),
867
+ ),
840
868
  )
841
869
  return not_in_consensus
842
870
 
@@ -914,7 +942,7 @@ def restore_features(self, samples=None, maps=False):
914
942
  maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
915
943
  """
916
944
  import datetime
917
- from masster.sample.sample import Sample
945
+ from master.sample.sample import Sample
918
946
 
919
947
  if self.features_df is None or self.features_df.is_empty():
920
948
  self.logger.error("No features_df found in study.")
@@ -934,7 +962,9 @@ def restore_features(self, samples=None, maps=False):
934
962
  # Columns to update from sample data
935
963
  columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
936
964
 
937
- self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
965
+ self.logger.info(
966
+ f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...",
967
+ )
938
968
 
939
969
  # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
940
970
  study_feature_mapping = {}
@@ -954,7 +984,9 @@ def restore_features(self, samples=None, maps=False):
954
984
  # Get sample info
955
985
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
956
986
  if sample_row.is_empty():
957
- self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
987
+ self.logger.warning(
988
+ f"Sample with uid {sample_uid} not found in samples_df.",
989
+ )
958
990
  continue
959
991
 
960
992
  sample_info = sample_row.row(0, named=True)
@@ -962,7 +994,9 @@ def restore_features(self, samples=None, maps=False):
962
994
  sample_name = sample_info.get("sample_name")
963
995
 
964
996
  if not sample_path or not os.path.exists(sample_path):
965
- self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
997
+ self.logger.warning(
998
+ f"Sample file not found for {sample_name}: {sample_path}",
999
+ )
966
1000
  continue
967
1001
 
968
1002
  try:
@@ -978,7 +1012,9 @@ def restore_features(self, samples=None, maps=False):
978
1012
  continue
979
1013
 
980
1014
  # Check which columns are actually available in the sample
981
- available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
1015
+ available_columns = [
1016
+ col for col in columns_to_update if col in sample.features_df.columns
1017
+ ]
982
1018
  if not available_columns:
983
1019
  self.logger.debug(f"No target columns found in sample {sample_name}")
984
1020
  continue
@@ -1001,13 +1037,21 @@ def restore_features(self, samples=None, maps=False):
1001
1037
  original_dtype = self.features_df[col].dtype
1002
1038
 
1003
1039
  # Update the specific row and column, preserving dtype
1004
- mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1040
+ mask = (pl.col("feature_uid") == feature_uid) & (
1041
+ pl.col("sample_uid") == sample_uid
1042
+ )
1005
1043
 
1006
1044
  # Handle object columns (like Chromatogram) differently
1007
1045
  if original_dtype == pl.Object:
1008
1046
  self.features_df = self.features_df.with_columns(
1009
1047
  pl.when(mask)
1010
- .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
1048
+ .then(
1049
+ pl.lit(
1050
+ row[col],
1051
+ dtype=original_dtype,
1052
+ allow_object=True,
1053
+ ),
1054
+ )
1011
1055
  .otherwise(pl.col(col))
1012
1056
  .alias(col),
1013
1057
  )
@@ -1021,7 +1065,9 @@ def restore_features(self, samples=None, maps=False):
1021
1065
  updates_made += 1
1022
1066
 
1023
1067
  if updates_made > 0:
1024
- self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
1068
+ self.logger.debug(
1069
+ f"Updated {updates_made} features from sample {sample_name}",
1070
+ )
1025
1071
 
1026
1072
  # If maps is True, load featureXML data
1027
1073
  if maps:
@@ -1032,7 +1078,9 @@ def restore_features(self, samples=None, maps=False):
1032
1078
  self.logger.error(f"Failed to load sample {sample_name}: {e}")
1033
1079
  continue
1034
1080
 
1035
- self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
1081
+ self.logger.info(
1082
+ f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
1083
+ )
1036
1084
 
1037
1085
 
1038
1086
  def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
@@ -1052,8 +1100,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1052
1100
  """
1053
1101
  import datetime
1054
1102
  import numpy as np
1055
- from masster.sample.sample import Sample
1056
- from masster.chromatogram import Chromatogram
1103
+ from master.sample.sample import Sample
1104
+ from master.chromatogram import Chromatogram
1057
1105
 
1058
1106
  if self.features_df is None or self.features_df.is_empty():
1059
1107
  self.logger.error("No features_df found in study.")
@@ -1129,7 +1177,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1129
1177
  feature_uid = study_feature_mapping[key]
1130
1178
 
1131
1179
  # Update only the chrom column
1132
- mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1180
+ mask = (pl.col("feature_uid") == feature_uid) & (
1181
+ pl.col("sample_uid") == sample_uid
1182
+ )
1133
1183
  self.features_df = self.features_df.with_columns(
1134
1184
  pl.when(mask)
1135
1185
  .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
@@ -1142,7 +1192,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1142
1192
  self.logger.error(f"Failed to load sample {sample_name}: {e}")
1143
1193
  continue
1144
1194
 
1145
- self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
1195
+ self.logger.info(
1196
+ f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files",
1197
+ )
1146
1198
 
1147
1199
  # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
1148
1200
  self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
@@ -1156,7 +1208,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1156
1208
  )
1157
1209
 
1158
1210
  if empty_chroms == 0:
1159
- self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
1211
+ self.logger.info(
1212
+ "All chromatograms restored from .sample5 files. No gap-filling needed.",
1213
+ )
1160
1214
  return
1161
1215
 
1162
1216
  # Get consensus info for gap filling
@@ -1200,7 +1254,11 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1200
1254
  sample = Sample(log_level="ERROR")
1201
1255
  sample._load_sample5(sample_path, map=False)
1202
1256
 
1203
- if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
1257
+ if (
1258
+ not hasattr(sample, "ms1_df")
1259
+ or sample.ms1_df is None
1260
+ or sample.ms1_df.is_empty()
1261
+ ):
1204
1262
  continue
1205
1263
 
1206
1264
  # Process each missing feature
@@ -1285,7 +1343,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1285
1343
  self.logger.info(
1286
1344
  f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
1287
1345
  )
1288
- self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
1346
+ self.logger.info(
1347
+ f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}",
1348
+ )
1289
1349
 
1290
1350
 
1291
1351
  def compress_ms2(self, max_replicates=5):
@@ -1305,17 +1365,28 @@ def compress_ms2(self, max_replicates=5):
1305
1365
 
1306
1366
  # Create a ranking score based on number_frags * prec_inty
1307
1367
  # Handle None values by treating them as 0
1308
- self.consensus_ms2 = self.consensus_ms2.with_columns([
1309
- (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
1310
- ])
1368
+ self.consensus_ms2 = self.consensus_ms2.with_columns(
1369
+ [
1370
+ (
1371
+ pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
1372
+ ).alias("ranking_score"),
1373
+ ],
1374
+ )
1311
1375
 
1312
1376
  # Group by consensus_uid and energy, then rank by score and keep top max_replicates
1313
1377
  compressed_ms2 = (
1314
- self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
1315
- .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
1316
- .with_columns([
1317
- pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
1318
- ])
1378
+ self.consensus_ms2.with_row_count(
1379
+ "row_id",
1380
+ ) # Add row numbers for stable sorting
1381
+ .sort(
1382
+ ["consensus_uid", "energy", "ranking_score", "row_id"],
1383
+ descending=[False, False, True, False],
1384
+ )
1385
+ .with_columns(
1386
+ [
1387
+ pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
1388
+ ],
1389
+ )
1319
1390
  .filter(pl.col("rank") < max_replicates)
1320
1391
  .drop(["ranking_score", "row_id", "rank"])
1321
1392
  )
@@ -1351,7 +1422,9 @@ def compress_chrom(self):
1351
1422
  pl.lit(None, dtype=pl.Object).alias("chrom"),
1352
1423
  )
1353
1424
 
1354
- self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
1425
+ self.logger.info(
1426
+ f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df",
1427
+ )
1355
1428
 
1356
1429
 
1357
1430
  # =====================================================================================
@@ -1402,7 +1475,9 @@ def sample_name_replace(self, replace_dict):
1402
1475
  if name in replace_dict:
1403
1476
  new_names.append(replace_dict[name])
1404
1477
  replaced_count += 1
1405
- self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1478
+ self.logger.debug(
1479
+ f"Replacing sample name: '{name}' -> '{replace_dict[name]}'",
1480
+ )
1406
1481
  else:
1407
1482
  new_names.append(name)
1408
1483
 
@@ -1415,7 +1490,9 @@ def sample_name_replace(self, replace_dict):
1415
1490
  duplicates.append(name)
1416
1491
  else:
1417
1492
  seen.add(name)
1418
- raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1493
+ raise ValueError(
1494
+ f"Resulting sample names are not unique. Duplicates found: {duplicates}",
1495
+ )
1419
1496
 
1420
1497
  # If we get here, all names are unique - apply the changes
1421
1498
  self.samples_df = self.samples_df.with_columns(
@@ -1464,7 +1541,9 @@ def sample_name_reset(self):
1464
1541
  name_without_ext = os.path.splitext(name_without_ext)[0]
1465
1542
 
1466
1543
  new_names.append(name_without_ext)
1467
- self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1544
+ self.logger.debug(
1545
+ f"Resetting sample name from path: '{path}' -> '{name_without_ext}'",
1546
+ )
1468
1547
 
1469
1548
  # Check that all new names are unique
1470
1549
  if len(set(new_names)) != len(new_names):
@@ -1475,14 +1554,18 @@ def sample_name_reset(self):
1475
1554
  duplicates.append(name)
1476
1555
  else:
1477
1556
  seen.add(name)
1478
- raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1557
+ raise ValueError(
1558
+ f"Resulting sample names are not unique. Duplicates found: {duplicates}",
1559
+ )
1479
1560
 
1480
1561
  # If we get here, all names are unique - apply the changes
1481
1562
  self.samples_df = self.samples_df.with_columns(
1482
1563
  pl.Series("sample_name", new_names).alias("sample_name"),
1483
1564
  )
1484
1565
 
1485
- self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1566
+ self.logger.info(
1567
+ f"Successfully reset {len(new_names)} sample names from sample paths",
1568
+ )
1486
1569
 
1487
1570
 
1488
1571
  def set_source(self, filename):
@@ -1512,11 +1595,15 @@ def set_source(self, filename):
1512
1595
 
1513
1596
  new_sources = []
1514
1597
 
1515
- for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
1598
+ for i, (current_source, sample_name) in enumerate(
1599
+ zip(current_sources, sample_names),
1600
+ ):
1516
1601
  # Check if filename is just a directory path
1517
1602
  if os.path.isdir(filename):
1518
1603
  if current_source is None or current_source == "":
1519
- self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
1604
+ self.logger.warning(
1605
+ f"Cannot build path for sample '{sample_name}': no current file_source available",
1606
+ )
1520
1607
  new_sources.append(current_source)
1521
1608
  failed_count += 1
1522
1609
  continue
@@ -1531,7 +1618,9 @@ def set_source(self, filename):
1531
1618
 
1532
1619
  # Check if the new file exists
1533
1620
  if not os.path.exists(new_file_path):
1534
- self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
1621
+ self.logger.warning(
1622
+ f"File does not exist for sample '{sample_name}': {new_file_path}",
1623
+ )
1535
1624
  new_sources.append(current_source)
1536
1625
  failed_count += 1
1537
1626
  continue
@@ -1541,7 +1630,9 @@ def set_source(self, filename):
1541
1630
  updated_count += 1
1542
1631
 
1543
1632
  # Log individual updates at debug level
1544
- self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
1633
+ self.logger.debug(
1634
+ f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}",
1635
+ )
1545
1636
 
1546
1637
  # Update the samples_df with new file_source values
1547
1638
  self.samples_df = self.samples_df.with_columns(
@@ -1636,7 +1727,9 @@ def features_select(
1636
1727
  if mz is not None:
1637
1728
  if isinstance(mz, tuple) and len(mz) == 2:
1638
1729
  min_mz, max_mz = mz
1639
- filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1730
+ filter_conditions.append(
1731
+ (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
1732
+ )
1640
1733
  else:
1641
1734
  filter_conditions.append(pl.col("mz") >= mz)
1642
1735
 
@@ -1644,7 +1737,9 @@ def features_select(
1644
1737
  if rt is not None:
1645
1738
  if isinstance(rt, tuple) and len(rt) == 2:
1646
1739
  min_rt, max_rt = rt
1647
- filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1740
+ filter_conditions.append(
1741
+ (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
1742
+ )
1648
1743
  else:
1649
1744
  filter_conditions.append(pl.col("rt") >= rt)
1650
1745
 
@@ -1652,7 +1747,9 @@ def features_select(
1652
1747
  if inty is not None:
1653
1748
  if isinstance(inty, tuple) and len(inty) == 2:
1654
1749
  min_inty, max_inty = inty
1655
- filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
1750
+ filter_conditions.append(
1751
+ (pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
1752
+ )
1656
1753
  else:
1657
1754
  filter_conditions.append(pl.col("inty") >= inty)
1658
1755
 
@@ -1662,7 +1759,10 @@ def features_select(
1662
1759
  if len(sample_uid) == 2 and not isinstance(sample_uid, list):
1663
1760
  # Treat as range
1664
1761
  min_uid, max_uid = sample_uid
1665
- filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
1762
+ filter_conditions.append(
1763
+ (pl.col("sample_uid") >= min_uid)
1764
+ & (pl.col("sample_uid") <= max_uid),
1765
+ )
1666
1766
  else:
1667
1767
  # Treat as list
1668
1768
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -1692,7 +1792,10 @@ def features_select(
1692
1792
  if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1693
1793
  # Treat as range
1694
1794
  min_uid, max_uid = consensus_uid
1695
- filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1795
+ filter_conditions.append(
1796
+ (pl.col("consensus_uid") >= min_uid)
1797
+ & (pl.col("consensus_uid") <= max_uid),
1798
+ )
1696
1799
  else:
1697
1800
  # Treat as list
1698
1801
  filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
@@ -1705,7 +1808,10 @@ def features_select(
1705
1808
  if len(feature_uid) == 2 and not isinstance(feature_uid, list):
1706
1809
  # Treat as range
1707
1810
  min_uid, max_uid = feature_uid
1708
- filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
1811
+ filter_conditions.append(
1812
+ (pl.col("feature_uid") >= min_uid)
1813
+ & (pl.col("feature_uid") <= max_uid),
1814
+ )
1709
1815
  else:
1710
1816
  # Treat as list
1711
1817
  filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
@@ -1727,7 +1833,10 @@ def features_select(
1727
1833
  if "quality" in available_columns:
1728
1834
  if isinstance(quality, tuple) and len(quality) == 2:
1729
1835
  min_quality, max_quality = quality
1730
- filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
1836
+ filter_conditions.append(
1837
+ (pl.col("quality") >= min_quality)
1838
+ & (pl.col("quality") <= max_quality),
1839
+ )
1731
1840
  else:
1732
1841
  filter_conditions.append(pl.col("quality") >= quality)
1733
1842
  else:
@@ -1739,7 +1848,8 @@ def features_select(
1739
1848
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1740
1849
  min_coherence, max_coherence = chrom_coherence
1741
1850
  filter_conditions.append(
1742
- (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
1851
+ (pl.col("chrom_coherence") >= min_coherence)
1852
+ & (pl.col("chrom_coherence") <= max_coherence),
1743
1853
  )
1744
1854
  else:
1745
1855
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1752,7 +1862,8 @@ def features_select(
1752
1862
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1753
1863
  min_prominence, max_prominence = chrom_prominence
1754
1864
  filter_conditions.append(
1755
- (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
1865
+ (pl.col("chrom_prominence") >= min_prominence)
1866
+ & (pl.col("chrom_prominence") <= max_prominence),
1756
1867
  )
1757
1868
  else:
1758
1869
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1762,14 +1873,19 @@ def features_select(
1762
1873
  # Filter by scaled chromatogram prominence
1763
1874
  if chrom_prominence_scaled is not None:
1764
1875
  if "chrom_prominence_scaled" in available_columns:
1765
- if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
1876
+ if (
1877
+ isinstance(chrom_prominence_scaled, tuple)
1878
+ and len(chrom_prominence_scaled) == 2
1879
+ ):
1766
1880
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1767
1881
  filter_conditions.append(
1768
1882
  (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1769
1883
  & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
1770
1884
  )
1771
1885
  else:
1772
- filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
1886
+ filter_conditions.append(
1887
+ pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
1888
+ )
1773
1889
  else:
1774
1890
  warnings.append("'chrom_prominence_scaled' column not found in features_df")
1775
1891
 
@@ -1783,7 +1899,9 @@ def features_select(
1783
1899
  & (pl.col("chrom_height_scaled") <= max_height_scaled),
1784
1900
  )
1785
1901
  else:
1786
- filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
1902
+ filter_conditions.append(
1903
+ pl.col("chrom_height_scaled") >= chrom_height_scaled,
1904
+ )
1787
1905
  else:
1788
1906
  warnings.append("'chrom_height_scaled' column not found in features_df")
1789
1907
 
@@ -1896,7 +2014,7 @@ def monkey_patch_study():
1896
2014
  as `features_select_original` if not already set, then replaces Study.features_select
1897
2015
  with the optimized `features_select` defined above. This function is idempotent.
1898
2016
  """
1899
- from masster.study.study import Study
2017
+ from master.study.study import Study
1900
2018
 
1901
2019
  # Only set original if it doesn't exist yet
1902
2020
  if not hasattr(Study, "features_select_original"):
@@ -1969,9 +2087,14 @@ def features_filter(self, features):
1969
2087
 
1970
2088
  # Apply filter to consensus_mapping_df if it exists - batch operation
1971
2089
  mapping_removed_count = 0
1972
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2090
+ if (
2091
+ self.consensus_mapping_df is not None
2092
+ and not self.consensus_mapping_df.is_empty()
2093
+ ):
1973
2094
  initial_mapping_count = len(self.consensus_mapping_df)
1974
- self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2095
+ self.consensus_mapping_df = (
2096
+ self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2097
+ )
1975
2098
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1976
2099
 
1977
2100
  # Calculate results once and log efficiently
@@ -1984,7 +2107,9 @@ def features_filter(self, features):
1984
2107
  f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
1985
2108
  )
1986
2109
  else:
1987
- self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
2110
+ self.logger.info(
2111
+ f"Kept {final_count} features. Filtered out {removed_count} features.",
2112
+ )
1988
2113
 
1989
2114
 
1990
2115
  def features_delete(self, features):
@@ -2046,9 +2171,14 @@ def features_delete(self, features):
2046
2171
 
2047
2172
  # Apply filter to consensus_mapping_df if it exists - batch operation
2048
2173
  mapping_removed_count = 0
2049
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2174
+ if (
2175
+ self.consensus_mapping_df is not None
2176
+ and not self.consensus_mapping_df.is_empty()
2177
+ ):
2050
2178
  initial_mapping_count = len(self.consensus_mapping_df)
2051
- self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2179
+ self.consensus_mapping_df = (
2180
+ self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2181
+ )
2052
2182
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
2053
2183
 
2054
2184
  # Calculate results once and log efficiently
@@ -2061,7 +2191,9 @@ def features_delete(self, features):
2061
2191
  f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
2062
2192
  )
2063
2193
  else:
2064
- self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
2194
+ self.logger.info(
2195
+ f"Deleted {removed_count} features. Remaining features: {final_count}",
2196
+ )
2065
2197
 
2066
2198
 
2067
2199
  def consensus_select(
@@ -2134,7 +2266,9 @@ def consensus_select(
2134
2266
  else:
2135
2267
  # Standard (min_mz, max_mz) format
2136
2268
  min_mz, max_mz = mz
2137
- consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2269
+ consensus = consensus.filter(
2270
+ (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2271
+ )
2138
2272
  else:
2139
2273
  # Single float value - use default mz tolerance from study parameters
2140
2274
  default_mz_tol = getattr(self, "parameters", None)
@@ -2142,13 +2276,15 @@ def consensus_select(
2142
2276
  default_mz_tol = default_mz_tol.eic_mz_tol
2143
2277
  else:
2144
2278
  # Fallback to align_defaults if study parameters not available
2145
- from masster.study.defaults.align_def import align_defaults
2279
+ from master.study.defaults.align_def import align_defaults
2146
2280
 
2147
2281
  default_mz_tol = align_defaults().mz_max_diff
2148
2282
 
2149
2283
  min_mz = mz - default_mz_tol
2150
2284
  max_mz = mz + default_mz_tol
2151
- consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2285
+ consensus = consensus.filter(
2286
+ (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2287
+ )
2152
2288
 
2153
2289
  self.logger.debug(
2154
2290
  f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2168,7 +2304,9 @@ def consensus_select(
2168
2304
  else:
2169
2305
  # Standard (min_rt, max_rt) format
2170
2306
  min_rt, max_rt = rt
2171
- consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2307
+ consensus = consensus.filter(
2308
+ (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2309
+ )
2172
2310
  else:
2173
2311
  # Single float value - use default rt tolerance from study parameters
2174
2312
  default_rt_tol = getattr(self, "parameters", None)
@@ -2176,13 +2314,15 @@ def consensus_select(
2176
2314
  default_rt_tol = default_rt_tol.eic_rt_tol
2177
2315
  else:
2178
2316
  # Fallback to align_defaults if study parameters not available
2179
- from masster.study.defaults.align_def import align_defaults
2317
+ from master.study.defaults.align_def import align_defaults
2180
2318
 
2181
2319
  default_rt_tol = align_defaults().rt_tol
2182
2320
 
2183
2321
  min_rt = rt - default_rt_tol
2184
2322
  max_rt = rt + default_rt_tol
2185
- consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2323
+ consensus = consensus.filter(
2324
+ (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2325
+ )
2186
2326
 
2187
2327
  self.logger.debug(
2188
2328
  f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2193,7 +2333,9 @@ def consensus_select(
2193
2333
  consensus_len_before_filter = len(consensus)
2194
2334
  if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
2195
2335
  min_inty, max_inty = inty_mean
2196
- consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
2336
+ consensus = consensus.filter(
2337
+ (pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
2338
+ )
2197
2339
  else:
2198
2340
  consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
2199
2341
  self.logger.debug(
@@ -2208,11 +2350,14 @@ def consensus_select(
2208
2350
  # Treat as range
2209
2351
  min_uid, max_uid = consensus_uid
2210
2352
  consensus = consensus.filter(
2211
- (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
2353
+ (pl.col("consensus_uid") >= min_uid)
2354
+ & (pl.col("consensus_uid") <= max_uid),
2212
2355
  )
2213
2356
  else:
2214
2357
  # Treat as list
2215
- consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
2358
+ consensus = consensus.filter(
2359
+ pl.col("consensus_uid").is_in(consensus_uid),
2360
+ )
2216
2361
  else:
2217
2362
  consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
2218
2363
  self.logger.debug(
@@ -2236,7 +2381,8 @@ def consensus_select(
2236
2381
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
2237
2382
  min_samples, max_samples = number_samples
2238
2383
  consensus = consensus.filter(
2239
- (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
2384
+ (pl.col("number_samples") >= min_samples)
2385
+ & (pl.col("number_samples") <= max_samples),
2240
2386
  )
2241
2387
  else:
2242
2388
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2250,7 +2396,10 @@ def consensus_select(
2250
2396
  if "number_ms2" in consensus.columns:
2251
2397
  if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
2252
2398
  min_ms2, max_ms2 = number_ms2
2253
- consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
2399
+ consensus = consensus.filter(
2400
+ (pl.col("number_ms2") >= min_ms2)
2401
+ & (pl.col("number_ms2") <= max_ms2),
2402
+ )
2254
2403
  else:
2255
2404
  consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
2256
2405
  else:
@@ -2264,7 +2413,9 @@ def consensus_select(
2264
2413
  consensus_len_before_filter = len(consensus)
2265
2414
  if isinstance(quality, tuple) and len(quality) == 2:
2266
2415
  min_quality, max_quality = quality
2267
- consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
2416
+ consensus = consensus.filter(
2417
+ (pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
2418
+ )
2268
2419
  else:
2269
2420
  consensus = consensus.filter(pl.col("quality") >= quality)
2270
2421
  self.logger.debug(
@@ -2277,7 +2428,9 @@ def consensus_select(
2277
2428
  if "bl" in consensus.columns:
2278
2429
  if isinstance(bl, tuple) and len(bl) == 2:
2279
2430
  min_bl, max_bl = bl
2280
- consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
2431
+ consensus = consensus.filter(
2432
+ (pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
2433
+ )
2281
2434
  else:
2282
2435
  consensus = consensus.filter(pl.col("bl") >= bl)
2283
2436
  else:
@@ -2290,16 +2443,23 @@ def consensus_select(
2290
2443
  if chrom_coherence_mean is not None:
2291
2444
  consensus_len_before_filter = len(consensus)
2292
2445
  if "chrom_coherence_mean" in consensus.columns:
2293
- if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
2446
+ if (
2447
+ isinstance(chrom_coherence_mean, tuple)
2448
+ and len(chrom_coherence_mean) == 2
2449
+ ):
2294
2450
  min_coherence, max_coherence = chrom_coherence_mean
2295
2451
  consensus = consensus.filter(
2296
2452
  (pl.col("chrom_coherence_mean") >= min_coherence)
2297
2453
  & (pl.col("chrom_coherence_mean") <= max_coherence),
2298
2454
  )
2299
2455
  else:
2300
- consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
2456
+ consensus = consensus.filter(
2457
+ pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
2458
+ )
2301
2459
  else:
2302
- self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
2460
+ self.logger.warning(
2461
+ "'chrom_coherence_mean' column not found in consensus_df",
2462
+ )
2303
2463
  self.logger.debug(
2304
2464
  f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2305
2465
  )
@@ -2308,16 +2468,23 @@ def consensus_select(
2308
2468
  if chrom_prominence_mean is not None:
2309
2469
  consensus_len_before_filter = len(consensus)
2310
2470
  if "chrom_prominence_mean" in consensus.columns:
2311
- if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
2471
+ if (
2472
+ isinstance(chrom_prominence_mean, tuple)
2473
+ and len(chrom_prominence_mean) == 2
2474
+ ):
2312
2475
  min_prominence, max_prominence = chrom_prominence_mean
2313
2476
  consensus = consensus.filter(
2314
2477
  (pl.col("chrom_prominence_mean") >= min_prominence)
2315
2478
  & (pl.col("chrom_prominence_mean") <= max_prominence),
2316
2479
  )
2317
2480
  else:
2318
- consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
2481
+ consensus = consensus.filter(
2482
+ pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
2483
+ )
2319
2484
  else:
2320
- self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
2485
+ self.logger.warning(
2486
+ "'chrom_prominence_mean' column not found in consensus_df",
2487
+ )
2321
2488
  self.logger.debug(
2322
2489
  f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2323
2490
  )
@@ -2326,16 +2493,26 @@ def consensus_select(
2326
2493
  if chrom_prominence_scaled_mean is not None:
2327
2494
  consensus_len_before_filter = len(consensus)
2328
2495
  if "chrom_prominence_scaled_mean" in consensus.columns:
2329
- if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
2330
- min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
2496
+ if (
2497
+ isinstance(chrom_prominence_scaled_mean, tuple)
2498
+ and len(chrom_prominence_scaled_mean) == 2
2499
+ ):
2500
+ min_prominence_scaled, max_prominence_scaled = (
2501
+ chrom_prominence_scaled_mean
2502
+ )
2331
2503
  consensus = consensus.filter(
2332
2504
  (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2333
2505
  & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
2334
2506
  )
2335
2507
  else:
2336
- consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
2508
+ consensus = consensus.filter(
2509
+ pl.col("chrom_prominence_scaled_mean")
2510
+ >= chrom_prominence_scaled_mean,
2511
+ )
2337
2512
  else:
2338
- self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
2513
+ self.logger.warning(
2514
+ "'chrom_prominence_scaled_mean' column not found in consensus_df",
2515
+ )
2339
2516
  self.logger.debug(
2340
2517
  f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2341
2518
  )
@@ -2344,16 +2521,23 @@ def consensus_select(
2344
2521
  if chrom_height_scaled_mean is not None:
2345
2522
  consensus_len_before_filter = len(consensus)
2346
2523
  if "chrom_height_scaled_mean" in consensus.columns:
2347
- if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
2524
+ if (
2525
+ isinstance(chrom_height_scaled_mean, tuple)
2526
+ and len(chrom_height_scaled_mean) == 2
2527
+ ):
2348
2528
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
2349
2529
  consensus = consensus.filter(
2350
2530
  (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2351
2531
  & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
2352
2532
  )
2353
2533
  else:
2354
- consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
2534
+ consensus = consensus.filter(
2535
+ pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
2536
+ )
2355
2537
  else:
2356
- self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
2538
+ self.logger.warning(
2539
+ "'chrom_height_scaled_mean' column not found in consensus_df",
2540
+ )
2357
2541
  self.logger.debug(
2358
2542
  f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2359
2543
  )
@@ -2365,7 +2549,8 @@ def consensus_select(
2365
2549
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
2366
2550
  min_rt_delta, max_rt_delta = rt_delta_mean
2367
2551
  consensus = consensus.filter(
2368
- (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
2552
+ (pl.col("rt_delta_mean") >= min_rt_delta)
2553
+ & (pl.col("rt_delta_mean") <= max_rt_delta),
2369
2554
  )
2370
2555
  else:
2371
2556
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2376,9 +2561,13 @@ def consensus_select(
2376
2561
  )
2377
2562
 
2378
2563
  if len(consensus) == 0:
2379
- self.logger.warning("No consensus features remaining after applying selection criteria.")
2564
+ self.logger.warning(
2565
+ "No consensus features remaining after applying selection criteria.",
2566
+ )
2380
2567
  else:
2381
- self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
2568
+ self.logger.info(
2569
+ f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
2570
+ )
2382
2571
 
2383
2572
  # Sort the results if sortby is specified
2384
2573
  if sortby is not None:
@@ -2387,19 +2576,25 @@ def consensus_select(
2387
2576
  if sortby in consensus.columns:
2388
2577
  consensus = consensus.sort(sortby, descending=descending)
2389
2578
  else:
2390
- self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
2579
+ self.logger.warning(
2580
+ f"Sort column '{sortby}' not found in consensus DataFrame",
2581
+ )
2391
2582
  elif isinstance(sortby, (list, tuple)):
2392
2583
  # Multiple columns
2393
2584
  valid_columns = [col for col in sortby if col in consensus.columns]
2394
2585
  invalid_columns = [col for col in sortby if col not in consensus.columns]
2395
2586
 
2396
2587
  if invalid_columns:
2397
- self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2588
+ self.logger.warning(
2589
+ f"Sort columns not found in consensus DataFrame: {invalid_columns}",
2590
+ )
2398
2591
 
2399
2592
  if valid_columns:
2400
2593
  consensus = consensus.sort(valid_columns, descending=descending)
2401
2594
  else:
2402
- self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
2595
+ self.logger.warning(
2596
+ f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
2597
+ )
2403
2598
 
2404
2599
  return consensus
2405
2600
 
@@ -2444,7 +2639,10 @@ def consensus_filter(self, consensus):
2444
2639
 
2445
2640
  # Get feature_uids that need to be removed from features_df
2446
2641
  feature_uids_to_remove = []
2447
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2642
+ if (
2643
+ self.consensus_mapping_df is not None
2644
+ and not self.consensus_mapping_df.is_empty()
2645
+ ):
2448
2646
  feature_uids_to_remove = self.consensus_mapping_df.filter(
2449
2647
  pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2450
2648
  )["feature_uid"].to_list()
@@ -2455,27 +2653,42 @@ def consensus_filter(self, consensus):
2455
2653
  )
2456
2654
 
2457
2655
  # Remove from consensus_mapping_df
2458
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2656
+ if (
2657
+ self.consensus_mapping_df is not None
2658
+ and not self.consensus_mapping_df.is_empty()
2659
+ ):
2459
2660
  initial_mapping_count = len(self.consensus_mapping_df)
2460
2661
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
2461
2662
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2462
2663
  )
2463
2664
  removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
2464
2665
  if removed_mapping_count > 0:
2465
- self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
2666
+ self.logger.debug(
2667
+ f"Removed {removed_mapping_count} entries from consensus_mapping_df",
2668
+ )
2466
2669
 
2467
2670
  # Remove corresponding features from features_df
2468
- if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
2671
+ if (
2672
+ feature_uids_to_remove
2673
+ and self.features_df is not None
2674
+ and not self.features_df.is_empty()
2675
+ ):
2469
2676
  initial_features_count = len(self.features_df)
2470
2677
  self.features_df = self.features_df.filter(
2471
2678
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
2472
2679
  )
2473
2680
  removed_features_count = initial_features_count - len(self.features_df)
2474
2681
  if removed_features_count > 0:
2475
- self.logger.debug(f"Removed {removed_features_count} entries from features_df")
2682
+ self.logger.debug(
2683
+ f"Removed {removed_features_count} entries from features_df",
2684
+ )
2476
2685
 
2477
2686
  # Remove from consensus_ms2 if it exists
2478
- if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
2687
+ if (
2688
+ hasattr(self, "consensus_ms2")
2689
+ and self.consensus_ms2 is not None
2690
+ and not self.consensus_ms2.is_empty()
2691
+ ):
2479
2692
  initial_ms2_count = len(self.consensus_ms2)
2480
2693
  self.consensus_ms2 = self.consensus_ms2.filter(
2481
2694
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
@@ -2575,7 +2788,10 @@ def samples_select(
2575
2788
  if len(sample_uid) == 2 and not isinstance(sample_uid, list):
2576
2789
  # Treat as range
2577
2790
  min_uid, max_uid = sample_uid
2578
- filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
2791
+ filter_conditions.append(
2792
+ (pl.col("sample_uid") >= min_uid)
2793
+ & (pl.col("sample_uid") <= max_uid),
2794
+ )
2579
2795
  else:
2580
2796
  # Treat as list
2581
2797
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -2617,7 +2833,8 @@ def samples_select(
2617
2833
  # Treat as range
2618
2834
  min_batch, max_batch = sample_batch
2619
2835
  filter_conditions.append(
2620
- (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
2836
+ (pl.col("sample_batch") >= min_batch)
2837
+ & (pl.col("sample_batch") <= max_batch),
2621
2838
  )
2622
2839
  else:
2623
2840
  # Treat as list
@@ -2635,11 +2852,14 @@ def samples_select(
2635
2852
  # Treat as range
2636
2853
  min_seq, max_seq = sample_sequence
2637
2854
  filter_conditions.append(
2638
- (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
2855
+ (pl.col("sample_sequence") >= min_seq)
2856
+ & (pl.col("sample_sequence") <= max_seq),
2639
2857
  )
2640
2858
  else:
2641
2859
  # Treat as list
2642
- filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
2860
+ filter_conditions.append(
2861
+ pl.col("sample_sequence").is_in(sample_sequence),
2862
+ )
2643
2863
  else:
2644
2864
  filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
2645
2865
  else:
@@ -2651,7 +2871,8 @@ def samples_select(
2651
2871
  if isinstance(num_features, tuple) and len(num_features) == 2:
2652
2872
  min_features, max_features = num_features
2653
2873
  filter_conditions.append(
2654
- (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
2874
+ (pl.col("num_features") >= min_features)
2875
+ & (pl.col("num_features") <= max_features),
2655
2876
  )
2656
2877
  else:
2657
2878
  filter_conditions.append(pl.col("num_features") >= num_features)
@@ -2663,7 +2884,9 @@ def samples_select(
2663
2884
  if "num_ms1" in available_columns:
2664
2885
  if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
2665
2886
  min_ms1, max_ms1 = num_ms1
2666
- filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
2887
+ filter_conditions.append(
2888
+ (pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1),
2889
+ )
2667
2890
  else:
2668
2891
  filter_conditions.append(pl.col("num_ms1") >= num_ms1)
2669
2892
  else:
@@ -2674,7 +2897,9 @@ def samples_select(
2674
2897
  if "num_ms2" in available_columns:
2675
2898
  if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
2676
2899
  min_ms2, max_ms2 = num_ms2
2677
- filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
2900
+ filter_conditions.append(
2901
+ (pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2),
2902
+ )
2678
2903
  else:
2679
2904
  filter_conditions.append(pl.col("num_ms2") >= num_ms2)
2680
2905
  else:
@@ -2766,7 +2991,9 @@ def samples_delete(self, samples):
2766
2991
  if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
2767
2992
  sample_uids_to_remove = list(sample_uids_set)
2768
2993
 
2769
- self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
2994
+ self.logger.info(
2995
+ f"Deleting {len(sample_uids_to_remove)} samples and all related data...",
2996
+ )
2770
2997
 
2771
2998
  # Get feature_uids that need to be removed from features_df
2772
2999
  feature_uids_to_remove = []
@@ -2794,7 +3021,11 @@ def samples_delete(self, samples):
2794
3021
 
2795
3022
  # 2. Remove corresponding features from features_df
2796
3023
  removed_features_count = 0
2797
- if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
3024
+ if (
3025
+ feature_uids_to_remove
3026
+ and self.features_df is not None
3027
+ and not self.features_df.is_empty()
3028
+ ):
2798
3029
  self.features_df = self.features_df.filter(
2799
3030
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
2800
3031
  )
@@ -2802,7 +3033,11 @@ def samples_delete(self, samples):
2802
3033
 
2803
3034
  # 3. Remove from consensus_mapping_df
2804
3035
  removed_mapping_count = 0
2805
- if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3036
+ if (
3037
+ feature_uids_to_remove
3038
+ and self.consensus_mapping_df is not None
3039
+ and not self.consensus_mapping_df.is_empty()
3040
+ ):
2806
3041
  initial_mapping_count = len(self.consensus_mapping_df)
2807
3042
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
2808
3043
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
@@ -2811,7 +3046,11 @@ def samples_delete(self, samples):
2811
3046
 
2812
3047
  # 4. Remove from consensus_ms2 if it exists
2813
3048
  removed_ms2_count = 0
2814
- if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
3049
+ if (
3050
+ hasattr(self, "consensus_ms2")
3051
+ and self.consensus_ms2 is not None
3052
+ and not self.consensus_ms2.is_empty()
3053
+ ):
2815
3054
  initial_ms2_count = len(self.consensus_ms2)
2816
3055
  self.consensus_ms2 = self.consensus_ms2.filter(
2817
3056
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
@@ -2820,7 +3059,11 @@ def samples_delete(self, samples):
2820
3059
 
2821
3060
  # 5. Remove from feature_maps and update map_id
2822
3061
  removed_maps_count = 0
2823
- if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
3062
+ if (
3063
+ hasattr(self, "feature_maps")
3064
+ and self.feature_maps is not None
3065
+ and map_ids_to_remove
3066
+ ):
2824
3067
  # Remove feature maps in reverse order to maintain indices
2825
3068
  for map_id in sorted(map_ids_to_remove, reverse=True):
2826
3069
  if 0 <= map_id < len(self.feature_maps):
@@ -2861,7 +3104,9 @@ def samples_delete(self, samples):
2861
3104
 
2862
3105
  # Update map_id indices if needed
2863
3106
  if removed_maps_count > 0 and final_sample_count > 0:
2864
- self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
3107
+ self.logger.debug(
3108
+ f"Updated map_id values to range from 0 to {final_sample_count - 1}",
3109
+ )
2865
3110
 
2866
3111
 
2867
3112
  # =====================================================================================
@@ -3032,7 +3277,9 @@ def sample_color(self, by=None, palette="Turbo256"):
3032
3277
  )
3033
3278
 
3034
3279
  if isinstance(by, list):
3035
- self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
3280
+ self.logger.debug(
3281
+ f"Set sample colors using provided color list ({len(by)} colors)",
3282
+ )
3036
3283
  elif by is None:
3037
3284
  self.logger.debug(f"Set sequential sample colors using {palette} palette")
3038
3285
  else:
@@ -3067,7 +3314,9 @@ def sample_color_reset(self):
3067
3314
  # Distribute samples evenly across the full colormap range
3068
3315
  for i in range(n_samples):
3069
3316
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3070
- normalized_value = (i + 0.5) / n_samples # +0.5 to center samples in their bins
3317
+ normalized_value = (
3318
+ i + 0.5
3319
+ ) / n_samples # +0.5 to center samples in their bins
3071
3320
  # Optionally, map to a subset of colormap to avoid extreme colors
3072
3321
  # Use 10% to 90% of colormap range for better color diversity
3073
3322
  normalized_value = 0.1 + (normalized_value * 0.8)
@@ -3088,10 +3337,14 @@ def sample_color_reset(self):
3088
3337
  pl.Series("sample_color", colors).alias("sample_color"),
3089
3338
  )
3090
3339
 
3091
- self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
3340
+ self.logger.debug(
3341
+ f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
3342
+ )
3092
3343
 
3093
3344
  except ImportError:
3094
- self.logger.error("cmap library is required for sample color reset. Install with: uv add cmap")
3345
+ self.logger.error(
3346
+ "cmap library is required for sample color reset. Install with: uv add cmap",
3347
+ )
3095
3348
  except Exception as e:
3096
3349
  self.logger.error(f"Failed to reset sample colors: {e}")
3097
3350
 
@@ -3112,7 +3365,9 @@ def _get_color_palette(palette_name):
3112
3365
  try:
3113
3366
  from cmap import Colormap
3114
3367
  except ImportError:
3115
- raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3368
+ raise ValueError(
3369
+ "cmap library is required for color palettes. Install with: pip install cmap",
3370
+ )
3116
3371
 
3117
3372
  # Map common palette names to cmap names
3118
3373
  palette_mapping = {
@@ -3207,7 +3462,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3207
3462
  try:
3208
3463
  from cmap import Colormap
3209
3464
  except ImportError:
3210
- raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3465
+ raise ValueError(
3466
+ "cmap library is required for color palettes. Install with: pip install cmap",
3467
+ )
3211
3468
 
3212
3469
  # Map common palette names to cmap names (same as _get_color_palette)
3213
3470
  palette_mapping = {
@@ -3245,7 +3502,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3245
3502
  # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
3246
3503
  for i in range(n_colors):
3247
3504
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3248
- normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
3505
+ normalized_value = (
3506
+ i + 0.5
3507
+ ) / n_colors # +0.5 to center samples in their bins
3249
3508
  # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
3250
3509
  normalized_value = 0.1 + (normalized_value * 0.8)
3251
3510
 
@@ -3290,7 +3549,7 @@ def _ensure_features_df_schema_order(self):
3290
3549
  try:
3291
3550
  import os
3292
3551
  import json
3293
- from masster.study.h5 import _reorder_columns_by_schema
3552
+ from master.study.h5 import _reorder_columns_by_schema
3294
3553
 
3295
3554
  # Load schema
3296
3555
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
@@ -3298,7 +3557,11 @@ def _ensure_features_df_schema_order(self):
3298
3557
  schema = json.load(f)
3299
3558
 
3300
3559
  # Reorder columns to match schema
3301
- self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
3560
+ self.features_df = _reorder_columns_by_schema(
3561
+ self.features_df,
3562
+ schema,
3563
+ "features_df",
3564
+ )
3302
3565
 
3303
3566
  except Exception as e:
3304
3567
  self.logger.warning(f"Failed to reorder features_df columns: {e}")
@@ -3340,17 +3603,19 @@ def migrate_map_id_to_index(self):
3340
3603
  # Ensure the column is Int64 type
3341
3604
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
3342
3605
 
3343
- self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3606
+ self.logger.info(
3607
+ f"Successfully migrated {sample_count} samples to indexed map_id format",
3608
+ )
3344
3609
  self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
3345
3610
 
3346
3611
 
3347
3612
  def restore_ms2(self, samples=None, **kwargs):
3348
3613
  """
3349
3614
  Restore MS2 data by re-running find_ms2 on specified samples.
3350
-
3615
+
3351
3616
  This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
3352
3617
  from the original sample files. Use this to reverse the effects of compress_ms2().
3353
-
3618
+
3354
3619
  Parameters:
3355
3620
  samples (list, optional): List of sample_uids or sample_names to process.
3356
3621
  If None, processes all samples.
@@ -3360,31 +3625,37 @@ def restore_ms2(self, samples=None, **kwargs):
3360
3625
  if self.features_df is None or self.features_df.is_empty():
3361
3626
  self.logger.error("No features_df found in study.")
3362
3627
  return
3363
-
3628
+
3364
3629
  if self.samples_df is None or self.samples_df.is_empty():
3365
3630
  self.logger.error("No samples_df found in study.")
3366
3631
  return
3367
-
3368
- # Get sample_uids to process
3632
+
3633
+ # Get sample_uids to process
3369
3634
  sample_uids = self._get_sample_uids(samples)
3370
3635
  if not sample_uids:
3371
3636
  self.logger.warning("No valid samples specified.")
3372
3637
  return
3373
-
3638
+
3374
3639
  self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
3375
-
3640
+
3376
3641
  # Clear existing consensus_ms2 to rebuild from scratch
3377
- initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3642
+ initial_ms2_count = (
3643
+ len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3644
+ )
3378
3645
  self.consensus_ms2 = pl.DataFrame()
3379
-
3646
+
3380
3647
  # Re-run find_ms2 which will rebuild consensus_ms2
3381
3648
  try:
3382
3649
  self.find_ms2(**kwargs)
3383
-
3384
- final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3385
-
3386
- self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
3387
-
3650
+
3651
+ final_ms2_count = (
3652
+ len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3653
+ )
3654
+
3655
+ self.logger.info(
3656
+ f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
3657
+ )
3658
+
3388
3659
  except Exception as e:
3389
3660
  self.logger.error(f"Failed to restore MS2 data: {e}")
3390
3661
  raise
@@ -3393,51 +3664,51 @@ def restore_ms2(self, samples=None, **kwargs):
3393
3664
  def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
3394
3665
  """
3395
3666
  Reverse any compression effects by restoring compressed data adaptively.
3396
-
3667
+
3397
3668
  This function restores data that was compressed using compress(), compress_features(),
3398
3669
  compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
3399
3670
  decompression process for speed by only processing what actually needs restoration.
3400
-
3671
+
3401
3672
  Parameters:
3402
3673
  features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
3403
- ms2 (bool): Restore MS2 spectra by re-running find_ms2()
3674
+ ms2 (bool): Restore MS2 spectra by re-running find_ms2()
3404
3675
  chrom (bool): Restore chromatogram objects
3405
3676
  samples (list, optional): List of sample_uids or sample_names to process.
3406
3677
  If None, processes all samples.
3407
3678
  **kwargs: Additional keyword arguments for restoration functions:
3408
3679
  - For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
3409
3680
  - For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
3410
-
3681
+
3411
3682
  Performance Optimizations:
3412
3683
  - Adaptive processing: Only restores what actually needs restoration
3413
3684
  - Processes features and chromatograms together when possible (shared file I/O)
3414
3685
  - Uses cached sample instances to avoid repeated file loading
3415
3686
  - Processes MS2 restoration last as it's the most computationally expensive
3416
3687
  - Provides detailed progress information for long-running operations
3417
-
3688
+
3418
3689
  Example:
3419
3690
  # Restore everything (but only what needs restoration)
3420
3691
  study.decompress()
3421
-
3692
+
3422
3693
  # Restore only chromatograms with custom tolerances
3423
3694
  study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
3424
-
3695
+
3425
3696
  # Restore specific samples only
3426
3697
  study.decompress(samples=["sample1", "sample2"])
3427
3698
  """
3428
3699
  if not any([features, ms2, chrom]):
3429
3700
  self.logger.warning("No decompression operations specified.")
3430
3701
  return
3431
-
3702
+
3432
3703
  # Get sample_uids to process
3433
3704
  sample_uids = self._get_sample_uids(samples)
3434
3705
  if not sample_uids:
3435
3706
  self.logger.warning("No valid samples specified.")
3436
3707
  return
3437
-
3708
+
3438
3709
  # Adaptively check what actually needs to be done
3439
3710
  import polars as pl
3440
-
3711
+
3441
3712
  # Check if features need restoration (more sophisticated logic)
3442
3713
  features_need_restoration = False
3443
3714
  if features and not self.features_df.is_empty():
@@ -3446,7 +3717,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3446
3717
  for col in ["ms2_scans", "ms2_specs"]:
3447
3718
  if col not in self.features_df.columns:
3448
3719
  missing_cols.append(col)
3449
-
3720
+
3450
3721
  # If columns are missing entirely, we likely need restoration
3451
3722
  if missing_cols:
3452
3723
  features_need_restoration = True
@@ -3455,13 +3726,15 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3455
3726
  # But be smart about it - only check if we have consensus features with MS2
3456
3727
  if not self.consensus_ms2.is_empty():
3457
3728
  # We have MS2 data, so ms2_specs should have some content
3458
- null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
3729
+ null_ms2_specs = self.features_df.filter(
3730
+ pl.col("ms2_specs").is_null(),
3731
+ ).height
3459
3732
  total_features = len(self.features_df)
3460
3733
  # If more than 90% are null but we have MS2 data, likely compressed
3461
3734
  if null_ms2_specs > (total_features * 0.9):
3462
3735
  features_need_restoration = True
3463
-
3464
- # Check if chromatograms need restoration
3736
+
3737
+ # Check if chromatograms need restoration
3465
3738
  chrom_need_restoration = False
3466
3739
  if chrom and not self.features_df.is_empty():
3467
3740
  if "chrom" not in self.features_df.columns:
@@ -3472,22 +3745,26 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3472
3745
  total_features = len(self.features_df)
3473
3746
  # If more than 50% are null, likely need restoration
3474
3747
  chrom_need_restoration = null_chroms > (total_features * 0.5)
3475
-
3748
+
3476
3749
  # Check if MS2 data might need restoration (compare expected vs actual)
3477
3750
  ms2_need_restoration = False
3478
3751
  if ms2:
3479
- current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3480
- consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
3481
-
3752
+ current_ms2_count = (
3753
+ len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3754
+ )
3755
+ consensus_count = (
3756
+ len(self.consensus_df) if not self.consensus_df.is_empty() else 0
3757
+ )
3758
+
3482
3759
  if consensus_count > 0:
3483
3760
  # Calculate expected MS2 count based on consensus features with MS2 potential
3484
3761
  # This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
3485
3762
  expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
3486
3763
  expected_ms2 = consensus_count * expected_ratio
3487
-
3764
+
3488
3765
  if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
3489
3766
  ms2_need_restoration = True
3490
-
3767
+
3491
3768
  # Build list of operations that actually need to be done
3492
3769
  operations_needed = []
3493
3770
  if features and features_need_restoration:
@@ -3496,59 +3773,75 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3496
3773
  operations_needed.append("chromatograms")
3497
3774
  if ms2 and ms2_need_restoration:
3498
3775
  operations_needed.append("MS2 spectra")
3499
-
3776
+
3500
3777
  # Early exit if nothing needs to be done
3501
3778
  if not operations_needed:
3502
- self.logger.info("All data appears to be already decompressed. No operations needed.")
3779
+ self.logger.info(
3780
+ "All data appears to be already decompressed. No operations needed.",
3781
+ )
3503
3782
  return
3504
-
3505
- self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
3506
-
3783
+
3784
+ self.logger.info(
3785
+ f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples",
3786
+ )
3787
+
3507
3788
  try:
3508
3789
  # Phase 1: Restore features and chromatograms together (shared file I/O)
3509
- if ("features" in operations_needed and "chromatograms" in operations_needed):
3510
- self.logger.info("Phase 1: Restoring features and chromatograms together...")
3511
-
3790
+ if "features" in operations_needed and "chromatograms" in operations_needed:
3791
+ self.logger.info(
3792
+ "Phase 1: Restoring features and chromatograms together...",
3793
+ )
3794
+
3512
3795
  # Extract relevant kwargs for restore_features and restore_chrom
3513
3796
  restore_kwargs = {}
3514
- if 'mz_tol' in kwargs:
3515
- restore_kwargs['mz_tol'] = kwargs['mz_tol']
3516
- if 'rt_tol' in kwargs:
3517
- restore_kwargs['rt_tol'] = kwargs['rt_tol']
3518
-
3797
+ if "mz_tol" in kwargs:
3798
+ restore_kwargs["mz_tol"] = kwargs["mz_tol"]
3799
+ if "rt_tol" in kwargs:
3800
+ restore_kwargs["rt_tol"] = kwargs["rt_tol"]
3801
+
3519
3802
  # Restore features first (includes chrom column)
3520
3803
  self.restore_features(samples=samples)
3521
-
3804
+
3522
3805
  # Then do additional chrom gap-filling if needed
3523
3806
  self.restore_chrom(samples=samples, **restore_kwargs)
3524
-
3525
- elif ("features" in operations_needed and "chromatograms" not in operations_needed):
3807
+
3808
+ elif (
3809
+ "features" in operations_needed and "chromatograms" not in operations_needed
3810
+ ):
3526
3811
  self.logger.info("Phase 1: Restoring features data...")
3527
3812
  self.restore_features(samples=samples)
3528
-
3529
- elif ("chromatograms" in operations_needed and "features" not in operations_needed):
3813
+
3814
+ elif (
3815
+ "chromatograms" in operations_needed and "features" not in operations_needed
3816
+ ):
3530
3817
  self.logger.info("Phase 1: Restoring chromatograms...")
3531
3818
  restore_kwargs = {}
3532
- if 'mz_tol' in kwargs:
3533
- restore_kwargs['mz_tol'] = kwargs['mz_tol']
3534
- if 'rt_tol' in kwargs:
3535
- restore_kwargs['rt_tol'] = kwargs['rt_tol']
3819
+ if "mz_tol" in kwargs:
3820
+ restore_kwargs["mz_tol"] = kwargs["mz_tol"]
3821
+ if "rt_tol" in kwargs:
3822
+ restore_kwargs["rt_tol"] = kwargs["rt_tol"]
3536
3823
  self.restore_chrom(samples=samples, **restore_kwargs)
3537
-
3824
+
3538
3825
  # Phase 2: Restore MS2 data (most computationally expensive, done last)
3539
3826
  if "MS2 spectra" in operations_needed:
3540
3827
  self.logger.info("Phase 2: Restoring MS2 spectra...")
3541
-
3828
+
3542
3829
  # Extract MS2-specific kwargs
3543
3830
  ms2_kwargs = {}
3544
3831
  for key, value in kwargs.items():
3545
- if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
3832
+ if key in [
3833
+ "mz_tol",
3834
+ "centroid",
3835
+ "deisotope",
3836
+ "dia_stats",
3837
+ "feature_uid",
3838
+ ]:
3546
3839
  ms2_kwargs[key] = value
3547
-
3840
+
3548
3841
  self.restore_ms2(samples=samples, **ms2_kwargs)
3549
-
3842
+
3550
3843
  self.logger.info("Adaptive decompression completed successfully")
3551
-
3844
+
3552
3845
  except Exception as e:
3553
3846
  self.logger.error(f"Decompression failed: {e}")
3554
3847
  raise