masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py CHANGED
@@ -22,7 +22,7 @@ import pandas as pd
22
22
  import polars as pl
23
23
 
24
24
  from tqdm import tqdm
25
- from master.chromatogram import Chromatogram
25
+ from masster.chromatogram import Chromatogram
26
26
 
27
27
 
28
28
  # =====================================================================================
@@ -71,12 +71,7 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
71
71
  # fallback to pandas
72
72
  try:
73
73
  bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
74
- bpc_pd = (
75
- bpc_pd.groupby("rt")
76
- .agg({"inty": "max"})
77
- .reset_index()
78
- .sort_values("rt")
79
- )
74
+ bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
80
75
  except Exception:
81
76
  raise
82
77
 
@@ -118,16 +113,11 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
118
113
  mapping_rows = pl.DataFrame()
119
114
 
120
115
  # If we still have no sample selector, try to infer sample from the Sample object s
121
- if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
122
- s,
123
- "sample_path",
124
- ):
116
+ if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
125
117
  # attempt to match by sample_path or file name
126
118
  try:
127
119
  # find row where sample_path matches
128
- mapping_rows = feats.filter(
129
- pl.col("sample_path") == getattr(s, "file", None),
130
- )
120
+ mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
131
121
  except Exception:
132
122
  mapping_rows = pl.DataFrame()
133
123
 
@@ -214,9 +204,7 @@ def get_tic(owner, sample=None, label=None):
214
204
  except Exception:
215
205
  raise
216
206
  else:
217
- raise ValueError(
218
- "Neither ms1_df nor scans_df available for TIC computation",
219
- )
207
+ raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
220
208
 
221
209
  if tic_pd.empty:
222
210
  raise ValueError("Computed TIC is empty")
@@ -379,17 +367,14 @@ def get_chrom(self, uids=None, samples=None):
379
367
  )
380
368
  # Pre-filter features_df to only relevant features and samples
381
369
  filtered_features = self.features_df.filter(
382
- pl.col("feature_uid").is_in(relevant_feature_uids)
383
- & pl.col("sample_uid").is_in(sample_uids),
384
- ).select(
385
- [
386
- "feature_uid",
387
- "chrom",
388
- "rt",
389
- "rt_original",
390
- "sample_uid",
391
- ],
392
- )
370
+ pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
371
+ ).select([
372
+ "feature_uid",
373
+ "chrom",
374
+ "rt",
375
+ "rt_original",
376
+ "sample_uid",
377
+ ])
393
378
 
394
379
  # Pre-filter samples_df
395
380
  filtered_samples = self.samples_df.filter(
@@ -424,13 +409,11 @@ def get_chrom(self, uids=None, samples=None):
424
409
  # Create a mapping dictionary for O(1) lookup instead of O(n) filtering
425
410
  self.logger.debug("Creating lookup dictionary for chromatogram objects.")
426
411
  chrom_lookup = {}
427
- for row in df_combined.select(
428
- [
429
- "consensus_uid",
430
- "sample_name",
431
- "chrom",
432
- ],
433
- ).iter_rows():
412
+ for row in df_combined.select([
413
+ "consensus_uid",
414
+ "sample_name",
415
+ "chrom",
416
+ ]).iter_rows():
434
417
  key = (row[0], row[1]) # (consensus_uid, sample_name)
435
418
  chrom_lookup[key] = row[2] # chrom object
436
419
 
@@ -549,9 +532,7 @@ def get_consensus_matrix(self, quant="chrom_area"):
549
532
 
550
533
  # Build consensus matrix directly using the consensus_mapping_df
551
534
  matrix_dict = {}
552
- sample_mapping = dict(
553
- self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
554
- )
535
+ sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
555
536
 
556
537
  for row in self.consensus_mapping_df.iter_rows(named=True):
557
538
  consensus_uid = row["consensus_uid"]
@@ -569,10 +550,7 @@ def get_consensus_matrix(self, quant="chrom_area"):
569
550
 
570
551
  # Take max if multiple features map to same consensus/sample combination
571
552
  if sample_name in matrix_dict[consensus_uid]:
572
- matrix_dict[consensus_uid][sample_name] = max(
573
- matrix_dict[consensus_uid][sample_name],
574
- value,
575
- )
553
+ matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
576
554
  else:
577
555
  matrix_dict[consensus_uid][sample_name] = value
578
556
 
@@ -591,12 +569,10 @@ def get_consensus_matrix(self, quant="chrom_area"):
591
569
 
592
570
  # Fill null values with 0 and round numeric columns
593
571
  numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
594
- df2 = df2.with_columns(
595
- [
596
- pl.col("consensus_uid").cast(pl.UInt64),
597
- *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
598
- ],
599
- )
572
+ df2 = df2.with_columns([
573
+ pl.col("consensus_uid").cast(pl.UInt64),
574
+ *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
575
+ ])
600
576
 
601
577
  return df2
602
578
 
@@ -816,7 +792,7 @@ def get_sample(self, sample):
816
792
 
817
793
  This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
818
794
  """
819
- from master.sample.sample import Sample
795
+ from masster.sample.sample import Sample
820
796
 
821
797
  if isinstance(sample, Sample):
822
798
  return sample
@@ -826,9 +802,7 @@ def get_sample(self, sample):
826
802
  elif isinstance(sample, str):
827
803
  rows = self.samples_df.filter(pl.col("sample_name") == sample)
828
804
  else:
829
- raise ValueError(
830
- "sample must be an int (sample_uid), str (sample_name) or a Sample instance",
831
- )
805
+ raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
832
806
 
833
807
  if rows.is_empty():
834
808
  raise KeyError(f"Sample not found: {sample}")
@@ -862,9 +836,7 @@ def get_orphans(self):
862
836
  Get all features that are not in the consensus mapping.
863
837
  """
864
838
  not_in_consensus = self.features_df.filter(
865
- ~self.features_df["feature_uid"].is_in(
866
- self.consensus_mapping_df["feature_uid"].to_list(),
867
- ),
839
+ ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
868
840
  )
869
841
  return not_in_consensus
870
842
 
@@ -942,7 +914,7 @@ def restore_features(self, samples=None, maps=False):
942
914
  maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
943
915
  """
944
916
  import datetime
945
- from master.sample.sample import Sample
917
+ from masster.sample.sample import Sample
946
918
 
947
919
  if self.features_df is None or self.features_df.is_empty():
948
920
  self.logger.error("No features_df found in study.")
@@ -962,9 +934,7 @@ def restore_features(self, samples=None, maps=False):
962
934
  # Columns to update from sample data
963
935
  columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
964
936
 
965
- self.logger.info(
966
- f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...",
967
- )
937
+ self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
968
938
 
969
939
  # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
970
940
  study_feature_mapping = {}
@@ -984,9 +954,7 @@ def restore_features(self, samples=None, maps=False):
984
954
  # Get sample info
985
955
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
986
956
  if sample_row.is_empty():
987
- self.logger.warning(
988
- f"Sample with uid {sample_uid} not found in samples_df.",
989
- )
957
+ self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
990
958
  continue
991
959
 
992
960
  sample_info = sample_row.row(0, named=True)
@@ -994,9 +962,7 @@ def restore_features(self, samples=None, maps=False):
994
962
  sample_name = sample_info.get("sample_name")
995
963
 
996
964
  if not sample_path or not os.path.exists(sample_path):
997
- self.logger.warning(
998
- f"Sample file not found for {sample_name}: {sample_path}",
999
- )
965
+ self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
1000
966
  continue
1001
967
 
1002
968
  try:
@@ -1012,9 +978,7 @@ def restore_features(self, samples=None, maps=False):
1012
978
  continue
1013
979
 
1014
980
  # Check which columns are actually available in the sample
1015
- available_columns = [
1016
- col for col in columns_to_update if col in sample.features_df.columns
1017
- ]
981
+ available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
1018
982
  if not available_columns:
1019
983
  self.logger.debug(f"No target columns found in sample {sample_name}")
1020
984
  continue
@@ -1037,21 +1001,13 @@ def restore_features(self, samples=None, maps=False):
1037
1001
  original_dtype = self.features_df[col].dtype
1038
1002
 
1039
1003
  # Update the specific row and column, preserving dtype
1040
- mask = (pl.col("feature_uid") == feature_uid) & (
1041
- pl.col("sample_uid") == sample_uid
1042
- )
1004
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1043
1005
 
1044
1006
  # Handle object columns (like Chromatogram) differently
1045
1007
  if original_dtype == pl.Object:
1046
1008
  self.features_df = self.features_df.with_columns(
1047
1009
  pl.when(mask)
1048
- .then(
1049
- pl.lit(
1050
- row[col],
1051
- dtype=original_dtype,
1052
- allow_object=True,
1053
- ),
1054
- )
1010
+ .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
1055
1011
  .otherwise(pl.col(col))
1056
1012
  .alias(col),
1057
1013
  )
@@ -1065,9 +1021,7 @@ def restore_features(self, samples=None, maps=False):
1065
1021
  updates_made += 1
1066
1022
 
1067
1023
  if updates_made > 0:
1068
- self.logger.debug(
1069
- f"Updated {updates_made} features from sample {sample_name}",
1070
- )
1024
+ self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
1071
1025
 
1072
1026
  # If maps is True, load featureXML data
1073
1027
  if maps:
@@ -1078,9 +1032,7 @@ def restore_features(self, samples=None, maps=False):
1078
1032
  self.logger.error(f"Failed to load sample {sample_name}: {e}")
1079
1033
  continue
1080
1034
 
1081
- self.logger.info(
1082
- f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
1083
- )
1035
+ self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
1084
1036
 
1085
1037
 
1086
1038
  def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
@@ -1100,8 +1052,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1100
1052
  """
1101
1053
  import datetime
1102
1054
  import numpy as np
1103
- from master.sample.sample import Sample
1104
- from master.chromatogram import Chromatogram
1055
+ from masster.sample.sample import Sample
1056
+ from masster.chromatogram import Chromatogram
1105
1057
 
1106
1058
  if self.features_df is None or self.features_df.is_empty():
1107
1059
  self.logger.error("No features_df found in study.")
@@ -1177,9 +1129,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1177
1129
  feature_uid = study_feature_mapping[key]
1178
1130
 
1179
1131
  # Update only the chrom column
1180
- mask = (pl.col("feature_uid") == feature_uid) & (
1181
- pl.col("sample_uid") == sample_uid
1182
- )
1132
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1183
1133
  self.features_df = self.features_df.with_columns(
1184
1134
  pl.when(mask)
1185
1135
  .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
@@ -1192,9 +1142,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1192
1142
  self.logger.error(f"Failed to load sample {sample_name}: {e}")
1193
1143
  continue
1194
1144
 
1195
- self.logger.info(
1196
- f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files",
1197
- )
1145
+ self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
1198
1146
 
1199
1147
  # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
1200
1148
  self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
@@ -1208,9 +1156,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1208
1156
  )
1209
1157
 
1210
1158
  if empty_chroms == 0:
1211
- self.logger.info(
1212
- "All chromatograms restored from .sample5 files. No gap-filling needed.",
1213
- )
1159
+ self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
1214
1160
  return
1215
1161
 
1216
1162
  # Get consensus info for gap filling
@@ -1254,11 +1200,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1254
1200
  sample = Sample(log_level="ERROR")
1255
1201
  sample._load_sample5(sample_path, map=False)
1256
1202
 
1257
- if (
1258
- not hasattr(sample, "ms1_df")
1259
- or sample.ms1_df is None
1260
- or sample.ms1_df.is_empty()
1261
- ):
1203
+ if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
1262
1204
  continue
1263
1205
 
1264
1206
  # Process each missing feature
@@ -1343,9 +1285,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1343
1285
  self.logger.info(
1344
1286
  f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
1345
1287
  )
1346
- self.logger.info(
1347
- f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}",
1348
- )
1288
+ self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
1349
1289
 
1350
1290
 
1351
1291
  def compress_ms2(self, max_replicates=5):
@@ -1365,28 +1305,17 @@ def compress_ms2(self, max_replicates=5):
1365
1305
 
1366
1306
  # Create a ranking score based on number_frags * prec_inty
1367
1307
  # Handle None values by treating them as 0
1368
- self.consensus_ms2 = self.consensus_ms2.with_columns(
1369
- [
1370
- (
1371
- pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
1372
- ).alias("ranking_score"),
1373
- ],
1374
- )
1308
+ self.consensus_ms2 = self.consensus_ms2.with_columns([
1309
+ (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
1310
+ ])
1375
1311
 
1376
1312
  # Group by consensus_uid and energy, then rank by score and keep top max_replicates
1377
1313
  compressed_ms2 = (
1378
- self.consensus_ms2.with_row_count(
1379
- "row_id",
1380
- ) # Add row numbers for stable sorting
1381
- .sort(
1382
- ["consensus_uid", "energy", "ranking_score", "row_id"],
1383
- descending=[False, False, True, False],
1384
- )
1385
- .with_columns(
1386
- [
1387
- pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
1388
- ],
1389
- )
1314
+ self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
1315
+ .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
1316
+ .with_columns([
1317
+ pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
1318
+ ])
1390
1319
  .filter(pl.col("rank") < max_replicates)
1391
1320
  .drop(["ranking_score", "row_id", "rank"])
1392
1321
  )
@@ -1422,9 +1351,7 @@ def compress_chrom(self):
1422
1351
  pl.lit(None, dtype=pl.Object).alias("chrom"),
1423
1352
  )
1424
1353
 
1425
- self.logger.info(
1426
- f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df",
1427
- )
1354
+ self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
1428
1355
 
1429
1356
 
1430
1357
  # =====================================================================================
@@ -1475,9 +1402,7 @@ def sample_name_replace(self, replace_dict):
1475
1402
  if name in replace_dict:
1476
1403
  new_names.append(replace_dict[name])
1477
1404
  replaced_count += 1
1478
- self.logger.debug(
1479
- f"Replacing sample name: '{name}' -> '{replace_dict[name]}'",
1480
- )
1405
+ self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1481
1406
  else:
1482
1407
  new_names.append(name)
1483
1408
 
@@ -1490,9 +1415,7 @@ def sample_name_replace(self, replace_dict):
1490
1415
  duplicates.append(name)
1491
1416
  else:
1492
1417
  seen.add(name)
1493
- raise ValueError(
1494
- f"Resulting sample names are not unique. Duplicates found: {duplicates}",
1495
- )
1418
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1496
1419
 
1497
1420
  # If we get here, all names are unique - apply the changes
1498
1421
  self.samples_df = self.samples_df.with_columns(
@@ -1541,9 +1464,7 @@ def sample_name_reset(self):
1541
1464
  name_without_ext = os.path.splitext(name_without_ext)[0]
1542
1465
 
1543
1466
  new_names.append(name_without_ext)
1544
- self.logger.debug(
1545
- f"Resetting sample name from path: '{path}' -> '{name_without_ext}'",
1546
- )
1467
+ self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1547
1468
 
1548
1469
  # Check that all new names are unique
1549
1470
  if len(set(new_names)) != len(new_names):
@@ -1554,18 +1475,14 @@ def sample_name_reset(self):
1554
1475
  duplicates.append(name)
1555
1476
  else:
1556
1477
  seen.add(name)
1557
- raise ValueError(
1558
- f"Resulting sample names are not unique. Duplicates found: {duplicates}",
1559
- )
1478
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1560
1479
 
1561
1480
  # If we get here, all names are unique - apply the changes
1562
1481
  self.samples_df = self.samples_df.with_columns(
1563
1482
  pl.Series("sample_name", new_names).alias("sample_name"),
1564
1483
  )
1565
1484
 
1566
- self.logger.info(
1567
- f"Successfully reset {len(new_names)} sample names from sample paths",
1568
- )
1485
+ self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1569
1486
 
1570
1487
 
1571
1488
  def set_source(self, filename):
@@ -1595,15 +1512,11 @@ def set_source(self, filename):
1595
1512
 
1596
1513
  new_sources = []
1597
1514
 
1598
- for i, (current_source, sample_name) in enumerate(
1599
- zip(current_sources, sample_names),
1600
- ):
1515
+ for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
1601
1516
  # Check if filename is just a directory path
1602
1517
  if os.path.isdir(filename):
1603
1518
  if current_source is None or current_source == "":
1604
- self.logger.warning(
1605
- f"Cannot build path for sample '{sample_name}': no current file_source available",
1606
- )
1519
+ self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
1607
1520
  new_sources.append(current_source)
1608
1521
  failed_count += 1
1609
1522
  continue
@@ -1618,9 +1531,7 @@ def set_source(self, filename):
1618
1531
 
1619
1532
  # Check if the new file exists
1620
1533
  if not os.path.exists(new_file_path):
1621
- self.logger.warning(
1622
- f"File does not exist for sample '{sample_name}': {new_file_path}",
1623
- )
1534
+ self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
1624
1535
  new_sources.append(current_source)
1625
1536
  failed_count += 1
1626
1537
  continue
@@ -1630,9 +1541,7 @@ def set_source(self, filename):
1630
1541
  updated_count += 1
1631
1542
 
1632
1543
  # Log individual updates at debug level
1633
- self.logger.debug(
1634
- f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}",
1635
- )
1544
+ self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
1636
1545
 
1637
1546
  # Update the samples_df with new file_source values
1638
1547
  self.samples_df = self.samples_df.with_columns(
@@ -1726,9 +1635,7 @@ def features_select(
1726
1635
  if mz is not None:
1727
1636
  if isinstance(mz, tuple) and len(mz) == 2:
1728
1637
  min_mz, max_mz = mz
1729
- filter_conditions.append(
1730
- (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
1731
- )
1638
+ filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1732
1639
  else:
1733
1640
  filter_conditions.append(pl.col("mz") >= mz)
1734
1641
 
@@ -1736,9 +1643,7 @@ def features_select(
1736
1643
  if rt is not None:
1737
1644
  if isinstance(rt, tuple) and len(rt) == 2:
1738
1645
  min_rt, max_rt = rt
1739
- filter_conditions.append(
1740
- (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
1741
- )
1646
+ filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1742
1647
  else:
1743
1648
  filter_conditions.append(pl.col("rt") >= rt)
1744
1649
 
@@ -1746,9 +1651,7 @@ def features_select(
1746
1651
  if inty is not None:
1747
1652
  if isinstance(inty, tuple) and len(inty) == 2:
1748
1653
  min_inty, max_inty = inty
1749
- filter_conditions.append(
1750
- (pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
1751
- )
1654
+ filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
1752
1655
  else:
1753
1656
  filter_conditions.append(pl.col("inty") >= inty)
1754
1657
 
@@ -1758,10 +1661,7 @@ def features_select(
1758
1661
  if len(sample_uid) == 2 and not isinstance(sample_uid, list):
1759
1662
  # Treat as range
1760
1663
  min_uid, max_uid = sample_uid
1761
- filter_conditions.append(
1762
- (pl.col("sample_uid") >= min_uid)
1763
- & (pl.col("sample_uid") <= max_uid),
1764
- )
1664
+ filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
1765
1665
  else:
1766
1666
  # Treat as list
1767
1667
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -1791,10 +1691,7 @@ def features_select(
1791
1691
  if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1792
1692
  # Treat as range
1793
1693
  min_uid, max_uid = consensus_uid
1794
- filter_conditions.append(
1795
- (pl.col("consensus_uid") >= min_uid)
1796
- & (pl.col("consensus_uid") <= max_uid),
1797
- )
1694
+ filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1798
1695
  else:
1799
1696
  # Treat as list
1800
1697
  filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
@@ -1807,10 +1704,7 @@ def features_select(
1807
1704
  if len(feature_uid) == 2 and not isinstance(feature_uid, list):
1808
1705
  # Treat as range
1809
1706
  min_uid, max_uid = feature_uid
1810
- filter_conditions.append(
1811
- (pl.col("feature_uid") >= min_uid)
1812
- & (pl.col("feature_uid") <= max_uid),
1813
- )
1707
+ filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
1814
1708
  else:
1815
1709
  # Treat as list
1816
1710
  filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
@@ -1832,10 +1726,7 @@ def features_select(
1832
1726
  if "quality" in available_columns:
1833
1727
  if isinstance(quality, tuple) and len(quality) == 2:
1834
1728
  min_quality, max_quality = quality
1835
- filter_conditions.append(
1836
- (pl.col("quality") >= min_quality)
1837
- & (pl.col("quality") <= max_quality),
1838
- )
1729
+ filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
1839
1730
  else:
1840
1731
  filter_conditions.append(pl.col("quality") >= quality)
1841
1732
  else:
@@ -1847,8 +1738,7 @@ def features_select(
1847
1738
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1848
1739
  min_coherence, max_coherence = chrom_coherence
1849
1740
  filter_conditions.append(
1850
- (pl.col("chrom_coherence") >= min_coherence)
1851
- & (pl.col("chrom_coherence") <= max_coherence),
1741
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
1852
1742
  )
1853
1743
  else:
1854
1744
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1861,8 +1751,7 @@ def features_select(
1861
1751
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1862
1752
  min_prominence, max_prominence = chrom_prominence
1863
1753
  filter_conditions.append(
1864
- (pl.col("chrom_prominence") >= min_prominence)
1865
- & (pl.col("chrom_prominence") <= max_prominence),
1754
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
1866
1755
  )
1867
1756
  else:
1868
1757
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1872,19 +1761,14 @@ def features_select(
1872
1761
  # Filter by scaled chromatogram prominence
1873
1762
  if chrom_prominence_scaled is not None:
1874
1763
  if "chrom_prominence_scaled" in available_columns:
1875
- if (
1876
- isinstance(chrom_prominence_scaled, tuple)
1877
- and len(chrom_prominence_scaled) == 2
1878
- ):
1764
+ if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
1879
1765
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1880
1766
  filter_conditions.append(
1881
1767
  (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1882
1768
  & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
1883
1769
  )
1884
1770
  else:
1885
- filter_conditions.append(
1886
- pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
1887
- )
1771
+ filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
1888
1772
  else:
1889
1773
  warnings.append("'chrom_prominence_scaled' column not found in features_df")
1890
1774
 
@@ -1898,9 +1782,7 @@ def features_select(
1898
1782
  & (pl.col("chrom_height_scaled") <= max_height_scaled),
1899
1783
  )
1900
1784
  else:
1901
- filter_conditions.append(
1902
- pl.col("chrom_height_scaled") >= chrom_height_scaled,
1903
- )
1785
+ filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
1904
1786
  else:
1905
1787
  warnings.append("'chrom_height_scaled' column not found in features_df")
1906
1788
 
@@ -1992,14 +1874,9 @@ def features_filter(self, features):
1992
1874
 
1993
1875
  # Apply filter to consensus_mapping_df if it exists - batch operation
1994
1876
  mapping_removed_count = 0
1995
- if (
1996
- self.consensus_mapping_df is not None
1997
- and not self.consensus_mapping_df.is_empty()
1998
- ):
1877
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1999
1878
  initial_mapping_count = len(self.consensus_mapping_df)
2000
- self.consensus_mapping_df = (
2001
- self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2002
- )
1879
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2003
1880
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
2004
1881
 
2005
1882
  # Calculate results once and log efficiently
@@ -2012,9 +1889,7 @@ def features_filter(self, features):
2012
1889
  f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
2013
1890
  )
2014
1891
  else:
2015
- self.logger.info(
2016
- f"Kept {final_count} features. Filtered out {removed_count} features.",
2017
- )
1892
+ self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
2018
1893
 
2019
1894
 
2020
1895
  def features_delete(self, features):
@@ -2076,14 +1951,9 @@ def features_delete(self, features):
2076
1951
 
2077
1952
  # Apply filter to consensus_mapping_df if it exists - batch operation
2078
1953
  mapping_removed_count = 0
2079
- if (
2080
- self.consensus_mapping_df is not None
2081
- and not self.consensus_mapping_df.is_empty()
2082
- ):
1954
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2083
1955
  initial_mapping_count = len(self.consensus_mapping_df)
2084
- self.consensus_mapping_df = (
2085
- self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2086
- )
1956
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2087
1957
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
2088
1958
 
2089
1959
  # Calculate results once and log efficiently
@@ -2096,9 +1966,7 @@ def features_delete(self, features):
2096
1966
  f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
2097
1967
  )
2098
1968
  else:
2099
- self.logger.info(
2100
- f"Deleted {removed_count} features. Remaining features: {final_count}",
2101
- )
1969
+ self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
2102
1970
 
2103
1971
 
2104
1972
  def consensus_select(
@@ -2171,9 +2039,7 @@ def consensus_select(
2171
2039
  else:
2172
2040
  # Standard (min_mz, max_mz) format
2173
2041
  min_mz, max_mz = mz
2174
- consensus = consensus.filter(
2175
- (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2176
- )
2042
+ consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2177
2043
  else:
2178
2044
  # Single float value - use default mz tolerance from study parameters
2179
2045
  default_mz_tol = getattr(self, "parameters", None)
@@ -2181,15 +2047,13 @@ def consensus_select(
2181
2047
  default_mz_tol = default_mz_tol.eic_mz_tol
2182
2048
  else:
2183
2049
  # Fallback to align_defaults if study parameters not available
2184
- from master.study.defaults.align_def import align_defaults
2050
+ from masster.study.defaults.align_def import align_defaults
2185
2051
 
2186
2052
  default_mz_tol = align_defaults().mz_max_diff
2187
2053
 
2188
2054
  min_mz = mz - default_mz_tol
2189
2055
  max_mz = mz + default_mz_tol
2190
- consensus = consensus.filter(
2191
- (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2192
- )
2056
+ consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2193
2057
 
2194
2058
  self.logger.debug(
2195
2059
  f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2209,9 +2073,7 @@ def consensus_select(
2209
2073
  else:
2210
2074
  # Standard (min_rt, max_rt) format
2211
2075
  min_rt, max_rt = rt
2212
- consensus = consensus.filter(
2213
- (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2214
- )
2076
+ consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2215
2077
  else:
2216
2078
  # Single float value - use default rt tolerance from study parameters
2217
2079
  default_rt_tol = getattr(self, "parameters", None)
@@ -2219,15 +2081,13 @@ def consensus_select(
2219
2081
  default_rt_tol = default_rt_tol.eic_rt_tol
2220
2082
  else:
2221
2083
  # Fallback to align_defaults if study parameters not available
2222
- from master.study.defaults.align_def import align_defaults
2084
+ from masster.study.defaults.align_def import align_defaults
2223
2085
 
2224
2086
  default_rt_tol = align_defaults().rt_max_diff
2225
2087
 
2226
2088
  min_rt = rt - default_rt_tol
2227
2089
  max_rt = rt + default_rt_tol
2228
- consensus = consensus.filter(
2229
- (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2230
- )
2090
+ consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2231
2091
 
2232
2092
  self.logger.debug(
2233
2093
  f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2238,9 +2098,7 @@ def consensus_select(
2238
2098
  consensus_len_before_filter = len(consensus)
2239
2099
  if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
2240
2100
  min_inty, max_inty = inty_mean
2241
- consensus = consensus.filter(
2242
- (pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
2243
- )
2101
+ consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
2244
2102
  else:
2245
2103
  consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
2246
2104
  self.logger.debug(
@@ -2255,14 +2113,11 @@ def consensus_select(
2255
2113
  # Treat as range
2256
2114
  min_uid, max_uid = consensus_uid
2257
2115
  consensus = consensus.filter(
2258
- (pl.col("consensus_uid") >= min_uid)
2259
- & (pl.col("consensus_uid") <= max_uid),
2116
+ (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
2260
2117
  )
2261
2118
  else:
2262
2119
  # Treat as list
2263
- consensus = consensus.filter(
2264
- pl.col("consensus_uid").is_in(consensus_uid),
2265
- )
2120
+ consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
2266
2121
  else:
2267
2122
  consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
2268
2123
  self.logger.debug(
@@ -2286,8 +2141,7 @@ def consensus_select(
2286
2141
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
2287
2142
  min_samples, max_samples = number_samples
2288
2143
  consensus = consensus.filter(
2289
- (pl.col("number_samples") >= min_samples)
2290
- & (pl.col("number_samples") <= max_samples),
2144
+ (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
2291
2145
  )
2292
2146
  else:
2293
2147
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2301,10 +2155,7 @@ def consensus_select(
2301
2155
  if "number_ms2" in consensus.columns:
2302
2156
  if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
2303
2157
  min_ms2, max_ms2 = number_ms2
2304
- consensus = consensus.filter(
2305
- (pl.col("number_ms2") >= min_ms2)
2306
- & (pl.col("number_ms2") <= max_ms2),
2307
- )
2158
+ consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
2308
2159
  else:
2309
2160
  consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
2310
2161
  else:
@@ -2318,9 +2169,7 @@ def consensus_select(
2318
2169
  consensus_len_before_filter = len(consensus)
2319
2170
  if isinstance(quality, tuple) and len(quality) == 2:
2320
2171
  min_quality, max_quality = quality
2321
- consensus = consensus.filter(
2322
- (pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
2323
- )
2172
+ consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
2324
2173
  else:
2325
2174
  consensus = consensus.filter(pl.col("quality") >= quality)
2326
2175
  self.logger.debug(
@@ -2333,9 +2182,7 @@ def consensus_select(
2333
2182
  if "bl" in consensus.columns:
2334
2183
  if isinstance(bl, tuple) and len(bl) == 2:
2335
2184
  min_bl, max_bl = bl
2336
- consensus = consensus.filter(
2337
- (pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
2338
- )
2185
+ consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
2339
2186
  else:
2340
2187
  consensus = consensus.filter(pl.col("bl") >= bl)
2341
2188
  else:
@@ -2348,23 +2195,16 @@ def consensus_select(
2348
2195
  if chrom_coherence_mean is not None:
2349
2196
  consensus_len_before_filter = len(consensus)
2350
2197
  if "chrom_coherence_mean" in consensus.columns:
2351
- if (
2352
- isinstance(chrom_coherence_mean, tuple)
2353
- and len(chrom_coherence_mean) == 2
2354
- ):
2198
+ if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
2355
2199
  min_coherence, max_coherence = chrom_coherence_mean
2356
2200
  consensus = consensus.filter(
2357
2201
  (pl.col("chrom_coherence_mean") >= min_coherence)
2358
2202
  & (pl.col("chrom_coherence_mean") <= max_coherence),
2359
2203
  )
2360
2204
  else:
2361
- consensus = consensus.filter(
2362
- pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
2363
- )
2205
+ consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
2364
2206
  else:
2365
- self.logger.warning(
2366
- "'chrom_coherence_mean' column not found in consensus_df",
2367
- )
2207
+ self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
2368
2208
  self.logger.debug(
2369
2209
  f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2370
2210
  )
@@ -2373,23 +2213,16 @@ def consensus_select(
2373
2213
  if chrom_prominence_mean is not None:
2374
2214
  consensus_len_before_filter = len(consensus)
2375
2215
  if "chrom_prominence_mean" in consensus.columns:
2376
- if (
2377
- isinstance(chrom_prominence_mean, tuple)
2378
- and len(chrom_prominence_mean) == 2
2379
- ):
2216
+ if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
2380
2217
  min_prominence, max_prominence = chrom_prominence_mean
2381
2218
  consensus = consensus.filter(
2382
2219
  (pl.col("chrom_prominence_mean") >= min_prominence)
2383
2220
  & (pl.col("chrom_prominence_mean") <= max_prominence),
2384
2221
  )
2385
2222
  else:
2386
- consensus = consensus.filter(
2387
- pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
2388
- )
2223
+ consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
2389
2224
  else:
2390
- self.logger.warning(
2391
- "'chrom_prominence_mean' column not found in consensus_df",
2392
- )
2225
+ self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
2393
2226
  self.logger.debug(
2394
2227
  f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2395
2228
  )
@@ -2398,26 +2231,16 @@ def consensus_select(
2398
2231
  if chrom_prominence_scaled_mean is not None:
2399
2232
  consensus_len_before_filter = len(consensus)
2400
2233
  if "chrom_prominence_scaled_mean" in consensus.columns:
2401
- if (
2402
- isinstance(chrom_prominence_scaled_mean, tuple)
2403
- and len(chrom_prominence_scaled_mean) == 2
2404
- ):
2405
- min_prominence_scaled, max_prominence_scaled = (
2406
- chrom_prominence_scaled_mean
2407
- )
2234
+ if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
2235
+ min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
2408
2236
  consensus = consensus.filter(
2409
2237
  (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2410
2238
  & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
2411
2239
  )
2412
2240
  else:
2413
- consensus = consensus.filter(
2414
- pl.col("chrom_prominence_scaled_mean")
2415
- >= chrom_prominence_scaled_mean,
2416
- )
2241
+ consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
2417
2242
  else:
2418
- self.logger.warning(
2419
- "'chrom_prominence_scaled_mean' column not found in consensus_df",
2420
- )
2243
+ self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
2421
2244
  self.logger.debug(
2422
2245
  f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2423
2246
  )
@@ -2426,23 +2249,16 @@ def consensus_select(
2426
2249
  if chrom_height_scaled_mean is not None:
2427
2250
  consensus_len_before_filter = len(consensus)
2428
2251
  if "chrom_height_scaled_mean" in consensus.columns:
2429
- if (
2430
- isinstance(chrom_height_scaled_mean, tuple)
2431
- and len(chrom_height_scaled_mean) == 2
2432
- ):
2252
+ if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
2433
2253
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
2434
2254
  consensus = consensus.filter(
2435
2255
  (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2436
2256
  & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
2437
2257
  )
2438
2258
  else:
2439
- consensus = consensus.filter(
2440
- pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
2441
- )
2259
+ consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
2442
2260
  else:
2443
- self.logger.warning(
2444
- "'chrom_height_scaled_mean' column not found in consensus_df",
2445
- )
2261
+ self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
2446
2262
  self.logger.debug(
2447
2263
  f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2448
2264
  )
@@ -2454,8 +2270,7 @@ def consensus_select(
2454
2270
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
2455
2271
  min_rt_delta, max_rt_delta = rt_delta_mean
2456
2272
  consensus = consensus.filter(
2457
- (pl.col("rt_delta_mean") >= min_rt_delta)
2458
- & (pl.col("rt_delta_mean") <= max_rt_delta),
2273
+ (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
2459
2274
  )
2460
2275
  else:
2461
2276
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2466,13 +2281,9 @@ def consensus_select(
2466
2281
  )
2467
2282
 
2468
2283
  if len(consensus) == 0:
2469
- self.logger.warning(
2470
- "No consensus features remaining after applying selection criteria.",
2471
- )
2284
+ self.logger.warning("No consensus features remaining after applying selection criteria.")
2472
2285
  else:
2473
- self.logger.info(
2474
- f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
2475
- )
2286
+ self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
2476
2287
 
2477
2288
  # Sort the results if sortby is specified
2478
2289
  if sortby is not None:
@@ -2481,25 +2292,19 @@ def consensus_select(
2481
2292
  if sortby in consensus.columns:
2482
2293
  consensus = consensus.sort(sortby, descending=descending)
2483
2294
  else:
2484
- self.logger.warning(
2485
- f"Sort column '{sortby}' not found in consensus DataFrame",
2486
- )
2295
+ self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
2487
2296
  elif isinstance(sortby, (list, tuple)):
2488
2297
  # Multiple columns
2489
2298
  valid_columns = [col for col in sortby if col in consensus.columns]
2490
2299
  invalid_columns = [col for col in sortby if col not in consensus.columns]
2491
2300
 
2492
2301
  if invalid_columns:
2493
- self.logger.warning(
2494
- f"Sort columns not found in consensus DataFrame: {invalid_columns}",
2495
- )
2302
+ self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2496
2303
 
2497
2304
  if valid_columns:
2498
2305
  consensus = consensus.sort(valid_columns, descending=descending)
2499
2306
  else:
2500
- self.logger.warning(
2501
- f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
2502
- )
2307
+ self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
2503
2308
 
2504
2309
  return consensus
2505
2310
 
@@ -2544,10 +2349,7 @@ def consensus_filter(self, consensus):
2544
2349
 
2545
2350
  # Get feature_uids that need to be removed from features_df
2546
2351
  feature_uids_to_remove = []
2547
- if (
2548
- self.consensus_mapping_df is not None
2549
- and not self.consensus_mapping_df.is_empty()
2550
- ):
2352
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2551
2353
  feature_uids_to_remove = self.consensus_mapping_df.filter(
2552
2354
  pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2553
2355
  )["feature_uid"].to_list()
@@ -2558,42 +2360,27 @@ def consensus_filter(self, consensus):
2558
2360
  )
2559
2361
 
2560
2362
  # Remove from consensus_mapping_df
2561
- if (
2562
- self.consensus_mapping_df is not None
2563
- and not self.consensus_mapping_df.is_empty()
2564
- ):
2363
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2565
2364
  initial_mapping_count = len(self.consensus_mapping_df)
2566
2365
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
2567
2366
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2568
2367
  )
2569
2368
  removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
2570
2369
  if removed_mapping_count > 0:
2571
- self.logger.debug(
2572
- f"Removed {removed_mapping_count} entries from consensus_mapping_df",
2573
- )
2370
+ self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
2574
2371
 
2575
2372
  # Remove corresponding features from features_df
2576
- if (
2577
- feature_uids_to_remove
2578
- and self.features_df is not None
2579
- and not self.features_df.is_empty()
2580
- ):
2373
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
2581
2374
  initial_features_count = len(self.features_df)
2582
2375
  self.features_df = self.features_df.filter(
2583
2376
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
2584
2377
  )
2585
2378
  removed_features_count = initial_features_count - len(self.features_df)
2586
2379
  if removed_features_count > 0:
2587
- self.logger.debug(
2588
- f"Removed {removed_features_count} entries from features_df",
2589
- )
2380
+ self.logger.debug(f"Removed {removed_features_count} entries from features_df")
2590
2381
 
2591
2382
  # Remove from consensus_ms2 if it exists
2592
- if (
2593
- hasattr(self, "consensus_ms2")
2594
- and self.consensus_ms2 is not None
2595
- and not self.consensus_ms2.is_empty()
2596
- ):
2383
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
2597
2384
  initial_ms2_count = len(self.consensus_ms2)
2598
2385
  self.consensus_ms2 = self.consensus_ms2.filter(
2599
2386
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
@@ -2693,10 +2480,7 @@ def samples_select(
2693
2480
  if len(sample_uid) == 2 and not isinstance(sample_uid, list):
2694
2481
  # Treat as range
2695
2482
  min_uid, max_uid = sample_uid
2696
- filter_conditions.append(
2697
- (pl.col("sample_uid") >= min_uid)
2698
- & (pl.col("sample_uid") <= max_uid),
2699
- )
2483
+ filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
2700
2484
  else:
2701
2485
  # Treat as list
2702
2486
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -2738,8 +2522,7 @@ def samples_select(
2738
2522
  # Treat as range
2739
2523
  min_batch, max_batch = sample_batch
2740
2524
  filter_conditions.append(
2741
- (pl.col("sample_batch") >= min_batch)
2742
- & (pl.col("sample_batch") <= max_batch),
2525
+ (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
2743
2526
  )
2744
2527
  else:
2745
2528
  # Treat as list
@@ -2757,14 +2540,11 @@ def samples_select(
2757
2540
  # Treat as range
2758
2541
  min_seq, max_seq = sample_sequence
2759
2542
  filter_conditions.append(
2760
- (pl.col("sample_sequence") >= min_seq)
2761
- & (pl.col("sample_sequence") <= max_seq),
2543
+ (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
2762
2544
  )
2763
2545
  else:
2764
2546
  # Treat as list
2765
- filter_conditions.append(
2766
- pl.col("sample_sequence").is_in(sample_sequence),
2767
- )
2547
+ filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
2768
2548
  else:
2769
2549
  filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
2770
2550
  else:
@@ -2776,8 +2556,7 @@ def samples_select(
2776
2556
  if isinstance(num_features, tuple) and len(num_features) == 2:
2777
2557
  min_features, max_features = num_features
2778
2558
  filter_conditions.append(
2779
- (pl.col("num_features") >= min_features)
2780
- & (pl.col("num_features") <= max_features),
2559
+ (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
2781
2560
  )
2782
2561
  else:
2783
2562
  filter_conditions.append(pl.col("num_features") >= num_features)
@@ -2789,9 +2568,7 @@ def samples_select(
2789
2568
  if "num_ms1" in available_columns:
2790
2569
  if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
2791
2570
  min_ms1, max_ms1 = num_ms1
2792
- filter_conditions.append(
2793
- (pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1),
2794
- )
2571
+ filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
2795
2572
  else:
2796
2573
  filter_conditions.append(pl.col("num_ms1") >= num_ms1)
2797
2574
  else:
@@ -2802,9 +2579,7 @@ def samples_select(
2802
2579
  if "num_ms2" in available_columns:
2803
2580
  if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
2804
2581
  min_ms2, max_ms2 = num_ms2
2805
- filter_conditions.append(
2806
- (pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2),
2807
- )
2582
+ filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
2808
2583
  else:
2809
2584
  filter_conditions.append(pl.col("num_ms2") >= num_ms2)
2810
2585
  else:
@@ -2896,9 +2671,7 @@ def samples_delete(self, samples):
2896
2671
  if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
2897
2672
  sample_uids_to_remove = list(sample_uids_set)
2898
2673
 
2899
- self.logger.info(
2900
- f"Deleting {len(sample_uids_to_remove)} samples and all related data...",
2901
- )
2674
+ self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
2902
2675
 
2903
2676
  # Get feature_uids that need to be removed from features_df
2904
2677
  feature_uids_to_remove = []
@@ -2926,11 +2699,7 @@ def samples_delete(self, samples):
2926
2699
 
2927
2700
  # 2. Remove corresponding features from features_df
2928
2701
  removed_features_count = 0
2929
- if (
2930
- feature_uids_to_remove
2931
- and self.features_df is not None
2932
- and not self.features_df.is_empty()
2933
- ):
2702
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
2934
2703
  self.features_df = self.features_df.filter(
2935
2704
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
2936
2705
  )
@@ -2938,11 +2707,7 @@ def samples_delete(self, samples):
2938
2707
 
2939
2708
  # 3. Remove from consensus_mapping_df
2940
2709
  removed_mapping_count = 0
2941
- if (
2942
- feature_uids_to_remove
2943
- and self.consensus_mapping_df is not None
2944
- and not self.consensus_mapping_df.is_empty()
2945
- ):
2710
+ if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2946
2711
  initial_mapping_count = len(self.consensus_mapping_df)
2947
2712
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
2948
2713
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
@@ -2951,11 +2716,7 @@ def samples_delete(self, samples):
2951
2716
 
2952
2717
  # 4. Remove from consensus_ms2 if it exists
2953
2718
  removed_ms2_count = 0
2954
- if (
2955
- hasattr(self, "consensus_ms2")
2956
- and self.consensus_ms2 is not None
2957
- and not self.consensus_ms2.is_empty()
2958
- ):
2719
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
2959
2720
  initial_ms2_count = len(self.consensus_ms2)
2960
2721
  self.consensus_ms2 = self.consensus_ms2.filter(
2961
2722
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
@@ -2964,11 +2725,7 @@ def samples_delete(self, samples):
2964
2725
 
2965
2726
  # 5. Remove from feature_maps and update map_id
2966
2727
  removed_maps_count = 0
2967
- if (
2968
- hasattr(self, "feature_maps")
2969
- and self.feature_maps is not None
2970
- and map_ids_to_remove
2971
- ):
2728
+ if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
2972
2729
  # Remove feature maps in reverse order to maintain indices
2973
2730
  for map_id in sorted(map_ids_to_remove, reverse=True):
2974
2731
  if 0 <= map_id < len(self.feature_maps):
@@ -3009,9 +2766,7 @@ def samples_delete(self, samples):
3009
2766
 
3010
2767
  # Update map_id indices if needed
3011
2768
  if removed_maps_count > 0 and final_sample_count > 0:
3012
- self.logger.debug(
3013
- f"Updated map_id values to range from 0 to {final_sample_count - 1}",
3014
- )
2769
+ self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
3015
2770
 
3016
2771
 
3017
2772
  # =====================================================================================
@@ -3182,9 +2937,7 @@ def sample_color(self, by=None, palette="Turbo256"):
3182
2937
  )
3183
2938
 
3184
2939
  if isinstance(by, list):
3185
- self.logger.debug(
3186
- f"Set sample colors using provided color list ({len(by)} colors)",
3187
- )
2940
+ self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
3188
2941
  elif by is None:
3189
2942
  self.logger.debug(f"Set sequential sample colors using {palette} palette")
3190
2943
  else:
@@ -3219,9 +2972,7 @@ def sample_color_reset(self):
3219
2972
  # Distribute samples evenly across the full colormap range
3220
2973
  for i in range(n_samples):
3221
2974
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3222
- normalized_value = (
3223
- i + 0.5
3224
- ) / n_samples # +0.5 to center samples in their bins
2975
+ normalized_value = (i + 0.5) / n_samples # +0.5 to center samples in their bins
3225
2976
  # Optionally, map to a subset of colormap to avoid extreme colors
3226
2977
  # Use 10% to 90% of colormap range for better color diversity
3227
2978
  normalized_value = 0.1 + (normalized_value * 0.8)
@@ -3242,14 +2993,10 @@ def sample_color_reset(self):
3242
2993
  pl.Series("sample_color", colors).alias("sample_color"),
3243
2994
  )
3244
2995
 
3245
- self.logger.debug(
3246
- f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
3247
- )
2996
+ self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
3248
2997
 
3249
2998
  except ImportError:
3250
- self.logger.error(
3251
- "cmap library is required for sample color reset. Install with: uv add cmap",
3252
- )
2999
+ self.logger.error("cmap library is required for sample color reset. Install with: uv add cmap")
3253
3000
  except Exception as e:
3254
3001
  self.logger.error(f"Failed to reset sample colors: {e}")
3255
3002
 
@@ -3270,9 +3017,7 @@ def _get_color_palette(palette_name):
3270
3017
  try:
3271
3018
  from cmap import Colormap
3272
3019
  except ImportError:
3273
- raise ValueError(
3274
- "cmap library is required for color palettes. Install with: pip install cmap",
3275
- )
3020
+ raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3276
3021
 
3277
3022
  # Map common palette names to cmap names
3278
3023
  palette_mapping = {
@@ -3367,9 +3112,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3367
3112
  try:
3368
3113
  from cmap import Colormap
3369
3114
  except ImportError:
3370
- raise ValueError(
3371
- "cmap library is required for color palettes. Install with: pip install cmap",
3372
- )
3115
+ raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3373
3116
 
3374
3117
  # Map common palette names to cmap names (same as _get_color_palette)
3375
3118
  palette_mapping = {
@@ -3407,9 +3150,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3407
3150
  # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
3408
3151
  for i in range(n_colors):
3409
3152
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3410
- normalized_value = (
3411
- i + 0.5
3412
- ) / n_colors # +0.5 to center samples in their bins
3153
+ normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
3413
3154
  # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
3414
3155
  normalized_value = 0.1 + (normalized_value * 0.8)
3415
3156
 
@@ -3454,7 +3195,7 @@ def _ensure_features_df_schema_order(self):
3454
3195
  try:
3455
3196
  import os
3456
3197
  import json
3457
- from master.study.h5 import _reorder_columns_by_schema
3198
+ from masster.study.h5 import _reorder_columns_by_schema
3458
3199
 
3459
3200
  # Load schema
3460
3201
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
@@ -3462,11 +3203,7 @@ def _ensure_features_df_schema_order(self):
3462
3203
  schema = json.load(f)
3463
3204
 
3464
3205
  # Reorder columns to match schema
3465
- self.features_df = _reorder_columns_by_schema(
3466
- self.features_df,
3467
- schema,
3468
- "features_df",
3469
- )
3206
+ self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
3470
3207
 
3471
3208
  except Exception as e:
3472
3209
  self.logger.warning(f"Failed to reorder features_df columns: {e}")
@@ -3508,19 +3245,17 @@ def migrate_map_id_to_index(self):
3508
3245
  # Ensure the column is Int64 type
3509
3246
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
3510
3247
 
3511
- self.logger.info(
3512
- f"Successfully migrated {sample_count} samples to indexed map_id format",
3513
- )
3248
+ self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3514
3249
  self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
3515
3250
 
3516
3251
 
3517
3252
  def restore_ms2(self, samples=None, **kwargs):
3518
3253
  """
3519
3254
  Restore MS2 data by re-running find_ms2 on specified samples.
3520
-
3255
+
3521
3256
  This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
3522
3257
  from the original sample files. Use this to reverse the effects of compress_ms2().
3523
-
3258
+
3524
3259
  Parameters:
3525
3260
  samples (list, optional): List of sample_uids or sample_names to process.
3526
3261
  If None, processes all samples.
@@ -3530,37 +3265,31 @@ def restore_ms2(self, samples=None, **kwargs):
3530
3265
  if self.features_df is None or self.features_df.is_empty():
3531
3266
  self.logger.error("No features_df found in study.")
3532
3267
  return
3533
-
3268
+
3534
3269
  if self.samples_df is None or self.samples_df.is_empty():
3535
3270
  self.logger.error("No samples_df found in study.")
3536
3271
  return
3537
-
3538
- # Get sample_uids to process
3272
+
3273
+ # Get sample_uids to process
3539
3274
  sample_uids = self._get_sample_uids(samples)
3540
3275
  if not sample_uids:
3541
3276
  self.logger.warning("No valid samples specified.")
3542
3277
  return
3543
-
3278
+
3544
3279
  self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
3545
-
3280
+
3546
3281
  # Clear existing consensus_ms2 to rebuild from scratch
3547
- initial_ms2_count = (
3548
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3549
- )
3282
+ initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3550
3283
  self.consensus_ms2 = pl.DataFrame()
3551
-
3284
+
3552
3285
  # Re-run find_ms2 which will rebuild consensus_ms2
3553
3286
  try:
3554
3287
  self.find_ms2(**kwargs)
3555
-
3556
- final_ms2_count = (
3557
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3558
- )
3559
-
3560
- self.logger.info(
3561
- f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
3562
- )
3563
-
3288
+
3289
+ final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3290
+
3291
+ self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
3292
+
3564
3293
  except Exception as e:
3565
3294
  self.logger.error(f"Failed to restore MS2 data: {e}")
3566
3295
  raise
@@ -3569,51 +3298,51 @@ def restore_ms2(self, samples=None, **kwargs):
3569
3298
  def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
3570
3299
  """
3571
3300
  Reverse any compression effects by restoring compressed data adaptively.
3572
-
3301
+
3573
3302
  This function restores data that was compressed using compress(), compress_features(),
3574
3303
  compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
3575
3304
  decompression process for speed by only processing what actually needs restoration.
3576
-
3305
+
3577
3306
  Parameters:
3578
3307
  features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
3579
- ms2 (bool): Restore MS2 spectra by re-running find_ms2()
3308
+ ms2 (bool): Restore MS2 spectra by re-running find_ms2()
3580
3309
  chrom (bool): Restore chromatogram objects
3581
3310
  samples (list, optional): List of sample_uids or sample_names to process.
3582
3311
  If None, processes all samples.
3583
3312
  **kwargs: Additional keyword arguments for restoration functions:
3584
3313
  - For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
3585
3314
  - For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
3586
-
3315
+
3587
3316
  Performance Optimizations:
3588
3317
  - Adaptive processing: Only restores what actually needs restoration
3589
3318
  - Processes features and chromatograms together when possible (shared file I/O)
3590
3319
  - Uses cached sample instances to avoid repeated file loading
3591
3320
  - Processes MS2 restoration last as it's the most computationally expensive
3592
3321
  - Provides detailed progress information for long-running operations
3593
-
3322
+
3594
3323
  Example:
3595
3324
  # Restore everything (but only what needs restoration)
3596
3325
  study.decompress()
3597
-
3326
+
3598
3327
  # Restore only chromatograms with custom tolerances
3599
3328
  study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
3600
-
3329
+
3601
3330
  # Restore specific samples only
3602
3331
  study.decompress(samples=["sample1", "sample2"])
3603
3332
  """
3604
3333
  if not any([features, ms2, chrom]):
3605
3334
  self.logger.warning("No decompression operations specified.")
3606
3335
  return
3607
-
3336
+
3608
3337
  # Get sample_uids to process
3609
3338
  sample_uids = self._get_sample_uids(samples)
3610
3339
  if not sample_uids:
3611
3340
  self.logger.warning("No valid samples specified.")
3612
3341
  return
3613
-
3342
+
3614
3343
  # Adaptively check what actually needs to be done
3615
3344
  import polars as pl
3616
-
3345
+
3617
3346
  # Check if features need restoration (more sophisticated logic)
3618
3347
  features_need_restoration = False
3619
3348
  if features and not self.features_df.is_empty():
@@ -3622,7 +3351,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3622
3351
  for col in ["ms2_scans", "ms2_specs"]:
3623
3352
  if col not in self.features_df.columns:
3624
3353
  missing_cols.append(col)
3625
-
3354
+
3626
3355
  # If columns are missing entirely, we likely need restoration
3627
3356
  if missing_cols:
3628
3357
  features_need_restoration = True
@@ -3631,15 +3360,13 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3631
3360
  # But be smart about it - only check if we have consensus features with MS2
3632
3361
  if not self.consensus_ms2.is_empty():
3633
3362
  # We have MS2 data, so ms2_specs should have some content
3634
- null_ms2_specs = self.features_df.filter(
3635
- pl.col("ms2_specs").is_null(),
3636
- ).height
3363
+ null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
3637
3364
  total_features = len(self.features_df)
3638
3365
  # If more than 90% are null but we have MS2 data, likely compressed
3639
3366
  if null_ms2_specs > (total_features * 0.9):
3640
3367
  features_need_restoration = True
3641
-
3642
- # Check if chromatograms need restoration
3368
+
3369
+ # Check if chromatograms need restoration
3643
3370
  chrom_need_restoration = False
3644
3371
  if chrom and not self.features_df.is_empty():
3645
3372
  if "chrom" not in self.features_df.columns:
@@ -3650,26 +3377,22 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3650
3377
  total_features = len(self.features_df)
3651
3378
  # If more than 50% are null, likely need restoration
3652
3379
  chrom_need_restoration = null_chroms > (total_features * 0.5)
3653
-
3380
+
3654
3381
  # Check if MS2 data might need restoration (compare expected vs actual)
3655
3382
  ms2_need_restoration = False
3656
3383
  if ms2:
3657
- current_ms2_count = (
3658
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3659
- )
3660
- consensus_count = (
3661
- len(self.consensus_df) if not self.consensus_df.is_empty() else 0
3662
- )
3663
-
3384
+ current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
3385
+ consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
3386
+
3664
3387
  if consensus_count > 0:
3665
3388
  # Calculate expected MS2 count based on consensus features with MS2 potential
3666
3389
  # This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
3667
3390
  expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
3668
3391
  expected_ms2 = consensus_count * expected_ratio
3669
-
3392
+
3670
3393
  if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
3671
3394
  ms2_need_restoration = True
3672
-
3395
+
3673
3396
  # Build list of operations that actually need to be done
3674
3397
  operations_needed = []
3675
3398
  if features and features_need_restoration:
@@ -3678,75 +3401,59 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
3678
3401
  operations_needed.append("chromatograms")
3679
3402
  if ms2 and ms2_need_restoration:
3680
3403
  operations_needed.append("MS2 spectra")
3681
-
3404
+
3682
3405
  # Early exit if nothing needs to be done
3683
3406
  if not operations_needed:
3684
- self.logger.info(
3685
- "All data appears to be already decompressed. No operations needed.",
3686
- )
3407
+ self.logger.info("All data appears to be already decompressed. No operations needed.")
3687
3408
  return
3688
-
3689
- self.logger.info(
3690
- f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples",
3691
- )
3692
-
3409
+
3410
+ self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
3411
+
3693
3412
  try:
3694
3413
  # Phase 1: Restore features and chromatograms together (shared file I/O)
3695
- if "features" in operations_needed and "chromatograms" in operations_needed:
3696
- self.logger.info(
3697
- "Phase 1: Restoring features and chromatograms together...",
3698
- )
3699
-
3414
+ if ("features" in operations_needed and "chromatograms" in operations_needed):
3415
+ self.logger.info("Phase 1: Restoring features and chromatograms together...")
3416
+
3700
3417
  # Extract relevant kwargs for restore_features and restore_chrom
3701
3418
  restore_kwargs = {}
3702
- if "mz_tol" in kwargs:
3703
- restore_kwargs["mz_tol"] = kwargs["mz_tol"]
3704
- if "rt_tol" in kwargs:
3705
- restore_kwargs["rt_tol"] = kwargs["rt_tol"]
3706
-
3419
+ if 'mz_tol' in kwargs:
3420
+ restore_kwargs['mz_tol'] = kwargs['mz_tol']
3421
+ if 'rt_tol' in kwargs:
3422
+ restore_kwargs['rt_tol'] = kwargs['rt_tol']
3423
+
3707
3424
  # Restore features first (includes chrom column)
3708
3425
  self.restore_features(samples=samples)
3709
-
3426
+
3710
3427
  # Then do additional chrom gap-filling if needed
3711
3428
  self.restore_chrom(samples=samples, **restore_kwargs)
3712
-
3713
- elif (
3714
- "features" in operations_needed and "chromatograms" not in operations_needed
3715
- ):
3429
+
3430
+ elif ("features" in operations_needed and "chromatograms" not in operations_needed):
3716
3431
  self.logger.info("Phase 1: Restoring features data...")
3717
3432
  self.restore_features(samples=samples)
3718
-
3719
- elif (
3720
- "chromatograms" in operations_needed and "features" not in operations_needed
3721
- ):
3433
+
3434
+ elif ("chromatograms" in operations_needed and "features" not in operations_needed):
3722
3435
  self.logger.info("Phase 1: Restoring chromatograms...")
3723
3436
  restore_kwargs = {}
3724
- if "mz_tol" in kwargs:
3725
- restore_kwargs["mz_tol"] = kwargs["mz_tol"]
3726
- if "rt_tol" in kwargs:
3727
- restore_kwargs["rt_tol"] = kwargs["rt_tol"]
3437
+ if 'mz_tol' in kwargs:
3438
+ restore_kwargs['mz_tol'] = kwargs['mz_tol']
3439
+ if 'rt_tol' in kwargs:
3440
+ restore_kwargs['rt_tol'] = kwargs['rt_tol']
3728
3441
  self.restore_chrom(samples=samples, **restore_kwargs)
3729
-
3442
+
3730
3443
  # Phase 2: Restore MS2 data (most computationally expensive, done last)
3731
3444
  if "MS2 spectra" in operations_needed:
3732
3445
  self.logger.info("Phase 2: Restoring MS2 spectra...")
3733
-
3446
+
3734
3447
  # Extract MS2-specific kwargs
3735
3448
  ms2_kwargs = {}
3736
3449
  for key, value in kwargs.items():
3737
- if key in [
3738
- "mz_tol",
3739
- "centroid",
3740
- "deisotope",
3741
- "dia_stats",
3742
- "feature_uid",
3743
- ]:
3450
+ if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
3744
3451
  ms2_kwargs[key] = value
3745
-
3452
+
3746
3453
  self.restore_ms2(samples=samples, **ms2_kwargs)
3747
-
3454
+
3748
3455
  self.logger.info("Adaptive decompression completed successfully")
3749
-
3456
+
3750
3457
  except Exception as e:
3751
3458
  self.logger.error(f"Decompression failed: {e}")
3752
3459
  raise