masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +1 -1
  4. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  5. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  6. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  7. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  8. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  9. masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
  10. masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
  11. masster/data/libs/ccm.csv +120 -0
  12. masster/data/libs/urine.csv +4693 -0
  13. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  14. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  15. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  16. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  17. masster/logger.py +11 -11
  18. masster/sample/__init__.py +1 -1
  19. masster/sample/adducts.py +338 -264
  20. masster/sample/defaults/find_adducts_def.py +21 -8
  21. masster/sample/h5.py +561 -282
  22. masster/sample/helpers.py +131 -75
  23. masster/sample/lib.py +4 -4
  24. masster/sample/load.py +31 -17
  25. masster/sample/parameters.py +1 -1
  26. masster/sample/plot.py +7 -7
  27. masster/sample/processing.py +117 -87
  28. masster/sample/sample.py +103 -90
  29. masster/sample/sample5_schema.json +196 -0
  30. masster/sample/save.py +35 -12
  31. masster/spectrum.py +1 -1
  32. masster/study/__init__.py +1 -1
  33. masster/study/defaults/align_def.py +5 -1
  34. masster/study/defaults/identify_def.py +3 -1
  35. masster/study/defaults/study_def.py +58 -25
  36. masster/study/export.py +360 -210
  37. masster/study/h5.py +560 -158
  38. masster/study/helpers.py +496 -203
  39. masster/study/helpers_optimized.py +1 -1
  40. masster/study/id.py +538 -349
  41. masster/study/load.py +233 -143
  42. masster/study/plot.py +71 -71
  43. masster/study/processing.py +456 -254
  44. masster/study/save.py +15 -5
  45. masster/study/study.py +213 -131
  46. masster/study/study5_schema.json +360 -0
  47. masster-0.4.5.dist-info/METADATA +131 -0
  48. masster-0.4.5.dist-info/RECORD +71 -0
  49. masster-0.4.3.dist-info/METADATA +0 -791
  50. masster-0.4.3.dist-info/RECORD +0 -56
  51. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  52. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  53. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  54. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/id.py CHANGED
@@ -3,14 +3,19 @@
3
3
  Identification helpers for Study: load a Lib and identify consensus features
4
4
  by matching m/z (and optionally RT).
5
5
  """
6
+
6
7
  from __future__ import annotations
7
8
 
8
- from typing import Optional
9
9
 
10
10
  import polars as pl
11
11
 
12
12
 
13
- def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optional[list] = None):
13
+ def lib_load(
14
+ study,
15
+ lib_source,
16
+ polarity: str | None = None,
17
+ adducts: list | None = None,
18
+ ):
14
19
  """Load a co # Add compound and formula count columns
15
20
  if "consensus_uid" in result_df.columns:
16
21
  # Calculate counts per consensus_uid
@@ -31,7 +36,7 @@ def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optiona
31
36
  """
32
37
  # Lazy import to avoid circular imports at module import time
33
38
  try:
34
- from masster.lib.lib import Lib
39
+ from master.lib.lib import Lib
35
40
  except Exception:
36
41
  Lib = None
37
42
 
@@ -40,72 +45,93 @@ def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optiona
40
45
 
41
46
  # Use study polarity if not explicitly provided
42
47
  if polarity is None:
43
- study_polarity = getattr(study, 'polarity', 'positive')
48
+ study_polarity = getattr(study, "polarity", "positive")
44
49
  # Normalize polarity names
45
- if study_polarity in ['pos', 'positive']:
46
- polarity = 'positive'
47
- elif study_polarity in ['neg', 'negative']:
48
- polarity = 'negative'
50
+ if study_polarity in ["pos", "positive"]:
51
+ polarity = "positive"
52
+ elif study_polarity in ["neg", "negative"]:
53
+ polarity = "negative"
49
54
  else:
50
- polarity = 'positive' # Default fallback
55
+ polarity = "positive" # Default fallback
51
56
 
52
57
  # Handle string input (CSV file path)
53
58
  if isinstance(lib_source, str):
54
59
  if Lib is None:
55
- raise ImportError("Could not import masster.lib.lib.Lib - required for CSV loading")
56
-
60
+ raise ImportError(
61
+ "Could not import master.lib.lib.Lib - required for CSV loading",
62
+ )
63
+
57
64
  lib_obj = Lib()
58
65
  lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
59
-
66
+
60
67
  # Handle Lib instance
61
68
  elif Lib is not None and isinstance(lib_source, Lib):
62
69
  lib_obj = lib_source
63
-
70
+
64
71
  # Handle other objects with lib_df attribute
65
72
  elif hasattr(lib_source, "lib_df"):
66
73
  lib_obj = lib_source
67
-
74
+
68
75
  else:
69
- raise TypeError("lib_source must be a CSV file path (str), a masster.lib.Lib instance, or have a 'lib_df' attribute")
76
+ raise TypeError(
77
+ "lib_source must be a CSV file path (str), a master.lib.Lib instance, or have a 'lib_df' attribute",
78
+ )
70
79
 
71
80
  # Ensure lib_df is populated
72
81
  lf = getattr(lib_obj, "lib_df", None)
73
- if lf is None or (hasattr(lf, 'is_empty') and lf.is_empty()):
82
+ if lf is None or (hasattr(lf, "is_empty") and lf.is_empty()):
74
83
  raise ValueError("Library has no data populated in lib_df")
75
84
 
76
85
  # Filter by polarity to match study
77
86
  # Map polarity to charge signs
78
- if polarity == 'positive':
87
+ if polarity == "positive":
79
88
  target_charges = [1, 2] # positive charges
80
- elif polarity == 'negative':
89
+ elif polarity == "negative":
81
90
  target_charges = [-1, -2] # negative charges
82
91
  else:
83
92
  target_charges = [-2, -1, 1, 2] # all charges
84
93
 
85
94
  # Filter library entries by charge sign (which corresponds to polarity)
86
95
  filtered_lf = lf.filter(pl.col("z").is_in(target_charges))
87
-
96
+
88
97
  if filtered_lf.is_empty():
89
- print(f"Warning: No library entries found for polarity '{polarity}'. Using all entries.")
98
+ print(
99
+ f"Warning: No library entries found for polarity '{polarity}'. Using all entries.",
100
+ )
90
101
  filtered_lf = lf
91
102
 
92
103
  # Store pointer and DataFrame on study
93
104
  study._lib = lib_obj
94
-
105
+
95
106
  # Add to existing lib_df instead of replacing
96
- if hasattr(study, 'lib_df') and study.lib_df is not None and not study.lib_df.is_empty():
107
+ if (
108
+ hasattr(study, "lib_df")
109
+ and study.lib_df is not None
110
+ and not study.lib_df.is_empty()
111
+ ):
97
112
  # Concatenate with existing data
98
113
  study.lib_df = pl.concat([study.lib_df, filtered_lf])
99
114
  else:
100
115
  # First time loading - create new
101
116
  try:
102
- study.lib_df = filtered_lf.clone() if hasattr(filtered_lf, "clone") else pl.DataFrame(filtered_lf)
117
+ study.lib_df = (
118
+ filtered_lf.clone()
119
+ if hasattr(filtered_lf, "clone")
120
+ else pl.DataFrame(filtered_lf)
121
+ )
103
122
  except Exception:
104
- study.lib_df = pl.from_pandas(filtered_lf) if hasattr(filtered_lf, "to_pandas") else pl.DataFrame(filtered_lf)
123
+ study.lib_df = (
124
+ pl.from_pandas(filtered_lf)
125
+ if hasattr(filtered_lf, "to_pandas")
126
+ else pl.DataFrame(filtered_lf)
127
+ )
105
128
 
106
129
  # Store this operation in history
107
- if hasattr(study, 'store_history'):
108
- study.store_history(["lib_load"], {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts})
130
+ if hasattr(study, "store_history"):
131
+ study.store_history(
132
+ ["lib_load"],
133
+ {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
134
+ )
109
135
 
110
136
 
111
137
  def identify(study, features=None, params=None, **kwargs):
@@ -121,7 +147,7 @@ def identify(study, features=None, params=None, **kwargs):
121
147
  If None, identifies all consensus features.
122
148
  params: Optional identify_defaults instance with matching tolerances and scoring parameters.
123
149
  If None, uses default parameters.
124
- **kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
150
+ **kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
125
151
  multiple_formulas_penalty, multiple_compounds_penalty, heteroatoms)
126
152
 
127
153
  The resulting DataFrame is stored as study.id_df. Columns:
@@ -133,10 +159,10 @@ def identify(study, features=None, params=None, **kwargs):
133
159
  """
134
160
  # Import defaults class
135
161
  try:
136
- from masster.study.defaults.identify_def import identify_defaults
162
+ from master.study.defaults.identify_def import identify_defaults
137
163
  except ImportError:
138
164
  identify_defaults = None
139
-
165
+
140
166
  # Use provided params or create defaults
141
167
  if params is None:
142
168
  if identify_defaults is not None:
@@ -150,52 +176,66 @@ def identify(study, features=None, params=None, **kwargs):
150
176
  multiple_formulas_penalty = 0.8
151
177
  multiple_compounds_penalty = 0.8
152
178
  heteroatoms = ["Cl", "Br", "F", "I"]
179
+
153
180
  params = FallbackParams()
154
-
181
+
155
182
  # Override parameters with any provided kwargs
156
183
  if kwargs:
157
184
  for param_name, value in kwargs.items():
158
185
  if hasattr(params, param_name):
159
186
  setattr(params, param_name, value)
160
-
187
+
161
188
  # Get effective tolerances from params (now possibly overridden)
162
- effective_mz_tol = getattr(params, 'mz_tol', 0.01)
163
- effective_rt_tol = getattr(params, 'rt_tol', 2.0)
189
+ effective_mz_tol = getattr(params, "mz_tol", 0.01)
190
+ effective_rt_tol = getattr(params, "rt_tol", 2.0)
164
191
  # Get logger from study if available
165
- logger = getattr(study, 'logger', None)
166
-
192
+ logger = getattr(study, "logger", None)
193
+
167
194
  if logger:
168
- logger.debug(f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}")
195
+ logger.debug(
196
+ f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
197
+ )
169
198
 
170
199
  # Determine which features to process
171
200
  target_uids = None
172
201
  if features is not None:
173
- if hasattr(features, 'columns'): # DataFrame-like
174
- if 'consensus_uid' in features.columns:
175
- target_uids = features['consensus_uid'].unique().to_list()
202
+ if hasattr(features, "columns"): # DataFrame-like
203
+ if "consensus_uid" in features.columns:
204
+ target_uids = features["consensus_uid"].unique().to_list()
176
205
  else:
177
- raise ValueError("features DataFrame must contain 'consensus_uid' column")
178
- elif hasattr(features, '__iter__') and not isinstance(features, str): # List-like
206
+ raise ValueError(
207
+ "features DataFrame must contain 'consensus_uid' column",
208
+ )
209
+ elif hasattr(features, "__iter__") and not isinstance(
210
+ features,
211
+ str,
212
+ ): # List-like
179
213
  target_uids = list(features)
180
214
  else:
181
- raise ValueError("features must be a DataFrame with 'consensus_uid' column or a list of UIDs")
182
-
215
+ raise ValueError(
216
+ "features must be a DataFrame with 'consensus_uid' column or a list of UIDs",
217
+ )
218
+
183
219
  if logger:
184
220
  logger.debug(f"Identifying {len(target_uids)} specified features")
185
221
 
186
222
  # Clear previous identification results for target features only
187
- if hasattr(study, 'id_df') and not study.id_df.is_empty():
223
+ if hasattr(study, "id_df") and not study.id_df.is_empty():
188
224
  if target_uids is not None:
189
225
  # Keep results for features NOT being re-identified
190
- study.id_df = study.id_df.filter(~pl.col("consensus_uid").is_in(target_uids))
226
+ study.id_df = study.id_df.filter(
227
+ ~pl.col("consensus_uid").is_in(target_uids),
228
+ )
191
229
  if logger:
192
- logger.debug(f"Cleared previous identification results for {len(target_uids)} features")
230
+ logger.debug(
231
+ f"Cleared previous identification results for {len(target_uids)} features",
232
+ )
193
233
  else:
194
234
  # Clear all results if no specific features specified
195
235
  study.id_df = pl.DataFrame()
196
236
  if logger:
197
237
  logger.debug("Cleared all previous identification results")
198
- elif not hasattr(study, 'id_df'):
238
+ elif not hasattr(study, "id_df"):
199
239
  study.id_df = pl.DataFrame()
200
240
  if logger:
201
241
  logger.debug("Initialized empty id_df")
@@ -214,20 +254,28 @@ def identify(study, features=None, params=None, **kwargs):
214
254
  # Filter consensus features if target_uids specified
215
255
  consensus_to_process = study.consensus_df
216
256
  if target_uids is not None:
217
- consensus_to_process = study.consensus_df.filter(pl.col("consensus_uid").is_in(target_uids))
257
+ consensus_to_process = study.consensus_df.filter(
258
+ pl.col("consensus_uid").is_in(target_uids),
259
+ )
218
260
  if consensus_to_process.is_empty():
219
261
  if logger:
220
- logger.warning("No consensus features found matching specified features")
262
+ logger.warning(
263
+ "No consensus features found matching specified features",
264
+ )
221
265
  return
222
266
 
223
267
  consensus_count = len(consensus_to_process)
224
268
  lib_count = len(study.lib_df)
225
-
269
+
226
270
  if logger:
227
271
  if target_uids is not None:
228
- logger.debug(f"Identifying {consensus_count} specified consensus features against {lib_count} library entries")
272
+ logger.debug(
273
+ f"Identifying {consensus_count} specified consensus features against {lib_count} library entries",
274
+ )
229
275
  else:
230
- logger.debug(f"Identifying {consensus_count} consensus features against {lib_count} library entries")
276
+ logger.debug(
277
+ f"Identifying {consensus_count} consensus features against {lib_count} library entries",
278
+ )
231
279
 
232
280
  # Get adduct probabilities
233
281
  adducts_df = study._get_adducts()
@@ -235,7 +283,7 @@ def identify(study, features=None, params=None, **kwargs):
235
283
  if not adducts_df.is_empty():
236
284
  for row in adducts_df.iter_rows(named=True):
237
285
  adduct_prob_map[row.get("name")] = row.get("probability", 1.0)
238
-
286
+
239
287
  results = []
240
288
  features_with_matches = 0
241
289
  total_matches = 0
@@ -247,7 +295,7 @@ def identify(study, features=None, params=None, **kwargs):
247
295
  cons_mz = cons.get("mz")
248
296
  cons_rt = cons.get("rt")
249
297
  cons_uid = cons.get("consensus_uid")
250
-
298
+
251
299
  if cons_mz is None:
252
300
  if logger:
253
301
  logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
@@ -255,7 +303,8 @@ def identify(study, features=None, params=None, **kwargs):
255
303
 
256
304
  # Filter lib by mz window
257
305
  matches = study.lib_df.filter(
258
- (pl.col("mz") >= cons_mz - effective_mz_tol) & (pl.col("mz") <= cons_mz + effective_mz_tol)
306
+ (pl.col("mz") >= cons_mz - effective_mz_tol)
307
+ & (pl.col("mz") <= cons_mz + effective_mz_tol),
259
308
  )
260
309
 
261
310
  initial_matches = len(matches)
@@ -263,15 +312,21 @@ def identify(study, features=None, params=None, **kwargs):
263
312
  # If rt_tol provided and consensus RT present, prefer rt-filtered hits
264
313
  if effective_rt_tol is not None and cons_rt is not None:
265
314
  rt_matches = matches.filter(
266
- pl.col("rt").is_not_null() & (pl.col("rt") >= cons_rt - effective_rt_tol) & (pl.col("rt") <= cons_rt + effective_rt_tol)
315
+ pl.col("rt").is_not_null()
316
+ & (pl.col("rt") >= cons_rt - effective_rt_tol)
317
+ & (pl.col("rt") <= cons_rt + effective_rt_tol),
267
318
  )
268
319
  if not rt_matches.is_empty():
269
320
  matches = rt_matches
270
321
  if logger:
271
- logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter")
322
+ logger.debug(
323
+ f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter",
324
+ )
272
325
  else:
273
326
  if logger:
274
- logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only")
327
+ logger.debug(
328
+ f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only",
329
+ )
275
330
 
276
331
  # Apply scoring-based filtering system
277
332
  if not matches.is_empty():
@@ -283,19 +338,25 @@ def identify(study, features=None, params=None, **kwargs):
283
338
  features_with_matches += 1
284
339
  feature_match_count = len(filtered_matches)
285
340
  total_matches += feature_match_count
286
-
341
+
287
342
  if logger:
288
- logger.debug(f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches")
343
+ logger.debug(
344
+ f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches",
345
+ )
289
346
 
290
347
  for m in filtered_matches.iter_rows(named=True):
291
348
  mz_delta = abs(cons_mz - m.get("mz")) if m.get("mz") is not None else None
292
349
  lib_rt = m.get("rt")
293
- rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
294
-
350
+ rt_delta = (
351
+ abs(cons_rt - lib_rt)
352
+ if (cons_rt is not None and lib_rt is not None)
353
+ else None
354
+ )
355
+
295
356
  # Get adduct probability from _get_adducts() results
296
357
  adduct = m.get("adduct")
297
358
  score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
298
-
359
+
299
360
  results.append(
300
361
  {
301
362
  "consensus_uid": cons.get("consensus_uid"),
@@ -304,51 +365,60 @@ def identify(study, features=None, params=None, **kwargs):
304
365
  "rt_delta": rt_delta,
305
366
  "matcher": "ms1",
306
367
  "score": score,
307
- }
368
+ },
308
369
  )
309
370
 
310
371
  # Merge new results with existing results
311
372
  new_results_df = pl.DataFrame(results) if results else pl.DataFrame()
312
-
373
+
313
374
  if not new_results_df.is_empty():
314
- if hasattr(study, 'id_df') and not study.id_df.is_empty():
375
+ if hasattr(study, "id_df") and not study.id_df.is_empty():
315
376
  # Concatenate new results with existing results
316
377
  study.id_df = pl.concat([study.id_df, new_results_df])
317
378
  else:
318
379
  # First results
319
380
  study.id_df = new_results_df
320
-
381
+
321
382
  # Apply scoring adjustments based on compound and formula counts
322
- if not study.id_df.is_empty() and hasattr(study, 'lib_df') and not study.lib_df.is_empty():
383
+ if (
384
+ not study.id_df.is_empty()
385
+ and hasattr(study, "lib_df")
386
+ and not study.lib_df.is_empty()
387
+ ):
323
388
  # Join with lib_df to get compound and formula information
324
389
  id_with_lib = study.id_df.join(
325
390
  study.lib_df.select(["lib_uid", "cmpd_uid", "formula"]),
326
391
  on="lib_uid",
327
- how="left"
392
+ how="left",
328
393
  )
329
-
394
+
330
395
  # Calculate counts per consensus_uid
331
- count_stats = id_with_lib.group_by("consensus_uid").agg([
332
- pl.col("cmpd_uid").n_unique().alias("num_cmpds"),
333
- pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas")
334
- ])
335
-
396
+ count_stats = id_with_lib.group_by("consensus_uid").agg(
397
+ [
398
+ pl.col("cmpd_uid").n_unique().alias("num_cmpds"),
399
+ pl.col("formula")
400
+ .filter(pl.col("formula").is_not_null())
401
+ .n_unique()
402
+ .alias("num_formulas"),
403
+ ],
404
+ )
405
+
336
406
  # Join counts back to id_df
337
407
  id_with_counts = study.id_df.join(count_stats, on="consensus_uid", how="left")
338
-
408
+
339
409
  # Join with lib_df again to get formula information for heteroatom penalty
340
410
  id_with_formula = id_with_counts.join(
341
411
  study.lib_df.select(["lib_uid", "formula"]),
342
412
  on="lib_uid",
343
- how="left"
413
+ how="left",
344
414
  )
345
-
415
+
346
416
  # Apply scoring penalties
347
- heteroatoms = getattr(params, 'heteroatoms', ['Cl', 'Br', 'F', 'I'])
348
- heteroatom_penalty = getattr(params, 'heteroatom_penalty', 0.7)
349
- formulas_penalty = getattr(params, 'multiple_formulas_penalty', 0.8)
350
- compounds_penalty = getattr(params, 'multiple_compounds_penalty', 0.8)
351
-
417
+ heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
418
+ heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
419
+ formulas_penalty = getattr(params, "multiple_formulas_penalty", 0.8)
420
+ compounds_penalty = getattr(params, "multiple_compounds_penalty", 0.8)
421
+
352
422
  # Build heteroatom condition
353
423
  heteroatom_condition = None
354
424
  for atom in heteroatoms:
@@ -357,76 +427,104 @@ def identify(study, features=None, params=None, **kwargs):
357
427
  heteroatom_condition = atom_condition
358
428
  else:
359
429
  heteroatom_condition = heteroatom_condition | atom_condition
360
-
430
+
361
431
  # Apply penalties
362
- study.id_df = id_with_formula.with_columns([
363
- # Heteroatom penalty: if formula contains specified heteroatoms, apply penalty
364
- pl.when(
365
- pl.col("formula").is_not_null() & heteroatom_condition
432
+ study.id_df = (
433
+ id_with_formula.with_columns(
434
+ [
435
+ # Heteroatom penalty: if formula contains specified heteroatoms, apply penalty
436
+ pl.when(
437
+ pl.col("formula").is_not_null() & heteroatom_condition,
438
+ )
439
+ .then(pl.col("score") * heteroatom_penalty)
440
+ .otherwise(pl.col("score"))
441
+ .alias("score_temp0"),
442
+ ],
443
+ )
444
+ .with_columns(
445
+ [
446
+ # If num_formulas > 1, apply multiple formulas penalty
447
+ pl.when(pl.col("num_formulas") > 1)
448
+ .then(pl.col("score_temp0") * formulas_penalty)
449
+ .otherwise(pl.col("score_temp0"))
450
+ .alias("score_temp1"),
451
+ ],
366
452
  )
367
- .then(pl.col("score") * heteroatom_penalty)
368
- .otherwise(pl.col("score"))
369
- .alias("score_temp0")
370
- ]).with_columns([
371
- # If num_formulas > 1, apply multiple formulas penalty
372
- pl.when(pl.col("num_formulas") > 1)
373
- .then(pl.col("score_temp0") * formulas_penalty)
374
- .otherwise(pl.col("score_temp0"))
375
- .alias("score_temp1")
376
- ]).with_columns([
377
- # If num_cmpds > 1, apply multiple compounds penalty
378
- pl.when(pl.col("num_cmpds") > 1)
379
- .then(pl.col("score_temp1") * compounds_penalty)
380
- .otherwise(pl.col("score_temp1"))
381
- .round(4) # Round to 4 decimal places
382
- .alias("score")
383
- ]).select([
384
- "consensus_uid", "lib_uid", "mz_delta", "rt_delta", "matcher", "score"
385
- ])
386
-
453
+ .with_columns(
454
+ [
455
+ # If num_cmpds > 1, apply multiple compounds penalty
456
+ pl.when(pl.col("num_cmpds") > 1)
457
+ .then(pl.col("score_temp1") * compounds_penalty)
458
+ .otherwise(pl.col("score_temp1"))
459
+ .round(4) # Round to 4 decimal places
460
+ .alias("score"),
461
+ ],
462
+ )
463
+ .select(
464
+ [
465
+ "consensus_uid",
466
+ "lib_uid",
467
+ "mz_delta",
468
+ "rt_delta",
469
+ "matcher",
470
+ "score",
471
+ ],
472
+ )
473
+ )
474
+
387
475
  # Store this operation in history
388
- if hasattr(study, 'store_history'):
476
+ if hasattr(study, "store_history"):
389
477
  history_params = {"mz_tol": effective_mz_tol, "rt_tol": effective_rt_tol}
390
478
  if features is not None:
391
479
  history_params["features"] = target_uids
392
- if params is not None and hasattr(params, 'to_dict'):
480
+ if params is not None and hasattr(params, "to_dict"):
393
481
  history_params["params"] = params.to_dict()
394
482
  if kwargs:
395
483
  history_params["kwargs"] = kwargs
396
484
  study.store_history(["identify"], history_params)
397
-
485
+
398
486
  if logger:
399
487
  if rt_filtered_compounds > 0:
400
- logger.debug(f"RT consistency filtering applied to {rt_filtered_compounds} compound groups")
401
-
488
+ logger.debug(
489
+ f"RT consistency filtering applied to {rt_filtered_compounds} compound groups",
490
+ )
491
+
402
492
  if multiply_charged_filtered > 0:
403
- logger.debug(f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)")
404
-
405
- logger.info(f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications")
406
-
407
-
493
+ logger.debug(
494
+ f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)",
495
+ )
496
+
497
+ logger.info(
498
+ f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
499
+ )
500
+
408
501
  if total_matches > 0:
409
502
  # Calculate some statistics
410
503
  mz_deltas = [r["mz_delta"] for r in results if r["mz_delta"] is not None]
411
504
  rt_deltas = [r["rt_delta"] for r in results if r["rt_delta"] is not None]
412
505
  scores = [r["score"] for r in results if r["score"] is not None]
413
-
506
+
414
507
  if mz_deltas:
415
508
  avg_mz_delta = sum(mz_deltas) / len(mz_deltas)
416
509
  max_mz_delta = max(mz_deltas)
417
- logger.debug(f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da")
418
-
510
+ logger.debug(
511
+ f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da",
512
+ )
513
+
419
514
  if rt_deltas:
420
515
  avg_rt_delta = sum(rt_deltas) / len(rt_deltas)
421
516
  max_rt_delta = max(rt_deltas)
422
- logger.debug(f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min")
423
-
517
+ logger.debug(
518
+ f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min",
519
+ )
520
+
424
521
  if scores:
425
522
  avg_score = sum(scores) / len(scores)
426
523
  min_score = min(scores)
427
524
  max_score = max(scores)
428
- logger.debug(f"Adduct probability scores: average={avg_score:.3f}, min={min_score:.3f}, max={max_score:.3f}")
429
-
525
+ logger.debug(
526
+ f"Adduct probability scores: average={avg_score:.3f}, min={min_score:.3f}, max={max_score:.3f}",
527
+ )
430
528
 
431
529
 
432
530
  def get_id(study, features=None) -> pl.DataFrame:
@@ -443,7 +541,7 @@ def get_id(study, features=None) -> pl.DataFrame:
443
541
  Returns:
444
542
  Polars DataFrame with columns:
445
543
  - consensus_uid
446
- - lib_uid
544
+ - lib_uid
447
545
  - mz (consensus feature m/z)
448
546
  - rt (consensus feature RT)
449
547
  - name (compound name from library)
@@ -459,7 +557,9 @@ def get_id(study, features=None) -> pl.DataFrame:
459
557
  """
460
558
  # Validate inputs
461
559
  if getattr(study, "id_df", None) is None or study.id_df.is_empty():
462
- raise ValueError("Identification results (study.id_df) are empty; call identify() first")
560
+ raise ValueError(
561
+ "Identification results (study.id_df) are empty; call identify() first",
562
+ )
463
563
 
464
564
  if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
465
565
  raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
@@ -472,35 +572,52 @@ def get_id(study, features=None) -> pl.DataFrame:
472
572
 
473
573
  # Filter by features if provided
474
574
  if features is not None:
475
- if hasattr(features, 'columns'): # DataFrame-like
476
- if 'consensus_uid' in features.columns:
477
- uids = features['consensus_uid'].unique().to_list()
575
+ if hasattr(features, "columns"): # DataFrame-like
576
+ if "consensus_uid" in features.columns:
577
+ uids = features["consensus_uid"].unique().to_list()
478
578
  else:
479
- raise ValueError("features DataFrame must contain 'consensus_uid' column")
480
- elif hasattr(features, '__iter__') and not isinstance(features, str): # List-like
579
+ raise ValueError(
580
+ "features DataFrame must contain 'consensus_uid' column",
581
+ )
582
+ elif hasattr(features, "__iter__") and not isinstance(
583
+ features,
584
+ str,
585
+ ): # List-like
481
586
  uids = list(features)
482
587
  else:
483
- raise ValueError("features must be a DataFrame with 'consensus_uid' column or a list of UIDs")
484
-
588
+ raise ValueError(
589
+ "features must be a DataFrame with 'consensus_uid' column or a list of UIDs",
590
+ )
591
+
485
592
  result_df = result_df.filter(pl.col("consensus_uid").is_in(uids))
486
-
593
+
487
594
  if result_df.is_empty():
488
595
  return pl.DataFrame()
489
596
 
490
597
  # Join with consensus_df to get consensus feature m/z and RT
491
598
  consensus_cols = ["consensus_uid", "mz", "rt"]
492
599
  # Only select columns that exist in consensus_df
493
- available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
494
-
600
+ available_consensus_cols = [
601
+ col for col in consensus_cols if col in study.consensus_df.columns
602
+ ]
603
+
495
604
  result_df = result_df.join(
496
605
  study.consensus_df.select(available_consensus_cols),
497
606
  on="consensus_uid",
498
607
  how="left",
499
- suffix="_consensus"
608
+ suffix="_consensus",
500
609
  )
501
610
 
502
611
  # Join with lib_df to get library information
503
- lib_cols = ["lib_uid", "name", "formula", "adduct", "smiles", "cmpd_uid", "inchikey"]
612
+ lib_cols = [
613
+ "lib_uid",
614
+ "name",
615
+ "formula",
616
+ "adduct",
617
+ "smiles",
618
+ "cmpd_uid",
619
+ "inchikey",
620
+ ]
504
621
  # Add optional columns if they exist
505
622
  optional_lib_cols = ["inchi", "db_id", "db"]
506
623
  for col in optional_lib_cols:
@@ -509,19 +626,19 @@ def get_id(study, features=None) -> pl.DataFrame:
509
626
 
510
627
  # Only select columns that exist in lib_df
511
628
  available_lib_cols = [col for col in lib_cols if col in study.lib_df.columns]
512
-
629
+
513
630
  result_df = result_df.join(
514
631
  study.lib_df.select(available_lib_cols),
515
- on="lib_uid",
632
+ on="lib_uid",
516
633
  how="left",
517
- suffix="_lib"
634
+ suffix="_lib",
518
635
  )
519
636
 
520
637
  # Reorder columns for better readability
521
638
  column_order = [
522
639
  "consensus_uid",
523
640
  "cmpd_uid" if "cmpd_uid" in result_df.columns else None,
524
- "lib_uid",
641
+ "lib_uid",
525
642
  "name" if "name" in result_df.columns else None,
526
643
  "formula" if "formula" in result_df.columns else None,
527
644
  "adduct" if "adduct" in result_df.columns else None,
@@ -532,34 +649,57 @@ def get_id(study, features=None) -> pl.DataFrame:
532
649
  "matcher" if "matcher" in result_df.columns else None,
533
650
  "score" if "score" in result_df.columns else None,
534
651
  "smiles" if "smiles" in result_df.columns else None,
535
- "inchikey" if "inchikey" in result_df.columns else None
652
+ "inchikey" if "inchikey" in result_df.columns else None,
536
653
  ]
537
-
654
+
538
655
  # Add any remaining columns
539
656
  remaining_cols = [col for col in result_df.columns if col not in column_order]
540
657
  column_order.extend(remaining_cols)
541
-
658
+
542
659
  # Filter out None values and select existing columns
543
- final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
544
-
660
+ final_column_order = [
661
+ col for col in column_order if col is not None and col in result_df.columns
662
+ ]
663
+
545
664
  result_df = result_df.select(final_column_order)
546
-
665
+
547
666
  # Add compound and formula count columns
548
667
  if "consensus_uid" in result_df.columns:
549
668
  # Calculate counts per consensus_uid
550
- count_stats = result_df.group_by("consensus_uid").agg([
551
- pl.col("cmpd_uid").n_unique().alias("num_cmpds") if "cmpd_uid" in result_df.columns else pl.lit(None).alias("num_cmpds"),
552
- pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas") if "formula" in result_df.columns else pl.lit(None).alias("num_formulas")
553
- ])
554
-
669
+ count_stats = result_df.group_by("consensus_uid").agg(
670
+ [
671
+ pl.col("cmpd_uid").n_unique().alias("num_cmpds")
672
+ if "cmpd_uid" in result_df.columns
673
+ else pl.lit(None).alias("num_cmpds"),
674
+ pl.col("formula")
675
+ .filter(pl.col("formula").is_not_null())
676
+ .n_unique()
677
+ .alias("num_formulas")
678
+ if "formula" in result_df.columns
679
+ else pl.lit(None).alias("num_formulas"),
680
+ ],
681
+ )
682
+
555
683
  # Join the counts back to the main dataframe
556
684
  result_df = result_df.join(count_stats, on="consensus_uid", how="left")
557
-
685
+
558
686
  # Reorder columns to put count columns in the right position
559
687
  final_columns = []
560
688
  for col in result_df.columns:
561
- if col in ["consensus_uid", "cmpd_uid", "lib_uid", "name", "formula", "adduct",
562
- "mz", "mz_delta", "rt", "rt_delta", "matcher", "score"]:
689
+ if col in [
690
+ "consensus_uid",
691
+ "cmpd_uid",
692
+ "lib_uid",
693
+ "name",
694
+ "formula",
695
+ "adduct",
696
+ "mz",
697
+ "mz_delta",
698
+ "rt",
699
+ "rt_delta",
700
+ "matcher",
701
+ "score",
702
+ ]:
563
703
  final_columns.append(col)
564
704
  # Add count columns
565
705
  if "num_cmpds" in result_df.columns:
@@ -570,143 +710,160 @@ def get_id(study, features=None) -> pl.DataFrame:
570
710
  for col in result_df.columns:
571
711
  if col not in final_columns:
572
712
  final_columns.append(col)
573
-
713
+
574
714
  result_df = result_df.select(final_columns)
575
-
715
+
576
716
  # Apply filtering logic (scores are already final from identify())
577
717
  if "consensus_uid" in result_df.columns and len(result_df) > 0:
578
718
  # (v) Rank by score, assume that highest score has the correct rt
579
719
  # (vi) Remove all lower-scoring ids with a different rt (group by cmpd_uid)
580
720
  # (vii) Remove multiply charged ids if not in line with [M+H]+ or [M-H]- (group by cmpd_uid)
581
-
721
+
582
722
  # Group by cmpd_uid and apply filtering logic
583
723
  if "cmpd_uid" in result_df.columns:
584
724
  filtered_dfs = []
585
725
  for cmpd_uid, group_df in result_df.group_by("cmpd_uid"):
586
726
  # Sort by score descending to get highest score first
587
727
  group_df = group_df.sort("score", descending=True)
588
-
728
+
589
729
  if len(group_df) == 0:
590
730
  continue
591
-
731
+
592
732
  # Get the highest scoring entry's RT as reference
593
- reference_rt = group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
594
-
733
+ reference_rt = (
734
+ group_df["rt"][0]
735
+ if "rt" in group_df.columns and group_df["rt"][0] is not None
736
+ else None
737
+ )
738
+
595
739
  # Filter entries: keep those with same RT as highest scoring entry
596
740
  if reference_rt is not None and "rt" in group_df.columns:
597
741
  # Keep entries with the same RT or null RT
598
742
  rt_filtered = group_df.filter(
599
- (pl.col("rt") == reference_rt) | pl.col("rt").is_null()
743
+ (pl.col("rt") == reference_rt) | pl.col("rt").is_null(),
600
744
  )
601
745
  else:
602
746
  # No reference RT, keep all
603
747
  rt_filtered = group_df
604
-
748
+
605
749
  # Check multiply charged constraint
606
- if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
750
+ if (
751
+ "z" in rt_filtered.columns
752
+ and "adduct" in rt_filtered.columns
753
+ and len(rt_filtered) > 0
754
+ ):
607
755
  # Check if there are multiply charged adducts
608
- multiply_charged = rt_filtered.filter((pl.col("z") > 1) | (pl.col("z") < -1))
609
- singly_charged = rt_filtered.filter((pl.col("z") == 1) | (pl.col("z") == -1))
610
-
756
+ multiply_charged = rt_filtered.filter(
757
+ (pl.col("z") > 1) | (pl.col("z") < -1),
758
+ )
759
+ singly_charged = rt_filtered.filter(
760
+ (pl.col("z") == 1) | (pl.col("z") == -1),
761
+ )
762
+
611
763
  if not multiply_charged.is_empty():
612
764
  # Check if [M+H]+ or [M-H]- are present
613
765
  reference_adducts = ["[M+H]+", "[M-H]-"]
614
- has_reference = any(singly_charged.filter(pl.col("adduct").is_in(reference_adducts)).height > 0)
615
-
766
+ has_reference = any(
767
+ singly_charged.filter(
768
+ pl.col("adduct").is_in(reference_adducts),
769
+ ).height
770
+ > 0,
771
+ )
772
+
616
773
  if not has_reference:
617
774
  # Remove multiply charged adducts
618
775
  rt_filtered = singly_charged
619
-
776
+
620
777
  if len(rt_filtered) > 0:
621
778
  filtered_dfs.append(rt_filtered)
622
-
779
+
623
780
  if filtered_dfs:
624
781
  result_df = pl.concat(filtered_dfs)
625
782
  else:
626
783
  result_df = pl.DataFrame()
627
-
784
+
628
785
  # Sort by cmpd_uid if available
629
786
  if "cmpd_uid" in result_df.columns:
630
787
  result_df = result_df.sort("cmpd_uid")
631
-
788
+
632
789
  return result_df
633
790
 
634
791
 
635
792
  def id_reset(study):
636
793
  """Reset identification data and remove from history.
637
-
794
+
638
795
  Removes:
639
796
  - study.id_df (identification results DataFrame)
640
797
  - 'identify' from study.history
641
-
798
+
642
799
  Args:
643
800
  study: Study instance to reset
644
801
  """
645
802
  # Get logger from study if available
646
- logger = getattr(study, 'logger', None)
647
-
803
+ logger = getattr(study, "logger", None)
804
+
648
805
  # Remove id_df
649
- if hasattr(study, 'id_df'):
806
+ if hasattr(study, "id_df"):
650
807
  if logger:
651
808
  logger.debug("Removing id_df")
652
- delattr(study, 'id_df')
653
-
809
+ delattr(study, "id_df")
810
+
654
811
  # Remove identify from history
655
- if hasattr(study, 'history') and 'identify' in study.history:
812
+ if hasattr(study, "history") and "identify" in study.history:
656
813
  if logger:
657
814
  logger.debug("Removing 'identify' from history")
658
- del study.history['identify']
659
-
815
+ del study.history["identify"]
816
+
660
817
  if logger:
661
818
  logger.info("Identification data reset completed")
662
819
 
663
820
 
664
821
  def lib_reset(study):
665
822
  """Reset library and identification data and remove from history.
666
-
823
+
667
824
  Removes:
668
- - study.id_df (identification results DataFrame)
825
+ - study.id_df (identification results DataFrame)
669
826
  - study.lib_df (library DataFrame)
670
827
  - study._lib (library object reference)
671
828
  - 'identify' from study.history
672
829
  - 'lib_load' from study.history (if exists)
673
-
830
+
674
831
  Args:
675
832
  study: Study instance to reset
676
833
  """
677
834
  # Get logger from study if available
678
- logger = getattr(study, 'logger', None)
679
-
835
+ logger = getattr(study, "logger", None)
836
+
680
837
  # Remove id_df
681
- if hasattr(study, 'id_df'):
838
+ if hasattr(study, "id_df"):
682
839
  if logger:
683
840
  logger.debug("Removing id_df")
684
- delattr(study, 'id_df')
685
-
686
- # Remove lib_df
687
- if hasattr(study, 'lib_df'):
841
+ delattr(study, "id_df")
842
+
843
+ # Remove lib_df
844
+ if hasattr(study, "lib_df"):
688
845
  if logger:
689
846
  logger.debug("Removing lib_df")
690
- delattr(study, 'lib_df')
691
-
847
+ delattr(study, "lib_df")
848
+
692
849
  # Remove lib object reference
693
- if hasattr(study, '_lib'):
850
+ if hasattr(study, "_lib"):
694
851
  if logger:
695
852
  logger.debug("Removing _lib reference")
696
- delattr(study, '_lib')
697
-
853
+ delattr(study, "_lib")
854
+
698
855
  # Remove from history
699
- if hasattr(study, 'history'):
700
- if 'identify' in study.history:
856
+ if hasattr(study, "history"):
857
+ if "identify" in study.history:
701
858
  if logger:
702
859
  logger.debug("Removing 'identify' from history")
703
- del study.history['identify']
704
-
705
- if 'lib_load' in study.history:
860
+ del study.history["identify"]
861
+
862
+ if "lib_load" in study.history:
706
863
  if logger:
707
864
  logger.debug("Removing 'lib_load' from history")
708
- del study.history['lib_load']
709
-
865
+ del study.history["lib_load"]
866
+
710
867
  if logger:
711
868
  logger.info("Library and identification data reset completed")
712
869
 
@@ -714,11 +871,11 @@ def lib_reset(study):
714
871
  def _get_adducts(self, adducts_list: list = None, **kwargs):
715
872
  """
716
873
  Generate comprehensive adduct specifications for study-level adduct filtering.
717
-
874
+
718
875
  This method creates a DataFrame of adduct combinations that will be used to filter
719
876
  and score adducts at the study level. Similar to sample._get_adducts() but uses
720
877
  study-level parameters and constraints.
721
-
878
+
722
879
  Parameters
723
880
  ----------
724
881
  adducts_list : List[str], optional
@@ -727,10 +884,10 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
727
884
  **kwargs : dict
728
885
  Override parameters, including:
729
886
  - charge_min: Minimum charge to consider (default 1)
730
- - charge_max: Maximum charge to consider (default 3)
887
+ - charge_max: Maximum charge to consider (default 3)
731
888
  - max_combinations: Maximum number of adduct components to combine (default 3)
732
889
  - min_probability: Minimum probability threshold (default from study parameters)
733
-
890
+
734
891
  Returns
735
892
  -------
736
893
  pl.DataFrame
@@ -742,270 +899,302 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
742
899
  - complexity: Number of adduct components (1-3)
743
900
  """
744
901
  # Import required modules
745
- from collections import Counter
746
- from itertools import combinations
747
- import numpy as np
748
-
902
+
749
903
  # Use provided adducts list or get from study parameters
750
904
  if adducts_list is None:
751
- adducts_list = self.parameters.adducts if hasattr(self.parameters, 'adducts') and self.parameters.adducts else []
752
-
905
+ adducts_list = (
906
+ self.parameters.adducts
907
+ if hasattr(self.parameters, "adducts") and self.parameters.adducts
908
+ else []
909
+ )
910
+
753
911
  # Get parameters with study-specific defaults
754
- charge_min = kwargs.get('charge_min', -3) # Allow negative charges
755
- charge_max = kwargs.get('charge_max', 3) # Study uses up to charge ±3
756
- max_combinations = kwargs.get('max_combinations', 3) # Up to 3 combinations
757
- min_probability = kwargs.get('min_probability', getattr(self.parameters, 'adduct_min_probability', 0.04))
758
-
912
+ charge_min = kwargs.get("charge_min", -3) # Allow negative charges
913
+ charge_max = kwargs.get("charge_max", 3) # Study uses up to charge ±3
914
+ max_combinations = kwargs.get("max_combinations", 3) # Up to 3 combinations
915
+ min_probability = kwargs.get(
916
+ "min_probability",
917
+ getattr(self.parameters, "adduct_min_probability", 0.04),
918
+ )
919
+
759
920
  # Parse base adduct specifications
760
921
  base_specs = []
761
-
922
+
762
923
  for adduct_str in adducts_list:
763
- if not isinstance(adduct_str, str) or ':' not in adduct_str:
924
+ if not isinstance(adduct_str, str) or ":" not in adduct_str:
764
925
  continue
765
-
926
+
766
927
  try:
767
- parts = adduct_str.split(':')
928
+ parts = adduct_str.split(":")
768
929
  if len(parts) != 3:
769
930
  continue
770
-
931
+
771
932
  formula_part = parts[0]
772
- charge = int(parts[1])
933
+ charge = int(parts[1])
773
934
  probability = float(parts[2])
774
-
935
+
775
936
  # Calculate mass shift from formula
776
937
  mass_shift = self._calculate_formula_mass_shift(formula_part)
777
-
778
- base_specs.append({
779
- 'formula': formula_part,
780
- 'charge': charge,
781
- 'mass_shift': mass_shift,
782
- 'probability': probability,
783
- 'raw_string': adduct_str
784
- })
785
-
938
+
939
+ base_specs.append(
940
+ {
941
+ "formula": formula_part,
942
+ "charge": charge,
943
+ "mass_shift": mass_shift,
944
+ "probability": probability,
945
+ "raw_string": adduct_str,
946
+ },
947
+ )
948
+
786
949
  except (ValueError, IndexError):
787
950
  continue
788
-
951
+
789
952
  if not base_specs:
790
953
  # Return empty DataFrame with correct schema
791
- return pl.DataFrame({
792
- 'name': [],
793
- 'charge': [],
794
- 'mass_shift': [],
795
- 'probability': [],
796
- 'complexity': []
797
- })
798
-
954
+ return pl.DataFrame(
955
+ {
956
+ "name": [],
957
+ "charge": [],
958
+ "mass_shift": [],
959
+ "probability": [],
960
+ "complexity": [],
961
+ },
962
+ )
963
+
799
964
  # Generate all valid combinations
800
965
  combinations_list = []
801
-
966
+
802
967
  # Separate specs by charge type
803
- positive_specs = [spec for spec in base_specs if spec['charge'] > 0]
804
- negative_specs = [spec for spec in base_specs if spec['charge'] < 0]
805
- neutral_specs = [spec for spec in base_specs if spec['charge'] == 0]
806
-
968
+ positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
969
+ negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
970
+ neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
971
+
807
972
  # 1. Single adducts (filter out neutral adducts with charge == 0)
808
973
  for spec in base_specs:
809
- if charge_min <= spec['charge'] <= charge_max and spec['charge'] != 0:
974
+ if charge_min <= spec["charge"] <= charge_max and spec["charge"] != 0:
810
975
  formatted_name = self._format_adduct_name([spec])
811
- combinations_list.append({
812
- 'components': [spec],
813
- 'formatted_name': formatted_name,
814
- 'total_mass_shift': spec['mass_shift'],
815
- 'total_charge': spec['charge'],
816
- 'combined_probability': spec['probability'],
817
- 'complexity': 1
818
- })
819
-
976
+ combinations_list.append(
977
+ {
978
+ "components": [spec],
979
+ "formatted_name": formatted_name,
980
+ "total_mass_shift": spec["mass_shift"],
981
+ "total_charge": spec["charge"],
982
+ "combined_probability": spec["probability"],
983
+ "complexity": 1,
984
+ },
985
+ )
986
+
820
987
  # 2. Generate multiply charged versions (2H+, 3H+, etc.) - already excludes charge==0
821
988
  for spec in positive_specs + negative_specs:
822
- base_charge = spec['charge']
989
+ base_charge = spec["charge"]
823
990
  for multiplier in range(2, min(max_combinations + 1, 4)): # Up to 3x multiplier
824
991
  total_charge = base_charge * multiplier
825
992
  if charge_min <= total_charge <= charge_max and total_charge != 0:
826
993
  components = [spec] * multiplier
827
994
  formatted_name = self._format_adduct_name(components)
828
-
829
- combinations_list.append({
830
- 'components': components,
831
- 'formatted_name': formatted_name,
832
- 'total_mass_shift': spec['mass_shift'] * multiplier,
833
- 'total_charge': total_charge,
834
- 'combined_probability': spec['probability'] ** multiplier,
835
- 'complexity': multiplier
836
- })
837
-
995
+
996
+ combinations_list.append(
997
+ {
998
+ "components": components,
999
+ "formatted_name": formatted_name,
1000
+ "total_mass_shift": spec["mass_shift"] * multiplier,
1001
+ "total_charge": total_charge,
1002
+ "combined_probability": spec["probability"] ** multiplier,
1003
+ "complexity": multiplier,
1004
+ },
1005
+ )
1006
+
838
1007
  # 3. Mixed combinations (2-component) - limited for study level, filter out charge==0
839
1008
  if max_combinations >= 2:
840
1009
  # Positive + Neutral (1 neutral loss only) - but exclude if total charge == 0
841
1010
  for pos_spec in positive_specs[:2]: # Limit to first 2 positive specs
842
1011
  for neut_spec in neutral_specs[:1]: # Only 1 neutral loss
843
- total_charge = pos_spec['charge'] + neut_spec['charge']
1012
+ total_charge = pos_spec["charge"] + neut_spec["charge"]
844
1013
  if charge_min <= total_charge <= charge_max and total_charge != 0:
845
1014
  components = [pos_spec, neut_spec]
846
1015
  formatted_name = self._format_adduct_name(components)
847
- combinations_list.append({
848
- 'components': components,
849
- 'formatted_name': formatted_name,
850
- 'total_mass_shift': pos_spec['mass_shift'] + neut_spec['mass_shift'],
851
- 'total_charge': total_charge,
852
- 'combined_probability': pos_spec['probability'] * neut_spec['probability'],
853
- 'complexity': 2
854
- })
855
-
1016
+ combinations_list.append(
1017
+ {
1018
+ "components": components,
1019
+ "formatted_name": formatted_name,
1020
+ "total_mass_shift": pos_spec["mass_shift"]
1021
+ + neut_spec["mass_shift"],
1022
+ "total_charge": total_charge,
1023
+ "combined_probability": pos_spec["probability"]
1024
+ * neut_spec["probability"],
1025
+ "complexity": 2,
1026
+ },
1027
+ )
1028
+
856
1029
  # Convert to polars DataFrame
857
1030
  if combinations_list:
858
- combinations_list.sort(key=lambda x: (-x['combined_probability'], x['complexity']))
859
-
860
- adducts_df = pl.DataFrame([
861
- {
862
- 'name': combo['formatted_name'],
863
- 'charge': combo['total_charge'],
864
- 'mass_shift': combo['total_mass_shift'],
865
- 'probability': combo['combined_probability'],
866
- 'complexity': combo['complexity']
867
- }
868
- for combo in combinations_list
869
- ])
870
-
1031
+ combinations_list.sort(
1032
+ key=lambda x: (-x["combined_probability"], x["complexity"]),
1033
+ )
1034
+
1035
+ adducts_df = pl.DataFrame(
1036
+ [
1037
+ {
1038
+ "name": combo["formatted_name"],
1039
+ "charge": combo["total_charge"],
1040
+ "mass_shift": combo["total_mass_shift"],
1041
+ "probability": combo["combined_probability"],
1042
+ "complexity": combo["complexity"],
1043
+ }
1044
+ for combo in combinations_list
1045
+ ],
1046
+ )
1047
+
871
1048
  # Filter by minimum probability threshold
872
1049
  if min_probability > 0.0:
873
1050
  adducts_before_filter = len(adducts_df)
874
1051
  adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
875
1052
  adducts_after_filter = len(adducts_df)
876
-
877
- self.logger.debug(f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})")
878
-
1053
+
1054
+ self.logger.debug(
1055
+ f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
1056
+ )
1057
+
879
1058
  else:
880
1059
  # Return empty DataFrame with correct schema
881
- adducts_df = pl.DataFrame({
882
- 'name': [],
883
- 'charge': [],
884
- 'mass_shift': [],
885
- 'probability': [],
886
- 'complexity': []
887
- })
888
-
1060
+ adducts_df = pl.DataFrame(
1061
+ {
1062
+ "name": [],
1063
+ "charge": [],
1064
+ "mass_shift": [],
1065
+ "probability": [],
1066
+ "complexity": [],
1067
+ },
1068
+ )
1069
+
889
1070
  return adducts_df
890
1071
 
1072
+
891
1073
  def _calculate_formula_mass_shift(self, formula: str) -> float:
892
1074
  """Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc."""
893
1075
  # Standard atomic masses
894
1076
  atomic_masses = {
895
- 'H': 1.007825,
896
- 'C': 12.0,
897
- 'N': 14.003074,
898
- 'O': 15.994915,
899
- 'Na': 22.989769,
900
- 'K': 38.963707,
901
- 'Li': 7.016003,
902
- 'Ca': 39.962591,
903
- 'Mg': 23.985042,
904
- 'Fe': 55.934938,
905
- 'Cl': 34.968853,
906
- 'Br': 78.918336,
907
- 'I': 126.904473,
908
- 'P': 30.973762,
909
- 'S': 31.972071
1077
+ "H": 1.007825,
1078
+ "C": 12.0,
1079
+ "N": 14.003074,
1080
+ "O": 15.994915,
1081
+ "Na": 22.989769,
1082
+ "K": 38.963707,
1083
+ "Li": 7.016003,
1084
+ "Ca": 39.962591,
1085
+ "Mg": 23.985042,
1086
+ "Fe": 55.934938,
1087
+ "Cl": 34.968853,
1088
+ "Br": 78.918336,
1089
+ "I": 126.904473,
1090
+ "P": 30.973762,
1091
+ "S": 31.972071,
910
1092
  }
911
-
1093
+
912
1094
  total_mass = 0.0
913
-
1095
+
914
1096
  # Parse formula by splitting on + and - while preserving the operators
915
1097
  parts = []
916
1098
  current_part = ""
917
1099
  current_sign = 1
918
-
1100
+
919
1101
  for char in formula:
920
- if char == '+':
1102
+ if char == "+":
921
1103
  if current_part:
922
1104
  parts.append((current_sign, current_part))
923
1105
  current_part = ""
924
1106
  current_sign = 1
925
- elif char == '-':
1107
+ elif char == "-":
926
1108
  if current_part:
927
1109
  parts.append((current_sign, current_part))
928
1110
  current_part = ""
929
1111
  current_sign = -1
930
1112
  else:
931
1113
  current_part += char
932
-
1114
+
933
1115
  if current_part:
934
1116
  parts.append((current_sign, current_part))
935
-
1117
+
936
1118
  # Process each part
937
1119
  for sign, part in parts:
938
1120
  if not part:
939
1121
  continue
940
-
1122
+
941
1123
  # Parse element and count (e.g., "H2O" -> H:2, O:1)
942
1124
  elements = self._parse_element_counts(part)
943
-
1125
+
944
1126
  for element, count in elements.items():
945
1127
  if element in atomic_masses:
946
1128
  total_mass += sign * atomic_masses[element] * count
947
-
1129
+
948
1130
  return total_mass
949
1131
 
1132
+
950
1133
  def _parse_element_counts(self, formula_part: str) -> dict[str, int]:
951
1134
  """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
952
1135
  elements = {}
953
1136
  i = 0
954
-
1137
+
955
1138
  while i < len(formula_part):
956
1139
  # Get element (uppercase letter, possibly followed by lowercase)
957
1140
  element = formula_part[i]
958
1141
  i += 1
959
-
1142
+
960
1143
  while i < len(formula_part) and formula_part[i].islower():
961
1144
  element += formula_part[i]
962
1145
  i += 1
963
-
1146
+
964
1147
  # Get count (digits following element)
965
1148
  count_str = ""
966
1149
  while i < len(formula_part) and formula_part[i].isdigit():
967
1150
  count_str += formula_part[i]
968
1151
  i += 1
969
-
1152
+
970
1153
  count = int(count_str) if count_str else 1
971
1154
  elements[element] = elements.get(element, 0) + count
972
-
1155
+
973
1156
  return elements
974
1157
 
1158
+
975
1159
  def _format_adduct_name(self, components: list[dict]) -> str:
976
1160
  """Format adduct name from components like [M+H]1+ or [M+2H]2+"""
977
1161
  if not components:
978
1162
  return "[M]"
979
-
1163
+
980
1164
  # Count occurrences of each formula
981
1165
  from collections import Counter
982
- formula_counts = Counter(comp['formula'] for comp in components)
983
- total_charge = sum(comp['charge'] for comp in components)
984
-
1166
+
1167
+ formula_counts = Counter(comp["formula"] for comp in components)
1168
+ total_charge = sum(comp["charge"] for comp in components)
1169
+
985
1170
  # Build formula part with proper multipliers
986
1171
  formula_parts = []
987
- for formula, count in sorted(formula_counts.items()): # Sort for consistent ordering
1172
+ for formula, count in sorted(
1173
+ formula_counts.items(),
1174
+ ): # Sort for consistent ordering
988
1175
  if count == 1:
989
1176
  formula_parts.append(formula)
990
1177
  else:
991
1178
  # For multiple occurrences, use count prefix (e.g., 2H, 3Na)
992
1179
  # Handle special case where formula might already start with + or -
993
- if formula.startswith(('+', '-')):
1180
+ if formula.startswith(("+", "-")):
994
1181
  sign = formula[0]
995
1182
  base_formula = formula[1:]
996
1183
  formula_parts.append(f"{sign}{count}{base_formula}")
997
1184
  else:
998
1185
  formula_parts.append(f"{count}{formula}")
999
-
1186
+
1000
1187
  # Combine formula parts
1001
1188
  formula = "".join(formula_parts)
1002
-
1189
+
1003
1190
  # Format charge
1004
1191
  if total_charge == 0:
1005
1192
  charge_str = ""
1006
1193
  elif abs(total_charge) == 1:
1007
1194
  charge_str = "1+" if total_charge > 0 else "1-"
1008
1195
  else:
1009
- charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
1010
-
1011
- return f"[M{formula}]{charge_str}"
1196
+ charge_str = (
1197
+ f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
1198
+ )
1199
+
1200
+ return f"[M{formula}]{charge_str}"