masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/id.py CHANGED
@@ -3,19 +3,14 @@
3
3
  Identification helpers for Study: load a Lib and identify consensus features
4
4
  by matching m/z (and optionally RT).
5
5
  """
6
-
7
6
  from __future__ import annotations
8
7
 
8
+ from typing import Optional
9
9
 
10
10
  import polars as pl
11
11
 
12
12
 
13
- def lib_load(
14
- study,
15
- lib_source,
16
- polarity: str | None = None,
17
- adducts: list | None = None,
18
- ):
13
+ def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optional[list] = None):
19
14
  """Load a compound library into the study.
20
15
 
21
16
  Args:
@@ -30,7 +25,7 @@ def lib_load(
30
25
  """
31
26
  # Lazy import to avoid circular imports at module import time
32
27
  try:
33
- from master.lib.lib import Lib
28
+ from masster.lib.lib import Lib
34
29
  except Exception:
35
30
  Lib = None
36
31
 
@@ -39,89 +34,71 @@ def lib_load(
39
34
 
40
35
  # Use study polarity if not explicitly provided
41
36
  if polarity is None:
42
- study_polarity = getattr(study, "polarity", "positive")
37
+ study_polarity = getattr(study, 'polarity', 'positive')
43
38
  # Normalize polarity names
44
- if study_polarity in ["pos", "positive"]:
45
- polarity = "positive"
46
- elif study_polarity in ["neg", "negative"]:
47
- polarity = "negative"
39
+ if study_polarity in ['pos', 'positive']:
40
+ polarity = 'positive'
41
+ elif study_polarity in ['neg', 'negative']:
42
+ polarity = 'negative'
48
43
  else:
49
- polarity = "positive" # Default fallback
44
+ polarity = 'positive' # Default fallback
50
45
 
51
46
  # Handle string input (CSV file path)
52
47
  if isinstance(lib_source, str):
53
48
  if Lib is None:
54
- raise ImportError(
55
- "Could not import master.lib.lib.Lib - required for CSV loading",
56
- )
57
-
49
+ raise ImportError("Could not import masster.lib.lib.Lib - required for CSV loading")
50
+
58
51
  lib_obj = Lib()
59
52
  lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
60
-
53
+
61
54
  # Handle Lib instance
62
55
  elif Lib is not None and isinstance(lib_source, Lib):
63
56
  lib_obj = lib_source
64
-
57
+
65
58
  # Handle other objects with lib_df attribute
66
59
  elif hasattr(lib_source, "lib_df"):
67
60
  lib_obj = lib_source
68
-
61
+
69
62
  else:
70
- raise TypeError(
71
- "lib_source must be a CSV file path (str), a master.lib.Lib instance, or have a 'lib_df' attribute",
72
- )
63
+ raise TypeError("lib_source must be a CSV file path (str), a masster.lib.Lib instance, or have a 'lib_df' attribute")
73
64
 
74
65
  # Ensure lib_df is populated
75
66
  lf = getattr(lib_obj, "lib_df", None)
76
- if lf is None or (hasattr(lf, "is_empty") and lf.is_empty()):
67
+ if lf is None or (hasattr(lf, 'is_empty') and lf.is_empty()):
77
68
  raise ValueError("Library has no data populated in lib_df")
78
69
 
79
70
  # Filter by polarity to match study
80
71
  # Map polarity to charge signs
81
- if polarity == "positive":
72
+ if polarity == 'positive':
82
73
  target_charges = [1, 2] # positive charges
83
- elif polarity == "negative":
74
+ elif polarity == 'negative':
84
75
  target_charges = [-1, -2] # negative charges
85
76
  else:
86
77
  target_charges = [-2, -1, 1, 2] # all charges
87
78
 
88
79
  # Filter library entries by charge sign (which corresponds to polarity)
89
80
  filtered_lf = lf.filter(pl.col("z").is_in(target_charges))
90
-
81
+
91
82
  if filtered_lf.is_empty():
92
- print(
93
- f"Warning: No library entries found for polarity '{polarity}'. Using all entries.",
94
- )
83
+ print(f"Warning: No library entries found for polarity '{polarity}'. Using all entries.")
95
84
  filtered_lf = lf
96
85
 
97
86
  # Store pointer and DataFrame on study
98
87
  study._lib = lib_obj
99
-
88
+
100
89
  # Add to existing lib_df instead of replacing
101
- if (
102
- hasattr(study, "lib_df")
103
- and study.lib_df is not None
104
- and not study.lib_df.is_empty()
105
- ):
90
+ if hasattr(study, 'lib_df') and study.lib_df is not None and not study.lib_df.is_empty():
106
91
  # Concatenate with existing data
107
92
  study.lib_df = pl.concat([study.lib_df, filtered_lf])
108
93
  else:
109
94
  # First time loading - create new
110
95
  try:
111
- study.lib_df = (
112
- filtered_lf.clone()
113
- if hasattr(filtered_lf, "clone")
114
- else pl.DataFrame(filtered_lf)
115
- )
96
+ study.lib_df = filtered_lf.clone() if hasattr(filtered_lf, "clone") else pl.DataFrame(filtered_lf)
116
97
  except Exception:
117
- study.lib_df = (
118
- pl.from_pandas(filtered_lf)
119
- if hasattr(filtered_lf, "to_pandas")
120
- else pl.DataFrame(filtered_lf)
121
- )
98
+ study.lib_df = pl.from_pandas(filtered_lf) if hasattr(filtered_lf, "to_pandas") else pl.DataFrame(filtered_lf)
122
99
 
123
100
 
124
- def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
101
+ def identify(study, mz_tol: float = 0.01, rt_tol: Optional[float] = None):
125
102
  """Identify consensus features against the loaded library.
126
103
 
127
104
  Matches consensus_df.mz against lib_df.mz within mz_tolerance. If rt_tolerance
@@ -135,12 +112,10 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
135
112
  - rt_delta (nullable)
136
113
  """
137
114
  # Get logger from study if available
138
- logger = getattr(study, "logger", None)
139
-
115
+ logger = getattr(study, 'logger', None)
116
+
140
117
  if logger:
141
- logger.debug(
142
- f"Starting identification with mz_tolerance={mz_tol}, rt_tolerance={rt_tol}",
143
- )
118
+ logger.debug(f"Starting identification with mz_tolerance={mz_tol}, rt_tolerance={rt_tol}")
144
119
 
145
120
  # Validate inputs
146
121
  if getattr(study, "consensus_df", None) is None or study.consensus_df.is_empty():
@@ -156,11 +131,9 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
156
131
 
157
132
  consensus_count = len(study.consensus_df)
158
133
  lib_count = len(study.lib_df)
159
-
134
+
160
135
  if logger:
161
- logger.debug(
162
- f"Identifying {consensus_count} consensus features against {lib_count} library entries",
163
- )
136
+ logger.debug(f"Identifying {consensus_count} consensus features against {lib_count} library entries")
164
137
 
165
138
  results = []
166
139
  features_with_matches = 0
@@ -173,7 +146,7 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
173
146
  cons_mz = cons.get("mz")
174
147
  cons_rt = cons.get("rt")
175
148
  cons_uid = cons.get("consensus_uid")
176
-
149
+
177
150
  if cons_mz is None:
178
151
  if logger:
179
152
  logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
@@ -181,7 +154,7 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
181
154
 
182
155
  # Filter lib by mz window
183
156
  matches = study.lib_df.filter(
184
- (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol),
157
+ (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
185
158
  )
186
159
 
187
160
  initial_matches = len(matches)
@@ -189,21 +162,15 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
189
162
  # If rt_tol provided and consensus RT present, prefer rt-filtered hits
190
163
  if rt_tol is not None and cons_rt is not None:
191
164
  rt_matches = matches.filter(
192
- pl.col("rt").is_not_null()
193
- & (pl.col("rt") >= cons_rt - rt_tol)
194
- & (pl.col("rt") <= cons_rt + rt_tol),
165
+ pl.col("rt").is_not_null() & (pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol)
195
166
  )
196
167
  if not rt_matches.is_empty():
197
168
  matches = rt_matches
198
169
  if logger:
199
- logger.debug(
200
- f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter",
201
- )
170
+ logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter")
202
171
  else:
203
172
  if logger:
204
- logger.debug(
205
- f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only",
206
- )
173
+ logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only")
207
174
 
208
175
  # Apply scoring-based filtering system
209
176
  if not matches.is_empty():
@@ -215,20 +182,14 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
215
182
  features_with_matches += 1
216
183
  feature_match_count = len(filtered_matches)
217
184
  total_matches += feature_match_count
218
-
185
+
219
186
  if logger:
220
- logger.debug(
221
- f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches",
222
- )
187
+ logger.debug(f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches")
223
188
 
224
189
  for m in filtered_matches.iter_rows(named=True):
225
190
  mz_delta = abs(cons_mz - m.get("mz")) if m.get("mz") is not None else None
226
191
  lib_rt = m.get("rt")
227
- rt_delta = (
228
- abs(cons_rt - lib_rt)
229
- if (cons_rt is not None and lib_rt is not None)
230
- else None
231
- )
192
+ rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
232
193
  results.append(
233
194
  {
234
195
  "consensus_uid": cons.get("consensus_uid"),
@@ -237,44 +198,36 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
237
198
  "rt_delta": rt_delta,
238
199
  "matcher": "ms1",
239
200
  "score": 1.0,
240
- },
201
+ }
241
202
  )
242
203
 
243
204
  study.id_df = pl.DataFrame(results) if results else pl.DataFrame()
244
-
205
+
245
206
  if logger:
246
207
  if rt_filtered_compounds > 0:
247
- logger.debug(
248
- f"RT consistency filtering applied to {rt_filtered_compounds} compound groups",
249
- )
250
-
208
+ logger.debug(f"RT consistency filtering applied to {rt_filtered_compounds} compound groups")
209
+
251
210
  if multiply_charged_filtered > 0:
252
- logger.debug(
253
- f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)",
254
- )
255
-
256
- logger.info(
257
- f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
258
- )
259
-
211
+ logger.debug(f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)")
212
+
213
+ logger.info(f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications")
214
+
215
+
260
216
  if total_matches > 0:
261
217
  # Calculate some statistics
262
218
  mz_deltas = [r["mz_delta"] for r in results if r["mz_delta"] is not None]
263
219
  rt_deltas = [r["rt_delta"] for r in results if r["rt_delta"] is not None]
264
-
220
+
265
221
  if mz_deltas:
266
222
  avg_mz_delta = sum(mz_deltas) / len(mz_deltas)
267
223
  max_mz_delta = max(mz_deltas)
268
- logger.debug(
269
- f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da",
270
- )
271
-
224
+ logger.debug(f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da")
225
+
272
226
  if rt_deltas:
273
227
  avg_rt_delta = sum(rt_deltas) / len(rt_deltas)
274
228
  max_rt_delta = max(rt_deltas)
275
- logger.debug(
276
- f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min",
277
- )
229
+ logger.debug(f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min")
230
+
278
231
 
279
232
 
280
233
  def get_id(study, features=None) -> pl.DataFrame:
@@ -291,7 +244,7 @@ def get_id(study, features=None) -> pl.DataFrame:
291
244
  Returns:
292
245
  Polars DataFrame with columns:
293
246
  - consensus_uid
294
- - lib_uid
247
+ - lib_uid
295
248
  - mz (consensus feature m/z)
296
249
  - rt (consensus feature RT)
297
250
  - name (compound name from library)
@@ -307,9 +260,7 @@ def get_id(study, features=None) -> pl.DataFrame:
307
260
  """
308
261
  # Validate inputs
309
262
  if getattr(study, "id_df", None) is None or study.id_df.is_empty():
310
- raise ValueError(
311
- "Identification results (study.id_df) are empty; call identify() first",
312
- )
263
+ raise ValueError("Identification results (study.id_df) are empty; call identify() first")
313
264
 
314
265
  if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
315
266
  raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
@@ -322,52 +273,35 @@ def get_id(study, features=None) -> pl.DataFrame:
322
273
 
323
274
  # Filter by features if provided
324
275
  if features is not None:
325
- if hasattr(features, "columns"): # DataFrame-like
326
- if "consensus_uid" in features.columns:
327
- uids = features["consensus_uid"].unique().to_list()
276
+ if hasattr(features, 'columns'): # DataFrame-like
277
+ if 'consensus_uid' in features.columns:
278
+ uids = features['consensus_uid'].unique().to_list()
328
279
  else:
329
- raise ValueError(
330
- "features DataFrame must contain 'consensus_uid' column",
331
- )
332
- elif hasattr(features, "__iter__") and not isinstance(
333
- features,
334
- str,
335
- ): # List-like
280
+ raise ValueError("features DataFrame must contain 'consensus_uid' column")
281
+ elif hasattr(features, '__iter__') and not isinstance(features, str): # List-like
336
282
  uids = list(features)
337
283
  else:
338
- raise ValueError(
339
- "features must be a DataFrame with 'consensus_uid' column or a list of UIDs",
340
- )
341
-
284
+ raise ValueError("features must be a DataFrame with 'consensus_uid' column or a list of UIDs")
285
+
342
286
  result_df = result_df.filter(pl.col("consensus_uid").is_in(uids))
343
-
287
+
344
288
  if result_df.is_empty():
345
289
  return pl.DataFrame()
346
290
 
347
291
  # Join with consensus_df to get consensus feature m/z and RT
348
292
  consensus_cols = ["consensus_uid", "mz", "rt"]
349
293
  # Only select columns that exist in consensus_df
350
- available_consensus_cols = [
351
- col for col in consensus_cols if col in study.consensus_df.columns
352
- ]
353
-
294
+ available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
295
+
354
296
  result_df = result_df.join(
355
297
  study.consensus_df.select(available_consensus_cols),
356
298
  on="consensus_uid",
357
299
  how="left",
358
- suffix="_consensus",
300
+ suffix="_consensus"
359
301
  )
360
302
 
361
303
  # Join with lib_df to get library information
362
- lib_cols = [
363
- "lib_uid",
364
- "name",
365
- "formula",
366
- "adduct",
367
- "smiles",
368
- "cmpd_uid",
369
- "inchikey",
370
- ]
304
+ lib_cols = ["lib_uid", "name", "formula", "adduct", "smiles", "cmpd_uid", "inchikey"]
371
305
  # Add optional columns if they exist
372
306
  optional_lib_cols = ["inchi"]
373
307
  for col in optional_lib_cols:
@@ -376,19 +310,19 @@ def get_id(study, features=None) -> pl.DataFrame:
376
310
 
377
311
  # Only select columns that exist in lib_df
378
312
  available_lib_cols = [col for col in lib_cols if col in study.lib_df.columns]
379
-
313
+
380
314
  result_df = result_df.join(
381
315
  study.lib_df.select(available_lib_cols),
382
- on="lib_uid",
316
+ on="lib_uid",
383
317
  how="left",
384
- suffix="_lib",
318
+ suffix="_lib"
385
319
  )
386
320
 
387
321
  # Reorder columns for better readability
388
322
  column_order = [
389
323
  "consensus_uid",
390
324
  "cmpd_uid" if "cmpd_uid" in result_df.columns else None,
391
- "lib_uid",
325
+ "lib_uid",
392
326
  "name" if "name" in result_df.columns else None,
393
327
  "formula" if "formula" in result_df.columns else None,
394
328
  "adduct" if "adduct" in result_df.columns else None,
@@ -399,54 +333,34 @@ def get_id(study, features=None) -> pl.DataFrame:
399
333
  "matcher" if "matcher" in result_df.columns else None,
400
334
  "score" if "score" in result_df.columns else None,
401
335
  "smiles" if "smiles" in result_df.columns else None,
402
- "inchikey" if "inchikey" in result_df.columns else None,
336
+ "inchikey" if "inchikey" in result_df.columns else None
403
337
  ]
404
-
338
+
405
339
  # Add any remaining columns
406
340
  remaining_cols = [col for col in result_df.columns if col not in column_order]
407
341
  column_order.extend(remaining_cols)
408
-
342
+
409
343
  # Filter out None values and select existing columns
410
- final_column_order = [
411
- col for col in column_order if col is not None and col in result_df.columns
412
- ]
413
-
344
+ final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
345
+
414
346
  result_df = result_df.select(final_column_order)
415
-
347
+
416
348
  # Add compound and formula count columns
417
349
  if "consensus_uid" in result_df.columns:
418
350
  # Calculate counts per consensus_uid
419
- count_stats = result_df.group_by("consensus_uid").agg(
420
- [
421
- pl.col("cmpd_uid").n_unique().alias("num_cmpds")
422
- if "cmpd_uid" in result_df.columns
423
- else pl.lit(None).alias("num_cmpds"),
424
- pl.col("formula").n_unique().alias("num_formulas")
425
- if "formula" in result_df.columns
426
- else pl.lit(None).alias("num_formulas"),
427
- ],
428
- )
429
-
351
+ count_stats = result_df.group_by("consensus_uid").agg([
352
+ pl.col("cmpd_uid").n_unique().alias("num_cmpds") if "cmpd_uid" in result_df.columns else pl.lit(None).alias("num_cmpds"),
353
+ pl.col("formula").n_unique().alias("num_formulas") if "formula" in result_df.columns else pl.lit(None).alias("num_formulas")
354
+ ])
355
+
430
356
  # Join the counts back to the main dataframe
431
357
  result_df = result_df.join(count_stats, on="consensus_uid", how="left")
432
-
358
+
433
359
  # Reorder columns to put count columns in the right position
434
360
  final_columns = []
435
361
  for col in result_df.columns:
436
- if col in [
437
- "consensus_uid",
438
- "cmpd_uid",
439
- "lib_uid",
440
- "name",
441
- "formula",
442
- "adduct",
443
- "mz",
444
- "mz_delta",
445
- "rt",
446
- "rt_delta",
447
- "matcher",
448
- "score",
449
- ]:
362
+ if col in ["consensus_uid", "cmpd_uid", "lib_uid", "name", "formula", "adduct",
363
+ "mz", "mz_delta", "rt", "rt_delta", "matcher", "score"]:
450
364
  final_columns.append(col)
451
365
  # Add count columns
452
366
  if "num_cmpds" in result_df.columns:
@@ -457,14 +371,14 @@ def get_id(study, features=None) -> pl.DataFrame:
457
371
  for col in result_df.columns:
458
372
  if col not in final_columns:
459
373
  final_columns.append(col)
460
-
374
+
461
375
  result_df = result_df.select(final_columns)
462
-
376
+
463
377
  # Apply scoring-based filtering system
464
378
  if "consensus_uid" in result_df.columns and len(result_df) > 0:
465
379
  # (i) Start with score 1.0 for all
466
380
  result_df = result_df.with_columns(pl.lit(1.0).alias("score"))
467
-
381
+
468
382
  # (ii) If not [M+H]+ or [M-H]-, score *= 0.7
469
383
  if "adduct" in result_df.columns:
470
384
  preferred_adducts = ["[M+H]+", "[M-H]-"]
@@ -472,96 +386,79 @@ def get_id(study, features=None) -> pl.DataFrame:
472
386
  pl.when(pl.col("adduct").is_in(preferred_adducts))
473
387
  .then(pl.col("score"))
474
388
  .otherwise(pl.col("score") * 0.7)
475
- .alias("score"),
389
+ .alias("score")
476
390
  )
477
-
391
+
478
392
  # (iii) If num_formulas > 1, score *= 0.7
479
393
  if "num_formulas" in result_df.columns:
480
394
  result_df = result_df.with_columns(
481
395
  pl.when(pl.col("num_formulas") > 1)
482
396
  .then(pl.col("score") * 0.7)
483
397
  .otherwise(pl.col("score"))
484
- .alias("score"),
398
+ .alias("score")
485
399
  )
486
-
400
+
487
401
  # (iv) If num_cmpds > 1, score *= 0.7
488
402
  if "num_cmpds" in result_df.columns:
489
403
  result_df = result_df.with_columns(
490
404
  pl.when(pl.col("num_cmpds") > 1)
491
405
  .then(pl.col("score") * 0.7)
492
406
  .otherwise(pl.col("score"))
493
- .alias("score"),
407
+ .alias("score")
494
408
  )
495
-
409
+
496
410
  # (v) Rank by score, assume that highest score has the correct rt
497
411
  # (vi) Remove all lower-scoring ids with a different rt (group by cmpd_uid)
498
412
  # (vii) Remove multiply charged ids if not in line with [M+H]+ or [M-H]- (group by cmpd_uid)
499
-
413
+
500
414
  # Group by cmpd_uid and apply filtering logic
501
415
  if "cmpd_uid" in result_df.columns:
502
416
  filtered_dfs = []
503
417
  for cmpd_uid, group_df in result_df.group_by("cmpd_uid"):
504
418
  # Sort by score descending to get highest score first
505
419
  group_df = group_df.sort("score", descending=True)
506
-
420
+
507
421
  if len(group_df) == 0:
508
422
  continue
509
-
423
+
510
424
  # Get the highest scoring entry's RT as reference
511
- reference_rt = (
512
- group_df["rt"][0]
513
- if "rt" in group_df.columns and group_df["rt"][0] is not None
514
- else None
515
- )
516
-
425
+ reference_rt = group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
426
+
517
427
  # Filter entries: keep those with same RT as highest scoring entry
518
428
  if reference_rt is not None and "rt" in group_df.columns:
519
429
  # Keep entries with the same RT or null RT
520
430
  rt_filtered = group_df.filter(
521
- (pl.col("rt") == reference_rt) | pl.col("rt").is_null(),
431
+ (pl.col("rt") == reference_rt) | pl.col("rt").is_null()
522
432
  )
523
433
  else:
524
434
  # No reference RT, keep all
525
435
  rt_filtered = group_df
526
-
436
+
527
437
  # Check multiply charged constraint
528
- if (
529
- "z" in rt_filtered.columns
530
- and "adduct" in rt_filtered.columns
531
- and len(rt_filtered) > 0
532
- ):
438
+ if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
533
439
  # Check if there are multiply charged adducts
534
- multiply_charged = rt_filtered.filter(
535
- (pl.col("z") > 1) | (pl.col("z") < -1),
536
- )
537
- singly_charged = rt_filtered.filter(
538
- (pl.col("z") == 1) | (pl.col("z") == -1),
539
- )
540
-
440
+ multiply_charged = rt_filtered.filter((pl.col("z") > 1) | (pl.col("z") < -1))
441
+ singly_charged = rt_filtered.filter((pl.col("z") == 1) | (pl.col("z") == -1))
442
+
541
443
  if not multiply_charged.is_empty():
542
444
  # Check if [M+H]+ or [M-H]- are present
543
445
  reference_adducts = ["[M+H]+", "[M-H]-"]
544
- has_reference = any(
545
- singly_charged.filter(
546
- pl.col("adduct").is_in(reference_adducts),
547
- ).height
548
- > 0,
549
- )
550
-
446
+ has_reference = any(singly_charged.filter(pl.col("adduct").is_in(reference_adducts)).height > 0)
447
+
551
448
  if not has_reference:
552
449
  # Remove multiply charged adducts
553
450
  rt_filtered = singly_charged
554
-
451
+
555
452
  if len(rt_filtered) > 0:
556
453
  filtered_dfs.append(rt_filtered)
557
-
454
+
558
455
  if filtered_dfs:
559
456
  result_df = pl.concat(filtered_dfs)
560
457
  else:
561
458
  result_df = pl.DataFrame()
562
-
459
+
563
460
  # Sort by cmpd_uid if available
564
461
  if "cmpd_uid" in result_df.columns:
565
462
  result_df = result_df.sort("cmpd_uid")
566
-
463
+
567
464
  return result_df