masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/id.py CHANGED
@@ -60,11 +60,11 @@ def lib_load(
60
60
  )
61
61
 
62
62
  lib_obj = Lib()
63
-
63
+
64
64
  # Determine file type by extension
65
- if lib_source.lower().endswith('.json'):
65
+ if lib_source.lower().endswith(".json"):
66
66
  lib_obj.import_json(lib_source, polarity=polarity, adducts=adducts)
67
- elif lib_source.lower().endswith('.csv'):
67
+ elif lib_source.lower().endswith(".csv"):
68
68
  lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
69
69
  else:
70
70
  # Default to CSV behavior for backward compatibility
@@ -112,15 +112,13 @@ def lib_load(
112
112
  # Add source_id column with filename (without path) if loading from CSV/JSON
113
113
  if isinstance(lib_source, str):
114
114
  import os
115
+
115
116
  filename_only = os.path.basename(lib_source)
116
117
  filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
117
118
 
118
119
  # Ensure required columns exist and set correct values
119
- required_columns = {
120
- "quant_group": pl.Int64,
121
- "iso": pl.Int64
122
- }
123
-
120
+ required_columns = {"quant_group": pl.Int64, "iso": pl.Int64}
121
+
124
122
  for col_name, col_dtype in required_columns.items():
125
123
  if col_name == "quant_group":
126
124
  # Set quant_group using cmpd_uid (same for isotopomers of same compound)
@@ -133,21 +131,24 @@ def lib_load(
133
131
  if col_name not in filtered_lf.columns:
134
132
  # Default to zero for iso
135
133
  filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
136
-
134
+
137
135
  # Generate 13C isotopes if requested
138
136
  original_count = len(filtered_lf)
139
- if iso == '13C':
137
+ if iso == "13C":
140
138
  filtered_lf = _generate_13c_isotopes(filtered_lf)
141
139
  # Update the log message to show the correct count after isotope generation
142
140
  if isinstance(lib_source, str):
143
141
  import os
142
+
144
143
  filename_only = os.path.basename(lib_source)
145
- print(f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}")
144
+ print(
145
+ f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}"
146
+ )
146
147
 
147
148
  # Reorder columns to place quant_group after rt and iso after formula
148
149
  column_order = []
149
150
  columns_list = list(filtered_lf.columns)
150
-
151
+
151
152
  for col in columns_list:
152
153
  if col not in column_order: # Only add if not already added
153
154
  column_order.append(col)
@@ -156,22 +157,17 @@ def lib_load(
156
157
  elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
157
158
  column_order.append("iso")
158
159
 
159
-
160
160
  # Add to existing lib_df instead of replacing
161
- if (
162
- hasattr(study, "lib_df")
163
- and study.lib_df is not None
164
- and not study.lib_df.is_empty()
165
- ):
161
+ if hasattr(study, "lib_df") and study.lib_df is not None and not study.lib_df.is_empty():
166
162
  # Check for schema compatibility and handle mismatches
167
163
  existing_cols = set(study.lib_df.columns)
168
164
  new_cols = set(filtered_lf.columns)
169
-
165
+
170
166
  # If schemas don't match, we need to align them
171
167
  if existing_cols != new_cols:
172
168
  # Get union of all columns
173
169
  all_cols = existing_cols.union(new_cols)
174
-
170
+
175
171
  # Add missing columns to existing data with appropriate defaults
176
172
  for col in new_cols - existing_cols:
177
173
  if col == "probability":
@@ -180,10 +176,12 @@ def lib_load(
180
176
  try:
181
177
  adduct_prob_map = _get_adduct_probabilities(study)
182
178
  study.lib_df = study.lib_df.with_columns(
183
- pl.col("adduct").map_elements(
179
+ pl.col("adduct")
180
+ .map_elements(
184
181
  lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
185
- return_dtype=pl.Float64
186
- ).alias("probability")
182
+ return_dtype=pl.Float64,
183
+ )
184
+ .alias("probability")
187
185
  )
188
186
  except Exception:
189
187
  study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
@@ -200,16 +198,16 @@ def lib_load(
200
198
  else:
201
199
  # Default to null for other columns
202
200
  study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
203
-
201
+
204
202
  # Add missing columns to new data with appropriate defaults
205
203
  for col in existing_cols - new_cols:
206
204
  if col not in ["probability", "iso", "quant_group"]: # These should already be handled
207
205
  filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
208
-
206
+
209
207
  # Ensure column order matches for concatenation - use existing column order
210
208
  existing_column_order = list(study.lib_df.columns)
211
209
  filtered_lf = filtered_lf.select(existing_column_order)
212
-
210
+
213
211
  # Concatenate with existing data
214
212
  study.lib_df = pl.concat([study.lib_df, filtered_lf])
215
213
  else:
@@ -218,14 +216,14 @@ def lib_load(
218
216
  study.lib_df = (
219
217
  filtered_lf.clone()
220
218
  if hasattr(filtered_lf, "clone")
221
- else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
219
+ else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
222
220
  )
223
221
  except Exception:
224
222
  try:
225
223
  study.lib_df = (
226
224
  pl.from_pandas(filtered_lf)
227
225
  if hasattr(filtered_lf, "to_pandas")
228
- else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
226
+ else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
229
227
  )
230
228
  except Exception:
231
229
  study.lib_df = pl.DataFrame()
@@ -265,20 +263,17 @@ def _setup_identify_parameters(params, kwargs):
265
263
  # Override parameters with any provided kwargs
266
264
  if kwargs:
267
265
  # Handle parameter name mapping for backwards compatibility
268
- param_mapping = {
269
- 'rt_tolerance': 'rt_tol',
270
- 'mz_tolerance': 'mz_tol'
271
- }
272
-
266
+ param_mapping = {"rt_tolerance": "rt_tol", "mz_tolerance": "mz_tol"}
267
+
273
268
  for param_name, value in kwargs.items():
274
269
  # Check if we need to map the parameter name
275
270
  mapped_name = param_mapping.get(param_name, param_name)
276
-
271
+
277
272
  if hasattr(params, mapped_name):
278
273
  setattr(params, mapped_name, value)
279
274
  elif hasattr(params, param_name):
280
275
  setattr(params, param_name, value)
281
-
276
+
282
277
  return params
283
278
 
284
279
 
@@ -287,9 +282,7 @@ def _smart_reset_id_results(study, target_uids, logger):
287
282
  if target_uids is not None:
288
283
  # Selective reset: only clear results for features being re-identified
289
284
  if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
290
- study.id_df = study.id_df.filter(
291
- ~pl.col("consensus_uid").is_in(target_uids)
292
- )
285
+ study.id_df = study.id_df.filter(~pl.col("consensus_uid").is_in(target_uids))
293
286
  if logger:
294
287
  logger.debug(f"Cleared previous results for {len(target_uids)} specific features")
295
288
  elif not hasattr(study, "id_df"):
@@ -305,21 +298,23 @@ def _get_cached_adduct_probabilities(study, logger):
305
298
  """Get adduct probabilities with caching to avoid repeated expensive computation."""
306
299
  # Check if we have cached results and cache key matches current parameters
307
300
  current_cache_key = _get_adduct_cache_key(study)
308
-
309
- if (hasattr(study, '_cached_adduct_probs') and
310
- hasattr(study, '_cached_adduct_key') and
311
- study._cached_adduct_key == current_cache_key):
301
+
302
+ if (
303
+ hasattr(study, "_cached_adduct_probs")
304
+ and hasattr(study, "_cached_adduct_key")
305
+ and study._cached_adduct_key == current_cache_key
306
+ ):
312
307
  if logger:
313
308
  logger.debug("Using cached adduct probabilities")
314
309
  return study._cached_adduct_probs
315
-
310
+
316
311
  # Compute and cache
317
312
  if logger:
318
313
  logger.debug("Computing adduct probabilities...")
319
314
  adduct_prob_map = _get_adduct_probabilities(study)
320
315
  study._cached_adduct_probs = adduct_prob_map
321
316
  study._cached_adduct_key = current_cache_key
322
-
317
+
323
318
  if logger:
324
319
  logger.debug(f"Computed and cached probabilities for {len(adduct_prob_map)} adducts")
325
320
  return adduct_prob_map
@@ -327,28 +322,30 @@ def _get_cached_adduct_probabilities(study, logger):
327
322
 
328
323
  def _get_adduct_cache_key(study):
329
324
  """Generate a cache key based on adduct-related parameters."""
330
- if hasattr(study, 'parameters') and hasattr(study.parameters, 'adducts'):
331
- adducts_str = '|'.join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
332
- min_prob = getattr(study.parameters, 'adduct_min_probability', 0.04)
325
+ if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
326
+ adducts_str = "|".join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
327
+ min_prob = getattr(study.parameters, "adduct_min_probability", 0.04)
333
328
  return f"adducts:{adducts_str}:min_prob:{min_prob}"
334
329
  return "default"
335
330
 
336
331
 
337
332
  def clear_identification_cache(study):
338
333
  """Clear cached identification data (useful when parameters change)."""
339
- cache_attrs = ['_cached_adduct_probs', '_cached_adduct_key']
334
+ cache_attrs = ["_cached_adduct_probs", "_cached_adduct_key"]
340
335
  for attr in cache_attrs:
341
336
  if hasattr(study, attr):
342
337
  delattr(study, attr)
343
338
 
344
339
 
345
- def _perform_identification_matching(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger):
340
+ def _perform_identification_matching(
341
+ consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
342
+ ):
346
343
  """Perform optimized identification matching using vectorized operations where possible."""
347
344
  results = []
348
-
345
+
349
346
  # Get library data as arrays for faster access
350
347
  lib_df = study.lib_df
351
-
348
+
352
349
  if logger:
353
350
  consensus_count = len(consensus_to_process)
354
351
  lib_count = len(lib_df)
@@ -361,7 +358,7 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
361
358
  cons_uid = cons_row.get("consensus_uid")
362
359
  cons_mz = cons_row.get("mz")
363
360
  cons_rt = cons_row.get("rt")
364
-
361
+
365
362
  if cons_mz is None:
366
363
  if logger:
367
364
  logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
@@ -372,18 +369,14 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
372
369
  matches = _find_matches_vectorized(
373
370
  lib_df, cons_mz, cons_rt, effective_mz_tol, effective_rt_tol, logger, cons_uid
374
371
  )
375
-
372
+
376
373
  # Convert matches to result format
377
374
  match_results = []
378
375
  if not matches.is_empty():
379
376
  for match_row in matches.iter_rows(named=True):
380
377
  mz_delta = abs(cons_mz - match_row.get("mz")) if match_row.get("mz") is not None else None
381
378
  lib_rt = match_row.get("rt")
382
- rt_delta = (
383
- abs(cons_rt - lib_rt)
384
- if (cons_rt is not None and lib_rt is not None)
385
- else None
386
- )
379
+ rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
387
380
 
388
381
  # Get library probability as base score, then multiply by adduct probability
389
382
  lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
@@ -400,22 +393,20 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
400
393
  "matcher": "ms1",
401
394
  "score": score,
402
395
  })
403
-
396
+
404
397
  results.append({"consensus_uid": cons_uid, "matches": match_results})
405
-
398
+
406
399
  return results
407
400
 
408
401
 
409
402
  def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
410
403
  """
411
404
  Find library matches using optimized vectorized operations.
412
-
405
+
413
406
  FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
414
407
  """
415
408
  # Filter by m/z tolerance using vectorized operations
416
- matches = lib_df.filter(
417
- (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
418
- )
409
+ matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
419
410
 
420
411
  initial_match_count = len(matches)
421
412
 
@@ -423,14 +414,11 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
423
414
  if rt_tol is not None and cons_rt is not None and not matches.is_empty():
424
415
  # First, check if any m/z matches have RT data
425
416
  rt_candidates = matches.filter(pl.col("rt").is_not_null())
426
-
417
+
427
418
  if not rt_candidates.is_empty():
428
419
  # Apply RT filtering to candidates with RT data
429
- rt_matches = rt_candidates.filter(
430
- (pl.col("rt") >= cons_rt - rt_tol) &
431
- (pl.col("rt") <= cons_rt + rt_tol)
432
- )
433
-
420
+ rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
421
+
434
422
  if not rt_matches.is_empty():
435
423
  matches = rt_matches
436
424
  if logger:
@@ -458,12 +446,14 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
458
446
  strict_matches = matches.filter(
459
447
  (pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
460
448
  )
461
-
449
+
462
450
  if not strict_matches.is_empty():
463
451
  # Use strict matches if available
464
452
  matches = strict_matches
465
453
  if logger:
466
- logger.debug(f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)")
454
+ logger.debug(
455
+ f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)"
456
+ )
467
457
  else:
468
458
  if logger:
469
459
  logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
@@ -472,21 +462,18 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
472
462
  if not matches.is_empty() and len(matches) > 1:
473
463
  if "formula" in matches.columns and "adduct" in matches.columns:
474
464
  pre_dedup_count = len(matches)
475
-
465
+
476
466
  # Calculate m/z error for sorting
477
- matches = matches.with_columns([
478
- (pl.col("mz") - cons_mz).abs().alias("mz_error_abs")
479
- ])
480
-
467
+ matches = matches.with_columns([(pl.col("mz") - cons_mz).abs().alias("mz_error_abs")])
468
+
481
469
  # Group by formula and adduct, but keep the most accurate m/z match
482
470
  matches = (
483
- matches
484
- .sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
471
+ matches.sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
485
472
  .group_by(["formula", "adduct"], maintain_order=True)
486
473
  .first()
487
474
  .drop("mz_error_abs") # Remove the temporary column
488
475
  )
489
-
476
+
490
477
  post_dedup_count = len(matches)
491
478
  if logger and post_dedup_count < pre_dedup_count:
492
479
  logger.debug(
@@ -512,10 +499,10 @@ def _update_identification_results(study, results, logger):
512
499
  "score": match["score"],
513
500
  "iso": 0, # Default to zero
514
501
  })
515
-
502
+
516
503
  # Convert to DataFrame and append to existing results
517
504
  new_results_df = pl.DataFrame(records) if records else pl.DataFrame()
518
-
505
+
519
506
  if not new_results_df.is_empty():
520
507
  if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
521
508
  # Check if existing id_df has the iso column
@@ -524,11 +511,11 @@ def _update_identification_results(study, results, logger):
524
511
  study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
525
512
  if logger:
526
513
  logger.debug("Added 'iso' column to existing id_df for schema compatibility")
527
-
514
+
528
515
  study.id_df = pl.concat([study.id_df, new_results_df])
529
516
  else:
530
517
  study.id_df = new_results_df
531
-
518
+
532
519
  if logger:
533
520
  logger.debug(f"Added {len(records)} identification results to study.id_df")
534
521
  elif not hasattr(study, "id_df"):
@@ -539,7 +526,7 @@ def _finalize_identification_results(study, params, logger):
539
526
  """Apply final scoring adjustments and update consensus columns."""
540
527
  # Apply scoring adjustments based on compound and formula counts
541
528
  _apply_scoring_adjustments(study, params)
542
-
529
+
543
530
  # Update consensus_df with top-scoring identification results
544
531
  _update_consensus_id_columns(study, logger)
545
532
 
@@ -568,7 +555,7 @@ def _validate_identify_inputs(study, logger=None):
568
555
  if logger:
569
556
  logger.error("Library (study.lib_df) is empty; call lib_load() first")
570
557
  raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
571
-
558
+
572
559
  return True
573
560
 
574
561
 
@@ -612,8 +599,6 @@ def _prepare_consensus_features(study, features, logger=None):
612
599
  return consensus_to_process, target_uids
613
600
 
614
601
 
615
-
616
-
617
602
  def _get_adduct_probabilities(study):
618
603
  """Get adduct probabilities from _get_adducts() results."""
619
604
  adducts_df = _get_adducts(study)
@@ -624,45 +609,42 @@ def _get_adduct_probabilities(study):
624
609
  return adduct_prob_map
625
610
 
626
611
 
627
-
628
- def _create_identification_results(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None):
612
+ def _create_identification_results(
613
+ consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None
614
+ ):
629
615
  """Create identification results by matching consensus features against library (DEPRECATED - use optimized version)."""
630
616
  # This function is now deprecated in favor of _perform_identification_matching
631
617
  # Keep for backward compatibility but redirect to optimized version
632
618
  results = _perform_identification_matching(
633
619
  consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
634
620
  )
635
-
621
+
636
622
  # Convert to legacy format for compatibility
637
623
  legacy_results = []
638
624
  features_with_matches = 0
639
625
  total_matches = 0
640
-
626
+
641
627
  for result in results:
642
628
  if result["matches"]:
643
629
  features_with_matches += 1
644
630
  total_matches += len(result["matches"])
645
-
631
+
646
632
  for match in result["matches"]:
647
633
  legacy_results.append({
648
634
  "consensus_uid": result["consensus_uid"],
649
635
  "lib_uid": match["lib_uid"],
650
- "mz_delta": match["mz_delta"],
636
+ "mz_delta": match["mz_delta"],
651
637
  "rt_delta": match["rt_delta"],
652
638
  "matcher": match["matcher"],
653
639
  "score": match["score"],
654
640
  })
655
-
641
+
656
642
  return legacy_results, features_with_matches, total_matches
657
643
 
658
644
 
659
645
  def _apply_scoring_adjustments(study, params):
660
646
  """Apply scoring adjustments based on compound and formula counts using optimized operations."""
661
- if (
662
- not study.id_df.is_empty()
663
- and hasattr(study, "lib_df")
664
- and not study.lib_df.is_empty()
665
- ):
647
+ if not study.id_df.is_empty() and hasattr(study, "lib_df") and not study.lib_df.is_empty():
666
648
  # Get penalty parameters
667
649
  heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
668
650
  heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
@@ -685,15 +667,14 @@ def _apply_scoring_adjustments(study, params):
685
667
 
686
668
  # Join stats back and apply all penalties in one with_columns operation
687
669
  heteroatom_conditions = [pl.col("formula").str.contains(atom) for atom in heteroatoms]
688
- has_heteroatoms = pl.fold(
689
- acc=pl.lit(False),
690
- function=lambda acc, x: acc | x,
691
- exprs=heteroatom_conditions
692
- ) if heteroatom_conditions else pl.lit(False)
670
+ has_heteroatoms = (
671
+ pl.fold(acc=pl.lit(False), function=lambda acc, x: acc | x, exprs=heteroatom_conditions)
672
+ if heteroatom_conditions
673
+ else pl.lit(False)
674
+ )
693
675
 
694
676
  study.id_df = (
695
- id_with_lib
696
- .join(stats, on="consensus_uid", how="left")
677
+ id_with_lib.join(stats, on="consensus_uid", how="left")
697
678
  .with_columns([
698
679
  # Apply all penalties in sequence using case-when chains
699
680
  pl.when(pl.col("formula").is_not_null() & has_heteroatoms)
@@ -716,7 +697,7 @@ def _apply_scoring_adjustments(study, params):
716
697
  ])
717
698
  .select([
718
699
  "consensus_uid",
719
- "lib_uid",
700
+ "lib_uid",
720
701
  "mz_delta",
721
702
  "rt_delta",
722
703
  "matcher",
@@ -728,7 +709,7 @@ def _apply_scoring_adjustments(study, params):
728
709
  def _update_consensus_id_columns(study, logger=None):
729
710
  """
730
711
  Update consensus_df with top-scoring identification results using safe in-place updates.
731
-
712
+
732
713
  FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
733
714
  """
734
715
  try:
@@ -736,15 +717,15 @@ def _update_consensus_id_columns(study, logger=None):
736
717
  if logger:
737
718
  logger.debug("No identification results to process")
738
719
  return
739
-
720
+
740
721
  if not hasattr(study, "lib_df") or study.lib_df is None or study.lib_df.is_empty():
741
722
  if logger:
742
723
  logger.debug("No library data available")
743
724
  return
744
-
725
+
745
726
  if not hasattr(study, "consensus_df") or study.consensus_df is None or study.consensus_df.is_empty():
746
727
  if logger:
747
- logger.debug("No consensus data available")
728
+ logger.debug("No consensus data available")
748
729
  return
749
730
 
750
731
  # Get library columns we need (include mz for validation)
@@ -754,50 +735,45 @@ def _update_consensus_id_columns(study, logger=None):
754
735
 
755
736
  # FIX 1: Join identification results with consensus m/z for validation
756
737
  id_with_consensus = study.id_df.join(
757
- study.consensus_df.select(["consensus_uid", "mz"]),
758
- on="consensus_uid",
759
- how="left",
760
- suffix="_consensus"
738
+ study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left", suffix="_consensus"
761
739
  )
762
740
 
763
741
  # FIX 2: Validate m/z accuracy - filter out poor matches
764
742
  id_with_lib = id_with_consensus.join(
765
- study.lib_df.select(["lib_uid", "mz"]),
766
- on="lib_uid",
767
- how="left",
768
- suffix="_lib"
743
+ study.lib_df.select(["lib_uid", "mz"]), on="lib_uid", how="left", suffix="_lib"
769
744
  )
770
-
745
+
771
746
  # Calculate actual m/z error and filter out excessive errors
772
- id_validated = id_with_lib.with_columns([
773
- (pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")
774
- ])
775
-
747
+ id_validated = id_with_lib.with_columns([(pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")])
748
+
776
749
  # Filter out matches with excessive m/z error
777
750
  max_reasonable_error = 0.02 # 20 millidalton maximum error
778
751
  id_validated = id_validated.filter(
779
752
  (pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
780
753
  )
781
-
754
+
782
755
  if logger:
783
756
  original_count = len(id_with_consensus)
784
757
  validated_count = len(id_validated)
785
758
  if validated_count < original_count:
786
- logger.warning(f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)")
759
+ logger.warning(
760
+ f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)"
761
+ )
787
762
 
788
763
  # Get top-scoring identification for each consensus feature (from validated results)
789
764
  top_ids = (
790
- id_validated
791
- .sort(["consensus_uid", "score"], descending=[False, True])
765
+ id_validated.sort(["consensus_uid", "score"], descending=[False, True])
792
766
  .group_by("consensus_uid", maintain_order=True)
793
767
  .first()
794
768
  .join(study.lib_df.select(lib_columns), on="lib_uid", how="left")
795
769
  .select([
796
770
  "consensus_uid",
797
771
  "name",
798
- pl.col("class").alias("id_top_class") if "class" in lib_columns else pl.lit(None, dtype=pl.String).alias("id_top_class"),
772
+ pl.col("class").alias("id_top_class")
773
+ if "class" in lib_columns
774
+ else pl.lit(None, dtype=pl.String).alias("id_top_class"),
799
775
  pl.col("adduct").alias("id_top_adduct"),
800
- pl.col("score").alias("id_top_score")
776
+ pl.col("score").alias("id_top_score"),
801
777
  ])
802
778
  .rename({"name": "id_top_name"})
803
779
  )
@@ -805,28 +781,23 @@ def _update_consensus_id_columns(study, logger=None):
805
781
  # FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
806
782
  if not top_ids.is_empty():
807
783
  compound_groups = (
808
- top_ids
809
- .join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
784
+ top_ids.join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
810
785
  .group_by(["id_top_name", "id_top_adduct"])
811
786
  .agg([
812
787
  pl.col("consensus_uid").count().alias("count"),
813
788
  pl.col("mz").min().alias("mz_min"),
814
- pl.col("mz").max().alias("mz_max")
815
- ])
816
- .with_columns([
817
- (pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
789
+ pl.col("mz").max().alias("mz_max"),
818
790
  ])
791
+ .with_columns([(pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")])
819
792
  )
820
-
793
+
821
794
  # Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
822
- problematic = compound_groups.filter(
823
- (pl.col("count") > 1) & (pl.col("mz_range") > 0.1)
824
- )
825
-
795
+ problematic = compound_groups.filter((pl.col("count") > 1) & (pl.col("mz_range") > 0.1))
796
+
826
797
  if not problematic.is_empty() and logger:
827
798
  for row in problematic.iter_rows(named=True):
828
799
  name = row["id_top_name"]
829
- adduct = row["id_top_adduct"]
800
+ adduct = row["id_top_adduct"]
830
801
  count = row["count"]
831
802
  mz_range = row["mz_range"]
832
803
  logger.warning(
@@ -836,15 +807,13 @@ def _update_consensus_id_columns(study, logger=None):
836
807
  # Ensure we have the id_top columns in consensus_df
837
808
  for col_name, dtype in [
838
809
  ("id_top_name", pl.String),
839
- ("id_top_class", pl.String),
810
+ ("id_top_class", pl.String),
840
811
  ("id_top_adduct", pl.String),
841
812
  ("id_top_score", pl.Float64),
842
- ("id_source", pl.String)
813
+ ("id_source", pl.String),
843
814
  ]:
844
815
  if col_name not in study.consensus_df.columns:
845
- study.consensus_df = study.consensus_df.with_columns(
846
- pl.lit(None, dtype=dtype).alias(col_name)
847
- )
816
+ study.consensus_df = study.consensus_df.with_columns(pl.lit(None, dtype=dtype).alias(col_name))
848
817
 
849
818
  # Create a mapping dictionary for efficient updates
850
819
  id_mapping = {}
@@ -854,42 +823,36 @@ def _update_consensus_id_columns(study, logger=None):
854
823
  "id_top_name": row["id_top_name"],
855
824
  "id_top_class": row["id_top_class"],
856
825
  "id_top_adduct": row["id_top_adduct"],
857
- "id_top_score": row["id_top_score"]
826
+ "id_top_score": row["id_top_score"],
858
827
  }
859
828
 
860
829
  # Update consensus_df using map_elements (safer than join for avoiding duplicates)
861
830
  if id_mapping:
862
831
  study.consensus_df = study.consensus_df.with_columns([
863
- pl.col("consensus_uid").map_elements(
864
- lambda uid: id_mapping.get(uid, {}).get("id_top_name"),
865
- return_dtype=pl.String
866
- ).alias("id_top_name"),
867
- pl.col("consensus_uid").map_elements(
868
- lambda uid: id_mapping.get(uid, {}).get("id_top_class"),
869
- return_dtype=pl.String
870
- ).alias("id_top_class"),
871
- pl.col("consensus_uid").map_elements(
872
- lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"),
873
- return_dtype=pl.String
874
- ).alias("id_top_adduct"),
875
- pl.col("consensus_uid").map_elements(
876
- lambda uid: id_mapping.get(uid, {}).get("id_top_score"),
877
- return_dtype=pl.Float64
878
- ).alias("id_top_score")
832
+ pl.col("consensus_uid")
833
+ .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_name"), return_dtype=pl.String)
834
+ .alias("id_top_name"),
835
+ pl.col("consensus_uid")
836
+ .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_class"), return_dtype=pl.String)
837
+ .alias("id_top_class"),
838
+ pl.col("consensus_uid")
839
+ .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"), return_dtype=pl.String)
840
+ .alias("id_top_adduct"),
841
+ pl.col("consensus_uid")
842
+ .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_score"), return_dtype=pl.Float64)
843
+ .alias("id_top_score"),
879
844
  ])
880
845
 
881
846
  if logger:
882
847
  num_updated = len(id_mapping)
883
848
  logger.debug(f"Updated consensus_df with top identifications for {num_updated} features")
884
-
849
+
885
850
  except Exception as e:
886
851
  if logger:
887
852
  logger.error(f"Error updating consensus_df with identification results: {e}")
888
853
  # Don't re-raise to avoid breaking the identification process
889
854
 
890
855
 
891
-
892
-
893
856
  def identify(study, features=None, params=None, **kwargs):
894
857
  """Identify consensus features against the loaded library.
895
858
 
@@ -915,12 +878,12 @@ def identify(study, features=None, params=None, **kwargs):
915
878
  """
916
879
  # Get logger from study if available
917
880
  logger = getattr(study, "logger", None)
918
-
881
+
919
882
  # Setup parameters early
920
883
  params = _setup_identify_parameters(params, kwargs)
921
884
  effective_mz_tol = getattr(params, "mz_tol", 0.01)
922
885
  effective_rt_tol = getattr(params, "rt_tol", 2.0)
923
-
886
+
924
887
  if logger:
925
888
  logger.debug(
926
889
  f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
@@ -937,7 +900,7 @@ def identify(study, features=None, params=None, **kwargs):
937
900
 
938
901
  # Smart reset of id_df: only clear results for features being re-identified
939
902
  _smart_reset_id_results(study, target_uids, logger)
940
-
903
+
941
904
  # Cache adduct probabilities (expensive operation)
942
905
  adduct_prob_map = _get_cached_adduct_probabilities(study, logger)
943
906
 
@@ -1037,9 +1000,7 @@ def get_id(study, features=None) -> pl.DataFrame:
1037
1000
  # Join with consensus_df to get consensus feature m/z and RT
1038
1001
  consensus_cols = ["consensus_uid", "mz", "rt"]
1039
1002
  # Only select columns that exist in consensus_df
1040
- available_consensus_cols = [
1041
- col for col in consensus_cols if col in study.consensus_df.columns
1042
- ]
1003
+ available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
1043
1004
 
1044
1005
  result_df = result_df.join(
1045
1006
  study.consensus_df.select(available_consensus_cols),
@@ -1101,9 +1062,7 @@ def get_id(study, features=None) -> pl.DataFrame:
1101
1062
  column_order.extend(remaining_cols)
1102
1063
 
1103
1064
  # Filter out None values and select existing columns
1104
- final_column_order = [
1105
- col for col in column_order if col is not None and col in result_df.columns
1106
- ]
1065
+ final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
1107
1066
 
1108
1067
  result_df = result_df.select(final_column_order)
1109
1068
 
@@ -1115,10 +1074,7 @@ def get_id(study, features=None) -> pl.DataFrame:
1115
1074
  pl.col("cmpd_uid").n_unique().alias("num_cmpds")
1116
1075
  if "cmpd_uid" in result_df.columns
1117
1076
  else pl.lit(None).alias("num_cmpds"),
1118
- pl.col("formula")
1119
- .filter(pl.col("formula").is_not_null())
1120
- .n_unique()
1121
- .alias("num_formulas")
1077
+ pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas")
1122
1078
  if "formula" in result_df.columns
1123
1079
  else pl.lit(None).alias("num_formulas"),
1124
1080
  ],
@@ -1177,9 +1133,7 @@ def get_id(study, features=None) -> pl.DataFrame:
1177
1133
 
1178
1134
  # Get the highest scoring entry's RT as reference
1179
1135
  reference_rt = (
1180
- group_df["rt"][0]
1181
- if "rt" in group_df.columns and group_df["rt"][0] is not None
1182
- else None
1136
+ group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
1183
1137
  )
1184
1138
 
1185
1139
  # Filter entries: keep those with same RT as highest scoring entry
@@ -1193,11 +1147,7 @@ def get_id(study, features=None) -> pl.DataFrame:
1193
1147
  rt_filtered = group_df
1194
1148
 
1195
1149
  # Check multiply charged constraint
1196
- if (
1197
- "z" in rt_filtered.columns
1198
- and "adduct" in rt_filtered.columns
1199
- and len(rt_filtered) > 0
1200
- ):
1150
+ if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
1201
1151
  # Check if there are multiply charged adducts
1202
1152
  multiply_charged = rt_filtered.filter(
1203
1153
  (pl.col("z") > 1) | (pl.col("z") < -1),
@@ -1259,7 +1209,7 @@ def id_reset(study):
1259
1209
  if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
1260
1210
  if logger:
1261
1211
  logger.debug("Resetting id_top_* columns in consensus_df")
1262
-
1212
+
1263
1213
  # Check which columns exist before trying to update them
1264
1214
  id_columns_to_reset = []
1265
1215
  for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
@@ -1268,7 +1218,7 @@ def id_reset(study):
1268
1218
  id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
1269
1219
  else:
1270
1220
  id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
1271
-
1221
+
1272
1222
  if id_columns_to_reset:
1273
1223
  study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
1274
1224
 
@@ -1306,24 +1256,24 @@ def lib_reset(study):
1306
1256
  if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
1307
1257
  if logger:
1308
1258
  logger.debug("Checking for consensus features created by lib_to_consensus()")
1309
-
1259
+
1310
1260
  try:
1311
1261
  # Filter for features created by lib_to_consensus()
1312
1262
  # These can be identified by:
1313
1263
  # 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
1314
1264
  # 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
1315
-
1265
+
1316
1266
  # First check if we have any features with number_samples < 1
1317
1267
  potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
1318
-
1268
+
1319
1269
  if potential_lib_features is not None and not potential_lib_features.is_empty():
1320
1270
  # Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
1321
1271
  # This ensures we only remove library-derived features, not legitimate features with 0 samples
1322
1272
  if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
1323
- lib_consensus_uids = study.consensus_mapping_df.filter(
1324
- pl.col("sample_uid") == 0
1325
- )["consensus_uid"].unique().to_list()
1326
-
1273
+ lib_consensus_uids = (
1274
+ study.consensus_mapping_df.filter(pl.col("sample_uid") == 0)["consensus_uid"].unique().to_list()
1275
+ )
1276
+
1327
1277
  if lib_consensus_uids:
1328
1278
  lib_consensus_features = potential_lib_features.filter(
1329
1279
  pl.col("consensus_uid").is_in(lib_consensus_uids)
@@ -1335,15 +1285,15 @@ def lib_reset(study):
1335
1285
  lib_consensus_features = potential_lib_features
1336
1286
  else:
1337
1287
  lib_consensus_features = pl.DataFrame() # No features with number_samples < 1
1338
-
1288
+
1339
1289
  if lib_consensus_features is not None and not lib_consensus_features.is_empty():
1340
1290
  num_lib_features = len(lib_consensus_features)
1341
1291
  if logger:
1342
1292
  logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
1343
-
1293
+
1344
1294
  # Use consensus_delete to remove these features and all dependent data
1345
1295
  study.consensus_delete(lib_consensus_features)
1346
-
1296
+
1347
1297
  if logger:
1348
1298
  logger.debug("Successfully removed library-derived consensus features")
1349
1299
  else:
@@ -1375,7 +1325,7 @@ def lib_reset(study):
1375
1325
  if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
1376
1326
  if logger:
1377
1327
  logger.debug("Resetting id_top_* columns in consensus_df")
1378
-
1328
+
1379
1329
  # Check which columns exist before trying to update them
1380
1330
  id_columns_to_reset = []
1381
1331
  for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
@@ -1384,7 +1334,7 @@ def lib_reset(study):
1384
1334
  id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
1385
1335
  else:
1386
1336
  id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
1387
-
1337
+
1388
1338
  if id_columns_to_reset:
1389
1339
  study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
1390
1340
 
@@ -1399,7 +1349,7 @@ def lib_reset(study):
1399
1349
  if logger:
1400
1350
  logger.debug("Removing 'lib_load' from history")
1401
1351
  del study.history["lib_load"]
1402
-
1352
+
1403
1353
  if "lib_to_consensus" in study.history:
1404
1354
  if logger:
1405
1355
  logger.debug("Removing 'lib_to_consensus' from history")
@@ -1445,9 +1395,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1445
1395
  adducts_list_to_use = adducts_list
1446
1396
  if adducts_list_to_use is None:
1447
1397
  adducts_list_to_use = (
1448
- study.parameters.adducts
1449
- if hasattr(study.parameters, "adducts") and study.parameters.adducts
1450
- else []
1398
+ study.parameters.adducts if hasattr(study.parameters, "adducts") and study.parameters.adducts else []
1451
1399
  )
1452
1400
 
1453
1401
  # Get parameters with study-specific defaults
@@ -1561,11 +1509,9 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1561
1509
  {
1562
1510
  "components": components,
1563
1511
  "formatted_name": formatted_name,
1564
- "total_mass_shift": float(pos_spec["mass_shift"])
1565
- + float(neut_spec["mass_shift"]),
1512
+ "total_mass_shift": float(pos_spec["mass_shift"]) + float(neut_spec["mass_shift"]),
1566
1513
  "total_charge": total_charge,
1567
- "combined_probability": float(pos_spec["probability"])
1568
- * float(neut_spec["probability"]),
1514
+ "combined_probability": float(pos_spec["probability"]) * float(neut_spec["probability"]),
1569
1515
  "complexity": 2,
1570
1516
  },
1571
1517
  )
@@ -1739,9 +1685,7 @@ def _format_adduct_name(components: list[dict]) -> str:
1739
1685
  elif abs(total_charge) == 1:
1740
1686
  charge_str = "1+" if total_charge > 0 else "1-"
1741
1687
  else:
1742
- charge_str = (
1743
- f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
1744
- )
1688
+ charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
1745
1689
 
1746
1690
  return f"[M{formula}]{charge_str}"
1747
1691
 
@@ -1749,53 +1693,53 @@ def _format_adduct_name(components: list[dict]) -> str:
1749
1693
  def _generate_13c_isotopes(lib_df):
1750
1694
  """
1751
1695
  Generate 13C isotope variants for library entries.
1752
-
1696
+
1753
1697
  For each compound with n carbon atoms, creates n+1 entries:
1754
1698
  - iso=0: original compound (no 13C)
1755
1699
  - iso=1: one 13C isotope (+1.00335 Da)
1756
1700
  - iso=2: two 13C isotopes (+2.00670 Da)
1757
1701
  - ...
1758
1702
  - iso=n: n 13C isotopes (+n*1.00335 Da)
1759
-
1703
+
1760
1704
  All isotopomers share the same quant_group.
1761
-
1705
+
1762
1706
  Args:
1763
1707
  lib_df: Polars DataFrame with library entries
1764
-
1708
+
1765
1709
  Returns:
1766
1710
  Polars DataFrame with additional 13C isotope entries
1767
1711
  """
1768
1712
  if lib_df.is_empty():
1769
1713
  return lib_df
1770
-
1714
+
1771
1715
  # First, ensure all original entries have iso=0
1772
1716
  original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
1773
-
1717
+
1774
1718
  isotope_entries = []
1775
1719
  next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
1776
-
1720
+
1777
1721
  # Mass difference for one 13C isotope
1778
1722
  c13_mass_shift = 1.00335 # Mass difference between 13C and 12C
1779
-
1723
+
1780
1724
  for row in original_df.iter_rows(named=True):
1781
1725
  formula = row.get("formula", "")
1782
1726
  if not formula:
1783
1727
  continue
1784
-
1728
+
1785
1729
  # Count carbon atoms in the formula
1786
1730
  carbon_count = _count_carbon_atoms(formula)
1787
1731
  if carbon_count == 0:
1788
1732
  continue
1789
-
1733
+
1790
1734
  # Get the original quant_group to keep it consistent across isotopes
1791
1735
  # All isotopomers of the same compound should have the same quant_group
1792
1736
  quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
1793
-
1737
+
1794
1738
  # Generate isotope variants (1 to n 13C atoms)
1795
1739
  for iso_num in range(1, carbon_count + 1):
1796
1740
  # Calculate mass shift for this number of 13C isotopes
1797
1741
  mass_shift = iso_num * c13_mass_shift
1798
-
1742
+
1799
1743
  # Create new entry
1800
1744
  isotope_entry = dict(row) # Copy all fields
1801
1745
  isotope_entry["lib_uid"] = next_lib_uid
@@ -1803,10 +1747,10 @@ def _generate_13c_isotopes(lib_df):
1803
1747
  isotope_entry["m"] = row["m"] + mass_shift
1804
1748
  isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
1805
1749
  isotope_entry["quant_group"] = quant_group # Keep same quant_group
1806
-
1750
+
1807
1751
  isotope_entries.append(isotope_entry)
1808
1752
  next_lib_uid += 1
1809
-
1753
+
1810
1754
  # Combine original entries (now with iso=0) with isotope entries
1811
1755
  if isotope_entries:
1812
1756
  isotope_df = pl.DataFrame(isotope_entries)
@@ -1818,7 +1762,7 @@ def _generate_13c_isotopes(lib_df):
1818
1762
  # Get common schema
1819
1763
  original_schema = original_df.schema
1820
1764
  isotope_schema = isotope_df.schema
1821
-
1765
+
1822
1766
  # Cast isotope_df columns to match original_df schema where possible
1823
1767
  cast_exprs = []
1824
1768
  for col_name in isotope_df.columns:
@@ -1827,7 +1771,7 @@ def _generate_13c_isotopes(lib_df):
1827
1771
  cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
1828
1772
  else:
1829
1773
  cast_exprs.append(pl.col(col_name))
1830
-
1774
+
1831
1775
  isotope_df_cast = isotope_df.select(cast_exprs)
1832
1776
  return pl.concat([original_df, isotope_df_cast])
1833
1777
  else:
@@ -1837,75 +1781,75 @@ def _generate_13c_isotopes(lib_df):
1837
1781
  def _count_carbon_atoms(formula: str) -> int:
1838
1782
  """
1839
1783
  Count the number of carbon atoms in a molecular formula.
1840
-
1784
+
1841
1785
  Args:
1842
1786
  formula: Molecular formula string like "C6H12O6"
1843
-
1787
+
1844
1788
  Returns:
1845
1789
  Number of carbon atoms
1846
1790
  """
1847
1791
  import re
1848
-
1792
+
1849
1793
  if not formula or not isinstance(formula, str):
1850
1794
  return 0
1851
-
1795
+
1852
1796
  # Look for carbon followed by optional number
1853
1797
  # C followed by digits, or just C (which means 1)
1854
- carbon_matches = re.findall(r'C(\d*)', formula)
1855
-
1798
+ carbon_matches = re.findall(r"C(\d*)", formula)
1799
+
1856
1800
  total_carbons = 0
1857
1801
  for match in carbon_matches:
1858
- if match == '':
1802
+ if match == "":
1859
1803
  # Just 'C' without number means 1 carbon
1860
1804
  total_carbons += 1
1861
1805
  else:
1862
1806
  # 'C' followed by number
1863
1807
  total_carbons += int(match)
1864
-
1808
+
1865
1809
  return total_carbons
1866
1810
 
1867
1811
 
1868
1812
  def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
1869
1813
  """Create consensus features from library entries instead of features_df.
1870
-
1814
+
1871
1815
  This method takes all rows from lib_df and creates corresponding entries in
1872
- consensus_df with the same columns as merge(). Instead of relying on
1816
+ consensus_df with the same columns as merge(). Instead of relying on
1873
1817
  features_df, it populates consensus features directly from library data.
1874
-
1818
+
1875
1819
  Before creating new features, it checks for pre-existing consensus features:
1876
1820
  - If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
1877
1821
  - If rt is not null: picks consensus feature with matching mz and rt within tolerance
1878
1822
  - If a match is found, skips to the next library entry
1879
-
1823
+
1880
1824
  Args:
1881
1825
  study: Study instance with lib_df populated
1882
- chrom_fhwm: Chromatographic full width at half maximum in seconds
1826
+ chrom_fhwm: Chromatographic full width at half maximum in seconds
1883
1827
  to infer rt_start_mean and rt_end_mean (default: 5.0)
1884
1828
  mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
1885
1829
  rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
1886
-
1830
+
1887
1831
  Side effects:
1888
1832
  Adds rows to study.consensus_df and study.consensus_mapping_df
1889
1833
  Calls study.find_ms2() at the end
1890
1834
  """
1891
1835
  # Get logger from study if available
1892
1836
  logger = getattr(study, "logger", None)
1893
-
1837
+
1894
1838
  # Validate inputs
1895
1839
  if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
1896
1840
  if logger:
1897
1841
  logger.error("Library (study.lib_df) is empty; call lib_load() first")
1898
1842
  raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
1899
-
1843
+
1900
1844
  if logger:
1901
1845
  logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
1902
-
1846
+
1903
1847
  # Initialize consensus DataFrames if they don't exist
1904
1848
  if not hasattr(study, "consensus_df") or study.consensus_df is None:
1905
1849
  study.consensus_df = pl.DataFrame()
1906
1850
  if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
1907
1851
  study.consensus_mapping_df = pl.DataFrame()
1908
-
1852
+
1909
1853
  # Get cached adducts for consistent adduct handling
1910
1854
  cached_adducts_df = None
1911
1855
  cached_valid_adducts = None
@@ -1919,26 +1863,26 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
1919
1863
  if logger:
1920
1864
  logger.warning(f"Could not retrieve study adducts: {e}")
1921
1865
  cached_valid_adducts = set()
1922
-
1866
+
1923
1867
  # Always allow '?' adducts
1924
1868
  cached_valid_adducts.add("?")
1925
-
1869
+
1926
1870
  # Get starting consensus_uid counter
1927
1871
  if not study.consensus_df.is_empty():
1928
1872
  max_existing_uid = study.consensus_df["consensus_uid"].max()
1929
1873
  consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
1930
1874
  else:
1931
1875
  consensus_uid_counter = 0
1932
-
1876
+
1933
1877
  # Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
1934
1878
  base_adduct_groups = {} # key: (mz, adduct_base), value: adduct_group
1935
-
1879
+
1936
1880
  # Process each library entry
1937
1881
  consensus_metadata = []
1938
1882
  consensus_mapping_list = []
1939
1883
  matched_count = 0
1940
1884
  skipped_count = 0
1941
-
1885
+
1942
1886
  for lib_row in study.lib_df.iter_rows(named=True):
1943
1887
  # Extract basic library data
1944
1888
  lib_uid = lib_row.get("lib_uid")
@@ -1947,21 +1891,19 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
1947
1891
  iso = lib_row.get("iso", 0)
1948
1892
  adduct = lib_row.get("adduct")
1949
1893
  z = lib_row.get("z", 1) # charge
1950
-
1894
+
1951
1895
  # Skip entries without essential data
1952
1896
  if mz is None:
1953
1897
  if logger:
1954
1898
  logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
1955
1899
  continue
1956
-
1900
+
1957
1901
  # Check for pre-existing consensus features
1958
1902
  existing_match = None
1959
1903
  if not study.consensus_df.is_empty():
1960
1904
  # Filter by m/z tolerance first
1961
- mz_matches = study.consensus_df.filter(
1962
- (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
1963
- )
1964
-
1905
+ mz_matches = study.consensus_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
1906
+
1965
1907
  if not mz_matches.is_empty():
1966
1908
  if rt is None:
1967
1909
  # If rt is null, pick the consensus feature with largest inty_mean
@@ -1974,7 +1916,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
1974
1916
  )
1975
1917
  if not rt_matches.is_empty():
1976
1918
  existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
1977
-
1919
+
1978
1920
  if existing_match is not None and len(existing_match) > 0:
1979
1921
  # Found a matching consensus feature, skip this library entry
1980
1922
  matched_count += 1
@@ -1982,27 +1924,29 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
1982
1924
  match_uid = existing_match["consensus_uid"][0]
1983
1925
  match_mz = existing_match["mz"][0]
1984
1926
  match_rt = existing_match["rt"][0]
1985
- logger.debug(f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})")
1927
+ logger.debug(
1928
+ f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})"
1929
+ )
1986
1930
  continue
1987
-
1931
+
1988
1932
  # No match found, create new consensus feature
1989
1933
  # Handle missing RT - use 0 as placeholder
1990
1934
  if rt is None:
1991
1935
  rt = 0.0
1992
1936
  if logger and skipped_count < 5: # Log first few
1993
1937
  logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
1994
-
1938
+
1995
1939
  # Calculate RT range based on chrom_fhwm
1996
1940
  half_width = chrom_fhwm / 2.0
1997
1941
  rt_start = rt - half_width
1998
1942
  rt_end = rt + half_width
1999
-
1943
+
2000
1944
  # Get adduct information
2001
1945
  adduct_top = adduct if adduct else "?"
2002
1946
  adduct_charge_top = None
2003
1947
  adduct_mass_shift_top = None
2004
1948
  adduct_mass_neutral_top = None
2005
-
1949
+
2006
1950
  # Parse adduct to get charge and mass shift
2007
1951
  if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
2008
1952
  # Look for exact match in study adducts
@@ -2011,7 +1955,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2011
1955
  adduct_row = matching_adduct.row(0, named=True)
2012
1956
  adduct_charge_top = adduct_row["charge"]
2013
1957
  adduct_mass_shift_top = adduct_row["mass_shift"]
2014
-
1958
+
2015
1959
  # Fallback to default values if not found
2016
1960
  if adduct_charge_top is None:
2017
1961
  adduct_charge_top = int(z) if z else 1
@@ -2029,15 +1973,15 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2029
1973
  adduct_mass_shift_top = 1.007825
2030
1974
  if adduct_top == "?":
2031
1975
  adduct_top = "[M+?]1+"
2032
-
1976
+
2033
1977
  # Calculate neutral mass
2034
1978
  if adduct_charge_top and adduct_mass_shift_top is not None:
2035
1979
  adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
2036
-
1980
+
2037
1981
  # Determine adduct group for isotopologues and related adducts
2038
1982
  adduct_group = consensus_uid_counter # Default: each entry gets its own group
2039
1983
  adduct_of = 0 # Default: this is the base adduct
2040
-
1984
+
2041
1985
  # Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
2042
1986
  base_adduct_key = None
2043
1987
  if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
@@ -2049,21 +1993,22 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2049
1993
  # Calculate the base m/z (subtract isotope mass shifts)
2050
1994
  c13_mass_shift = 1.00335
2051
1995
  base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
2052
-
1996
+
2053
1997
  # Look for matching base adduct
2054
1998
  for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
2055
1999
  if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
2056
2000
  adduct_group = stored_group
2057
2001
  adduct_of = stored_group
2058
2002
  break
2059
-
2003
+
2060
2004
  # Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
2061
2005
  adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
2062
-
2006
+
2063
2007
  # Generate unique consensus_id string
2064
2008
  import uuid
2065
- consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
2066
-
2009
+
2010
+ consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
2011
+
2067
2012
  # Build consensus metadata with requested modifications for new entries
2068
2013
  metadata = {
2069
2014
  "consensus_uid": consensus_uid_counter,
@@ -2096,7 +2041,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2096
2041
  "adducts": adduct_values,
2097
2042
  "adduct_charge_top": adduct_charge_top,
2098
2043
  "adduct_group": adduct_group, # Use calculated adduct group
2099
- "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
2044
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
2045
+ if adduct_mass_neutral_top is not None
2046
+ else None,
2100
2047
  "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
2101
2048
  "adduct_of": adduct_of, # Use calculated adduct_of
2102
2049
  "adduct_top": adduct_top,
@@ -2105,9 +2052,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2105
2052
  "id_top_adduct": None, # Set to null as requested
2106
2053
  "id_top_score": None, # Set to null as requested
2107
2054
  }
2108
-
2055
+
2109
2056
  consensus_metadata.append(metadata)
2110
-
2057
+
2111
2058
  # Create mapping entry (maps to library entry as "virtual" feature)
2112
2059
  # Use lib_uid as the feature_uid and a virtual sample_uid of 0
2113
2060
  # Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
@@ -2116,18 +2063,20 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2116
2063
  "feature_uid": lib_uid, # Use lib_uid as feature reference
2117
2064
  "sample_uid": 0, # Virtual sample for library entries
2118
2065
  })
2119
-
2066
+
2120
2067
  consensus_uid_counter += 1
2121
-
2068
+
2122
2069
  # Log matching statistics
2123
2070
  if logger:
2124
2071
  total_processed = matched_count + len(consensus_metadata)
2125
- logger.info(f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features")
2126
-
2072
+ logger.info(
2073
+ f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features"
2074
+ )
2075
+
2127
2076
  # Convert to DataFrames with proper schema alignment
2128
2077
  if consensus_metadata:
2129
2078
  new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
2130
-
2079
+
2131
2080
  # Ensure schema compatibility with existing consensus_df
2132
2081
  if not study.consensus_df.is_empty():
2133
2082
  # Cast columns to match existing schema
@@ -2143,36 +2092,36 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2143
2092
  cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
2144
2093
  else:
2145
2094
  cast_exprs.append(pl.col(col_name))
2146
-
2095
+
2147
2096
  new_consensus_df = new_consensus_df.select(cast_exprs)
2148
-
2097
+
2149
2098
  new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
2150
-
2099
+
2151
2100
  # Append to existing DataFrames
2152
2101
  if not study.consensus_df.is_empty():
2153
2102
  study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
2154
2103
  else:
2155
2104
  study.consensus_df = new_consensus_df
2156
-
2105
+
2157
2106
  if not study.consensus_mapping_df.is_empty():
2158
2107
  study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
2159
2108
  else:
2160
2109
  study.consensus_mapping_df = new_consensus_mapping_df
2161
-
2110
+
2162
2111
  if logger:
2163
2112
  logger.info(f"Added {len(consensus_metadata)} consensus features from library")
2164
2113
  else:
2165
2114
  if logger:
2166
2115
  logger.warning("No valid consensus features created from library")
2167
2116
  return
2168
-
2117
+
2169
2118
  # Store operation in history
2170
2119
  if hasattr(study, "update_history"):
2171
2120
  study.update_history(
2172
2121
  ["lib_to_consensus"],
2173
2122
  {"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
2174
2123
  )
2175
-
2124
+
2176
2125
  # Perform find_ms2 at the end
2177
2126
  try:
2178
2127
  if hasattr(study, "find_ms2"):
@@ -2185,6 +2134,6 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
2185
2134
  except Exception as e:
2186
2135
  if logger:
2187
2136
  logger.warning(f"find_ms2 failed: {e}")
2188
-
2137
+
2189
2138
  if logger:
2190
2139
  logger.success(f"lib_to_consensus completed: {len(consensus_metadata)} features added")