masster 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/id.py CHANGED
@@ -145,16 +145,61 @@ def lib_load(
145
145
  column_order.append("quant_group")
146
146
  elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
147
147
  column_order.append("iso")
148
-
149
- # Apply the column ordering
150
- filtered_lf = filtered_lf.select(column_order)
151
148
 
149
+
152
150
  # Add to existing lib_df instead of replacing
153
151
  if (
154
152
  hasattr(study, "lib_df")
155
153
  and study.lib_df is not None
156
154
  and not study.lib_df.is_empty()
157
155
  ):
156
+ # Check for schema compatibility and handle mismatches
157
+ existing_cols = set(study.lib_df.columns)
158
+ new_cols = set(filtered_lf.columns)
159
+
160
+ # If schemas don't match, we need to align them
161
+ if existing_cols != new_cols:
162
+ # Get union of all columns
163
+ all_cols = existing_cols.union(new_cols)
164
+
165
+ # Add missing columns to existing data with appropriate defaults
166
+ for col in new_cols - existing_cols:
167
+ if col == "probability":
168
+ # Add probability column to existing data - try to calculate from adduct
169
+ if "adduct" in study.lib_df.columns:
170
+ try:
171
+ adduct_prob_map = _get_adduct_probabilities(study)
172
+ study.lib_df = study.lib_df.with_columns(
173
+ pl.col("adduct").map_elements(
174
+ lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
175
+ return_dtype=pl.Float64
176
+ ).alias("probability")
177
+ )
178
+ except Exception:
179
+ study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
180
+ else:
181
+ study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
182
+ elif col == "iso":
183
+ study.lib_df = study.lib_df.with_columns(pl.lit(0).cast(pl.Int64).alias("iso"))
184
+ elif col == "quant_group":
185
+ # Set quant_group using cmpd_uid or lib_uid
186
+ if "cmpd_uid" in study.lib_df.columns:
187
+ study.lib_df = study.lib_df.with_columns(pl.col("cmpd_uid").cast(pl.Int64).alias("quant_group"))
188
+ else:
189
+ study.lib_df = study.lib_df.with_columns(pl.col("lib_uid").cast(pl.Int64).alias("quant_group"))
190
+ else:
191
+ # Default to null for other columns
192
+ study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
193
+
194
+ # Add missing columns to new data with appropriate defaults
195
+ for col in existing_cols - new_cols:
196
+ if col not in ["probability", "iso", "quant_group"]: # These should already be handled
197
+ filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
198
+
199
+ # Ensure column order matches for concatenation - use existing column order
200
+ existing_column_order = list(study.lib_df.columns)
201
+ filtered_lf = filtered_lf.select(existing_column_order)
202
+
158
203
  # Concatenate with existing data
159
204
  study.lib_df = pl.concat([study.lib_df, filtered_lf])
160
205
  else:
@@ -209,8 +254,19 @@ def _setup_identify_parameters(params, kwargs):
209
254
 
210
255
  # Override parameters with any provided kwargs
211
256
  if kwargs:
257
+ # Handle parameter name mapping for backwards compatibility
258
+ param_mapping = {
259
+ 'rt_tolerance': 'rt_tol',
260
+ 'mz_tolerance': 'mz_tol'
261
+ }
262
+
212
263
  for param_name, value in kwargs.items():
213
- if hasattr(params, param_name):
264
+ # Check if we need to map the parameter name
265
+ mapped_name = param_mapping.get(param_name, param_name)
266
+
267
+ if hasattr(params, mapped_name):
268
+ setattr(params, mapped_name, value)
269
+ elif hasattr(params, param_name):
214
270
  setattr(params, param_name, value)
215
271
 
216
272
  return params
@@ -319,9 +375,13 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
319
375
  else None
320
376
  )
321
377
 
322
- # Get adduct probability from cached map
378
+ # Get library probability as base score, then multiply by adduct probability
379
+ lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
323
380
  adduct = match_row.get("adduct")
324
- score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
381
+ adduct_probability = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
382
+ score = lib_probability * adduct_probability
383
+ # Scale to 0-100 and round to 1 decimal place
384
+ score = round(score * 100.0, 1)
325
385
 
326
386
  match_results.append({
327
387
  "lib_uid": match_row.get("lib_uid"),
@@ -337,7 +397,11 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
337
397
 
338
398
 
339
399
  def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
340
- """Find library matches using optimized vectorized operations."""
400
+ """
401
+ Find library matches using optimized vectorized operations.
402
+
403
+ FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
404
+ """
341
405
  # Filter by m/z tolerance using vectorized operations
342
406
  matches = lib_df.filter(
343
407
  (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
@@ -345,43 +409,78 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
345
409
 
346
410
  initial_match_count = len(matches)
347
411
 
348
- # Apply RT filter if available
412
+ # Apply RT filter if available - STRICT VERSION (no fallback)
349
413
  if rt_tol is not None and cons_rt is not None and not matches.is_empty():
350
- rt_matches = matches.filter(
351
- pl.col("rt").is_not_null() &
352
- (pl.col("rt") >= cons_rt - rt_tol) &
353
- (pl.col("rt") <= cons_rt + rt_tol)
354
- )
414
+ # First, check if any m/z matches have RT data
415
+ rt_candidates = matches.filter(pl.col("rt").is_not_null())
355
416
 
356
- if not rt_matches.is_empty():
357
- matches = rt_matches
417
+ if not rt_candidates.is_empty():
418
+ # Apply RT filtering to candidates with RT data
419
+ rt_matches = rt_candidates.filter(
420
+ (pl.col("rt") >= cons_rt - rt_tol) &
421
+ (pl.col("rt") <= cons_rt + rt_tol)
422
+ )
423
+
424
+ if not rt_matches.is_empty():
425
+ matches = rt_matches
426
+ if logger:
427
+ logger.debug(
428
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
429
+ )
430
+ else:
431
+ # NO FALLBACK - if RT filtering finds no matches, return empty
432
+ matches = rt_matches # This is empty
433
+ if logger:
434
+ logger.debug(
435
+ f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
436
+ )
437
+ else:
438
+ # No RT data in library matches - return empty if strict RT filtering requested
358
439
  if logger:
359
440
  logger.debug(
360
- f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(matches)} after RT filter"
441
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
361
442
  )
443
+ matches = pl.DataFrame() # Return empty DataFrame
444
+
445
+ # FIX 1: Add stricter m/z validation - prioritize more accurate matches
446
+ if not matches.is_empty():
447
+ strict_mz_tol = mz_tol * 0.5 # Use 50% of tolerance as strict threshold
448
+ strict_matches = matches.filter(
449
+ (pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
450
+ )
451
+
452
+ if not strict_matches.is_empty():
453
+ # Use strict matches if available
454
+ matches = strict_matches
455
+ if logger:
456
+ logger.debug(f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)")
362
457
  else:
363
458
  if logger:
364
- logger.debug(
365
- f"Consensus {cons_uid}: {initial_match_count} m/z matches, 0 after RT filter - using m/z matches only"
366
- )
459
+ logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
367
460
 
368
- # Optimized deduplication using Polars operations
461
+ # FIX 2: Improved deduplication - prioritize by m/z accuracy
369
462
  if not matches.is_empty() and len(matches) > 1:
370
463
  if "formula" in matches.columns and "adduct" in matches.columns:
371
464
  pre_dedup_count = len(matches)
372
465
 
373
- # Use Polars group_by with maintain_order for consistent results
466
+ # Calculate m/z error for sorting
467
+ matches = matches.with_columns([
468
+ (pl.col("mz") - cons_mz).abs().alias("mz_error_abs")
469
+ ])
470
+
471
+ # Group by formula and adduct, but keep the most accurate m/z match
374
472
  matches = (
375
473
  matches
376
- .sort("lib_uid") # Ensure consistent ordering
474
+ .sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
377
475
  .group_by(["formula", "adduct"], maintain_order=True)
378
476
  .first()
477
+ .drop("mz_error_abs") # Remove the temporary column
379
478
  )
380
479
 
381
480
  post_dedup_count = len(matches)
382
481
  if logger and post_dedup_count < pre_dedup_count:
383
482
  logger.debug(
384
- f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches"
483
+ f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches (m/z accuracy prioritized)"
385
484
  )
386
485
 
387
486
  return matches
@@ -617,7 +716,11 @@ def _apply_scoring_adjustments(study, params):
617
716
 
618
717
 
619
718
  def _update_consensus_id_columns(study, logger=None):
620
- """Update consensus_df with top-scoring identification results using safe in-place updates."""
719
+ """
720
+ Update consensus_df with top-scoring identification results using safe in-place updates.
721
+
722
+ FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
723
+ """
621
724
  try:
622
725
  if not hasattr(study, "id_df") or study.id_df is None or study.id_df.is_empty():
623
726
  if logger:
@@ -634,14 +737,47 @@ def _update_consensus_id_columns(study, logger=None):
634
737
  logger.debug("No consensus data available")
635
738
  return
636
739
 
637
- # Get library columns we need
638
- lib_columns = ["lib_uid", "name", "adduct"]
740
+ # Get library columns we need (include mz for validation)
741
+ lib_columns = ["lib_uid", "name", "adduct", "mz"]
639
742
  if "class" in study.lib_df.columns:
640
743
  lib_columns.append("class")
641
744
 
642
- # Get top-scoring identification for each consensus feature
745
+ # FIX 1: Join identification results with consensus m/z for validation
746
+ id_with_consensus = study.id_df.join(
747
+ study.consensus_df.select(["consensus_uid", "mz"]),
748
+ on="consensus_uid",
749
+ how="left",
750
+ suffix="_consensus"
751
+ )
752
+
753
+ # FIX 2: Validate m/z accuracy - filter out poor matches
754
+ id_with_lib = id_with_consensus.join(
755
+ study.lib_df.select(["lib_uid", "mz"]),
756
+ on="lib_uid",
757
+ how="left",
758
+ suffix="_lib"
759
+ )
760
+
761
+ # Calculate actual m/z error and filter out excessive errors
762
+ id_validated = id_with_lib.with_columns([
763
+ (pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")
764
+ ])
765
+
766
+ # Filter out matches with excessive m/z error
767
+ max_reasonable_error = 0.02 # 20 millidalton maximum error
768
+ id_validated = id_validated.filter(
769
+ (pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
770
+ )
771
+
772
+ if logger:
773
+ original_count = len(id_with_consensus)
774
+ validated_count = len(id_validated)
775
+ if validated_count < original_count:
776
+ logger.warning(f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)")
777
+
778
+ # Get top-scoring identification for each consensus feature (from validated results)
643
779
  top_ids = (
644
- study.id_df
780
+ id_validated
645
781
  .sort(["consensus_uid", "score"], descending=[False, True])
646
782
  .group_by("consensus_uid", maintain_order=True)
647
783
  .first()
@@ -656,6 +792,37 @@ def _update_consensus_id_columns(study, logger=None):
656
792
  .rename({"name": "id_top_name"})
657
793
  )
658
794
 
795
+ # FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
796
+ if not top_ids.is_empty():
797
+ compound_groups = (
798
+ top_ids
799
+ .join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
800
+ .group_by(["id_top_name", "id_top_adduct"])
801
+ .agg([
802
+ pl.col("consensus_uid").count().alias("count"),
803
+ pl.col("mz").min().alias("mz_min"),
804
+ pl.col("mz").max().alias("mz_max")
805
+ ])
806
+ .with_columns([
807
+ (pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
808
+ ])
809
+ )
810
+
811
+ # Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
812
+ problematic = compound_groups.filter(
813
+ (pl.col("count") > 1) & (pl.col("mz_range") > 0.1)
814
+ )
815
+
816
+ if not problematic.is_empty() and logger:
817
+ for row in problematic.iter_rows(named=True):
818
+ name = row["id_top_name"]
819
+ adduct = row["id_top_adduct"]
820
+ count = row["count"]
821
+ mz_range = row["mz_range"]
822
+ logger.warning(
823
+ f"Identification conflict detected: '{name}' ({adduct}) assigned to {count} features with {mz_range:.4f} Da m/z range"
824
+ )
825
+
659
826
  # Ensure we have the id_top columns in consensus_df
660
827
  for col_name, dtype in [
661
828
  ("id_top_name", pl.String),
@@ -783,7 +950,7 @@ def identify(study, features=None, params=None, **kwargs):
783
950
  if logger:
784
951
  features_with_matches = len([r for r in results if len(r["matches"]) > 0])
785
952
  total_matches = sum(len(r["matches"]) for r in results)
786
- logger.info(
953
+ logger.success(
787
954
  f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
788
955
  )
789
956
 
@@ -806,6 +973,8 @@ def get_id(study, features=None) -> pl.DataFrame:
806
973
  - mz (consensus feature m/z)
807
974
  - rt (consensus feature RT)
808
975
  - name (compound name from library)
976
+ - shortname (short name from library, if available)
977
+ - class (compound class from library, if available)
809
978
  - formula (molecular formula from library)
810
979
  - adduct (adduct type from library)
811
980
  - smiles (SMILES notation from library)
@@ -873,6 +1042,8 @@ def get_id(study, features=None) -> pl.DataFrame:
873
1042
  lib_cols = [
874
1043
  "lib_uid",
875
1044
  "name",
1045
+ "shortname",
1046
+ "class",
876
1047
  "formula",
877
1048
  "adduct",
878
1049
  "smiles",
@@ -901,6 +1072,8 @@ def get_id(study, features=None) -> pl.DataFrame:
901
1072
  "cmpd_uid" if "cmpd_uid" in result_df.columns else None,
902
1073
  "lib_uid",
903
1074
  "name" if "name" in result_df.columns else None,
1075
+ "shortname" if "shortname" in result_df.columns else None,
1076
+ "class" if "class" in result_df.columns else None,
904
1077
  "formula" if "formula" in result_df.columns else None,
905
1078
  "adduct" if "adduct" in result_df.columns else None,
906
1079
  "mz" if "mz" in result_df.columns else None,
@@ -952,6 +1125,8 @@ def get_id(study, features=None) -> pl.DataFrame:
952
1125
  "cmpd_uid",
953
1126
  "lib_uid",
954
1127
  "name",
1128
+ "shortname",
1129
+ "class",
955
1130
  "formula",
956
1131
  "adduct",
957
1132
  "mz",
@@ -1094,7 +1269,7 @@ def id_reset(study):
1094
1269
  del study.history["identify"]
1095
1270
 
1096
1271
  if logger:
1097
- logger.success("Identification data reset completed")
1272
+ logger.info("Identification data reset completed")
1098
1273
 
1099
1274
 
1100
1275
  def lib_reset(study):
@@ -1123,11 +1298,33 @@ def lib_reset(study):
1123
1298
  logger.debug("Checking for consensus features created by lib_to_consensus()")
1124
1299
 
1125
1300
  try:
1126
- # Filter for features with number_samples = -1 or 0
1127
- # Since consensus_select doesn't support list of discrete values, use direct filtering
1128
- lib_consensus_features = study.consensus_df.filter(
1129
- (pl.col("number_samples") == -1) | (pl.col("number_samples") == 0)
1130
- )
1301
+ # Filter for features created by lib_to_consensus()
1302
+ # These can be identified by:
1303
+ # 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
1304
+ # 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
1305
+
1306
+ # First check if we have any features with number_samples < 1
1307
+ potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
1308
+
1309
+ if potential_lib_features is not None and not potential_lib_features.is_empty():
1310
+ # Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
1311
+ # This ensures we only remove library-derived features, not legitimate features with 0 samples
1312
+ if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
1313
+ lib_consensus_uids = study.consensus_mapping_df.filter(
1314
+ pl.col("sample_uid") == 0
1315
+ )["consensus_uid"].unique().to_list()
1316
+
1317
+ if lib_consensus_uids:
1318
+ lib_consensus_features = potential_lib_features.filter(
1319
+ pl.col("consensus_uid").is_in(lib_consensus_uids)
1320
+ )
1321
+ else:
1322
+ lib_consensus_features = pl.DataFrame() # No library features found
1323
+ else:
1324
+ # If no consensus_mapping_df, fall back to number_samples < 1 only
1325
+ lib_consensus_features = potential_lib_features
1326
+ else:
1327
+ lib_consensus_features = pl.DataFrame() # No features with number_samples < 1
1131
1328
 
1132
1329
  if lib_consensus_features is not None and not lib_consensus_features.is_empty():
1133
1330
  num_lib_features = len(lib_consensus_features)
@@ -1199,7 +1396,7 @@ def lib_reset(study):
1199
1396
  del study.history["lib_to_consensus"]
1200
1397
 
1201
1398
  if logger:
1202
- logger.success("Library and identification data reset completed")
1399
+ logger.info("Library and identification data reset completed")
1203
1400
 
1204
1401
 
1205
1402
  def _get_adducts(study, adducts_list: list | None = None, **kwargs):
@@ -1306,7 +1503,8 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1306
1503
 
1307
1504
  # 1. Single adducts (filter out neutral adducts with charge == 0)
1308
1505
  for spec in base_specs:
1309
- if charge_min <= spec["charge"] <= charge_max and spec["charge"] != 0:
1506
+ # Study-level filtering: exclude neutral adducts (charge=0) but use abs() for charged adducts
1507
+ if spec["charge"] != 0 and charge_min <= abs(spec["charge"]) <= charge_max:
1310
1508
  formatted_name = _format_adduct_name([spec])
1311
1509
  combinations_list.append(
1312
1510
  {
@@ -1324,7 +1522,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1324
1522
  base_charge = spec["charge"]
1325
1523
  for multiplier in range(2, min(max_combinations + 1, 4)): # Up to 3x multiplier
1326
1524
  total_charge = base_charge * multiplier
1327
- if charge_min <= total_charge <= charge_max and total_charge != 0:
1525
+ if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
1328
1526
  components = [spec] * multiplier
1329
1527
  formatted_name = _format_adduct_name(components)
1330
1528
  probability_multiplied = float(spec["probability"]) ** multiplier
@@ -1346,7 +1544,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1346
1544
  for pos_spec in positive_specs[:2]: # Limit to first 2 positive specs
1347
1545
  for neut_spec in neutral_specs[:1]: # Only 1 neutral loss
1348
1546
  total_charge = pos_spec["charge"] + neut_spec["charge"]
1349
- if charge_min <= total_charge <= charge_max and total_charge != 0:
1547
+ if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
1350
1548
  components = [pos_spec, neut_spec]
1351
1549
  formatted_name = _format_adduct_name(components)
1352
1550
  combinations_list.append(