masster 0.4.22__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/id.py CHANGED
@@ -15,6 +15,7 @@ def lib_load(
15
15
  lib_source,
16
16
  polarity: str | None = None,
17
17
  adducts: list | None = None,
18
+ iso: str | None = None,
18
19
  ):
19
20
  """Load a compound library into the study.
20
21
 
@@ -23,6 +24,7 @@ def lib_load(
23
24
  lib_source: either a CSV file path (str) or a Lib instance
24
25
  polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV path
25
26
  adducts: specific adducts to generate - used when lib_source is a CSV path
27
+ iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
26
28
 
27
29
  Side effects:
28
30
  sets study.lib_df to a Polars DataFrame and stores the lib object on
@@ -97,6 +99,56 @@ def lib_load(
97
99
  # Store pointer and DataFrame on study
98
100
  study._lib = lib_obj
99
101
 
102
+ # Add source_id column with filename (without path) if loading from CSV
103
+ if isinstance(lib_source, str):
104
+ import os
105
+ filename_only = os.path.basename(lib_source)
106
+ filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
107
+
108
+ # Ensure required columns exist and set correct values
109
+ required_columns = {
110
+ "quant_group": pl.Int64,
111
+ "iso": pl.Int64
112
+ }
113
+
114
+ for col_name, col_dtype in required_columns.items():
115
+ if col_name == "quant_group":
116
+ # Set quant_group using cmpd_uid (same for isotopomers of same compound)
117
+ if "cmpd_uid" in filtered_lf.columns:
118
+ filtered_lf = filtered_lf.with_columns(pl.col("cmpd_uid").cast(col_dtype).alias("quant_group"))
119
+ else:
120
+ # Fallback to lib_uid if cmpd_uid doesn't exist
121
+ filtered_lf = filtered_lf.with_columns(pl.col("lib_uid").cast(col_dtype).alias("quant_group"))
122
+ elif col_name == "iso":
123
+ if col_name not in filtered_lf.columns:
124
+ # Default to zero for iso
125
+ filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
126
+
127
+ # Generate 13C isotopes if requested
128
+ original_count = len(filtered_lf)
129
+ if iso == '13C':
130
+ filtered_lf = _generate_13c_isotopes(filtered_lf)
131
+ # Update the log message to show the correct count after isotope generation
132
+ if isinstance(lib_source, str):
133
+ import os
134
+ filename_only = os.path.basename(lib_source)
135
+ print(f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}")
136
+
137
+ # Reorder columns to place quant_group after rt and iso after formula
138
+ column_order = []
139
+ columns_list = list(filtered_lf.columns)
140
+
141
+ for col in columns_list:
142
+ if col not in column_order: # Only add if not already added
143
+ column_order.append(col)
144
+ if col == "rt" and "quant_group" in columns_list and "quant_group" not in column_order:
145
+ column_order.append("quant_group")
146
+ elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
147
+ column_order.append("iso")
148
+
149
+ # Apply the column ordering
150
+ filtered_lf = filtered_lf.select(column_order)
151
+
100
152
  # Add to existing lib_df instead of replacing
101
153
  if (
102
154
  hasattr(study, "lib_df")
@@ -124,10 +176,10 @@ def lib_load(
124
176
  study.lib_df = pl.DataFrame()
125
177
 
126
178
  # Store this operation in history
127
- if hasattr(study, "store_history"):
128
- study.store_history(
179
+ if hasattr(study, "update_history"):
180
+ study.update_history(
129
181
  ["lib_load"],
130
- {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
182
+ {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts, "iso": iso},
131
183
  )
132
184
 
133
185
 
@@ -349,6 +401,7 @@ def _update_identification_results(study, results, logger):
349
401
  "rt_delta": match["rt_delta"],
350
402
  "matcher": match["matcher"],
351
403
  "score": match["score"],
404
+ "iso": 0, # Default to zero
352
405
  })
353
406
 
354
407
  # Convert to DataFrame and append to existing results
@@ -356,6 +409,13 @@ def _update_identification_results(study, results, logger):
356
409
 
357
410
  if not new_results_df.is_empty():
358
411
  if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
412
+ # Check if existing id_df has the iso column
413
+ if "iso" not in study.id_df.columns:
414
+ # Add iso column to existing id_df with default value 0
415
+ study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
416
+ if logger:
417
+ logger.debug("Added 'iso' column to existing id_df for schema compatibility")
418
+
359
419
  study.id_df = pl.concat([study.id_df, new_results_df])
360
420
  else:
361
421
  study.id_df = new_results_df
@@ -385,7 +445,7 @@ def _store_identification_history(study, effective_mz_tol, effective_rt_tol, tar
385
445
  history_params["params"] = params.to_dict()
386
446
  if kwargs:
387
447
  history_params["kwargs"] = kwargs
388
- study.store_history(["identify"], history_params)
448
+ study.update_history(["identify"], history_params)
389
449
 
390
450
 
391
451
  def _validate_identify_inputs(study, logger=None):
@@ -1043,8 +1103,10 @@ def lib_reset(study):
1043
1103
  - study.id_df (identification results DataFrame)
1044
1104
  - study.lib_df (library DataFrame)
1045
1105
  - study._lib (library object reference)
1106
+ - Consensus features created by lib_to_consensus() (number_samples = -1 or 0)
1046
1107
  - 'identify' from study.history
1047
1108
  - 'lib_load' from study.history (if exists)
1109
+ - 'lib_to_consensus' from study.history (if exists)
1048
1110
  - Resets id_top_* columns in consensus_df to null
1049
1111
 
1050
1112
  Args:
@@ -1053,6 +1115,36 @@ def lib_reset(study):
1053
1115
  # Get logger from study if available
1054
1116
  logger = getattr(study, "logger", None)
1055
1117
 
1118
+ # Remove consensus features created by lib_to_consensus()
1119
+ # These are identified by number_samples = -1 or 0
1120
+ if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
1121
+ if logger:
1122
+ logger.debug("Checking for consensus features created by lib_to_consensus()")
1123
+
1124
+ try:
1125
+ # Filter for features with number_samples = -1 or 0
1126
+ # Since consensus_select doesn't support list of discrete values, use direct filtering
1127
+ lib_consensus_features = study.consensus_df.filter(
1128
+ (pl.col("number_samples") == -1) | (pl.col("number_samples") == 0)
1129
+ )
1130
+
1131
+ if lib_consensus_features is not None and not lib_consensus_features.is_empty():
1132
+ num_lib_features = len(lib_consensus_features)
1133
+ if logger:
1134
+ logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
1135
+
1136
+ # Use consensus_delete to remove these features and all dependent data
1137
+ study.consensus_delete(lib_consensus_features)
1138
+
1139
+ if logger:
1140
+ logger.debug("Successfully removed library-derived consensus features")
1141
+ else:
1142
+ if logger:
1143
+ logger.debug("No library-derived consensus features found to remove")
1144
+ except Exception as e:
1145
+ if logger:
1146
+ logger.warning(f"Error removing library-derived consensus features: {e}")
1147
+
1056
1148
  # Remove id_df
1057
1149
  if hasattr(study, "id_df"):
1058
1150
  if logger:
@@ -1099,6 +1191,11 @@ def lib_reset(study):
1099
1191
  if logger:
1100
1192
  logger.debug("Removing 'lib_load' from history")
1101
1193
  del study.history["lib_load"]
1194
+
1195
+ if "lib_to_consensus" in study.history:
1196
+ if logger:
1197
+ logger.debug("Removing 'lib_to_consensus' from history")
1198
+ del study.history["lib_to_consensus"]
1102
1199
 
1103
1200
  if logger:
1104
1201
  logger.info("Library and identification data reset completed")
@@ -1438,3 +1535,447 @@ def _format_adduct_name(components: list[dict]) -> str:
1438
1535
  )
1439
1536
 
1440
1537
  return f"[M{formula}]{charge_str}"
1538
+
1539
+
1540
+ def _generate_13c_isotopes(lib_df):
1541
+ """
1542
+ Generate 13C isotope variants for library entries.
1543
+
1544
+ For each compound with n carbon atoms, creates n+1 entries:
1545
+ - iso=0: original compound (no 13C)
1546
+ - iso=1: one 13C isotope (+1.00335 Da)
1547
+ - iso=2: two 13C isotopes (+2.00670 Da)
1548
+ - ...
1549
+ - iso=n: n 13C isotopes (+n*1.00335 Da)
1550
+
1551
+ All isotopomers share the same quant_group.
1552
+
1553
+ Args:
1554
+ lib_df: Polars DataFrame with library entries
1555
+
1556
+ Returns:
1557
+ Polars DataFrame with additional 13C isotope entries
1558
+ """
1559
+ if lib_df.is_empty():
1560
+ return lib_df
1561
+
1562
+ # First, ensure all original entries have iso=0
1563
+ original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
1564
+
1565
+ isotope_entries = []
1566
+ next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
1567
+
1568
+ # Mass difference for one 13C isotope
1569
+ c13_mass_shift = 1.00335 # Mass difference between 13C and 12C
1570
+
1571
+ for row in original_df.iter_rows(named=True):
1572
+ formula = row.get("formula", "")
1573
+ if not formula:
1574
+ continue
1575
+
1576
+ # Count carbon atoms in the formula
1577
+ carbon_count = _count_carbon_atoms(formula)
1578
+ if carbon_count == 0:
1579
+ continue
1580
+
1581
+ # Get the original quant_group to keep it consistent across isotopes
1582
+ # All isotopomers of the same compound should have the same quant_group
1583
+ quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
1584
+
1585
+ # Generate isotope variants (1 to n 13C atoms)
1586
+ for iso_num in range(1, carbon_count + 1):
1587
+ # Calculate mass shift for this number of 13C isotopes
1588
+ mass_shift = iso_num * c13_mass_shift
1589
+
1590
+ # Create new entry
1591
+ isotope_entry = dict(row) # Copy all fields
1592
+ isotope_entry["lib_uid"] = next_lib_uid
1593
+ isotope_entry["iso"] = iso_num
1594
+ isotope_entry["m"] = row["m"] + mass_shift
1595
+ isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
1596
+ isotope_entry["quant_group"] = quant_group # Keep same quant_group
1597
+
1598
+ isotope_entries.append(isotope_entry)
1599
+ next_lib_uid += 1
1600
+
1601
+ # Combine original entries (now with iso=0) with isotope entries
1602
+ if isotope_entries:
1603
+ isotope_df = pl.DataFrame(isotope_entries)
1604
+ # Ensure schema compatibility by aligning data types
1605
+ try:
1606
+ return pl.concat([original_df, isotope_df])
1607
+ except Exception as e:
1608
+ # If concat fails due to schema mismatch, convert to compatible types
1609
+ # Get common schema
1610
+ original_schema = original_df.schema
1611
+ isotope_schema = isotope_df.schema
1612
+
1613
+ # Cast isotope_df columns to match original_df schema where possible
1614
+ cast_exprs = []
1615
+ for col_name in isotope_df.columns:
1616
+ if col_name in original_schema:
1617
+ target_dtype = original_schema[col_name]
1618
+ cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
1619
+ else:
1620
+ cast_exprs.append(pl.col(col_name))
1621
+
1622
+ isotope_df_cast = isotope_df.select(cast_exprs)
1623
+ return pl.concat([original_df, isotope_df_cast])
1624
+ else:
1625
+ return original_df
1626
+
1627
+
1628
+ def _count_carbon_atoms(formula: str) -> int:
1629
+ """
1630
+ Count the number of carbon atoms in a molecular formula.
1631
+
1632
+ Args:
1633
+ formula: Molecular formula string like "C6H12O6"
1634
+
1635
+ Returns:
1636
+ Number of carbon atoms
1637
+ """
1638
+ import re
1639
+
1640
+ if not formula or not isinstance(formula, str):
1641
+ return 0
1642
+
1643
+ # Look for carbon followed by optional number
1644
+ # C followed by digits, or just C (which means 1)
1645
+ carbon_matches = re.findall(r'C(\d*)', formula)
1646
+
1647
+ total_carbons = 0
1648
+ for match in carbon_matches:
1649
+ if match == '':
1650
+ # Just 'C' without number means 1 carbon
1651
+ total_carbons += 1
1652
+ else:
1653
+ # 'C' followed by number
1654
+ total_carbons += int(match)
1655
+
1656
+ return total_carbons
1657
+
1658
+
1659
+ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
1660
+ """Create consensus features from library entries instead of features_df.
1661
+
1662
+ This method takes all rows from lib_df and creates corresponding entries in
1663
+ consensus_df with the same columns as merge(). Instead of relying on
1664
+ features_df, it populates consensus features directly from library data.
1665
+
1666
+ Before creating new features, it checks for pre-existing consensus features:
1667
+ - If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
1668
+ - If rt is not null: picks consensus feature with matching mz and rt within tolerance
1669
+ - If a match is found, skips to the next library entry
1670
+
1671
+ Args:
1672
+ study: Study instance with lib_df populated
1673
+ chrom_fhwm: Chromatographic full width at half maximum in seconds
1674
+ to infer rt_start_mean and rt_end_mean (default: 5.0)
1675
+ mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
1676
+ rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
1677
+
1678
+ Side effects:
1679
+ Adds rows to study.consensus_df and study.consensus_mapping_df
1680
+ Calls study.find_ms2() at the end
1681
+ """
1682
+ # Get logger from study if available
1683
+ logger = getattr(study, "logger", None)
1684
+
1685
+ # Validate inputs
1686
+ if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
1687
+ if logger:
1688
+ logger.error("Library (study.lib_df) is empty; call lib_load() first")
1689
+ raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
1690
+
1691
+ if logger:
1692
+ logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
1693
+
1694
+ # Initialize consensus DataFrames if they don't exist
1695
+ if not hasattr(study, "consensus_df") or study.consensus_df is None:
1696
+ study.consensus_df = pl.DataFrame()
1697
+ if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
1698
+ study.consensus_mapping_df = pl.DataFrame()
1699
+
1700
+ # Get cached adducts for consistent adduct handling
1701
+ cached_adducts_df = None
1702
+ cached_valid_adducts = None
1703
+ try:
1704
+ cached_adducts_df = _get_adducts(study)
1705
+ if not cached_adducts_df.is_empty():
1706
+ cached_valid_adducts = set(cached_adducts_df["name"].to_list())
1707
+ else:
1708
+ cached_valid_adducts = set()
1709
+ except Exception as e:
1710
+ if logger:
1711
+ logger.warning(f"Could not retrieve study adducts: {e}")
1712
+ cached_valid_adducts = set()
1713
+
1714
+ # Always allow '?' adducts
1715
+ cached_valid_adducts.add("?")
1716
+
1717
+ # Get starting consensus_uid counter
1718
+ if not study.consensus_df.is_empty():
1719
+ max_existing_uid = study.consensus_df["consensus_uid"].max()
1720
+ consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
1721
+ else:
1722
+ consensus_uid_counter = 0
1723
+
1724
+ # Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
1725
+ base_adduct_groups = {} # key: (mz, adduct_base), value: adduct_group
1726
+
1727
+ # Process each library entry
1728
+ consensus_metadata = []
1729
+ consensus_mapping_list = []
1730
+ matched_count = 0
1731
+ skipped_count = 0
1732
+
1733
+ for lib_row in study.lib_df.iter_rows(named=True):
1734
+ # Extract basic library data
1735
+ lib_uid = lib_row.get("lib_uid")
1736
+ mz = lib_row.get("mz")
1737
+ rt = lib_row.get("rt")
1738
+ iso = lib_row.get("iso", 0)
1739
+ adduct = lib_row.get("adduct")
1740
+ z = lib_row.get("z", 1) # charge
1741
+
1742
+ # Skip entries without essential data
1743
+ if mz is None:
1744
+ if logger:
1745
+ logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
1746
+ continue
1747
+
1748
+ # Check for pre-existing consensus features
1749
+ existing_match = None
1750
+ if not study.consensus_df.is_empty():
1751
+ # Filter by m/z tolerance first
1752
+ mz_matches = study.consensus_df.filter(
1753
+ (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
1754
+ )
1755
+
1756
+ if not mz_matches.is_empty():
1757
+ if rt is None:
1758
+ # If rt is null, pick the consensus feature with largest inty_mean
1759
+ existing_match = mz_matches.sort("inty_mean", descending=True).head(1)
1760
+ else:
1761
+ # If rt is not null, filter by RT tolerance and pick largest inty_mean
1762
+ rt_tolerance = chrom_fhwm # Use chrom_fhwm as RT tolerance range
1763
+ rt_matches = mz_matches.filter(
1764
+ (pl.col("rt") >= rt - rt_tolerance) & (pl.col("rt") <= rt + rt_tolerance)
1765
+ )
1766
+ if not rt_matches.is_empty():
1767
+ existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
1768
+
1769
+ if existing_match is not None and len(existing_match) > 0:
1770
+ # Found a matching consensus feature, skip this library entry
1771
+ matched_count += 1
1772
+ if logger and matched_count <= 5: # Log first few matches
1773
+ match_uid = existing_match["consensus_uid"][0]
1774
+ match_mz = existing_match["mz"][0]
1775
+ match_rt = existing_match["rt"][0]
1776
+ logger.debug(f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})")
1777
+ continue
1778
+
1779
+ # No match found, create new consensus feature
1780
+ # Handle missing RT - use 0 as placeholder
1781
+ if rt is None:
1782
+ rt = 0.0
1783
+ if logger and skipped_count < 5: # Log first few
1784
+ logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
1785
+
1786
+ # Calculate RT range based on chrom_fhwm
1787
+ half_width = chrom_fhwm / 2.0
1788
+ rt_start = rt - half_width
1789
+ rt_end = rt + half_width
1790
+
1791
+ # Get adduct information
1792
+ adduct_top = adduct if adduct else "?"
1793
+ adduct_charge_top = None
1794
+ adduct_mass_shift_top = None
1795
+ adduct_mass_neutral_top = None
1796
+
1797
+ # Parse adduct to get charge and mass shift
1798
+ if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
1799
+ # Look for exact match in study adducts
1800
+ matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct_top)
1801
+ if not matching_adduct.is_empty():
1802
+ adduct_row = matching_adduct.row(0, named=True)
1803
+ adduct_charge_top = adduct_row["charge"]
1804
+ adduct_mass_shift_top = adduct_row["mass_shift"]
1805
+
1806
+ # Fallback to default values if not found
1807
+ if adduct_charge_top is None:
1808
+ adduct_charge_top = int(z) if z else 1
1809
+ # Default based on study polarity
1810
+ study_polarity = getattr(study, "polarity", "positive")
1811
+ if study_polarity in ["negative", "neg"]:
1812
+ if adduct_charge_top > 0:
1813
+ adduct_charge_top = -adduct_charge_top
1814
+ adduct_mass_shift_top = -1.007825
1815
+ if adduct_top == "?":
1816
+ adduct_top = "[M-?]1-"
1817
+ else:
1818
+ if adduct_charge_top < 0:
1819
+ adduct_charge_top = -adduct_charge_top
1820
+ adduct_mass_shift_top = 1.007825
1821
+ if adduct_top == "?":
1822
+ adduct_top = "[M+?]1+"
1823
+
1824
+ # Calculate neutral mass
1825
+ if adduct_charge_top and adduct_mass_shift_top is not None:
1826
+ adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
1827
+
1828
+ # Determine adduct group for isotopologues and related adducts
1829
+ adduct_group = consensus_uid_counter # Default: each entry gets its own group
1830
+ adduct_of = 0 # Default: this is the base adduct
1831
+
1832
+ # Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
1833
+ base_adduct_key = None
1834
+ if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
1835
+ # This is a base adduct with iso=0
1836
+ base_adduct_key = (round(mz, 4), adduct_top)
1837
+ base_adduct_groups[base_adduct_key] = consensus_uid_counter
1838
+ elif iso > 0:
1839
+ # This is an isotopologue, try to find the base adduct
1840
+ # Calculate the base m/z (subtract isotope mass shifts)
1841
+ c13_mass_shift = 1.00335
1842
+ base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
1843
+
1844
+ # Look for matching base adduct
1845
+ for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
1846
+ if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
1847
+ adduct_group = stored_group
1848
+ adduct_of = stored_group
1849
+ break
1850
+
1851
+ # Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
1852
+ adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
1853
+
1854
+ # Generate unique consensus_id string
1855
+ import uuid
1856
+ consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
1857
+
1858
+ # Build consensus metadata with requested modifications for new entries
1859
+ metadata = {
1860
+ "consensus_uid": consensus_uid_counter,
1861
+ "consensus_id": consensus_id_str,
1862
+ "quality": 1.0,
1863
+ "number_samples": 0.0, # Set to 0.0 for library entries
1864
+ "rt": float(rt),
1865
+ "mz": float(mz),
1866
+ "rt_min": float(rt), # Set to rt as requested
1867
+ "rt_max": float(rt), # Set to rt as requested
1868
+ "rt_mean": float(rt), # Set to rt as requested
1869
+ "rt_start_mean": float(rt_start),
1870
+ "rt_end_mean": float(rt_end),
1871
+ "rt_delta_mean": 0.0, # Set to 0.0 as requested
1872
+ "mz_min": float(mz), # Set to mz as requested
1873
+ "mz_max": float(mz), # Set to mz as requested
1874
+ "mz_mean": float(mz), # Set to mz as requested
1875
+ "mz_start_mean": float(mz), # Set to mz as requested
1876
+ "mz_end_mean": float(mz), # Set to mz as requested
1877
+ "inty_mean": -1.0, # Set to -1.0 as requested
1878
+ "bl": -1.0,
1879
+ "chrom_coherence_mean": -1.0, # Set to -1.0 as requested
1880
+ "chrom_prominence_mean": -1.0, # Set to -1.0 as requested
1881
+ "chrom_prominence_scaled_mean": -1.0, # Set to -1.0 as requested
1882
+ "chrom_height_scaled_mean": -1.0, # Set to -1.0 as requested
1883
+ "iso": iso, # Set to iso from lib_df as requested
1884
+ "iso_mean": float(iso), # Set to iso from lib_df as requested
1885
+ "charge_mean": float(abs(z)) if z else 1.0, # Set to z as requested
1886
+ "number_ms2": 0, # Will be updated by find_ms2
1887
+ "adducts": adduct_values,
1888
+ "adduct_charge_top": adduct_charge_top,
1889
+ "adduct_group": adduct_group, # Use calculated adduct group
1890
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
1891
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
1892
+ "adduct_of": adduct_of, # Use calculated adduct_of
1893
+ "adduct_top": adduct_top,
1894
+ "id_top_name": None, # Set to null as requested
1895
+ "id_top_class": None, # Set to null as requested
1896
+ "id_top_adduct": None, # Set to null as requested
1897
+ "id_top_score": None, # Set to null as requested
1898
+ }
1899
+
1900
+ consensus_metadata.append(metadata)
1901
+
1902
+ # Create mapping entry (maps to library entry as "virtual" feature)
1903
+ # Use lib_uid as the feature_uid and a virtual sample_uid of 0
1904
+ # Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
1905
+ consensus_mapping_list.append({
1906
+ "consensus_uid": consensus_uid_counter,
1907
+ "feature_uid": lib_uid, # Use lib_uid as feature reference
1908
+ "sample_uid": 0, # Virtual sample for library entries
1909
+ })
1910
+
1911
+ consensus_uid_counter += 1
1912
+
1913
+ # Log matching statistics
1914
+ if logger:
1915
+ total_processed = matched_count + len(consensus_metadata)
1916
+ logger.info(f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features")
1917
+
1918
+ # Convert to DataFrames with proper schema alignment
1919
+ if consensus_metadata:
1920
+ new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1921
+
1922
+ # Ensure schema compatibility with existing consensus_df
1923
+ if not study.consensus_df.is_empty():
1924
+ # Cast columns to match existing schema
1925
+ existing_schema = study.consensus_df.schema
1926
+ cast_exprs = []
1927
+ for col_name in new_consensus_df.columns:
1928
+ if col_name in existing_schema:
1929
+ target_dtype = existing_schema[col_name]
1930
+ if target_dtype == pl.Null:
1931
+ # For Null columns, use lit(None) to maintain Null type
1932
+ cast_exprs.append(pl.lit(None).alias(col_name))
1933
+ else:
1934
+ cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
1935
+ else:
1936
+ cast_exprs.append(pl.col(col_name))
1937
+
1938
+ new_consensus_df = new_consensus_df.select(cast_exprs)
1939
+
1940
+ new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1941
+
1942
+ # Append to existing DataFrames
1943
+ if not study.consensus_df.is_empty():
1944
+ study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
1945
+ else:
1946
+ study.consensus_df = new_consensus_df
1947
+
1948
+ if not study.consensus_mapping_df.is_empty():
1949
+ study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
1950
+ else:
1951
+ study.consensus_mapping_df = new_consensus_mapping_df
1952
+
1953
+ if logger:
1954
+ logger.info(f"Added {len(consensus_metadata)} consensus features from library")
1955
+ else:
1956
+ if logger:
1957
+ logger.warning("No valid consensus features created from library")
1958
+ return
1959
+
1960
+ # Store operation in history
1961
+ if hasattr(study, "update_history"):
1962
+ study.update_history(
1963
+ ["lib_to_consensus"],
1964
+ {"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
1965
+ )
1966
+
1967
+ # Perform find_ms2 at the end
1968
+ try:
1969
+ if hasattr(study, "find_ms2"):
1970
+ if logger:
1971
+ logger.info("Running find_ms2 to link MS2 spectra to library-derived consensus features")
1972
+ study.find_ms2()
1973
+ else:
1974
+ if logger:
1975
+ logger.warning("find_ms2 method not available on study object")
1976
+ except Exception as e:
1977
+ if logger:
1978
+ logger.warning(f"find_ms2 failed: {e}")
1979
+
1980
+ if logger:
1981
+ logger.info(f"lib_to_consensus completed: {len(consensus_metadata)} features added")