masster 0.4.22__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa.csv +22 -0
- masster/lib/lib.py +6 -0
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/study/analysis.py +1762 -0
- masster/study/defaults/fill_def.py +1 -1
- masster/study/export.py +5 -3
- masster/study/h5.py +3 -0
- masster/study/helpers.py +153 -80
- masster/study/id.py +545 -4
- masster/study/load.py +33 -59
- masster/study/merge.py +413 -315
- masster/study/parameters.py +3 -3
- masster/study/plot.py +398 -43
- masster/study/processing.py +6 -14
- masster/study/save.py +8 -4
- masster/study/study.py +179 -139
- masster/study/study5_schema.json +9 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/METADATA +54 -14
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/RECORD +27 -25
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/WHEEL +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/licenses/LICENSE +0 -0
masster/study/id.py
CHANGED
|
@@ -15,6 +15,7 @@ def lib_load(
|
|
|
15
15
|
lib_source,
|
|
16
16
|
polarity: str | None = None,
|
|
17
17
|
adducts: list | None = None,
|
|
18
|
+
iso: str | None = None,
|
|
18
19
|
):
|
|
19
20
|
"""Load a compound library into the study.
|
|
20
21
|
|
|
@@ -23,6 +24,7 @@ def lib_load(
|
|
|
23
24
|
lib_source: either a CSV file path (str) or a Lib instance
|
|
24
25
|
polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV path
|
|
25
26
|
adducts: specific adducts to generate - used when lib_source is a CSV path
|
|
27
|
+
iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
|
|
26
28
|
|
|
27
29
|
Side effects:
|
|
28
30
|
sets study.lib_df to a Polars DataFrame and stores the lib object on
|
|
@@ -97,6 +99,56 @@ def lib_load(
|
|
|
97
99
|
# Store pointer and DataFrame on study
|
|
98
100
|
study._lib = lib_obj
|
|
99
101
|
|
|
102
|
+
# Add source_id column with filename (without path) if loading from CSV
|
|
103
|
+
if isinstance(lib_source, str):
|
|
104
|
+
import os
|
|
105
|
+
filename_only = os.path.basename(lib_source)
|
|
106
|
+
filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
|
|
107
|
+
|
|
108
|
+
# Ensure required columns exist and set correct values
|
|
109
|
+
required_columns = {
|
|
110
|
+
"quant_group": pl.Int64,
|
|
111
|
+
"iso": pl.Int64
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for col_name, col_dtype in required_columns.items():
|
|
115
|
+
if col_name == "quant_group":
|
|
116
|
+
# Set quant_group using cmpd_uid (same for isotopomers of same compound)
|
|
117
|
+
if "cmpd_uid" in filtered_lf.columns:
|
|
118
|
+
filtered_lf = filtered_lf.with_columns(pl.col("cmpd_uid").cast(col_dtype).alias("quant_group"))
|
|
119
|
+
else:
|
|
120
|
+
# Fallback to lib_uid if cmpd_uid doesn't exist
|
|
121
|
+
filtered_lf = filtered_lf.with_columns(pl.col("lib_uid").cast(col_dtype).alias("quant_group"))
|
|
122
|
+
elif col_name == "iso":
|
|
123
|
+
if col_name not in filtered_lf.columns:
|
|
124
|
+
# Default to zero for iso
|
|
125
|
+
filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
|
|
126
|
+
|
|
127
|
+
# Generate 13C isotopes if requested
|
|
128
|
+
original_count = len(filtered_lf)
|
|
129
|
+
if iso == '13C':
|
|
130
|
+
filtered_lf = _generate_13c_isotopes(filtered_lf)
|
|
131
|
+
# Update the log message to show the correct count after isotope generation
|
|
132
|
+
if isinstance(lib_source, str):
|
|
133
|
+
import os
|
|
134
|
+
filename_only = os.path.basename(lib_source)
|
|
135
|
+
print(f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}")
|
|
136
|
+
|
|
137
|
+
# Reorder columns to place quant_group after rt and iso after formula
|
|
138
|
+
column_order = []
|
|
139
|
+
columns_list = list(filtered_lf.columns)
|
|
140
|
+
|
|
141
|
+
for col in columns_list:
|
|
142
|
+
if col not in column_order: # Only add if not already added
|
|
143
|
+
column_order.append(col)
|
|
144
|
+
if col == "rt" and "quant_group" in columns_list and "quant_group" not in column_order:
|
|
145
|
+
column_order.append("quant_group")
|
|
146
|
+
elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
|
|
147
|
+
column_order.append("iso")
|
|
148
|
+
|
|
149
|
+
# Apply the column ordering
|
|
150
|
+
filtered_lf = filtered_lf.select(column_order)
|
|
151
|
+
|
|
100
152
|
# Add to existing lib_df instead of replacing
|
|
101
153
|
if (
|
|
102
154
|
hasattr(study, "lib_df")
|
|
@@ -124,10 +176,10 @@ def lib_load(
|
|
|
124
176
|
study.lib_df = pl.DataFrame()
|
|
125
177
|
|
|
126
178
|
# Store this operation in history
|
|
127
|
-
if hasattr(study, "
|
|
128
|
-
study.
|
|
179
|
+
if hasattr(study, "update_history"):
|
|
180
|
+
study.update_history(
|
|
129
181
|
["lib_load"],
|
|
130
|
-
{"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
|
|
182
|
+
{"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts, "iso": iso},
|
|
131
183
|
)
|
|
132
184
|
|
|
133
185
|
|
|
@@ -349,6 +401,7 @@ def _update_identification_results(study, results, logger):
|
|
|
349
401
|
"rt_delta": match["rt_delta"],
|
|
350
402
|
"matcher": match["matcher"],
|
|
351
403
|
"score": match["score"],
|
|
404
|
+
"iso": 0, # Default to zero
|
|
352
405
|
})
|
|
353
406
|
|
|
354
407
|
# Convert to DataFrame and append to existing results
|
|
@@ -356,6 +409,13 @@ def _update_identification_results(study, results, logger):
|
|
|
356
409
|
|
|
357
410
|
if not new_results_df.is_empty():
|
|
358
411
|
if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
|
|
412
|
+
# Check if existing id_df has the iso column
|
|
413
|
+
if "iso" not in study.id_df.columns:
|
|
414
|
+
# Add iso column to existing id_df with default value 0
|
|
415
|
+
study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
|
|
416
|
+
if logger:
|
|
417
|
+
logger.debug("Added 'iso' column to existing id_df for schema compatibility")
|
|
418
|
+
|
|
359
419
|
study.id_df = pl.concat([study.id_df, new_results_df])
|
|
360
420
|
else:
|
|
361
421
|
study.id_df = new_results_df
|
|
@@ -385,7 +445,7 @@ def _store_identification_history(study, effective_mz_tol, effective_rt_tol, tar
|
|
|
385
445
|
history_params["params"] = params.to_dict()
|
|
386
446
|
if kwargs:
|
|
387
447
|
history_params["kwargs"] = kwargs
|
|
388
|
-
study.
|
|
448
|
+
study.update_history(["identify"], history_params)
|
|
389
449
|
|
|
390
450
|
|
|
391
451
|
def _validate_identify_inputs(study, logger=None):
|
|
@@ -1043,8 +1103,10 @@ def lib_reset(study):
|
|
|
1043
1103
|
- study.id_df (identification results DataFrame)
|
|
1044
1104
|
- study.lib_df (library DataFrame)
|
|
1045
1105
|
- study._lib (library object reference)
|
|
1106
|
+
- Consensus features created by lib_to_consensus() (number_samples = -1 or 0)
|
|
1046
1107
|
- 'identify' from study.history
|
|
1047
1108
|
- 'lib_load' from study.history (if exists)
|
|
1109
|
+
- 'lib_to_consensus' from study.history (if exists)
|
|
1048
1110
|
- Resets id_top_* columns in consensus_df to null
|
|
1049
1111
|
|
|
1050
1112
|
Args:
|
|
@@ -1053,6 +1115,36 @@ def lib_reset(study):
|
|
|
1053
1115
|
# Get logger from study if available
|
|
1054
1116
|
logger = getattr(study, "logger", None)
|
|
1055
1117
|
|
|
1118
|
+
# Remove consensus features created by lib_to_consensus()
|
|
1119
|
+
# These are identified by number_samples = -1 or 0
|
|
1120
|
+
if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
|
|
1121
|
+
if logger:
|
|
1122
|
+
logger.debug("Checking for consensus features created by lib_to_consensus()")
|
|
1123
|
+
|
|
1124
|
+
try:
|
|
1125
|
+
# Filter for features with number_samples = -1 or 0
|
|
1126
|
+
# Since consensus_select doesn't support list of discrete values, use direct filtering
|
|
1127
|
+
lib_consensus_features = study.consensus_df.filter(
|
|
1128
|
+
(pl.col("number_samples") == -1) | (pl.col("number_samples") == 0)
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
if lib_consensus_features is not None and not lib_consensus_features.is_empty():
|
|
1132
|
+
num_lib_features = len(lib_consensus_features)
|
|
1133
|
+
if logger:
|
|
1134
|
+
logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
|
|
1135
|
+
|
|
1136
|
+
# Use consensus_delete to remove these features and all dependent data
|
|
1137
|
+
study.consensus_delete(lib_consensus_features)
|
|
1138
|
+
|
|
1139
|
+
if logger:
|
|
1140
|
+
logger.debug("Successfully removed library-derived consensus features")
|
|
1141
|
+
else:
|
|
1142
|
+
if logger:
|
|
1143
|
+
logger.debug("No library-derived consensus features found to remove")
|
|
1144
|
+
except Exception as e:
|
|
1145
|
+
if logger:
|
|
1146
|
+
logger.warning(f"Error removing library-derived consensus features: {e}")
|
|
1147
|
+
|
|
1056
1148
|
# Remove id_df
|
|
1057
1149
|
if hasattr(study, "id_df"):
|
|
1058
1150
|
if logger:
|
|
@@ -1099,6 +1191,11 @@ def lib_reset(study):
|
|
|
1099
1191
|
if logger:
|
|
1100
1192
|
logger.debug("Removing 'lib_load' from history")
|
|
1101
1193
|
del study.history["lib_load"]
|
|
1194
|
+
|
|
1195
|
+
if "lib_to_consensus" in study.history:
|
|
1196
|
+
if logger:
|
|
1197
|
+
logger.debug("Removing 'lib_to_consensus' from history")
|
|
1198
|
+
del study.history["lib_to_consensus"]
|
|
1102
1199
|
|
|
1103
1200
|
if logger:
|
|
1104
1201
|
logger.info("Library and identification data reset completed")
|
|
@@ -1438,3 +1535,447 @@ def _format_adduct_name(components: list[dict]) -> str:
|
|
|
1438
1535
|
)
|
|
1439
1536
|
|
|
1440
1537
|
return f"[M{formula}]{charge_str}"
|
|
1538
|
+
|
|
1539
|
+
|
|
1540
|
+
def _generate_13c_isotopes(lib_df):
|
|
1541
|
+
"""
|
|
1542
|
+
Generate 13C isotope variants for library entries.
|
|
1543
|
+
|
|
1544
|
+
For each compound with n carbon atoms, creates n+1 entries:
|
|
1545
|
+
- iso=0: original compound (no 13C)
|
|
1546
|
+
- iso=1: one 13C isotope (+1.00335 Da)
|
|
1547
|
+
- iso=2: two 13C isotopes (+2.00670 Da)
|
|
1548
|
+
- ...
|
|
1549
|
+
- iso=n: n 13C isotopes (+n*1.00335 Da)
|
|
1550
|
+
|
|
1551
|
+
All isotopomers share the same quant_group.
|
|
1552
|
+
|
|
1553
|
+
Args:
|
|
1554
|
+
lib_df: Polars DataFrame with library entries
|
|
1555
|
+
|
|
1556
|
+
Returns:
|
|
1557
|
+
Polars DataFrame with additional 13C isotope entries
|
|
1558
|
+
"""
|
|
1559
|
+
if lib_df.is_empty():
|
|
1560
|
+
return lib_df
|
|
1561
|
+
|
|
1562
|
+
# First, ensure all original entries have iso=0
|
|
1563
|
+
original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
|
|
1564
|
+
|
|
1565
|
+
isotope_entries = []
|
|
1566
|
+
next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
|
|
1567
|
+
|
|
1568
|
+
# Mass difference for one 13C isotope
|
|
1569
|
+
c13_mass_shift = 1.00335 # Mass difference between 13C and 12C
|
|
1570
|
+
|
|
1571
|
+
for row in original_df.iter_rows(named=True):
|
|
1572
|
+
formula = row.get("formula", "")
|
|
1573
|
+
if not formula:
|
|
1574
|
+
continue
|
|
1575
|
+
|
|
1576
|
+
# Count carbon atoms in the formula
|
|
1577
|
+
carbon_count = _count_carbon_atoms(formula)
|
|
1578
|
+
if carbon_count == 0:
|
|
1579
|
+
continue
|
|
1580
|
+
|
|
1581
|
+
# Get the original quant_group to keep it consistent across isotopes
|
|
1582
|
+
# All isotopomers of the same compound should have the same quant_group
|
|
1583
|
+
quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
|
|
1584
|
+
|
|
1585
|
+
# Generate isotope variants (1 to n 13C atoms)
|
|
1586
|
+
for iso_num in range(1, carbon_count + 1):
|
|
1587
|
+
# Calculate mass shift for this number of 13C isotopes
|
|
1588
|
+
mass_shift = iso_num * c13_mass_shift
|
|
1589
|
+
|
|
1590
|
+
# Create new entry
|
|
1591
|
+
isotope_entry = dict(row) # Copy all fields
|
|
1592
|
+
isotope_entry["lib_uid"] = next_lib_uid
|
|
1593
|
+
isotope_entry["iso"] = iso_num
|
|
1594
|
+
isotope_entry["m"] = row["m"] + mass_shift
|
|
1595
|
+
isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
|
|
1596
|
+
isotope_entry["quant_group"] = quant_group # Keep same quant_group
|
|
1597
|
+
|
|
1598
|
+
isotope_entries.append(isotope_entry)
|
|
1599
|
+
next_lib_uid += 1
|
|
1600
|
+
|
|
1601
|
+
# Combine original entries (now with iso=0) with isotope entries
|
|
1602
|
+
if isotope_entries:
|
|
1603
|
+
isotope_df = pl.DataFrame(isotope_entries)
|
|
1604
|
+
# Ensure schema compatibility by aligning data types
|
|
1605
|
+
try:
|
|
1606
|
+
return pl.concat([original_df, isotope_df])
|
|
1607
|
+
except Exception as e:
|
|
1608
|
+
# If concat fails due to schema mismatch, convert to compatible types
|
|
1609
|
+
# Get common schema
|
|
1610
|
+
original_schema = original_df.schema
|
|
1611
|
+
isotope_schema = isotope_df.schema
|
|
1612
|
+
|
|
1613
|
+
# Cast isotope_df columns to match original_df schema where possible
|
|
1614
|
+
cast_exprs = []
|
|
1615
|
+
for col_name in isotope_df.columns:
|
|
1616
|
+
if col_name in original_schema:
|
|
1617
|
+
target_dtype = original_schema[col_name]
|
|
1618
|
+
cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
|
|
1619
|
+
else:
|
|
1620
|
+
cast_exprs.append(pl.col(col_name))
|
|
1621
|
+
|
|
1622
|
+
isotope_df_cast = isotope_df.select(cast_exprs)
|
|
1623
|
+
return pl.concat([original_df, isotope_df_cast])
|
|
1624
|
+
else:
|
|
1625
|
+
return original_df
|
|
1626
|
+
|
|
1627
|
+
|
|
1628
|
+
def _count_carbon_atoms(formula: str) -> int:
|
|
1629
|
+
"""
|
|
1630
|
+
Count the number of carbon atoms in a molecular formula.
|
|
1631
|
+
|
|
1632
|
+
Args:
|
|
1633
|
+
formula: Molecular formula string like "C6H12O6"
|
|
1634
|
+
|
|
1635
|
+
Returns:
|
|
1636
|
+
Number of carbon atoms
|
|
1637
|
+
"""
|
|
1638
|
+
import re
|
|
1639
|
+
|
|
1640
|
+
if not formula or not isinstance(formula, str):
|
|
1641
|
+
return 0
|
|
1642
|
+
|
|
1643
|
+
# Look for carbon followed by optional number
|
|
1644
|
+
# C followed by digits, or just C (which means 1)
|
|
1645
|
+
carbon_matches = re.findall(r'C(\d*)', formula)
|
|
1646
|
+
|
|
1647
|
+
total_carbons = 0
|
|
1648
|
+
for match in carbon_matches:
|
|
1649
|
+
if match == '':
|
|
1650
|
+
# Just 'C' without number means 1 carbon
|
|
1651
|
+
total_carbons += 1
|
|
1652
|
+
else:
|
|
1653
|
+
# 'C' followed by number
|
|
1654
|
+
total_carbons += int(match)
|
|
1655
|
+
|
|
1656
|
+
return total_carbons
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
|
|
1660
|
+
"""Create consensus features from library entries instead of features_df.
|
|
1661
|
+
|
|
1662
|
+
This method takes all rows from lib_df and creates corresponding entries in
|
|
1663
|
+
consensus_df with the same columns as merge(). Instead of relying on
|
|
1664
|
+
features_df, it populates consensus features directly from library data.
|
|
1665
|
+
|
|
1666
|
+
Before creating new features, it checks for pre-existing consensus features:
|
|
1667
|
+
- If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
|
|
1668
|
+
- If rt is not null: picks consensus feature with matching mz and rt within tolerance
|
|
1669
|
+
- If a match is found, skips to the next library entry
|
|
1670
|
+
|
|
1671
|
+
Args:
|
|
1672
|
+
study: Study instance with lib_df populated
|
|
1673
|
+
chrom_fhwm: Chromatographic full width at half maximum in seconds
|
|
1674
|
+
to infer rt_start_mean and rt_end_mean (default: 5.0)
|
|
1675
|
+
mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
|
|
1676
|
+
rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
|
|
1677
|
+
|
|
1678
|
+
Side effects:
|
|
1679
|
+
Adds rows to study.consensus_df and study.consensus_mapping_df
|
|
1680
|
+
Calls study.find_ms2() at the end
|
|
1681
|
+
"""
|
|
1682
|
+
# Get logger from study if available
|
|
1683
|
+
logger = getattr(study, "logger", None)
|
|
1684
|
+
|
|
1685
|
+
# Validate inputs
|
|
1686
|
+
if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
|
|
1687
|
+
if logger:
|
|
1688
|
+
logger.error("Library (study.lib_df) is empty; call lib_load() first")
|
|
1689
|
+
raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
|
|
1690
|
+
|
|
1691
|
+
if logger:
|
|
1692
|
+
logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
|
|
1693
|
+
|
|
1694
|
+
# Initialize consensus DataFrames if they don't exist
|
|
1695
|
+
if not hasattr(study, "consensus_df") or study.consensus_df is None:
|
|
1696
|
+
study.consensus_df = pl.DataFrame()
|
|
1697
|
+
if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
|
|
1698
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
1699
|
+
|
|
1700
|
+
# Get cached adducts for consistent adduct handling
|
|
1701
|
+
cached_adducts_df = None
|
|
1702
|
+
cached_valid_adducts = None
|
|
1703
|
+
try:
|
|
1704
|
+
cached_adducts_df = _get_adducts(study)
|
|
1705
|
+
if not cached_adducts_df.is_empty():
|
|
1706
|
+
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
1707
|
+
else:
|
|
1708
|
+
cached_valid_adducts = set()
|
|
1709
|
+
except Exception as e:
|
|
1710
|
+
if logger:
|
|
1711
|
+
logger.warning(f"Could not retrieve study adducts: {e}")
|
|
1712
|
+
cached_valid_adducts = set()
|
|
1713
|
+
|
|
1714
|
+
# Always allow '?' adducts
|
|
1715
|
+
cached_valid_adducts.add("?")
|
|
1716
|
+
|
|
1717
|
+
# Get starting consensus_uid counter
|
|
1718
|
+
if not study.consensus_df.is_empty():
|
|
1719
|
+
max_existing_uid = study.consensus_df["consensus_uid"].max()
|
|
1720
|
+
consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
|
|
1721
|
+
else:
|
|
1722
|
+
consensus_uid_counter = 0
|
|
1723
|
+
|
|
1724
|
+
# Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
|
|
1725
|
+
base_adduct_groups = {} # key: (mz, adduct_base), value: adduct_group
|
|
1726
|
+
|
|
1727
|
+
# Process each library entry
|
|
1728
|
+
consensus_metadata = []
|
|
1729
|
+
consensus_mapping_list = []
|
|
1730
|
+
matched_count = 0
|
|
1731
|
+
skipped_count = 0
|
|
1732
|
+
|
|
1733
|
+
for lib_row in study.lib_df.iter_rows(named=True):
|
|
1734
|
+
# Extract basic library data
|
|
1735
|
+
lib_uid = lib_row.get("lib_uid")
|
|
1736
|
+
mz = lib_row.get("mz")
|
|
1737
|
+
rt = lib_row.get("rt")
|
|
1738
|
+
iso = lib_row.get("iso", 0)
|
|
1739
|
+
adduct = lib_row.get("adduct")
|
|
1740
|
+
z = lib_row.get("z", 1) # charge
|
|
1741
|
+
|
|
1742
|
+
# Skip entries without essential data
|
|
1743
|
+
if mz is None:
|
|
1744
|
+
if logger:
|
|
1745
|
+
logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
|
|
1746
|
+
continue
|
|
1747
|
+
|
|
1748
|
+
# Check for pre-existing consensus features
|
|
1749
|
+
existing_match = None
|
|
1750
|
+
if not study.consensus_df.is_empty():
|
|
1751
|
+
# Filter by m/z tolerance first
|
|
1752
|
+
mz_matches = study.consensus_df.filter(
|
|
1753
|
+
(pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
|
|
1754
|
+
)
|
|
1755
|
+
|
|
1756
|
+
if not mz_matches.is_empty():
|
|
1757
|
+
if rt is None:
|
|
1758
|
+
# If rt is null, pick the consensus feature with largest inty_mean
|
|
1759
|
+
existing_match = mz_matches.sort("inty_mean", descending=True).head(1)
|
|
1760
|
+
else:
|
|
1761
|
+
# If rt is not null, filter by RT tolerance and pick largest inty_mean
|
|
1762
|
+
rt_tolerance = chrom_fhwm # Use chrom_fhwm as RT tolerance range
|
|
1763
|
+
rt_matches = mz_matches.filter(
|
|
1764
|
+
(pl.col("rt") >= rt - rt_tolerance) & (pl.col("rt") <= rt + rt_tolerance)
|
|
1765
|
+
)
|
|
1766
|
+
if not rt_matches.is_empty():
|
|
1767
|
+
existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
|
|
1768
|
+
|
|
1769
|
+
if existing_match is not None and len(existing_match) > 0:
|
|
1770
|
+
# Found a matching consensus feature, skip this library entry
|
|
1771
|
+
matched_count += 1
|
|
1772
|
+
if logger and matched_count <= 5: # Log first few matches
|
|
1773
|
+
match_uid = existing_match["consensus_uid"][0]
|
|
1774
|
+
match_mz = existing_match["mz"][0]
|
|
1775
|
+
match_rt = existing_match["rt"][0]
|
|
1776
|
+
logger.debug(f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})")
|
|
1777
|
+
continue
|
|
1778
|
+
|
|
1779
|
+
# No match found, create new consensus feature
|
|
1780
|
+
# Handle missing RT - use 0 as placeholder
|
|
1781
|
+
if rt is None:
|
|
1782
|
+
rt = 0.0
|
|
1783
|
+
if logger and skipped_count < 5: # Log first few
|
|
1784
|
+
logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
|
|
1785
|
+
|
|
1786
|
+
# Calculate RT range based on chrom_fhwm
|
|
1787
|
+
half_width = chrom_fhwm / 2.0
|
|
1788
|
+
rt_start = rt - half_width
|
|
1789
|
+
rt_end = rt + half_width
|
|
1790
|
+
|
|
1791
|
+
# Get adduct information
|
|
1792
|
+
adduct_top = adduct if adduct else "?"
|
|
1793
|
+
adduct_charge_top = None
|
|
1794
|
+
adduct_mass_shift_top = None
|
|
1795
|
+
adduct_mass_neutral_top = None
|
|
1796
|
+
|
|
1797
|
+
# Parse adduct to get charge and mass shift
|
|
1798
|
+
if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
1799
|
+
# Look for exact match in study adducts
|
|
1800
|
+
matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct_top)
|
|
1801
|
+
if not matching_adduct.is_empty():
|
|
1802
|
+
adduct_row = matching_adduct.row(0, named=True)
|
|
1803
|
+
adduct_charge_top = adduct_row["charge"]
|
|
1804
|
+
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
1805
|
+
|
|
1806
|
+
# Fallback to default values if not found
|
|
1807
|
+
if adduct_charge_top is None:
|
|
1808
|
+
adduct_charge_top = int(z) if z else 1
|
|
1809
|
+
# Default based on study polarity
|
|
1810
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
1811
|
+
if study_polarity in ["negative", "neg"]:
|
|
1812
|
+
if adduct_charge_top > 0:
|
|
1813
|
+
adduct_charge_top = -adduct_charge_top
|
|
1814
|
+
adduct_mass_shift_top = -1.007825
|
|
1815
|
+
if adduct_top == "?":
|
|
1816
|
+
adduct_top = "[M-?]1-"
|
|
1817
|
+
else:
|
|
1818
|
+
if adduct_charge_top < 0:
|
|
1819
|
+
adduct_charge_top = -adduct_charge_top
|
|
1820
|
+
adduct_mass_shift_top = 1.007825
|
|
1821
|
+
if adduct_top == "?":
|
|
1822
|
+
adduct_top = "[M+?]1+"
|
|
1823
|
+
|
|
1824
|
+
# Calculate neutral mass
|
|
1825
|
+
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
1826
|
+
adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
1827
|
+
|
|
1828
|
+
# Determine adduct group for isotopologues and related adducts
|
|
1829
|
+
adduct_group = consensus_uid_counter # Default: each entry gets its own group
|
|
1830
|
+
adduct_of = 0 # Default: this is the base adduct
|
|
1831
|
+
|
|
1832
|
+
# Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
|
|
1833
|
+
base_adduct_key = None
|
|
1834
|
+
if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
|
|
1835
|
+
# This is a base adduct with iso=0
|
|
1836
|
+
base_adduct_key = (round(mz, 4), adduct_top)
|
|
1837
|
+
base_adduct_groups[base_adduct_key] = consensus_uid_counter
|
|
1838
|
+
elif iso > 0:
|
|
1839
|
+
# This is an isotopologue, try to find the base adduct
|
|
1840
|
+
# Calculate the base m/z (subtract isotope mass shifts)
|
|
1841
|
+
c13_mass_shift = 1.00335
|
|
1842
|
+
base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
|
|
1843
|
+
|
|
1844
|
+
# Look for matching base adduct
|
|
1845
|
+
for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
|
|
1846
|
+
if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
|
|
1847
|
+
adduct_group = stored_group
|
|
1848
|
+
adduct_of = stored_group
|
|
1849
|
+
break
|
|
1850
|
+
|
|
1851
|
+
# Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
|
|
1852
|
+
adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
|
|
1853
|
+
|
|
1854
|
+
# Generate unique consensus_id string
|
|
1855
|
+
import uuid
|
|
1856
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
1857
|
+
|
|
1858
|
+
# Build consensus metadata with requested modifications for new entries
|
|
1859
|
+
metadata = {
|
|
1860
|
+
"consensus_uid": consensus_uid_counter,
|
|
1861
|
+
"consensus_id": consensus_id_str,
|
|
1862
|
+
"quality": 1.0,
|
|
1863
|
+
"number_samples": 0.0, # Set to 0.0 for library entries
|
|
1864
|
+
"rt": float(rt),
|
|
1865
|
+
"mz": float(mz),
|
|
1866
|
+
"rt_min": float(rt), # Set to rt as requested
|
|
1867
|
+
"rt_max": float(rt), # Set to rt as requested
|
|
1868
|
+
"rt_mean": float(rt), # Set to rt as requested
|
|
1869
|
+
"rt_start_mean": float(rt_start),
|
|
1870
|
+
"rt_end_mean": float(rt_end),
|
|
1871
|
+
"rt_delta_mean": 0.0, # Set to 0.0 as requested
|
|
1872
|
+
"mz_min": float(mz), # Set to mz as requested
|
|
1873
|
+
"mz_max": float(mz), # Set to mz as requested
|
|
1874
|
+
"mz_mean": float(mz), # Set to mz as requested
|
|
1875
|
+
"mz_start_mean": float(mz), # Set to mz as requested
|
|
1876
|
+
"mz_end_mean": float(mz), # Set to mz as requested
|
|
1877
|
+
"inty_mean": -1.0, # Set to -1.0 as requested
|
|
1878
|
+
"bl": -1.0,
|
|
1879
|
+
"chrom_coherence_mean": -1.0, # Set to -1.0 as requested
|
|
1880
|
+
"chrom_prominence_mean": -1.0, # Set to -1.0 as requested
|
|
1881
|
+
"chrom_prominence_scaled_mean": -1.0, # Set to -1.0 as requested
|
|
1882
|
+
"chrom_height_scaled_mean": -1.0, # Set to -1.0 as requested
|
|
1883
|
+
"iso": iso, # Set to iso from lib_df as requested
|
|
1884
|
+
"iso_mean": float(iso), # Set to iso from lib_df as requested
|
|
1885
|
+
"charge_mean": float(abs(z)) if z else 1.0, # Set to z as requested
|
|
1886
|
+
"number_ms2": 0, # Will be updated by find_ms2
|
|
1887
|
+
"adducts": adduct_values,
|
|
1888
|
+
"adduct_charge_top": adduct_charge_top,
|
|
1889
|
+
"adduct_group": adduct_group, # Use calculated adduct group
|
|
1890
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
|
|
1891
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
|
|
1892
|
+
"adduct_of": adduct_of, # Use calculated adduct_of
|
|
1893
|
+
"adduct_top": adduct_top,
|
|
1894
|
+
"id_top_name": None, # Set to null as requested
|
|
1895
|
+
"id_top_class": None, # Set to null as requested
|
|
1896
|
+
"id_top_adduct": None, # Set to null as requested
|
|
1897
|
+
"id_top_score": None, # Set to null as requested
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
consensus_metadata.append(metadata)
|
|
1901
|
+
|
|
1902
|
+
# Create mapping entry (maps to library entry as "virtual" feature)
|
|
1903
|
+
# Use lib_uid as the feature_uid and a virtual sample_uid of 0
|
|
1904
|
+
# Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
|
|
1905
|
+
consensus_mapping_list.append({
|
|
1906
|
+
"consensus_uid": consensus_uid_counter,
|
|
1907
|
+
"feature_uid": lib_uid, # Use lib_uid as feature reference
|
|
1908
|
+
"sample_uid": 0, # Virtual sample for library entries
|
|
1909
|
+
})
|
|
1910
|
+
|
|
1911
|
+
consensus_uid_counter += 1
|
|
1912
|
+
|
|
1913
|
+
# Log matching statistics
|
|
1914
|
+
if logger:
|
|
1915
|
+
total_processed = matched_count + len(consensus_metadata)
|
|
1916
|
+
logger.info(f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features")
|
|
1917
|
+
|
|
1918
|
+
# Convert to DataFrames with proper schema alignment
|
|
1919
|
+
if consensus_metadata:
|
|
1920
|
+
new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
|
|
1921
|
+
|
|
1922
|
+
# Ensure schema compatibility with existing consensus_df
|
|
1923
|
+
if not study.consensus_df.is_empty():
|
|
1924
|
+
# Cast columns to match existing schema
|
|
1925
|
+
existing_schema = study.consensus_df.schema
|
|
1926
|
+
cast_exprs = []
|
|
1927
|
+
for col_name in new_consensus_df.columns:
|
|
1928
|
+
if col_name in existing_schema:
|
|
1929
|
+
target_dtype = existing_schema[col_name]
|
|
1930
|
+
if target_dtype == pl.Null:
|
|
1931
|
+
# For Null columns, use lit(None) to maintain Null type
|
|
1932
|
+
cast_exprs.append(pl.lit(None).alias(col_name))
|
|
1933
|
+
else:
|
|
1934
|
+
cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
|
|
1935
|
+
else:
|
|
1936
|
+
cast_exprs.append(pl.col(col_name))
|
|
1937
|
+
|
|
1938
|
+
new_consensus_df = new_consensus_df.select(cast_exprs)
|
|
1939
|
+
|
|
1940
|
+
new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
|
|
1941
|
+
|
|
1942
|
+
# Append to existing DataFrames
|
|
1943
|
+
if not study.consensus_df.is_empty():
|
|
1944
|
+
study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
|
|
1945
|
+
else:
|
|
1946
|
+
study.consensus_df = new_consensus_df
|
|
1947
|
+
|
|
1948
|
+
if not study.consensus_mapping_df.is_empty():
|
|
1949
|
+
study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
|
|
1950
|
+
else:
|
|
1951
|
+
study.consensus_mapping_df = new_consensus_mapping_df
|
|
1952
|
+
|
|
1953
|
+
if logger:
|
|
1954
|
+
logger.info(f"Added {len(consensus_metadata)} consensus features from library")
|
|
1955
|
+
else:
|
|
1956
|
+
if logger:
|
|
1957
|
+
logger.warning("No valid consensus features created from library")
|
|
1958
|
+
return
|
|
1959
|
+
|
|
1960
|
+
# Store operation in history
|
|
1961
|
+
if hasattr(study, "update_history"):
|
|
1962
|
+
study.update_history(
|
|
1963
|
+
["lib_to_consensus"],
|
|
1964
|
+
{"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
|
|
1965
|
+
)
|
|
1966
|
+
|
|
1967
|
+
# Perform find_ms2 at the end
|
|
1968
|
+
try:
|
|
1969
|
+
if hasattr(study, "find_ms2"):
|
|
1970
|
+
if logger:
|
|
1971
|
+
logger.info("Running find_ms2 to link MS2 spectra to library-derived consensus features")
|
|
1972
|
+
study.find_ms2()
|
|
1973
|
+
else:
|
|
1974
|
+
if logger:
|
|
1975
|
+
logger.warning("find_ms2 method not available on study object")
|
|
1976
|
+
except Exception as e:
|
|
1977
|
+
if logger:
|
|
1978
|
+
logger.warning(f"find_ms2 failed: {e}")
|
|
1979
|
+
|
|
1980
|
+
if logger:
|
|
1981
|
+
logger.info(f"lib_to_consensus completed: {len(consensus_metadata)} features added")
|