masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/id.py
CHANGED
|
@@ -3,19 +3,14 @@
|
|
|
3
3
|
Identification helpers for Study: load a Lib and identify consensus features
|
|
4
4
|
by matching m/z (and optionally RT).
|
|
5
5
|
"""
|
|
6
|
-
|
|
7
6
|
from __future__ import annotations
|
|
8
7
|
|
|
8
|
+
from typing import Optional
|
|
9
9
|
|
|
10
10
|
import polars as pl
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def lib_load(
|
|
14
|
-
study,
|
|
15
|
-
lib_source,
|
|
16
|
-
polarity: str | None = None,
|
|
17
|
-
adducts: list | None = None,
|
|
18
|
-
):
|
|
13
|
+
def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optional[list] = None):
|
|
19
14
|
"""Load a compound library into the study.
|
|
20
15
|
|
|
21
16
|
Args:
|
|
@@ -30,7 +25,7 @@ def lib_load(
|
|
|
30
25
|
"""
|
|
31
26
|
# Lazy import to avoid circular imports at module import time
|
|
32
27
|
try:
|
|
33
|
-
from
|
|
28
|
+
from masster.lib.lib import Lib
|
|
34
29
|
except Exception:
|
|
35
30
|
Lib = None
|
|
36
31
|
|
|
@@ -39,89 +34,71 @@ def lib_load(
|
|
|
39
34
|
|
|
40
35
|
# Use study polarity if not explicitly provided
|
|
41
36
|
if polarity is None:
|
|
42
|
-
study_polarity = getattr(study,
|
|
37
|
+
study_polarity = getattr(study, 'polarity', 'positive')
|
|
43
38
|
# Normalize polarity names
|
|
44
|
-
if study_polarity in [
|
|
45
|
-
polarity =
|
|
46
|
-
elif study_polarity in [
|
|
47
|
-
polarity =
|
|
39
|
+
if study_polarity in ['pos', 'positive']:
|
|
40
|
+
polarity = 'positive'
|
|
41
|
+
elif study_polarity in ['neg', 'negative']:
|
|
42
|
+
polarity = 'negative'
|
|
48
43
|
else:
|
|
49
|
-
polarity =
|
|
44
|
+
polarity = 'positive' # Default fallback
|
|
50
45
|
|
|
51
46
|
# Handle string input (CSV file path)
|
|
52
47
|
if isinstance(lib_source, str):
|
|
53
48
|
if Lib is None:
|
|
54
|
-
raise ImportError(
|
|
55
|
-
|
|
56
|
-
)
|
|
57
|
-
|
|
49
|
+
raise ImportError("Could not import masster.lib.lib.Lib - required for CSV loading")
|
|
50
|
+
|
|
58
51
|
lib_obj = Lib()
|
|
59
52
|
lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
|
|
60
|
-
|
|
53
|
+
|
|
61
54
|
# Handle Lib instance
|
|
62
55
|
elif Lib is not None and isinstance(lib_source, Lib):
|
|
63
56
|
lib_obj = lib_source
|
|
64
|
-
|
|
57
|
+
|
|
65
58
|
# Handle other objects with lib_df attribute
|
|
66
59
|
elif hasattr(lib_source, "lib_df"):
|
|
67
60
|
lib_obj = lib_source
|
|
68
|
-
|
|
61
|
+
|
|
69
62
|
else:
|
|
70
|
-
raise TypeError(
|
|
71
|
-
"lib_source must be a CSV file path (str), a master.lib.Lib instance, or have a 'lib_df' attribute",
|
|
72
|
-
)
|
|
63
|
+
raise TypeError("lib_source must be a CSV file path (str), a masster.lib.Lib instance, or have a 'lib_df' attribute")
|
|
73
64
|
|
|
74
65
|
# Ensure lib_df is populated
|
|
75
66
|
lf = getattr(lib_obj, "lib_df", None)
|
|
76
|
-
if lf is None or (hasattr(lf,
|
|
67
|
+
if lf is None or (hasattr(lf, 'is_empty') and lf.is_empty()):
|
|
77
68
|
raise ValueError("Library has no data populated in lib_df")
|
|
78
69
|
|
|
79
70
|
# Filter by polarity to match study
|
|
80
71
|
# Map polarity to charge signs
|
|
81
|
-
if polarity ==
|
|
72
|
+
if polarity == 'positive':
|
|
82
73
|
target_charges = [1, 2] # positive charges
|
|
83
|
-
elif polarity ==
|
|
74
|
+
elif polarity == 'negative':
|
|
84
75
|
target_charges = [-1, -2] # negative charges
|
|
85
76
|
else:
|
|
86
77
|
target_charges = [-2, -1, 1, 2] # all charges
|
|
87
78
|
|
|
88
79
|
# Filter library entries by charge sign (which corresponds to polarity)
|
|
89
80
|
filtered_lf = lf.filter(pl.col("z").is_in(target_charges))
|
|
90
|
-
|
|
81
|
+
|
|
91
82
|
if filtered_lf.is_empty():
|
|
92
|
-
print(
|
|
93
|
-
f"Warning: No library entries found for polarity '{polarity}'. Using all entries.",
|
|
94
|
-
)
|
|
83
|
+
print(f"Warning: No library entries found for polarity '{polarity}'. Using all entries.")
|
|
95
84
|
filtered_lf = lf
|
|
96
85
|
|
|
97
86
|
# Store pointer and DataFrame on study
|
|
98
87
|
study._lib = lib_obj
|
|
99
|
-
|
|
88
|
+
|
|
100
89
|
# Add to existing lib_df instead of replacing
|
|
101
|
-
if (
|
|
102
|
-
hasattr(study, "lib_df")
|
|
103
|
-
and study.lib_df is not None
|
|
104
|
-
and not study.lib_df.is_empty()
|
|
105
|
-
):
|
|
90
|
+
if hasattr(study, 'lib_df') and study.lib_df is not None and not study.lib_df.is_empty():
|
|
106
91
|
# Concatenate with existing data
|
|
107
92
|
study.lib_df = pl.concat([study.lib_df, filtered_lf])
|
|
108
93
|
else:
|
|
109
94
|
# First time loading - create new
|
|
110
95
|
try:
|
|
111
|
-
study.lib_df = (
|
|
112
|
-
filtered_lf.clone()
|
|
113
|
-
if hasattr(filtered_lf, "clone")
|
|
114
|
-
else pl.DataFrame(filtered_lf)
|
|
115
|
-
)
|
|
96
|
+
study.lib_df = filtered_lf.clone() if hasattr(filtered_lf, "clone") else pl.DataFrame(filtered_lf)
|
|
116
97
|
except Exception:
|
|
117
|
-
study.lib_df = (
|
|
118
|
-
pl.from_pandas(filtered_lf)
|
|
119
|
-
if hasattr(filtered_lf, "to_pandas")
|
|
120
|
-
else pl.DataFrame(filtered_lf)
|
|
121
|
-
)
|
|
98
|
+
study.lib_df = pl.from_pandas(filtered_lf) if hasattr(filtered_lf, "to_pandas") else pl.DataFrame(filtered_lf)
|
|
122
99
|
|
|
123
100
|
|
|
124
|
-
def identify(study, mz_tol: float = 0.01, rt_tol: float
|
|
101
|
+
def identify(study, mz_tol: float = 0.01, rt_tol: Optional[float] = None):
|
|
125
102
|
"""Identify consensus features against the loaded library.
|
|
126
103
|
|
|
127
104
|
Matches consensus_df.mz against lib_df.mz within mz_tolerance. If rt_tolerance
|
|
@@ -135,12 +112,10 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
135
112
|
- rt_delta (nullable)
|
|
136
113
|
"""
|
|
137
114
|
# Get logger from study if available
|
|
138
|
-
logger = getattr(study,
|
|
139
|
-
|
|
115
|
+
logger = getattr(study, 'logger', None)
|
|
116
|
+
|
|
140
117
|
if logger:
|
|
141
|
-
logger.debug(
|
|
142
|
-
f"Starting identification with mz_tolerance={mz_tol}, rt_tolerance={rt_tol}",
|
|
143
|
-
)
|
|
118
|
+
logger.debug(f"Starting identification with mz_tolerance={mz_tol}, rt_tolerance={rt_tol}")
|
|
144
119
|
|
|
145
120
|
# Validate inputs
|
|
146
121
|
if getattr(study, "consensus_df", None) is None or study.consensus_df.is_empty():
|
|
@@ -156,11 +131,9 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
156
131
|
|
|
157
132
|
consensus_count = len(study.consensus_df)
|
|
158
133
|
lib_count = len(study.lib_df)
|
|
159
|
-
|
|
134
|
+
|
|
160
135
|
if logger:
|
|
161
|
-
logger.debug(
|
|
162
|
-
f"Identifying {consensus_count} consensus features against {lib_count} library entries",
|
|
163
|
-
)
|
|
136
|
+
logger.debug(f"Identifying {consensus_count} consensus features against {lib_count} library entries")
|
|
164
137
|
|
|
165
138
|
results = []
|
|
166
139
|
features_with_matches = 0
|
|
@@ -173,7 +146,7 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
173
146
|
cons_mz = cons.get("mz")
|
|
174
147
|
cons_rt = cons.get("rt")
|
|
175
148
|
cons_uid = cons.get("consensus_uid")
|
|
176
|
-
|
|
149
|
+
|
|
177
150
|
if cons_mz is None:
|
|
178
151
|
if logger:
|
|
179
152
|
logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
|
|
@@ -181,7 +154,7 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
181
154
|
|
|
182
155
|
# Filter lib by mz window
|
|
183
156
|
matches = study.lib_df.filter(
|
|
184
|
-
(pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
|
|
157
|
+
(pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
|
|
185
158
|
)
|
|
186
159
|
|
|
187
160
|
initial_matches = len(matches)
|
|
@@ -189,21 +162,15 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
189
162
|
# If rt_tol provided and consensus RT present, prefer rt-filtered hits
|
|
190
163
|
if rt_tol is not None and cons_rt is not None:
|
|
191
164
|
rt_matches = matches.filter(
|
|
192
|
-
pl.col("rt").is_not_null()
|
|
193
|
-
& (pl.col("rt") >= cons_rt - rt_tol)
|
|
194
|
-
& (pl.col("rt") <= cons_rt + rt_tol),
|
|
165
|
+
pl.col("rt").is_not_null() & (pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol)
|
|
195
166
|
)
|
|
196
167
|
if not rt_matches.is_empty():
|
|
197
168
|
matches = rt_matches
|
|
198
169
|
if logger:
|
|
199
|
-
logger.debug(
|
|
200
|
-
f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter",
|
|
201
|
-
)
|
|
170
|
+
logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter")
|
|
202
171
|
else:
|
|
203
172
|
if logger:
|
|
204
|
-
logger.debug(
|
|
205
|
-
f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only",
|
|
206
|
-
)
|
|
173
|
+
logger.debug(f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only")
|
|
207
174
|
|
|
208
175
|
# Apply scoring-based filtering system
|
|
209
176
|
if not matches.is_empty():
|
|
@@ -215,20 +182,14 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
215
182
|
features_with_matches += 1
|
|
216
183
|
feature_match_count = len(filtered_matches)
|
|
217
184
|
total_matches += feature_match_count
|
|
218
|
-
|
|
185
|
+
|
|
219
186
|
if logger:
|
|
220
|
-
logger.debug(
|
|
221
|
-
f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches",
|
|
222
|
-
)
|
|
187
|
+
logger.debug(f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches")
|
|
223
188
|
|
|
224
189
|
for m in filtered_matches.iter_rows(named=True):
|
|
225
190
|
mz_delta = abs(cons_mz - m.get("mz")) if m.get("mz") is not None else None
|
|
226
191
|
lib_rt = m.get("rt")
|
|
227
|
-
rt_delta = (
|
|
228
|
-
abs(cons_rt - lib_rt)
|
|
229
|
-
if (cons_rt is not None and lib_rt is not None)
|
|
230
|
-
else None
|
|
231
|
-
)
|
|
192
|
+
rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
|
|
232
193
|
results.append(
|
|
233
194
|
{
|
|
234
195
|
"consensus_uid": cons.get("consensus_uid"),
|
|
@@ -237,44 +198,36 @@ def identify(study, mz_tol: float = 0.01, rt_tol: float | None = None):
|
|
|
237
198
|
"rt_delta": rt_delta,
|
|
238
199
|
"matcher": "ms1",
|
|
239
200
|
"score": 1.0,
|
|
240
|
-
}
|
|
201
|
+
}
|
|
241
202
|
)
|
|
242
203
|
|
|
243
204
|
study.id_df = pl.DataFrame(results) if results else pl.DataFrame()
|
|
244
|
-
|
|
205
|
+
|
|
245
206
|
if logger:
|
|
246
207
|
if rt_filtered_compounds > 0:
|
|
247
|
-
logger.debug(
|
|
248
|
-
|
|
249
|
-
)
|
|
250
|
-
|
|
208
|
+
logger.debug(f"RT consistency filtering applied to {rt_filtered_compounds} compound groups")
|
|
209
|
+
|
|
251
210
|
if multiply_charged_filtered > 0:
|
|
252
|
-
logger.debug(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
|
|
258
|
-
)
|
|
259
|
-
|
|
211
|
+
logger.debug(f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)")
|
|
212
|
+
|
|
213
|
+
logger.info(f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications")
|
|
214
|
+
|
|
215
|
+
|
|
260
216
|
if total_matches > 0:
|
|
261
217
|
# Calculate some statistics
|
|
262
218
|
mz_deltas = [r["mz_delta"] for r in results if r["mz_delta"] is not None]
|
|
263
219
|
rt_deltas = [r["rt_delta"] for r in results if r["rt_delta"] is not None]
|
|
264
|
-
|
|
220
|
+
|
|
265
221
|
if mz_deltas:
|
|
266
222
|
avg_mz_delta = sum(mz_deltas) / len(mz_deltas)
|
|
267
223
|
max_mz_delta = max(mz_deltas)
|
|
268
|
-
logger.debug(
|
|
269
|
-
|
|
270
|
-
)
|
|
271
|
-
|
|
224
|
+
logger.debug(f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da")
|
|
225
|
+
|
|
272
226
|
if rt_deltas:
|
|
273
227
|
avg_rt_delta = sum(rt_deltas) / len(rt_deltas)
|
|
274
228
|
max_rt_delta = max(rt_deltas)
|
|
275
|
-
logger.debug(
|
|
276
|
-
|
|
277
|
-
)
|
|
229
|
+
logger.debug(f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min")
|
|
230
|
+
|
|
278
231
|
|
|
279
232
|
|
|
280
233
|
def get_id(study, features=None) -> pl.DataFrame:
|
|
@@ -291,7 +244,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
291
244
|
Returns:
|
|
292
245
|
Polars DataFrame with columns:
|
|
293
246
|
- consensus_uid
|
|
294
|
-
- lib_uid
|
|
247
|
+
- lib_uid
|
|
295
248
|
- mz (consensus feature m/z)
|
|
296
249
|
- rt (consensus feature RT)
|
|
297
250
|
- name (compound name from library)
|
|
@@ -307,9 +260,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
307
260
|
"""
|
|
308
261
|
# Validate inputs
|
|
309
262
|
if getattr(study, "id_df", None) is None or study.id_df.is_empty():
|
|
310
|
-
raise ValueError(
|
|
311
|
-
"Identification results (study.id_df) are empty; call identify() first",
|
|
312
|
-
)
|
|
263
|
+
raise ValueError("Identification results (study.id_df) are empty; call identify() first")
|
|
313
264
|
|
|
314
265
|
if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
|
|
315
266
|
raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
|
|
@@ -322,52 +273,35 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
322
273
|
|
|
323
274
|
# Filter by features if provided
|
|
324
275
|
if features is not None:
|
|
325
|
-
if hasattr(features,
|
|
326
|
-
if
|
|
327
|
-
uids = features[
|
|
276
|
+
if hasattr(features, 'columns'): # DataFrame-like
|
|
277
|
+
if 'consensus_uid' in features.columns:
|
|
278
|
+
uids = features['consensus_uid'].unique().to_list()
|
|
328
279
|
else:
|
|
329
|
-
raise ValueError(
|
|
330
|
-
|
|
331
|
-
)
|
|
332
|
-
elif hasattr(features, "__iter__") and not isinstance(
|
|
333
|
-
features,
|
|
334
|
-
str,
|
|
335
|
-
): # List-like
|
|
280
|
+
raise ValueError("features DataFrame must contain 'consensus_uid' column")
|
|
281
|
+
elif hasattr(features, '__iter__') and not isinstance(features, str): # List-like
|
|
336
282
|
uids = list(features)
|
|
337
283
|
else:
|
|
338
|
-
raise ValueError(
|
|
339
|
-
|
|
340
|
-
)
|
|
341
|
-
|
|
284
|
+
raise ValueError("features must be a DataFrame with 'consensus_uid' column or a list of UIDs")
|
|
285
|
+
|
|
342
286
|
result_df = result_df.filter(pl.col("consensus_uid").is_in(uids))
|
|
343
|
-
|
|
287
|
+
|
|
344
288
|
if result_df.is_empty():
|
|
345
289
|
return pl.DataFrame()
|
|
346
290
|
|
|
347
291
|
# Join with consensus_df to get consensus feature m/z and RT
|
|
348
292
|
consensus_cols = ["consensus_uid", "mz", "rt"]
|
|
349
293
|
# Only select columns that exist in consensus_df
|
|
350
|
-
available_consensus_cols = [
|
|
351
|
-
|
|
352
|
-
]
|
|
353
|
-
|
|
294
|
+
available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
|
|
295
|
+
|
|
354
296
|
result_df = result_df.join(
|
|
355
297
|
study.consensus_df.select(available_consensus_cols),
|
|
356
298
|
on="consensus_uid",
|
|
357
299
|
how="left",
|
|
358
|
-
suffix="_consensus"
|
|
300
|
+
suffix="_consensus"
|
|
359
301
|
)
|
|
360
302
|
|
|
361
303
|
# Join with lib_df to get library information
|
|
362
|
-
lib_cols = [
|
|
363
|
-
"lib_uid",
|
|
364
|
-
"name",
|
|
365
|
-
"formula",
|
|
366
|
-
"adduct",
|
|
367
|
-
"smiles",
|
|
368
|
-
"cmpd_uid",
|
|
369
|
-
"inchikey",
|
|
370
|
-
]
|
|
304
|
+
lib_cols = ["lib_uid", "name", "formula", "adduct", "smiles", "cmpd_uid", "inchikey"]
|
|
371
305
|
# Add optional columns if they exist
|
|
372
306
|
optional_lib_cols = ["inchi"]
|
|
373
307
|
for col in optional_lib_cols:
|
|
@@ -376,19 +310,19 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
376
310
|
|
|
377
311
|
# Only select columns that exist in lib_df
|
|
378
312
|
available_lib_cols = [col for col in lib_cols if col in study.lib_df.columns]
|
|
379
|
-
|
|
313
|
+
|
|
380
314
|
result_df = result_df.join(
|
|
381
315
|
study.lib_df.select(available_lib_cols),
|
|
382
|
-
on="lib_uid",
|
|
316
|
+
on="lib_uid",
|
|
383
317
|
how="left",
|
|
384
|
-
suffix="_lib"
|
|
318
|
+
suffix="_lib"
|
|
385
319
|
)
|
|
386
320
|
|
|
387
321
|
# Reorder columns for better readability
|
|
388
322
|
column_order = [
|
|
389
323
|
"consensus_uid",
|
|
390
324
|
"cmpd_uid" if "cmpd_uid" in result_df.columns else None,
|
|
391
|
-
"lib_uid",
|
|
325
|
+
"lib_uid",
|
|
392
326
|
"name" if "name" in result_df.columns else None,
|
|
393
327
|
"formula" if "formula" in result_df.columns else None,
|
|
394
328
|
"adduct" if "adduct" in result_df.columns else None,
|
|
@@ -399,54 +333,34 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
399
333
|
"matcher" if "matcher" in result_df.columns else None,
|
|
400
334
|
"score" if "score" in result_df.columns else None,
|
|
401
335
|
"smiles" if "smiles" in result_df.columns else None,
|
|
402
|
-
"inchikey" if "inchikey" in result_df.columns else None
|
|
336
|
+
"inchikey" if "inchikey" in result_df.columns else None
|
|
403
337
|
]
|
|
404
|
-
|
|
338
|
+
|
|
405
339
|
# Add any remaining columns
|
|
406
340
|
remaining_cols = [col for col in result_df.columns if col not in column_order]
|
|
407
341
|
column_order.extend(remaining_cols)
|
|
408
|
-
|
|
342
|
+
|
|
409
343
|
# Filter out None values and select existing columns
|
|
410
|
-
final_column_order = [
|
|
411
|
-
|
|
412
|
-
]
|
|
413
|
-
|
|
344
|
+
final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
|
|
345
|
+
|
|
414
346
|
result_df = result_df.select(final_column_order)
|
|
415
|
-
|
|
347
|
+
|
|
416
348
|
# Add compound and formula count columns
|
|
417
349
|
if "consensus_uid" in result_df.columns:
|
|
418
350
|
# Calculate counts per consensus_uid
|
|
419
|
-
count_stats = result_df.group_by("consensus_uid").agg(
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
pl.col("formula").n_unique().alias("num_formulas")
|
|
425
|
-
if "formula" in result_df.columns
|
|
426
|
-
else pl.lit(None).alias("num_formulas"),
|
|
427
|
-
],
|
|
428
|
-
)
|
|
429
|
-
|
|
351
|
+
count_stats = result_df.group_by("consensus_uid").agg([
|
|
352
|
+
pl.col("cmpd_uid").n_unique().alias("num_cmpds") if "cmpd_uid" in result_df.columns else pl.lit(None).alias("num_cmpds"),
|
|
353
|
+
pl.col("formula").n_unique().alias("num_formulas") if "formula" in result_df.columns else pl.lit(None).alias("num_formulas")
|
|
354
|
+
])
|
|
355
|
+
|
|
430
356
|
# Join the counts back to the main dataframe
|
|
431
357
|
result_df = result_df.join(count_stats, on="consensus_uid", how="left")
|
|
432
|
-
|
|
358
|
+
|
|
433
359
|
# Reorder columns to put count columns in the right position
|
|
434
360
|
final_columns = []
|
|
435
361
|
for col in result_df.columns:
|
|
436
|
-
if col in [
|
|
437
|
-
|
|
438
|
-
"cmpd_uid",
|
|
439
|
-
"lib_uid",
|
|
440
|
-
"name",
|
|
441
|
-
"formula",
|
|
442
|
-
"adduct",
|
|
443
|
-
"mz",
|
|
444
|
-
"mz_delta",
|
|
445
|
-
"rt",
|
|
446
|
-
"rt_delta",
|
|
447
|
-
"matcher",
|
|
448
|
-
"score",
|
|
449
|
-
]:
|
|
362
|
+
if col in ["consensus_uid", "cmpd_uid", "lib_uid", "name", "formula", "adduct",
|
|
363
|
+
"mz", "mz_delta", "rt", "rt_delta", "matcher", "score"]:
|
|
450
364
|
final_columns.append(col)
|
|
451
365
|
# Add count columns
|
|
452
366
|
if "num_cmpds" in result_df.columns:
|
|
@@ -457,14 +371,14 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
457
371
|
for col in result_df.columns:
|
|
458
372
|
if col not in final_columns:
|
|
459
373
|
final_columns.append(col)
|
|
460
|
-
|
|
374
|
+
|
|
461
375
|
result_df = result_df.select(final_columns)
|
|
462
|
-
|
|
376
|
+
|
|
463
377
|
# Apply scoring-based filtering system
|
|
464
378
|
if "consensus_uid" in result_df.columns and len(result_df) > 0:
|
|
465
379
|
# (i) Start with score 1.0 for all
|
|
466
380
|
result_df = result_df.with_columns(pl.lit(1.0).alias("score"))
|
|
467
|
-
|
|
381
|
+
|
|
468
382
|
# (ii) If not [M+H]+ or [M-H]-, score *= 0.7
|
|
469
383
|
if "adduct" in result_df.columns:
|
|
470
384
|
preferred_adducts = ["[M+H]+", "[M-H]-"]
|
|
@@ -472,96 +386,79 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
472
386
|
pl.when(pl.col("adduct").is_in(preferred_adducts))
|
|
473
387
|
.then(pl.col("score"))
|
|
474
388
|
.otherwise(pl.col("score") * 0.7)
|
|
475
|
-
.alias("score")
|
|
389
|
+
.alias("score")
|
|
476
390
|
)
|
|
477
|
-
|
|
391
|
+
|
|
478
392
|
# (iii) If num_formulas > 1, score *= 0.7
|
|
479
393
|
if "num_formulas" in result_df.columns:
|
|
480
394
|
result_df = result_df.with_columns(
|
|
481
395
|
pl.when(pl.col("num_formulas") > 1)
|
|
482
396
|
.then(pl.col("score") * 0.7)
|
|
483
397
|
.otherwise(pl.col("score"))
|
|
484
|
-
.alias("score")
|
|
398
|
+
.alias("score")
|
|
485
399
|
)
|
|
486
|
-
|
|
400
|
+
|
|
487
401
|
# (iv) If num_cmpds > 1, score *= 0.7
|
|
488
402
|
if "num_cmpds" in result_df.columns:
|
|
489
403
|
result_df = result_df.with_columns(
|
|
490
404
|
pl.when(pl.col("num_cmpds") > 1)
|
|
491
405
|
.then(pl.col("score") * 0.7)
|
|
492
406
|
.otherwise(pl.col("score"))
|
|
493
|
-
.alias("score")
|
|
407
|
+
.alias("score")
|
|
494
408
|
)
|
|
495
|
-
|
|
409
|
+
|
|
496
410
|
# (v) Rank by score, assume that highest score has the correct rt
|
|
497
411
|
# (vi) Remove all lower-scoring ids with a different rt (group by cmpd_uid)
|
|
498
412
|
# (vii) Remove multiply charged ids if not in line with [M+H]+ or [M-H]- (group by cmpd_uid)
|
|
499
|
-
|
|
413
|
+
|
|
500
414
|
# Group by cmpd_uid and apply filtering logic
|
|
501
415
|
if "cmpd_uid" in result_df.columns:
|
|
502
416
|
filtered_dfs = []
|
|
503
417
|
for cmpd_uid, group_df in result_df.group_by("cmpd_uid"):
|
|
504
418
|
# Sort by score descending to get highest score first
|
|
505
419
|
group_df = group_df.sort("score", descending=True)
|
|
506
|
-
|
|
420
|
+
|
|
507
421
|
if len(group_df) == 0:
|
|
508
422
|
continue
|
|
509
|
-
|
|
423
|
+
|
|
510
424
|
# Get the highest scoring entry's RT as reference
|
|
511
|
-
reference_rt =
|
|
512
|
-
|
|
513
|
-
if "rt" in group_df.columns and group_df["rt"][0] is not None
|
|
514
|
-
else None
|
|
515
|
-
)
|
|
516
|
-
|
|
425
|
+
reference_rt = group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
|
|
426
|
+
|
|
517
427
|
# Filter entries: keep those with same RT as highest scoring entry
|
|
518
428
|
if reference_rt is not None and "rt" in group_df.columns:
|
|
519
429
|
# Keep entries with the same RT or null RT
|
|
520
430
|
rt_filtered = group_df.filter(
|
|
521
|
-
(pl.col("rt") == reference_rt) | pl.col("rt").is_null()
|
|
431
|
+
(pl.col("rt") == reference_rt) | pl.col("rt").is_null()
|
|
522
432
|
)
|
|
523
433
|
else:
|
|
524
434
|
# No reference RT, keep all
|
|
525
435
|
rt_filtered = group_df
|
|
526
|
-
|
|
436
|
+
|
|
527
437
|
# Check multiply charged constraint
|
|
528
|
-
if (
|
|
529
|
-
"z" in rt_filtered.columns
|
|
530
|
-
and "adduct" in rt_filtered.columns
|
|
531
|
-
and len(rt_filtered) > 0
|
|
532
|
-
):
|
|
438
|
+
if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
|
|
533
439
|
# Check if there are multiply charged adducts
|
|
534
|
-
multiply_charged = rt_filtered.filter(
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
singly_charged = rt_filtered.filter(
|
|
538
|
-
(pl.col("z") == 1) | (pl.col("z") == -1),
|
|
539
|
-
)
|
|
540
|
-
|
|
440
|
+
multiply_charged = rt_filtered.filter((pl.col("z") > 1) | (pl.col("z") < -1))
|
|
441
|
+
singly_charged = rt_filtered.filter((pl.col("z") == 1) | (pl.col("z") == -1))
|
|
442
|
+
|
|
541
443
|
if not multiply_charged.is_empty():
|
|
542
444
|
# Check if [M+H]+ or [M-H]- are present
|
|
543
445
|
reference_adducts = ["[M+H]+", "[M-H]-"]
|
|
544
|
-
has_reference = any(
|
|
545
|
-
|
|
546
|
-
pl.col("adduct").is_in(reference_adducts),
|
|
547
|
-
).height
|
|
548
|
-
> 0,
|
|
549
|
-
)
|
|
550
|
-
|
|
446
|
+
has_reference = any(singly_charged.filter(pl.col("adduct").is_in(reference_adducts)).height > 0)
|
|
447
|
+
|
|
551
448
|
if not has_reference:
|
|
552
449
|
# Remove multiply charged adducts
|
|
553
450
|
rt_filtered = singly_charged
|
|
554
|
-
|
|
451
|
+
|
|
555
452
|
if len(rt_filtered) > 0:
|
|
556
453
|
filtered_dfs.append(rt_filtered)
|
|
557
|
-
|
|
454
|
+
|
|
558
455
|
if filtered_dfs:
|
|
559
456
|
result_df = pl.concat(filtered_dfs)
|
|
560
457
|
else:
|
|
561
458
|
result_df = pl.DataFrame()
|
|
562
|
-
|
|
459
|
+
|
|
563
460
|
# Sort by cmpd_uid if available
|
|
564
461
|
if "cmpd_uid" in result_df.columns:
|
|
565
462
|
result_df = result_df.sort("cmpd_uid")
|
|
566
|
-
|
|
463
|
+
|
|
567
464
|
return result_df
|