masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +16 -6
- masster/sample/defaults/sample_def.py +1 -1
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +190 -140
- masster/sample/load.py +13 -9
- masster/sample/plot.py +256 -147
- masster/sample/processing.py +18 -12
- masster/sample/sample.py +10 -4
- masster/sample/sample5_schema.json +38 -29
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +231 -13
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +757 -246
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +57 -25
- masster/study/plot.py +1244 -129
- masster/study/processing.py +194 -86
- masster/study/save.py +7 -7
- masster/study/study.py +154 -89
- masster/study/study5_schema.json +15 -15
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/METADATA +1 -1
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/RECORD +33 -31
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/WHEEL +0 -0
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/entry_points.txt +0 -0
- {masster-0.3.10.dist-info → masster-0.3.12.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -7,7 +7,290 @@ import pandas as pd
|
|
|
7
7
|
import polars as pl
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
+
from masster.chromatogram import Chromatogram
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
14
|
+
"""
|
|
15
|
+
Return a Chromatogram object containing the Base Peak Chromatogram (BPC).
|
|
16
|
+
|
|
17
|
+
The `owner` argument may be either a Study instance or a Sample-like object that
|
|
18
|
+
exposes `ms1_df` (Polars DataFrame) and optionally `scans_df`.
|
|
19
|
+
|
|
20
|
+
If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
|
|
21
|
+
and the Sample will be retrieved using `get_sample(owner, sample)`.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Chromatogram
|
|
25
|
+
"""
|
|
26
|
+
# resolve sample when owner is a Study-like object (has get_sample)
|
|
27
|
+
s = None
|
|
28
|
+
if hasattr(owner, "ms1_df"):
|
|
29
|
+
s = owner
|
|
30
|
+
else:
|
|
31
|
+
# owner is expected to be a Study
|
|
32
|
+
s = get_sample(owner, sample)
|
|
33
|
+
|
|
34
|
+
if s is None:
|
|
35
|
+
raise ValueError("Could not resolve sample for BPC computation")
|
|
36
|
+
|
|
37
|
+
# ensure ms1_df exists
|
|
38
|
+
if getattr(s, "ms1_df", None) is None:
|
|
39
|
+
raise ValueError("Sample has no ms1_df for BPC computation")
|
|
40
|
+
|
|
41
|
+
# try Polars aggregation first
|
|
42
|
+
try:
|
|
43
|
+
cols = s.ms1_df.columns
|
|
44
|
+
if not all(c in cols for c in ["rt", "inty"]):
|
|
45
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
46
|
+
|
|
47
|
+
bpc = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
|
|
48
|
+
bpc = bpc.groupby("rt").agg(pl.col("inty").max().alias("inty"))
|
|
49
|
+
bpc_pd = bpc.to_pandas().sort_values("rt")
|
|
50
|
+
except Exception:
|
|
51
|
+
# fallback to pandas
|
|
52
|
+
try:
|
|
53
|
+
bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
|
|
54
|
+
bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
|
|
55
|
+
except Exception:
|
|
56
|
+
raise
|
|
57
|
+
|
|
58
|
+
if bpc_pd.empty:
|
|
59
|
+
raise ValueError("Computed BPC is empty")
|
|
60
|
+
|
|
61
|
+
# If caller requests original RTs (original=True) and we were called from a Study
|
|
62
|
+
# we can obtain a per-sample mapping between current rt and rt_original from
|
|
63
|
+
# the study.features_df and apply it to the computed BPC rt values.
|
|
64
|
+
# Note: original parameter default is False (return current/aligned RTs).
|
|
65
|
+
if original is True:
|
|
66
|
+
try:
|
|
67
|
+
# Only proceed if owner is a Study-like object with features_df
|
|
68
|
+
study = None
|
|
69
|
+
if hasattr(owner, "features_df"):
|
|
70
|
+
study = owner
|
|
71
|
+
else:
|
|
72
|
+
# If owner is a Sample, try to find Study via attribute (not guaranteed)
|
|
73
|
+
study = getattr(owner, "study", None)
|
|
74
|
+
|
|
75
|
+
if study is not None and getattr(study, "features_df", None) is not None:
|
|
76
|
+
# Attempt to select mapping rows for this sample. Prefer matching by sample_uid,
|
|
77
|
+
# fall back to sample_name when necessary.
|
|
78
|
+
import numpy as _np
|
|
79
|
+
|
|
80
|
+
feats = study.features_df
|
|
81
|
+
# try filtering by sample identifier provided to this function
|
|
82
|
+
mapping_rows = None
|
|
83
|
+
if sample is not None:
|
|
84
|
+
try:
|
|
85
|
+
mapping_rows = feats.filter(pl.col("sample_uid") == sample)
|
|
86
|
+
except Exception:
|
|
87
|
+
mapping_rows = pl.DataFrame()
|
|
88
|
+
|
|
89
|
+
if mapping_rows is None or mapping_rows.is_empty():
|
|
90
|
+
try:
|
|
91
|
+
mapping_rows = feats.filter(pl.col("sample_name") == sample)
|
|
92
|
+
except Exception:
|
|
93
|
+
mapping_rows = pl.DataFrame()
|
|
94
|
+
|
|
95
|
+
# If we still have no sample selector, try to infer sample from the Sample object s
|
|
96
|
+
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
|
|
97
|
+
# attempt to match by sample_path or file name
|
|
98
|
+
try:
|
|
99
|
+
sample_paths = feats.select(["sample_uid", "sample_name", "sample_path"]) # type: ignore[arg-type]
|
|
100
|
+
# find row where sample_path matches
|
|
101
|
+
mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
|
|
102
|
+
except Exception:
|
|
103
|
+
mapping_rows = pl.DataFrame()
|
|
104
|
+
|
|
105
|
+
# If still empty, give up mapping
|
|
106
|
+
if mapping_rows is not None and not mapping_rows.is_empty():
|
|
107
|
+
# collect rt and rt_original pairs
|
|
108
|
+
try:
|
|
109
|
+
map_pd = mapping_rows.select(["rt", "rt_original"]).to_pandas()
|
|
110
|
+
except Exception:
|
|
111
|
+
map_pd = mapping_rows.to_pandas()[["rt", "rt_original"]]
|
|
112
|
+
|
|
113
|
+
# drop NA and duplicates
|
|
114
|
+
map_pd = map_pd.dropna()
|
|
115
|
+
if not map_pd.empty:
|
|
116
|
+
# sort by rt (current/aligned)
|
|
117
|
+
map_pd = map_pd.sort_values("rt")
|
|
118
|
+
x = map_pd["rt"].to_numpy()
|
|
119
|
+
y = map_pd["rt_original"].to_numpy()
|
|
120
|
+
# require at least 2 points to interpolate
|
|
121
|
+
if x.size >= 2:
|
|
122
|
+
# apply linear interpolation from current rt -> original rt
|
|
123
|
+
# for values outside the known range, numpy.interp will clip to endpoints
|
|
124
|
+
new_rt = _np.interp(bpc_pd["rt"].to_numpy(), x, y)
|
|
125
|
+
bpc_pd = bpc_pd.copy()
|
|
126
|
+
bpc_pd["rt"] = new_rt
|
|
127
|
+
except Exception:
|
|
128
|
+
# If mapping fails, silently continue and return the original computed BPC
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
# build Chromatogram
|
|
132
|
+
ycol = "inty"
|
|
133
|
+
try:
|
|
134
|
+
chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
|
|
135
|
+
except Exception:
|
|
136
|
+
chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
|
|
137
|
+
|
|
138
|
+
return chrom
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_tic(owner, sample=None, label=None):
|
|
142
|
+
"""
|
|
143
|
+
Return a Chromatogram object containing the Total Ion Chromatogram (TIC).
|
|
144
|
+
|
|
145
|
+
`owner` may be a Sample-like object (has `ms1_df`) or a Study (in which case `sample` selects the sample).
|
|
146
|
+
The function falls back to `scans_df` when `ms1_df` is not available.
|
|
147
|
+
"""
|
|
148
|
+
# resolve sample object
|
|
149
|
+
s = None
|
|
150
|
+
if hasattr(owner, "ms1_df"):
|
|
151
|
+
s = owner
|
|
152
|
+
else:
|
|
153
|
+
s = get_sample(owner, sample)
|
|
154
|
+
|
|
155
|
+
if s is None:
|
|
156
|
+
raise ValueError("Could not resolve sample for TIC computation")
|
|
157
|
+
|
|
158
|
+
# prefer ms1_df
|
|
159
|
+
try:
|
|
160
|
+
cols = s.ms1_df.columns
|
|
161
|
+
if all(c in cols for c in ["rt", "inty"]):
|
|
162
|
+
tic = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
|
|
163
|
+
tic = tic.groupby("rt").agg(pl.col("inty").sum().alias("inty_tot"))
|
|
164
|
+
tic_pd = tic.to_pandas().sort_values("rt")
|
|
165
|
+
else:
|
|
166
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
167
|
+
except Exception:
|
|
168
|
+
# fallback to scans_df if present
|
|
169
|
+
if getattr(s, "scans_df", None) is not None:
|
|
170
|
+
try:
|
|
171
|
+
scans = s.scans_df.filter(pl.col("ms_level") == 1)
|
|
172
|
+
data = scans[["rt", "scan_uid", "inty_tot"]].to_pandas()
|
|
173
|
+
data = data.sort_values("rt")
|
|
174
|
+
tic_pd = data.rename(columns={"inty_tot": "inty_tot"})
|
|
175
|
+
except Exception:
|
|
176
|
+
raise
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
|
|
179
|
+
|
|
180
|
+
if tic_pd.empty:
|
|
181
|
+
raise ValueError("Computed TIC is empty")
|
|
182
|
+
|
|
183
|
+
# ensure column name
|
|
184
|
+
if "inty_tot" not in tic_pd.columns:
|
|
185
|
+
tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
|
|
189
|
+
except Exception:
|
|
190
|
+
chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
|
|
191
|
+
|
|
192
|
+
return chrom
|
|
193
|
+
|
|
10
194
|
|
|
195
|
+
def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
|
|
196
|
+
"""
|
|
197
|
+
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
198
|
+
|
|
199
|
+
The `owner` argument may be either a Study instance or a Sample-like object that
|
|
200
|
+
exposes `ms1_df` (Polars DataFrame).
|
|
201
|
+
|
|
202
|
+
If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
|
|
203
|
+
and the Sample will be retrieved using `get_sample(owner, sample)`.
|
|
204
|
+
|
|
205
|
+
Parameters:
|
|
206
|
+
owner: Study or Sample instance
|
|
207
|
+
sample: Sample identifier (required if owner is Study)
|
|
208
|
+
mz (float): Target m/z value
|
|
209
|
+
mz_tol (float): m/z tolerance (default 0.01)
|
|
210
|
+
rt_unit (str): Retention time unit for the chromatogram
|
|
211
|
+
label (str): Optional label for the chromatogram
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Chromatogram
|
|
215
|
+
"""
|
|
216
|
+
if mz is None:
|
|
217
|
+
raise ValueError("mz must be provided for EIC computation")
|
|
218
|
+
|
|
219
|
+
# resolve sample when owner is a Study-like object (has get_sample)
|
|
220
|
+
s = None
|
|
221
|
+
if hasattr(owner, "ms1_df"):
|
|
222
|
+
s = owner
|
|
223
|
+
else:
|
|
224
|
+
# owner is expected to be a Study
|
|
225
|
+
s = get_sample(owner, sample)
|
|
226
|
+
|
|
227
|
+
if s is None:
|
|
228
|
+
raise ValueError("Could not resolve sample for EIC computation")
|
|
229
|
+
|
|
230
|
+
# ensure ms1_df exists
|
|
231
|
+
if getattr(s, "ms1_df", None) is None:
|
|
232
|
+
raise ValueError("Sample has no ms1_df for EIC computation")
|
|
233
|
+
|
|
234
|
+
# Extract EIC from ms1_df using mz window
|
|
235
|
+
try:
|
|
236
|
+
cols = s.ms1_df.columns
|
|
237
|
+
if not all(c in cols for c in ["rt", "mz", "inty"]):
|
|
238
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
239
|
+
|
|
240
|
+
# Filter by mz window
|
|
241
|
+
mz_min = mz - mz_tol
|
|
242
|
+
mz_max = mz + mz_tol
|
|
243
|
+
eic_data = s.ms1_df.filter(
|
|
244
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if eic_data.is_empty():
|
|
248
|
+
# Return empty chromatogram if no data found
|
|
249
|
+
import numpy as _np
|
|
250
|
+
return Chromatogram(
|
|
251
|
+
rt=_np.array([0.0]),
|
|
252
|
+
inty=_np.array([0.0]),
|
|
253
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
254
|
+
rt_unit=rt_unit
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Aggregate intensities per retention time (sum in case of multiple points per rt)
|
|
258
|
+
eic = eic_data.group_by("rt").agg(pl.col("inty").sum().alias("inty"))
|
|
259
|
+
eic_pd = eic.sort("rt").to_pandas()
|
|
260
|
+
|
|
261
|
+
except Exception:
|
|
262
|
+
raise RuntimeError("Failed to extract EIC from ms1_df")
|
|
263
|
+
|
|
264
|
+
if eic_pd.empty:
|
|
265
|
+
# Return empty chromatogram if no data found
|
|
266
|
+
import numpy as _np
|
|
267
|
+
return Chromatogram(
|
|
268
|
+
rt=_np.array([0.0]),
|
|
269
|
+
inty=_np.array([0.0]),
|
|
270
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
271
|
+
rt_unit=rt_unit
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# build Chromatogram
|
|
275
|
+
try:
|
|
276
|
+
chrom = Chromatogram(
|
|
277
|
+
rt=eic_pd["rt"].to_numpy(),
|
|
278
|
+
inty=eic_pd["inty"].to_numpy(),
|
|
279
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
280
|
+
rt_unit=rt_unit
|
|
281
|
+
)
|
|
282
|
+
except Exception:
|
|
283
|
+
chrom = Chromatogram(
|
|
284
|
+
rt=eic_pd["rt"].values,
|
|
285
|
+
inty=eic_pd["inty"].values,
|
|
286
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
287
|
+
rt_unit=rt_unit
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return chrom
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
|
|
11
294
|
|
|
12
295
|
def get_chrom(self, uids=None, samples=None):
|
|
13
296
|
# Check if consensus_df is empty or doesn't have required columns
|
|
@@ -113,6 +396,7 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
113
396
|
# Return as Polars DataFrame (can handle complex objects like Chromatogram)
|
|
114
397
|
return df2_pivoted
|
|
115
398
|
|
|
399
|
+
|
|
116
400
|
def set_folder(self, folder):
|
|
117
401
|
"""
|
|
118
402
|
Set the folder for saving and loading files.
|
|
@@ -123,8 +407,6 @@ def set_folder(self, folder):
|
|
|
123
407
|
|
|
124
408
|
|
|
125
409
|
def align_reset(self):
|
|
126
|
-
if self.alignment_ref_index is None:
|
|
127
|
-
return
|
|
128
410
|
self.logger.debug("Resetting alignment.")
|
|
129
411
|
# iterate over all feature maps and set RT to original RT
|
|
130
412
|
for feature_map in self.features_maps:
|
|
@@ -134,7 +416,13 @@ def align_reset(self):
|
|
|
134
416
|
feature.setRT(rt)
|
|
135
417
|
feature.removeMetaValue("original_RT")
|
|
136
418
|
self.alignment_ref_index = None
|
|
137
|
-
|
|
419
|
+
# in self.features_df, set rt equal to rt_original
|
|
420
|
+
self.features_df = self.features_df.with_columns(
|
|
421
|
+
pl.col("rt_original").alias("rt")
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Ensure column order is maintained after with_columns operation
|
|
425
|
+
self._ensure_features_df_schema_order()
|
|
138
426
|
|
|
139
427
|
# TODO I don't get this param
|
|
140
428
|
def get_consensus(self, quant="chrom_area"):
|
|
@@ -408,17 +696,71 @@ def _get_sample_uids(self, samples=None, seed=42):
|
|
|
408
696
|
sample_uids = list(set(sample_uids))
|
|
409
697
|
return sample_uids
|
|
410
698
|
|
|
699
|
+
|
|
700
|
+
def get_sample(self, sample):
|
|
701
|
+
"""
|
|
702
|
+
Return a `Sample` object corresponding to the provided sample identifier.
|
|
703
|
+
|
|
704
|
+
Accepted `sample` values:
|
|
705
|
+
- int: interpreted as `sample_uid`
|
|
706
|
+
- str: interpreted as `sample_name`
|
|
707
|
+
- Sample instance: returned as-is
|
|
708
|
+
|
|
709
|
+
This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
|
|
710
|
+
"""
|
|
711
|
+
from masster.sample.sample import Sample
|
|
712
|
+
|
|
713
|
+
if isinstance(sample, Sample):
|
|
714
|
+
return sample
|
|
715
|
+
|
|
716
|
+
if isinstance(sample, int):
|
|
717
|
+
rows = self.samples_df.filter(pl.col("sample_uid") == sample)
|
|
718
|
+
elif isinstance(sample, str):
|
|
719
|
+
rows = self.samples_df.filter(pl.col("sample_name") == sample)
|
|
720
|
+
else:
|
|
721
|
+
raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
|
|
722
|
+
|
|
723
|
+
if rows.is_empty():
|
|
724
|
+
raise KeyError(f"Sample not found: {sample}")
|
|
725
|
+
|
|
726
|
+
row = rows.row(0, named=True)
|
|
727
|
+
sample_uid = int(row["sample_uid"]) if row["sample_uid"] is not None else None
|
|
728
|
+
|
|
729
|
+
# Use a cache on the Study instance if available
|
|
730
|
+
cache = getattr(self, "_samples_cache", None)
|
|
731
|
+
if cache is not None and sample_uid in cache:
|
|
732
|
+
return cache[sample_uid]
|
|
733
|
+
|
|
734
|
+
sample_path = row.get("sample_path", None)
|
|
735
|
+
s = Sample(log_level='ERROR')
|
|
736
|
+
try:
|
|
737
|
+
if sample_path:
|
|
738
|
+
try:
|
|
739
|
+
s.load(sample_path)
|
|
740
|
+
except Exception:
|
|
741
|
+
s = Sample(file=sample_path)
|
|
742
|
+
except Exception:
|
|
743
|
+
pass
|
|
744
|
+
|
|
745
|
+
if cache is not None and sample_uid is not None:
|
|
746
|
+
cache[sample_uid] = s
|
|
747
|
+
return s
|
|
748
|
+
|
|
749
|
+
|
|
411
750
|
def get_orphans(self):
|
|
412
|
-
"""
|
|
751
|
+
"""
|
|
413
752
|
Get all features that are not in the consensus mapping.
|
|
414
753
|
"""
|
|
415
|
-
not_in_consensus = self.features_df.filter(
|
|
754
|
+
not_in_consensus = self.features_df.filter(
|
|
755
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
|
|
756
|
+
)
|
|
416
757
|
return not_in_consensus
|
|
417
758
|
|
|
759
|
+
|
|
418
760
|
def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
|
|
419
761
|
"""
|
|
420
762
|
Perform compress_features, compress_ms2, and compress_chrom operations.
|
|
421
|
-
|
|
763
|
+
|
|
422
764
|
Parameters:
|
|
423
765
|
max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
|
|
424
766
|
"""
|
|
@@ -441,48 +783,50 @@ def compress_features(self):
|
|
|
441
783
|
if self.features_df is None or self.features_df.is_empty():
|
|
442
784
|
self.logger.warning("No features_df found.")
|
|
443
785
|
return
|
|
444
|
-
|
|
786
|
+
|
|
445
787
|
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
446
788
|
self.logger.warning("No consensus_mapping_df found.")
|
|
447
789
|
return
|
|
448
|
-
|
|
790
|
+
|
|
449
791
|
initial_count = len(self.features_df)
|
|
450
|
-
|
|
792
|
+
|
|
451
793
|
# Get feature_uids that are associated with consensus features
|
|
452
794
|
consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
|
|
453
|
-
|
|
795
|
+
|
|
454
796
|
# Filter features_df to keep only features associated with consensus
|
|
455
797
|
self.features_df = self.features_df.filter(
|
|
456
|
-
pl.col("feature_uid").is_in(consensus_feature_uids)
|
|
798
|
+
pl.col("feature_uid").is_in(consensus_feature_uids),
|
|
457
799
|
)
|
|
458
|
-
|
|
800
|
+
|
|
459
801
|
# Set ms2_specs column to None if it exists
|
|
460
802
|
if "ms2_specs" in self.features_df.columns:
|
|
461
803
|
# Create a list of None values with the same length as the dataframe
|
|
462
804
|
# This preserves the Object dtype instead of converting to Null
|
|
463
805
|
none_values = [None] * len(self.features_df)
|
|
464
806
|
self.features_df = self.features_df.with_columns(
|
|
465
|
-
pl.Series("ms2_specs", none_values, dtype=pl.Object)
|
|
807
|
+
pl.Series("ms2_specs", none_values, dtype=pl.Object),
|
|
466
808
|
)
|
|
467
|
-
|
|
809
|
+
|
|
468
810
|
removed_count = initial_count - len(self.features_df)
|
|
469
|
-
self.logger.info(
|
|
811
|
+
self.logger.info(
|
|
812
|
+
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
|
|
813
|
+
)
|
|
470
814
|
|
|
471
815
|
|
|
472
816
|
def restore_features(self, samples=None, maps=False):
|
|
473
817
|
"""
|
|
474
|
-
Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
|
|
818
|
+
Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
|
|
475
819
|
from the corresponding samples by reading features_df from the sample5 file.
|
|
476
820
|
Use the feature_id for matching.
|
|
477
821
|
|
|
478
822
|
Parameters:
|
|
479
|
-
samples (list, optional): List of sample_uids or sample_names to restore.
|
|
823
|
+
samples (list, optional): List of sample_uids or sample_names to restore.
|
|
480
824
|
If None, restores all samples.
|
|
481
825
|
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
482
826
|
"""
|
|
483
827
|
import datetime
|
|
484
828
|
from masster.sample.sample import Sample
|
|
485
|
-
|
|
829
|
+
|
|
486
830
|
if self.features_df is None or self.features_df.is_empty():
|
|
487
831
|
self.logger.error("No features_df found in study.")
|
|
488
832
|
return
|
|
@@ -499,8 +843,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
499
843
|
return
|
|
500
844
|
|
|
501
845
|
# Columns to update from sample data
|
|
502
|
-
columns_to_update = [
|
|
503
|
-
|
|
846
|
+
columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
|
|
847
|
+
|
|
504
848
|
self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
|
|
505
849
|
|
|
506
850
|
# Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
|
|
@@ -512,10 +856,12 @@ def restore_features(self, samples=None, maps=False):
|
|
|
512
856
|
|
|
513
857
|
# Process each sample
|
|
514
858
|
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
515
|
-
for sample_uid in tqdm(
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
859
|
+
for sample_uid in tqdm(
|
|
860
|
+
sample_uids,
|
|
861
|
+
unit="sample",
|
|
862
|
+
disable=tqdm_disable,
|
|
863
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples",
|
|
864
|
+
):
|
|
519
865
|
# Get sample info
|
|
520
866
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
521
867
|
if sample_row.is_empty():
|
|
@@ -534,7 +880,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
534
880
|
# Load sample to get its features_df
|
|
535
881
|
# Use a direct load call with map=False to prevent feature synchronization
|
|
536
882
|
# which would remove filled features that don't exist in the original FeatureMap
|
|
537
|
-
sample = Sample(log_level=
|
|
883
|
+
sample = Sample(log_level="DEBUG")
|
|
538
884
|
sample._load_sample5(sample_path, map=False)
|
|
539
885
|
|
|
540
886
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
@@ -547,34 +893,34 @@ def restore_features(self, samples=None, maps=False):
|
|
|
547
893
|
feature_id = row.get("feature_id")
|
|
548
894
|
if feature_id is None:
|
|
549
895
|
continue
|
|
550
|
-
|
|
896
|
+
|
|
551
897
|
key = (sample_uid, feature_id)
|
|
552
898
|
if key in study_feature_mapping:
|
|
553
899
|
feature_uid = study_feature_mapping[key]
|
|
554
|
-
|
|
900
|
+
|
|
555
901
|
# Update the specific columns in study.features_df
|
|
556
902
|
for col in columns_to_update:
|
|
557
903
|
if col in row and col in self.features_df.columns:
|
|
558
904
|
# Get the original column dtype to preserve it
|
|
559
905
|
original_dtype = self.features_df[col].dtype
|
|
560
|
-
|
|
906
|
+
|
|
561
907
|
# Update the specific row and column, preserving dtype
|
|
562
908
|
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
563
|
-
|
|
909
|
+
|
|
564
910
|
# Handle object columns (like Chromatogram) differently
|
|
565
911
|
if original_dtype == pl.Object:
|
|
566
912
|
self.features_df = self.features_df.with_columns(
|
|
567
913
|
pl.when(mask)
|
|
568
914
|
.then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
|
|
569
915
|
.otherwise(pl.col(col))
|
|
570
|
-
.alias(col)
|
|
916
|
+
.alias(col),
|
|
571
917
|
)
|
|
572
918
|
else:
|
|
573
919
|
self.features_df = self.features_df.with_columns(
|
|
574
920
|
pl.when(mask)
|
|
575
921
|
.then(pl.lit(row[col], dtype=original_dtype))
|
|
576
922
|
.otherwise(pl.col(col))
|
|
577
|
-
.alias(col)
|
|
923
|
+
.alias(col),
|
|
578
924
|
)
|
|
579
925
|
updates_made += 1
|
|
580
926
|
|
|
@@ -582,7 +928,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
582
928
|
|
|
583
929
|
# If maps is True, load featureXML data
|
|
584
930
|
if maps:
|
|
585
|
-
if hasattr(sample,
|
|
931
|
+
if hasattr(sample, "feature_maps"):
|
|
586
932
|
self.feature_maps.extend(sample.feature_maps)
|
|
587
933
|
|
|
588
934
|
except Exception as e:
|
|
@@ -595,14 +941,14 @@ def restore_features(self, samples=None, maps=False):
|
|
|
595
941
|
def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
596
942
|
"""
|
|
597
943
|
Restore chromatograms from individual .sample5 files and gap-fill missing ones.
|
|
598
|
-
|
|
944
|
+
|
|
599
945
|
This function combines the functionality of restore_features() and fill_chrom():
|
|
600
946
|
1. First restores chromatograms from individual .sample5 files (like restore_features)
|
|
601
947
|
2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
|
|
602
948
|
3. ONLY updates the 'chrom' column, not chrom_area or other derived values
|
|
603
|
-
|
|
949
|
+
|
|
604
950
|
Parameters:
|
|
605
|
-
samples (list, optional): List of sample_uids or sample_names to process.
|
|
951
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
606
952
|
If None, processes all samples.
|
|
607
953
|
mz_tol (float): m/z tolerance for gap filling (default: 0.010)
|
|
608
954
|
rt_tol (float): RT tolerance for gap filling (default: 10.0)
|
|
@@ -611,7 +957,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
611
957
|
import numpy as np
|
|
612
958
|
from masster.sample.sample import Sample
|
|
613
959
|
from masster.chromatogram import Chromatogram
|
|
614
|
-
|
|
960
|
+
|
|
615
961
|
if self.features_df is None or self.features_df.is_empty():
|
|
616
962
|
self.logger.error("No features_df found in study.")
|
|
617
963
|
return
|
|
@@ -627,7 +973,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
627
973
|
return
|
|
628
974
|
|
|
629
975
|
self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
|
|
630
|
-
|
|
976
|
+
|
|
631
977
|
# Create mapping of (sample_uid, feature_id) to feature_uid
|
|
632
978
|
study_feature_mapping = {}
|
|
633
979
|
for row in self.features_df.iter_rows(named=True):
|
|
@@ -638,12 +984,13 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
638
984
|
# Phase 1: Restore from individual .sample5 files (like restore_features)
|
|
639
985
|
restored_count = 0
|
|
640
986
|
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
641
|
-
|
|
987
|
+
|
|
642
988
|
self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
|
|
643
|
-
for sample_uid in tqdm(
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
989
|
+
for sample_uid in tqdm(
|
|
990
|
+
sample_uids,
|
|
991
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
|
|
992
|
+
disable=tqdm_disable,
|
|
993
|
+
):
|
|
647
994
|
# Get sample info
|
|
648
995
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
649
996
|
if sample_row.is_empty():
|
|
@@ -660,7 +1007,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
660
1007
|
|
|
661
1008
|
try:
|
|
662
1009
|
# Load sample (with map=False to prevent feature synchronization)
|
|
663
|
-
sample = Sample(log_level=
|
|
1010
|
+
sample = Sample(log_level="WARNING")
|
|
664
1011
|
sample._load_sample5(sample_path, map=False)
|
|
665
1012
|
|
|
666
1013
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
@@ -671,21 +1018,21 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
671
1018
|
for row in sample.features_df.iter_rows(named=True):
|
|
672
1019
|
feature_id = row.get("feature_id")
|
|
673
1020
|
chrom = row.get("chrom")
|
|
674
|
-
|
|
1021
|
+
|
|
675
1022
|
if feature_id is None or chrom is None:
|
|
676
1023
|
continue
|
|
677
|
-
|
|
1024
|
+
|
|
678
1025
|
key = (sample_uid, feature_id)
|
|
679
1026
|
if key in study_feature_mapping:
|
|
680
1027
|
feature_uid = study_feature_mapping[key]
|
|
681
|
-
|
|
1028
|
+
|
|
682
1029
|
# Update only the chrom column
|
|
683
1030
|
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
684
1031
|
self.features_df = self.features_df.with_columns(
|
|
685
1032
|
pl.when(mask)
|
|
686
1033
|
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
687
1034
|
.otherwise(pl.col("chrom"))
|
|
688
|
-
.alias("chrom")
|
|
1035
|
+
.alias("chrom"),
|
|
689
1036
|
)
|
|
690
1037
|
restored_count += 1
|
|
691
1038
|
|
|
@@ -694,20 +1041,22 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
694
1041
|
continue
|
|
695
1042
|
|
|
696
1043
|
self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
|
|
697
|
-
|
|
1044
|
+
|
|
698
1045
|
# Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
|
|
699
1046
|
self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
|
|
700
|
-
|
|
1047
|
+
|
|
701
1048
|
# Count how many chromatograms are still missing
|
|
702
1049
|
empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
|
|
703
1050
|
total_chroms = len(self.features_df)
|
|
704
|
-
|
|
705
|
-
self.logger.debug(
|
|
706
|
-
|
|
1051
|
+
|
|
1052
|
+
self.logger.debug(
|
|
1053
|
+
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
|
|
1054
|
+
)
|
|
1055
|
+
|
|
707
1056
|
if empty_chroms == 0:
|
|
708
1057
|
self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
|
|
709
1058
|
return
|
|
710
|
-
|
|
1059
|
+
|
|
711
1060
|
# Get consensus info for gap filling
|
|
712
1061
|
consensus_info = {}
|
|
713
1062
|
for row in self.consensus_df.iter_rows(named=True):
|
|
@@ -717,23 +1066,23 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
717
1066
|
"mz": row["mz"],
|
|
718
1067
|
"rt": row["rt"],
|
|
719
1068
|
}
|
|
720
|
-
|
|
1069
|
+
|
|
721
1070
|
filled_count = 0
|
|
722
|
-
|
|
1071
|
+
|
|
723
1072
|
# Process each sample that has missing chromatograms
|
|
724
|
-
for sample_uid in tqdm(
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
1073
|
+
for sample_uid in tqdm(
|
|
1074
|
+
sample_uids,
|
|
1075
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
|
|
1076
|
+
disable=tqdm_disable,
|
|
1077
|
+
):
|
|
728
1078
|
# Get features with missing chromatograms for this sample
|
|
729
1079
|
missing_features = self.features_df.filter(
|
|
730
|
-
(pl.col("sample_uid") == sample_uid) &
|
|
731
|
-
(pl.col("chrom").is_null())
|
|
1080
|
+
(pl.col("sample_uid") == sample_uid) & (pl.col("chrom").is_null()),
|
|
732
1081
|
)
|
|
733
|
-
|
|
1082
|
+
|
|
734
1083
|
if missing_features.is_empty():
|
|
735
1084
|
continue
|
|
736
|
-
|
|
1085
|
+
|
|
737
1086
|
# Get sample info
|
|
738
1087
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
739
1088
|
sample_info = sample_row.row(0, named=True)
|
|
@@ -745,10 +1094,10 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
745
1094
|
|
|
746
1095
|
try:
|
|
747
1096
|
# Load sample for MS1 data extraction
|
|
748
|
-
sample = Sample(log_level=
|
|
1097
|
+
sample = Sample(log_level="WARNING")
|
|
749
1098
|
sample._load_sample5(sample_path, map=False)
|
|
750
1099
|
|
|
751
|
-
if not hasattr(sample,
|
|
1100
|
+
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
752
1101
|
continue
|
|
753
1102
|
|
|
754
1103
|
# Process each missing feature
|
|
@@ -758,15 +1107,15 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
758
1107
|
rt = feature_row["rt"]
|
|
759
1108
|
rt_start = feature_row.get("rt_start", rt - rt_tol)
|
|
760
1109
|
rt_end = feature_row.get("rt_end", rt + rt_tol)
|
|
761
|
-
|
|
1110
|
+
|
|
762
1111
|
# Extract EIC from MS1 data
|
|
763
1112
|
d = sample.ms1_df.filter(
|
|
764
|
-
(pl.col("mz") >= mz - mz_tol)
|
|
765
|
-
(pl.col("mz") <= mz + mz_tol)
|
|
766
|
-
(pl.col("rt") >= rt_start - rt_tol)
|
|
767
|
-
(pl.col("rt") <= rt_end + rt_tol)
|
|
1113
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
1114
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
1115
|
+
& (pl.col("rt") >= rt_start - rt_tol)
|
|
1116
|
+
& (pl.col("rt") <= rt_end + rt_tol),
|
|
768
1117
|
)
|
|
769
|
-
|
|
1118
|
+
|
|
770
1119
|
# Create chromatogram
|
|
771
1120
|
if d.is_empty():
|
|
772
1121
|
# Create empty chromatogram
|
|
@@ -784,7 +1133,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
784
1133
|
else:
|
|
785
1134
|
# Create real chromatogram from data
|
|
786
1135
|
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
787
|
-
|
|
1136
|
+
|
|
788
1137
|
if len(eic_rt) > 4:
|
|
789
1138
|
eic = Chromatogram(
|
|
790
1139
|
eic_rt["rt"].to_numpy(),
|
|
@@ -809,14 +1158,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
809
1158
|
feature_end=rt_end,
|
|
810
1159
|
feature_apex=rt,
|
|
811
1160
|
)
|
|
812
|
-
|
|
1161
|
+
|
|
813
1162
|
# Update the chromatogram in the study
|
|
814
1163
|
mask = pl.col("feature_uid") == feature_uid
|
|
815
1164
|
self.features_df = self.features_df.with_columns(
|
|
816
1165
|
pl.when(mask)
|
|
817
1166
|
.then(pl.lit(eic, dtype=pl.Object, allow_object=True))
|
|
818
1167
|
.otherwise(pl.col("chrom"))
|
|
819
|
-
.alias("chrom")
|
|
1168
|
+
.alias("chrom"),
|
|
820
1169
|
)
|
|
821
1170
|
filled_count += 1
|
|
822
1171
|
|
|
@@ -825,12 +1174,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
825
1174
|
continue
|
|
826
1175
|
|
|
827
1176
|
self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
|
|
828
|
-
|
|
1177
|
+
|
|
829
1178
|
# Final summary
|
|
830
1179
|
final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
831
1180
|
final_total = len(self.features_df)
|
|
832
|
-
|
|
833
|
-
self.logger.info(
|
|
1181
|
+
|
|
1182
|
+
self.logger.info(
|
|
1183
|
+
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
|
|
1184
|
+
)
|
|
834
1185
|
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
835
1186
|
|
|
836
1187
|
|
|
@@ -839,41 +1190,39 @@ def compress_ms2(self, max_replicates=5):
|
|
|
839
1190
|
Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
|
|
840
1191
|
Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
|
|
841
1192
|
and then pick the top XY rows. Discard the others.
|
|
842
|
-
|
|
1193
|
+
|
|
843
1194
|
Parameters:
|
|
844
1195
|
max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
|
|
845
1196
|
"""
|
|
846
1197
|
if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
|
|
847
1198
|
self.logger.warning("No consensus_ms2 found.")
|
|
848
1199
|
return
|
|
849
|
-
|
|
1200
|
+
|
|
850
1201
|
initial_count = len(self.consensus_ms2)
|
|
851
|
-
|
|
1202
|
+
|
|
852
1203
|
# Create a ranking score based on number_frags * prec_inty
|
|
853
1204
|
# Handle None values by treating them as 0
|
|
854
1205
|
self.consensus_ms2 = self.consensus_ms2.with_columns([
|
|
855
|
-
(
|
|
856
|
-
pl.col("number_frags").fill_null(0) *
|
|
857
|
-
pl.col("prec_inty").fill_null(0)
|
|
858
|
-
).alias("ranking_score")
|
|
1206
|
+
(pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
|
|
859
1207
|
])
|
|
860
|
-
|
|
1208
|
+
|
|
861
1209
|
# Group by consensus_uid and energy, then rank by score and keep top max_replicates
|
|
862
1210
|
compressed_ms2 = (
|
|
863
|
-
self.consensus_ms2
|
|
864
|
-
.with_row_count("row_id") # Add row numbers for stable sorting
|
|
1211
|
+
self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
|
|
865
1212
|
.sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
|
|
866
1213
|
.with_columns([
|
|
867
|
-
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
|
|
1214
|
+
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
|
|
868
1215
|
])
|
|
869
1216
|
.filter(pl.col("rank") < max_replicates)
|
|
870
1217
|
.drop(["ranking_score", "row_id", "rank"])
|
|
871
1218
|
)
|
|
872
|
-
|
|
1219
|
+
|
|
873
1220
|
self.consensus_ms2 = compressed_ms2
|
|
874
|
-
|
|
1221
|
+
|
|
875
1222
|
removed_count = initial_count - len(self.consensus_ms2)
|
|
876
|
-
self.logger.info(
|
|
1223
|
+
self.logger.info(
|
|
1224
|
+
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
|
|
1225
|
+
)
|
|
877
1226
|
|
|
878
1227
|
|
|
879
1228
|
def compress_chrom(self):
|
|
@@ -886,49 +1235,175 @@ def compress_chrom(self):
|
|
|
886
1235
|
if self.features_df is None or self.features_df.is_empty():
|
|
887
1236
|
self.logger.warning("No features_df found.")
|
|
888
1237
|
return
|
|
889
|
-
|
|
1238
|
+
|
|
890
1239
|
if "chrom" not in self.features_df.columns:
|
|
891
1240
|
self.logger.warning("No 'chrom' column found in features_df.")
|
|
892
1241
|
return
|
|
893
|
-
|
|
1242
|
+
|
|
894
1243
|
# Count non-null chromatograms before compression
|
|
895
1244
|
non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
896
|
-
|
|
1245
|
+
|
|
897
1246
|
# Set chrom column to None while keeping dtype as object
|
|
898
1247
|
self.features_df = self.features_df.with_columns(
|
|
899
|
-
pl.lit(None, dtype=pl.Object).alias("chrom")
|
|
1248
|
+
pl.lit(None, dtype=pl.Object).alias("chrom"),
|
|
900
1249
|
)
|
|
901
|
-
|
|
1250
|
+
|
|
902
1251
|
self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
|
|
903
1252
|
|
|
904
1253
|
|
|
905
|
-
def
|
|
1254
|
+
def name_replace(self, replace_dict):
|
|
906
1255
|
"""
|
|
907
|
-
|
|
908
|
-
keep the current basename and build an absolute path. Check that the new file exists
|
|
909
|
-
before overwriting the old file_source.
|
|
1256
|
+
Replace sample names in samples_df based on a dictionary mapping.
|
|
910
1257
|
|
|
1258
|
+
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1259
|
+
all keys with their corresponding values from replace_dict. Checks that all
|
|
1260
|
+
resulting sample names are unique. If unique, replaces the values in self.samples_df.
|
|
1261
|
+
|
|
911
1262
|
Parameters:
|
|
912
|
-
|
|
1263
|
+
replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
|
|
1264
|
+
All keys found in sample names will be replaced with their
|
|
1265
|
+
corresponding values.
|
|
1266
|
+
e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
|
|
1267
|
+
|
|
1268
|
+
Returns:
|
|
1269
|
+
None
|
|
1270
|
+
|
|
1271
|
+
Raises:
|
|
1272
|
+
ValueError: If replace_dict is not a dictionary
|
|
1273
|
+
ValueError: If resulting sample names are not unique
|
|
1274
|
+
"""
|
|
1275
|
+
if not isinstance(replace_dict, dict):
|
|
1276
|
+
raise ValueError("replace_dict must be a dictionary")
|
|
1277
|
+
|
|
1278
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1279
|
+
self.logger.warning("No samples found in study.")
|
|
1280
|
+
return
|
|
1281
|
+
|
|
1282
|
+
if not replace_dict:
|
|
1283
|
+
self.logger.warning("Empty replace_dict provided, no changes made.")
|
|
1284
|
+
return
|
|
1285
|
+
|
|
1286
|
+
# Get current sample names
|
|
1287
|
+
current_names = self.samples_df.get_column("sample_name").to_list()
|
|
1288
|
+
|
|
1289
|
+
# Create a copy and apply replacements
|
|
1290
|
+
new_names = []
|
|
1291
|
+
replaced_count = 0
|
|
1292
|
+
|
|
1293
|
+
for name in current_names:
|
|
1294
|
+
if name in replace_dict:
|
|
1295
|
+
new_names.append(replace_dict[name])
|
|
1296
|
+
replaced_count += 1
|
|
1297
|
+
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1298
|
+
else:
|
|
1299
|
+
new_names.append(name)
|
|
1300
|
+
|
|
1301
|
+
# Check that all new names are unique
|
|
1302
|
+
if len(set(new_names)) != len(new_names):
|
|
1303
|
+
duplicates = []
|
|
1304
|
+
seen = set()
|
|
1305
|
+
for name in new_names:
|
|
1306
|
+
if name in seen:
|
|
1307
|
+
duplicates.append(name)
|
|
1308
|
+
else:
|
|
1309
|
+
seen.add(name)
|
|
1310
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1311
|
+
|
|
1312
|
+
# If we get here, all names are unique - apply the changes
|
|
1313
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1314
|
+
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1315
|
+
)
|
|
913
1316
|
|
|
1317
|
+
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1318
|
+
|
|
1319
|
+
|
|
1320
|
+
def name_reset(self):
|
|
1321
|
+
"""
|
|
1322
|
+
Reset sample names to the basename of sample_path without extensions.
|
|
1323
|
+
|
|
1324
|
+
Takes all paths in self.samples_df['sample_path'], extracts the basename,
|
|
1325
|
+
removes file extensions, and checks that all resulting names are unique.
|
|
1326
|
+
If unique, replaces the values in self.samples_df['sample_name'].
|
|
1327
|
+
|
|
914
1328
|
Returns:
|
|
915
1329
|
None
|
|
1330
|
+
|
|
1331
|
+
Raises:
|
|
1332
|
+
ValueError: If resulting sample names are not unique
|
|
1333
|
+
RuntimeError: If any sample_path is None or empty
|
|
916
1334
|
"""
|
|
917
1335
|
import os
|
|
918
1336
|
|
|
919
1337
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
920
1338
|
self.logger.warning("No samples found in study.")
|
|
921
1339
|
return
|
|
1340
|
+
|
|
1341
|
+
# Get current sample paths
|
|
1342
|
+
sample_paths = self.samples_df.get_column("sample_path").to_list()
|
|
1343
|
+
|
|
1344
|
+
# Extract basenames without extensions
|
|
1345
|
+
new_names = []
|
|
1346
|
+
|
|
1347
|
+
for i, path in enumerate(sample_paths):
|
|
1348
|
+
if path is None or path == "":
|
|
1349
|
+
raise RuntimeError(f"Sample at index {i} has no sample_path set")
|
|
1350
|
+
|
|
1351
|
+
# Get basename and remove extension(s)
|
|
1352
|
+
basename = os.path.basename(path)
|
|
1353
|
+
# Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
|
|
1354
|
+
name_without_ext = basename
|
|
1355
|
+
while '.' in name_without_ext:
|
|
1356
|
+
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1357
|
+
|
|
1358
|
+
new_names.append(name_without_ext)
|
|
1359
|
+
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1360
|
+
|
|
1361
|
+
# Check that all new names are unique
|
|
1362
|
+
if len(set(new_names)) != len(new_names):
|
|
1363
|
+
duplicates = []
|
|
1364
|
+
seen = set()
|
|
1365
|
+
for name in new_names:
|
|
1366
|
+
if name in seen:
|
|
1367
|
+
duplicates.append(name)
|
|
1368
|
+
else:
|
|
1369
|
+
seen.add(name)
|
|
1370
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
922
1371
|
|
|
1372
|
+
# If we get here, all names are unique - apply the changes
|
|
1373
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1374
|
+
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1375
|
+
)
|
|
1376
|
+
|
|
1377
|
+
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1378
|
+
|
|
1379
|
+
|
|
1380
|
+
def set_source(self, filename):
|
|
1381
|
+
"""
|
|
1382
|
+
Reassign file_source for all samples in samples_df. If filename contains only a path,
|
|
1383
|
+
keep the current basename and build an absolute path. Check that the new file exists
|
|
1384
|
+
before overwriting the old file_source.
|
|
1385
|
+
|
|
1386
|
+
Parameters:
|
|
1387
|
+
filename (str): New file path or directory path for all samples
|
|
1388
|
+
|
|
1389
|
+
Returns:
|
|
1390
|
+
None
|
|
1391
|
+
"""
|
|
1392
|
+
import os
|
|
1393
|
+
|
|
1394
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1395
|
+
self.logger.warning("No samples found in study.")
|
|
1396
|
+
return
|
|
1397
|
+
|
|
923
1398
|
updated_count = 0
|
|
924
1399
|
failed_count = 0
|
|
925
|
-
|
|
1400
|
+
|
|
926
1401
|
# Get all current file_source values
|
|
927
1402
|
current_sources = self.samples_df.get_column("file_source").to_list()
|
|
928
1403
|
sample_names = self.samples_df.get_column("sample_name").to_list()
|
|
929
|
-
|
|
1404
|
+
|
|
930
1405
|
new_sources = []
|
|
931
|
-
|
|
1406
|
+
|
|
932
1407
|
for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
|
|
933
1408
|
# Check if filename is just a directory path
|
|
934
1409
|
if os.path.isdir(filename):
|
|
@@ -937,7 +1412,7 @@ def set_source(self, filename):
|
|
|
937
1412
|
new_sources.append(current_source)
|
|
938
1413
|
failed_count += 1
|
|
939
1414
|
continue
|
|
940
|
-
|
|
1415
|
+
|
|
941
1416
|
# Get the basename from current file_source
|
|
942
1417
|
current_basename = os.path.basename(current_source)
|
|
943
1418
|
# Build new absolute path
|
|
@@ -945,26 +1420,26 @@ def set_source(self, filename):
|
|
|
945
1420
|
else:
|
|
946
1421
|
# filename is a full path, make it absolute
|
|
947
1422
|
new_file_path = os.path.abspath(filename)
|
|
948
|
-
|
|
1423
|
+
|
|
949
1424
|
# Check if the new file exists
|
|
950
1425
|
if not os.path.exists(new_file_path):
|
|
951
1426
|
self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
|
|
952
1427
|
new_sources.append(current_source)
|
|
953
1428
|
failed_count += 1
|
|
954
1429
|
continue
|
|
955
|
-
|
|
1430
|
+
|
|
956
1431
|
# File exists, update source
|
|
957
1432
|
new_sources.append(new_file_path)
|
|
958
1433
|
updated_count += 1
|
|
959
|
-
|
|
1434
|
+
|
|
960
1435
|
# Log individual updates at debug level
|
|
961
1436
|
self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
|
|
962
|
-
|
|
1437
|
+
|
|
963
1438
|
# Update the samples_df with new file_source values
|
|
964
1439
|
self.samples_df = self.samples_df.with_columns(
|
|
965
|
-
pl.Series("file_source", new_sources).alias("file_source")
|
|
1440
|
+
pl.Series("file_source", new_sources).alias("file_source"),
|
|
966
1441
|
)
|
|
967
|
-
|
|
1442
|
+
|
|
968
1443
|
# Log summary
|
|
969
1444
|
if updated_count > 0:
|
|
970
1445
|
self.logger.info(f"Updated file_source for {updated_count} samples")
|
|
@@ -990,9 +1465,9 @@ def features_select(
|
|
|
990
1465
|
):
|
|
991
1466
|
"""
|
|
992
1467
|
Select features from features_df based on specified criteria and return the filtered DataFrame.
|
|
993
|
-
|
|
1468
|
+
|
|
994
1469
|
OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
|
|
995
|
-
|
|
1470
|
+
|
|
996
1471
|
Parameters:
|
|
997
1472
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
998
1473
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -1007,30 +1482,42 @@ def features_select(
|
|
|
1007
1482
|
chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1008
1483
|
chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1009
1484
|
chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1010
|
-
|
|
1485
|
+
|
|
1011
1486
|
Returns:
|
|
1012
1487
|
polars.DataFrame: Filtered features DataFrame
|
|
1013
1488
|
"""
|
|
1014
1489
|
if self.features_df is None or self.features_df.is_empty():
|
|
1015
1490
|
self.logger.warning("No features found in study.")
|
|
1016
1491
|
return pl.DataFrame()
|
|
1017
|
-
|
|
1492
|
+
|
|
1018
1493
|
# Early return if no filters provided - performance optimization
|
|
1019
|
-
filter_params = [
|
|
1020
|
-
|
|
1021
|
-
|
|
1494
|
+
filter_params = [
|
|
1495
|
+
mz,
|
|
1496
|
+
rt,
|
|
1497
|
+
inty,
|
|
1498
|
+
sample_uid,
|
|
1499
|
+
sample_name,
|
|
1500
|
+
consensus_uid,
|
|
1501
|
+
feature_uid,
|
|
1502
|
+
filled,
|
|
1503
|
+
quality,
|
|
1504
|
+
chrom_coherence,
|
|
1505
|
+
chrom_prominence,
|
|
1506
|
+
chrom_prominence_scaled,
|
|
1507
|
+
chrom_height_scaled,
|
|
1508
|
+
]
|
|
1022
1509
|
if all(param is None for param in filter_params):
|
|
1023
1510
|
return self.features_df.clone()
|
|
1024
|
-
|
|
1511
|
+
|
|
1025
1512
|
initial_count = len(self.features_df)
|
|
1026
|
-
|
|
1513
|
+
|
|
1027
1514
|
# Pre-check available columns once for efficiency
|
|
1028
1515
|
available_columns = set(self.features_df.columns)
|
|
1029
|
-
|
|
1516
|
+
|
|
1030
1517
|
# Build all filter conditions first, then apply them all at once
|
|
1031
1518
|
filter_conditions = []
|
|
1032
1519
|
warnings = []
|
|
1033
|
-
|
|
1520
|
+
|
|
1034
1521
|
# Filter by m/z
|
|
1035
1522
|
if mz is not None:
|
|
1036
1523
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
@@ -1038,7 +1525,7 @@ def features_select(
|
|
|
1038
1525
|
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1039
1526
|
else:
|
|
1040
1527
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
1041
|
-
|
|
1528
|
+
|
|
1042
1529
|
# Filter by retention time
|
|
1043
1530
|
if rt is not None:
|
|
1044
1531
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
@@ -1046,7 +1533,7 @@ def features_select(
|
|
|
1046
1533
|
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1047
1534
|
else:
|
|
1048
1535
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
1049
|
-
|
|
1536
|
+
|
|
1050
1537
|
# Filter by intensity
|
|
1051
1538
|
if inty is not None:
|
|
1052
1539
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
@@ -1054,7 +1541,7 @@ def features_select(
|
|
|
1054
1541
|
filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
|
|
1055
1542
|
else:
|
|
1056
1543
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
1057
|
-
|
|
1544
|
+
|
|
1058
1545
|
# Filter by sample_uid
|
|
1059
1546
|
if sample_uid is not None:
|
|
1060
1547
|
if isinstance(sample_uid, (list, tuple)):
|
|
@@ -1067,24 +1554,24 @@ def features_select(
|
|
|
1067
1554
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
1068
1555
|
else:
|
|
1069
1556
|
filter_conditions.append(pl.col("sample_uid") == sample_uid)
|
|
1070
|
-
|
|
1557
|
+
|
|
1071
1558
|
# Filter by sample_name (requires pre-processing)
|
|
1072
1559
|
if sample_name is not None:
|
|
1073
1560
|
# Get sample_uids for the given sample names
|
|
1074
1561
|
if isinstance(sample_name, list):
|
|
1075
1562
|
sample_uids_for_names = self.samples_df.filter(
|
|
1076
|
-
pl.col("sample_name").is_in(sample_name)
|
|
1563
|
+
pl.col("sample_name").is_in(sample_name),
|
|
1077
1564
|
)["sample_uid"].to_list()
|
|
1078
1565
|
else:
|
|
1079
1566
|
sample_uids_for_names = self.samples_df.filter(
|
|
1080
|
-
pl.col("sample_name") == sample_name
|
|
1567
|
+
pl.col("sample_name") == sample_name,
|
|
1081
1568
|
)["sample_uid"].to_list()
|
|
1082
|
-
|
|
1569
|
+
|
|
1083
1570
|
if sample_uids_for_names:
|
|
1084
1571
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
|
|
1085
1572
|
else:
|
|
1086
1573
|
filter_conditions.append(pl.lit(False)) # No matching samples
|
|
1087
|
-
|
|
1574
|
+
|
|
1088
1575
|
# Filter by consensus_uid
|
|
1089
1576
|
if consensus_uid is not None:
|
|
1090
1577
|
if isinstance(consensus_uid, (list, tuple)):
|
|
@@ -1097,7 +1584,7 @@ def features_select(
|
|
|
1097
1584
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1098
1585
|
else:
|
|
1099
1586
|
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
1100
|
-
|
|
1587
|
+
|
|
1101
1588
|
# Filter by feature_uid
|
|
1102
1589
|
if feature_uid is not None:
|
|
1103
1590
|
if isinstance(feature_uid, (list, tuple)):
|
|
@@ -1110,7 +1597,7 @@ def features_select(
|
|
|
1110
1597
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
1111
1598
|
else:
|
|
1112
1599
|
filter_conditions.append(pl.col("feature_uid") == feature_uid)
|
|
1113
|
-
|
|
1600
|
+
|
|
1114
1601
|
# Filter by filled status
|
|
1115
1602
|
if filled is not None:
|
|
1116
1603
|
if "filled" in available_columns:
|
|
@@ -1120,7 +1607,7 @@ def features_select(
|
|
|
1120
1607
|
filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
|
|
1121
1608
|
else:
|
|
1122
1609
|
warnings.append("'filled' column not found in features_df")
|
|
1123
|
-
|
|
1610
|
+
|
|
1124
1611
|
# Filter by quality
|
|
1125
1612
|
if quality is not None:
|
|
1126
1613
|
if "quality" in available_columns:
|
|
@@ -1131,73 +1618,83 @@ def features_select(
|
|
|
1131
1618
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
1132
1619
|
else:
|
|
1133
1620
|
warnings.append("'quality' column not found in features_df")
|
|
1134
|
-
|
|
1621
|
+
|
|
1135
1622
|
# Filter by chromatogram coherence
|
|
1136
1623
|
if chrom_coherence is not None:
|
|
1137
1624
|
if "chrom_coherence" in available_columns:
|
|
1138
1625
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1139
1626
|
min_coherence, max_coherence = chrom_coherence
|
|
1140
|
-
filter_conditions.append(
|
|
1627
|
+
filter_conditions.append(
|
|
1628
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
1629
|
+
)
|
|
1141
1630
|
else:
|
|
1142
1631
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
1143
1632
|
else:
|
|
1144
1633
|
warnings.append("'chrom_coherence' column not found in features_df")
|
|
1145
|
-
|
|
1634
|
+
|
|
1146
1635
|
# Filter by chromatogram prominence
|
|
1147
1636
|
if chrom_prominence is not None:
|
|
1148
1637
|
if "chrom_prominence" in available_columns:
|
|
1149
1638
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1150
1639
|
min_prominence, max_prominence = chrom_prominence
|
|
1151
|
-
filter_conditions.append(
|
|
1640
|
+
filter_conditions.append(
|
|
1641
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
1642
|
+
)
|
|
1152
1643
|
else:
|
|
1153
1644
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
1154
1645
|
else:
|
|
1155
1646
|
warnings.append("'chrom_prominence' column not found in features_df")
|
|
1156
|
-
|
|
1647
|
+
|
|
1157
1648
|
# Filter by scaled chromatogram prominence
|
|
1158
1649
|
if chrom_prominence_scaled is not None:
|
|
1159
1650
|
if "chrom_prominence_scaled" in available_columns:
|
|
1160
1651
|
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
1161
1652
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1162
|
-
filter_conditions.append(
|
|
1653
|
+
filter_conditions.append(
|
|
1654
|
+
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1655
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
1656
|
+
)
|
|
1163
1657
|
else:
|
|
1164
1658
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
1165
1659
|
else:
|
|
1166
1660
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1167
|
-
|
|
1661
|
+
|
|
1168
1662
|
# Filter by scaled chromatogram height
|
|
1169
1663
|
if chrom_height_scaled is not None:
|
|
1170
1664
|
if "chrom_height_scaled" in available_columns:
|
|
1171
1665
|
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
1172
1666
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1173
|
-
filter_conditions.append(
|
|
1667
|
+
filter_conditions.append(
|
|
1668
|
+
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
1669
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
1670
|
+
)
|
|
1174
1671
|
else:
|
|
1175
1672
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
1176
1673
|
else:
|
|
1177
1674
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1178
|
-
|
|
1675
|
+
|
|
1179
1676
|
# Log all warnings once at the end for efficiency
|
|
1180
1677
|
for warning in warnings:
|
|
1181
1678
|
self.logger.warning(warning)
|
|
1182
|
-
|
|
1679
|
+
|
|
1183
1680
|
# Apply all filters at once using lazy evaluation for optimal performance
|
|
1184
1681
|
if filter_conditions:
|
|
1185
1682
|
# Combine all conditions with AND
|
|
1186
1683
|
combined_filter = filter_conditions[0]
|
|
1187
1684
|
for condition in filter_conditions[1:]:
|
|
1188
1685
|
combined_filter = combined_filter & condition
|
|
1189
|
-
|
|
1686
|
+
|
|
1190
1687
|
# Apply the combined filter using lazy evaluation
|
|
1191
1688
|
feats = self.features_df.lazy().filter(combined_filter).collect()
|
|
1192
1689
|
else:
|
|
1193
1690
|
feats = self.features_df.clone()
|
|
1194
|
-
|
|
1691
|
+
|
|
1195
1692
|
final_count = len(feats)
|
|
1196
|
-
|
|
1693
|
+
|
|
1197
1694
|
if final_count == 0:
|
|
1198
1695
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1199
1696
|
else:
|
|
1200
|
-
#removed_count = initial_count - final_count
|
|
1697
|
+
# removed_count = initial_count - final_count
|
|
1201
1698
|
self.logger.info(f"Features selected: {final_count} (out of {initial_count})")
|
|
1202
1699
|
|
|
1203
1700
|
return feats
|
|
@@ -1207,29 +1704,29 @@ def features_filter(self, features):
|
|
|
1207
1704
|
"""
|
|
1208
1705
|
Filter features_df by keeping only features that match the given criteria.
|
|
1209
1706
|
This keeps only the specified features and removes all others.
|
|
1210
|
-
|
|
1707
|
+
|
|
1211
1708
|
OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
|
|
1212
|
-
|
|
1709
|
+
|
|
1213
1710
|
Parameters:
|
|
1214
1711
|
features: Features to keep. Can be:
|
|
1215
1712
|
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1216
1713
|
- list: List of feature_uids to keep
|
|
1217
1714
|
- int: Single feature_uid to keep
|
|
1218
|
-
|
|
1715
|
+
|
|
1219
1716
|
Returns:
|
|
1220
1717
|
None (modifies self.features_df in place)
|
|
1221
1718
|
"""
|
|
1222
1719
|
if self.features_df is None or self.features_df.is_empty():
|
|
1223
1720
|
self.logger.warning("No features found in study.")
|
|
1224
1721
|
return
|
|
1225
|
-
|
|
1722
|
+
|
|
1226
1723
|
# Early return if no features provided
|
|
1227
1724
|
if features is None:
|
|
1228
1725
|
self.logger.warning("No features provided for filtering.")
|
|
1229
1726
|
return
|
|
1230
|
-
|
|
1727
|
+
|
|
1231
1728
|
initial_count = len(self.features_df)
|
|
1232
|
-
|
|
1729
|
+
|
|
1233
1730
|
# Determine feature_uids to keep - optimized type checking
|
|
1234
1731
|
if isinstance(features, pl.DataFrame):
|
|
1235
1732
|
if "feature_uid" not in features.columns:
|
|
@@ -1243,44 +1740,41 @@ def features_filter(self, features):
|
|
|
1243
1740
|
else:
|
|
1244
1741
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
1245
1742
|
return
|
|
1246
|
-
|
|
1743
|
+
|
|
1247
1744
|
# Early return if no UIDs to keep
|
|
1248
1745
|
if not feature_uids_to_keep:
|
|
1249
1746
|
self.logger.warning("No feature UIDs provided for filtering.")
|
|
1250
1747
|
return
|
|
1251
|
-
|
|
1748
|
+
|
|
1252
1749
|
# Convert to set for faster lookup if list is large
|
|
1253
1750
|
if len(feature_uids_to_keep) > 100:
|
|
1254
1751
|
feature_uids_set = set(feature_uids_to_keep)
|
|
1255
1752
|
# Use the set for filtering if it's significantly smaller
|
|
1256
1753
|
if len(feature_uids_set) < len(feature_uids_to_keep) * 0.8:
|
|
1257
1754
|
feature_uids_to_keep = list(feature_uids_set)
|
|
1258
|
-
|
|
1755
|
+
|
|
1259
1756
|
# Create filter condition once - keep only the specified features
|
|
1260
1757
|
filter_condition = pl.col("feature_uid").is_in(feature_uids_to_keep)
|
|
1261
|
-
|
|
1758
|
+
|
|
1262
1759
|
# Apply filter to features_df using lazy evaluation for better performance
|
|
1263
1760
|
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
1264
|
-
|
|
1761
|
+
|
|
1265
1762
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1266
1763
|
mapping_removed_count = 0
|
|
1267
1764
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1268
1765
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1269
|
-
self.consensus_mapping_df = (
|
|
1270
|
-
self.consensus_mapping_df
|
|
1271
|
-
.lazy()
|
|
1272
|
-
.filter(filter_condition)
|
|
1273
|
-
.collect()
|
|
1274
|
-
)
|
|
1766
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
1275
1767
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1276
|
-
|
|
1768
|
+
|
|
1277
1769
|
# Calculate results once and log efficiently
|
|
1278
1770
|
final_count = len(self.features_df)
|
|
1279
1771
|
removed_count = initial_count - final_count
|
|
1280
|
-
|
|
1772
|
+
|
|
1281
1773
|
# Single comprehensive log message
|
|
1282
1774
|
if mapping_removed_count > 0:
|
|
1283
|
-
self.logger.info(
|
|
1775
|
+
self.logger.info(
|
|
1776
|
+
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
|
|
1777
|
+
)
|
|
1284
1778
|
else:
|
|
1285
1779
|
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
1286
1780
|
|
|
@@ -1289,27 +1783,27 @@ def features_delete(self, features):
|
|
|
1289
1783
|
"""
|
|
1290
1784
|
Delete features from features_df based on feature identifiers.
|
|
1291
1785
|
This removes the specified features and keeps all others (opposite of features_filter).
|
|
1292
|
-
|
|
1786
|
+
|
|
1293
1787
|
Parameters:
|
|
1294
1788
|
features: Features to delete. Can be:
|
|
1295
1789
|
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1296
1790
|
- list: List of feature_uids to delete
|
|
1297
1791
|
- int: Single feature_uid to delete
|
|
1298
|
-
|
|
1792
|
+
|
|
1299
1793
|
Returns:
|
|
1300
1794
|
None (modifies self.features_df in place)
|
|
1301
1795
|
"""
|
|
1302
1796
|
if self.features_df is None or self.features_df.is_empty():
|
|
1303
1797
|
self.logger.warning("No features found in study.")
|
|
1304
1798
|
return
|
|
1305
|
-
|
|
1799
|
+
|
|
1306
1800
|
# Early return if no features provided
|
|
1307
1801
|
if features is None:
|
|
1308
1802
|
self.logger.warning("No features provided for deletion.")
|
|
1309
1803
|
return
|
|
1310
|
-
|
|
1804
|
+
|
|
1311
1805
|
initial_count = len(self.features_df)
|
|
1312
|
-
|
|
1806
|
+
|
|
1313
1807
|
# Determine feature_uids to remove - optimized type checking
|
|
1314
1808
|
if isinstance(features, pl.DataFrame):
|
|
1315
1809
|
if "feature_uid" not in features.columns:
|
|
@@ -1323,44 +1817,41 @@ def features_delete(self, features):
|
|
|
1323
1817
|
else:
|
|
1324
1818
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
1325
1819
|
return
|
|
1326
|
-
|
|
1820
|
+
|
|
1327
1821
|
# Early return if no UIDs to remove
|
|
1328
1822
|
if not feature_uids_to_remove:
|
|
1329
1823
|
self.logger.warning("No feature UIDs provided for deletion.")
|
|
1330
1824
|
return
|
|
1331
|
-
|
|
1825
|
+
|
|
1332
1826
|
# Convert to set for faster lookup if list is large
|
|
1333
1827
|
if len(feature_uids_to_remove) > 100:
|
|
1334
1828
|
feature_uids_set = set(feature_uids_to_remove)
|
|
1335
1829
|
# Use the set for filtering if it's significantly smaller
|
|
1336
1830
|
if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
|
|
1337
1831
|
feature_uids_to_remove = list(feature_uids_set)
|
|
1338
|
-
|
|
1832
|
+
|
|
1339
1833
|
# Create filter condition - remove specified features
|
|
1340
1834
|
filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
1341
|
-
|
|
1835
|
+
|
|
1342
1836
|
# Apply filter to features_df using lazy evaluation for better performance
|
|
1343
1837
|
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
1344
|
-
|
|
1838
|
+
|
|
1345
1839
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1346
1840
|
mapping_removed_count = 0
|
|
1347
1841
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1348
1842
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1349
|
-
self.consensus_mapping_df = (
|
|
1350
|
-
self.consensus_mapping_df
|
|
1351
|
-
.lazy()
|
|
1352
|
-
.filter(filter_condition)
|
|
1353
|
-
.collect()
|
|
1354
|
-
)
|
|
1843
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
1355
1844
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1356
|
-
|
|
1845
|
+
|
|
1357
1846
|
# Calculate results once and log efficiently
|
|
1358
1847
|
final_count = len(self.features_df)
|
|
1359
1848
|
removed_count = initial_count - final_count
|
|
1360
|
-
|
|
1849
|
+
|
|
1361
1850
|
# Single comprehensive log message
|
|
1362
1851
|
if mapping_removed_count > 0:
|
|
1363
|
-
self.logger.info(
|
|
1852
|
+
self.logger.info(
|
|
1853
|
+
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
|
|
1854
|
+
)
|
|
1364
1855
|
else:
|
|
1365
1856
|
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
1366
1857
|
|
|
@@ -1384,7 +1875,7 @@ def consensus_select(
|
|
|
1384
1875
|
):
|
|
1385
1876
|
"""
|
|
1386
1877
|
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
1387
|
-
|
|
1878
|
+
|
|
1388
1879
|
Parameters:
|
|
1389
1880
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
1390
1881
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -1400,17 +1891,17 @@ def consensus_select(
|
|
|
1400
1891
|
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1401
1892
|
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1402
1893
|
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
1403
|
-
|
|
1894
|
+
|
|
1404
1895
|
Returns:
|
|
1405
1896
|
polars.DataFrame: Filtered consensus DataFrame
|
|
1406
1897
|
"""
|
|
1407
1898
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1408
1899
|
self.logger.warning("No consensus features found in study.")
|
|
1409
1900
|
return pl.DataFrame()
|
|
1410
|
-
|
|
1901
|
+
|
|
1411
1902
|
consensus = self.consensus_df.clone()
|
|
1412
1903
|
initial_count = len(consensus)
|
|
1413
|
-
|
|
1904
|
+
|
|
1414
1905
|
# Filter by m/z
|
|
1415
1906
|
if mz is not None:
|
|
1416
1907
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1420,9 +1911,9 @@ def consensus_select(
|
|
|
1420
1911
|
else:
|
|
1421
1912
|
consensus = consensus.filter(pl.col("mz") >= mz)
|
|
1422
1913
|
self.logger.debug(
|
|
1423
|
-
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1914
|
+
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1424
1915
|
)
|
|
1425
|
-
|
|
1916
|
+
|
|
1426
1917
|
# Filter by retention time
|
|
1427
1918
|
if rt is not None:
|
|
1428
1919
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1432,9 +1923,9 @@ def consensus_select(
|
|
|
1432
1923
|
else:
|
|
1433
1924
|
consensus = consensus.filter(pl.col("rt") >= rt)
|
|
1434
1925
|
self.logger.debug(
|
|
1435
|
-
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1926
|
+
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1436
1927
|
)
|
|
1437
|
-
|
|
1928
|
+
|
|
1438
1929
|
# Filter by mean intensity
|
|
1439
1930
|
if inty_mean is not None:
|
|
1440
1931
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1444,9 +1935,9 @@ def consensus_select(
|
|
|
1444
1935
|
else:
|
|
1445
1936
|
consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
|
|
1446
1937
|
self.logger.debug(
|
|
1447
|
-
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1938
|
+
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1448
1939
|
)
|
|
1449
|
-
|
|
1940
|
+
|
|
1450
1941
|
# Filter by consensus_uid
|
|
1451
1942
|
if consensus_uid is not None:
|
|
1452
1943
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1454,16 +1945,18 @@ def consensus_select(
|
|
|
1454
1945
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1455
1946
|
# Treat as range
|
|
1456
1947
|
min_uid, max_uid = consensus_uid
|
|
1457
|
-
consensus = consensus.filter(
|
|
1948
|
+
consensus = consensus.filter(
|
|
1949
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
|
|
1950
|
+
)
|
|
1458
1951
|
else:
|
|
1459
1952
|
# Treat as list
|
|
1460
1953
|
consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1461
1954
|
else:
|
|
1462
1955
|
consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
|
|
1463
1956
|
self.logger.debug(
|
|
1464
|
-
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1957
|
+
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1465
1958
|
)
|
|
1466
|
-
|
|
1959
|
+
|
|
1467
1960
|
# Filter by consensus_id
|
|
1468
1961
|
if consensus_id is not None:
|
|
1469
1962
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1472,21 +1965,23 @@ def consensus_select(
|
|
|
1472
1965
|
else:
|
|
1473
1966
|
consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
|
|
1474
1967
|
self.logger.debug(
|
|
1475
|
-
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1968
|
+
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1476
1969
|
)
|
|
1477
|
-
|
|
1970
|
+
|
|
1478
1971
|
# Filter by number of samples
|
|
1479
1972
|
if number_samples is not None:
|
|
1480
1973
|
consensus_len_before_filter = len(consensus)
|
|
1481
1974
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
1482
1975
|
min_samples, max_samples = number_samples
|
|
1483
|
-
consensus = consensus.filter(
|
|
1976
|
+
consensus = consensus.filter(
|
|
1977
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
|
|
1978
|
+
)
|
|
1484
1979
|
else:
|
|
1485
1980
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
1486
1981
|
self.logger.debug(
|
|
1487
|
-
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1982
|
+
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1488
1983
|
)
|
|
1489
|
-
|
|
1984
|
+
|
|
1490
1985
|
# Filter by number of MS2 spectra
|
|
1491
1986
|
if number_ms2 is not None:
|
|
1492
1987
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1499,9 +1994,9 @@ def consensus_select(
|
|
|
1499
1994
|
else:
|
|
1500
1995
|
self.logger.warning("'number_ms2' column not found in consensus_df")
|
|
1501
1996
|
self.logger.debug(
|
|
1502
|
-
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1997
|
+
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1503
1998
|
)
|
|
1504
|
-
|
|
1999
|
+
|
|
1505
2000
|
# Filter by quality
|
|
1506
2001
|
if quality is not None:
|
|
1507
2002
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1511,9 +2006,9 @@ def consensus_select(
|
|
|
1511
2006
|
else:
|
|
1512
2007
|
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
1513
2008
|
self.logger.debug(
|
|
1514
|
-
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2009
|
+
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1515
2010
|
)
|
|
1516
|
-
|
|
2011
|
+
|
|
1517
2012
|
# Filter by baseline
|
|
1518
2013
|
if bl is not None:
|
|
1519
2014
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1526,89 +2021,103 @@ def consensus_select(
|
|
|
1526
2021
|
else:
|
|
1527
2022
|
self.logger.warning("'bl' column not found in consensus_df")
|
|
1528
2023
|
self.logger.debug(
|
|
1529
|
-
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2024
|
+
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1530
2025
|
)
|
|
1531
|
-
|
|
2026
|
+
|
|
1532
2027
|
# Filter by mean chromatogram coherence
|
|
1533
2028
|
if chrom_coherence_mean is not None:
|
|
1534
2029
|
consensus_len_before_filter = len(consensus)
|
|
1535
2030
|
if "chrom_coherence_mean" in consensus.columns:
|
|
1536
2031
|
if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
|
|
1537
2032
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
1538
|
-
consensus = consensus.filter(
|
|
2033
|
+
consensus = consensus.filter(
|
|
2034
|
+
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2035
|
+
& (pl.col("chrom_coherence_mean") <= max_coherence)
|
|
2036
|
+
)
|
|
1539
2037
|
else:
|
|
1540
2038
|
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
1541
2039
|
else:
|
|
1542
2040
|
self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
|
|
1543
2041
|
self.logger.debug(
|
|
1544
|
-
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2042
|
+
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1545
2043
|
)
|
|
1546
|
-
|
|
2044
|
+
|
|
1547
2045
|
# Filter by mean chromatogram prominence
|
|
1548
2046
|
if chrom_prominence_mean is not None:
|
|
1549
2047
|
consensus_len_before_filter = len(consensus)
|
|
1550
2048
|
if "chrom_prominence_mean" in consensus.columns:
|
|
1551
2049
|
if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
|
|
1552
2050
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
1553
|
-
consensus = consensus.filter(
|
|
2051
|
+
consensus = consensus.filter(
|
|
2052
|
+
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2053
|
+
& (pl.col("chrom_prominence_mean") <= max_prominence)
|
|
2054
|
+
)
|
|
1554
2055
|
else:
|
|
1555
2056
|
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
1556
2057
|
else:
|
|
1557
2058
|
self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
|
|
1558
2059
|
self.logger.debug(
|
|
1559
|
-
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2060
|
+
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1560
2061
|
)
|
|
1561
|
-
|
|
2062
|
+
|
|
1562
2063
|
# Filter by mean scaled chromatogram prominence
|
|
1563
2064
|
if chrom_prominence_scaled_mean is not None:
|
|
1564
2065
|
consensus_len_before_filter = len(consensus)
|
|
1565
2066
|
if "chrom_prominence_scaled_mean" in consensus.columns:
|
|
1566
2067
|
if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
|
|
1567
2068
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
1568
|
-
consensus = consensus.filter(
|
|
2069
|
+
consensus = consensus.filter(
|
|
2070
|
+
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2071
|
+
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
|
|
2072
|
+
)
|
|
1569
2073
|
else:
|
|
1570
2074
|
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
1571
2075
|
else:
|
|
1572
2076
|
self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
|
|
1573
2077
|
self.logger.debug(
|
|
1574
|
-
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2078
|
+
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1575
2079
|
)
|
|
1576
|
-
|
|
2080
|
+
|
|
1577
2081
|
# Filter by mean scaled chromatogram height
|
|
1578
2082
|
if chrom_height_scaled_mean is not None:
|
|
1579
2083
|
consensus_len_before_filter = len(consensus)
|
|
1580
2084
|
if "chrom_height_scaled_mean" in consensus.columns:
|
|
1581
2085
|
if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
|
|
1582
2086
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
1583
|
-
consensus = consensus.filter(
|
|
2087
|
+
consensus = consensus.filter(
|
|
2088
|
+
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2089
|
+
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
|
|
2090
|
+
)
|
|
1584
2091
|
else:
|
|
1585
2092
|
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
1586
2093
|
else:
|
|
1587
2094
|
self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
|
|
1588
2095
|
self.logger.debug(
|
|
1589
|
-
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2096
|
+
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1590
2097
|
)
|
|
1591
|
-
|
|
2098
|
+
|
|
1592
2099
|
# Filter by mean RT delta
|
|
1593
2100
|
if rt_delta_mean is not None:
|
|
1594
2101
|
consensus_len_before_filter = len(consensus)
|
|
1595
2102
|
if "rt_delta_mean" in consensus.columns:
|
|
1596
2103
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
1597
2104
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
1598
|
-
consensus = consensus.filter(
|
|
2105
|
+
consensus = consensus.filter(
|
|
2106
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
|
|
2107
|
+
)
|
|
1599
2108
|
else:
|
|
1600
2109
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
1601
2110
|
else:
|
|
1602
2111
|
self.logger.warning("'rt_delta_mean' column not found in consensus_df")
|
|
1603
2112
|
self.logger.debug(
|
|
1604
|
-
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
2113
|
+
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1605
2114
|
)
|
|
1606
|
-
|
|
2115
|
+
|
|
1607
2116
|
if len(consensus) == 0:
|
|
1608
2117
|
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
1609
2118
|
else:
|
|
1610
2119
|
self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
|
|
1611
|
-
|
|
2120
|
+
|
|
1612
2121
|
return consensus
|
|
1613
2122
|
|
|
1614
2123
|
|
|
@@ -1616,22 +2125,22 @@ def consensus_filter(self, consensus):
|
|
|
1616
2125
|
"""
|
|
1617
2126
|
Filter consensus_df by removing all consensus features that match the given criteria.
|
|
1618
2127
|
This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
|
|
1619
|
-
|
|
2128
|
+
|
|
1620
2129
|
Parameters:
|
|
1621
2130
|
consensus: Consensus features to remove. Can be:
|
|
1622
2131
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1623
2132
|
- list: List of consensus_uids to remove
|
|
1624
2133
|
- int: Single consensus_uid to remove
|
|
1625
|
-
|
|
2134
|
+
|
|
1626
2135
|
Returns:
|
|
1627
2136
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
1628
2137
|
"""
|
|
1629
2138
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1630
2139
|
self.logger.warning("No consensus features found in study.")
|
|
1631
2140
|
return
|
|
1632
|
-
|
|
2141
|
+
|
|
1633
2142
|
initial_consensus_count = len(self.consensus_df)
|
|
1634
|
-
|
|
2143
|
+
|
|
1635
2144
|
# Determine consensus_uids to remove
|
|
1636
2145
|
if isinstance(consensus, pl.DataFrame):
|
|
1637
2146
|
if "consensus_uid" not in consensus.columns:
|
|
@@ -1645,68 +2154,70 @@ def consensus_filter(self, consensus):
|
|
|
1645
2154
|
else:
|
|
1646
2155
|
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
1647
2156
|
return
|
|
1648
|
-
|
|
2157
|
+
|
|
1649
2158
|
if not consensus_uids_to_remove:
|
|
1650
2159
|
self.logger.warning("No consensus UIDs provided for filtering.")
|
|
1651
2160
|
return
|
|
1652
|
-
|
|
2161
|
+
|
|
1653
2162
|
# Get feature_uids that need to be removed from features_df
|
|
1654
2163
|
feature_uids_to_remove = []
|
|
1655
2164
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1656
2165
|
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
1657
|
-
pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
2166
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1658
2167
|
)["feature_uid"].to_list()
|
|
1659
|
-
|
|
2168
|
+
|
|
1660
2169
|
# Remove consensus features from consensus_df
|
|
1661
2170
|
self.consensus_df = self.consensus_df.filter(
|
|
1662
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
2171
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1663
2172
|
)
|
|
1664
|
-
|
|
2173
|
+
|
|
1665
2174
|
# Remove from consensus_mapping_df
|
|
1666
2175
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1667
2176
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1668
2177
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1669
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
2178
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1670
2179
|
)
|
|
1671
2180
|
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1672
2181
|
if removed_mapping_count > 0:
|
|
1673
2182
|
self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
|
|
1674
|
-
|
|
2183
|
+
|
|
1675
2184
|
# Remove corresponding features from features_df
|
|
1676
2185
|
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
1677
2186
|
initial_features_count = len(self.features_df)
|
|
1678
2187
|
self.features_df = self.features_df.filter(
|
|
1679
|
-
~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
2188
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
1680
2189
|
)
|
|
1681
2190
|
removed_features_count = initial_features_count - len(self.features_df)
|
|
1682
2191
|
if removed_features_count > 0:
|
|
1683
2192
|
self.logger.debug(f"Removed {removed_features_count} entries from features_df")
|
|
1684
|
-
|
|
2193
|
+
|
|
1685
2194
|
# Remove from consensus_ms2 if it exists
|
|
1686
|
-
if hasattr(self,
|
|
2195
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1687
2196
|
initial_ms2_count = len(self.consensus_ms2)
|
|
1688
2197
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
1689
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
2198
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1690
2199
|
)
|
|
1691
2200
|
removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
|
|
1692
2201
|
if removed_ms2_count > 0:
|
|
1693
2202
|
self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
|
|
1694
|
-
|
|
2203
|
+
|
|
1695
2204
|
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
1696
|
-
self.logger.info(
|
|
2205
|
+
self.logger.info(
|
|
2206
|
+
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
|
|
2207
|
+
)
|
|
1697
2208
|
|
|
1698
2209
|
|
|
1699
2210
|
def consensus_delete(self, consensus):
|
|
1700
2211
|
"""
|
|
1701
2212
|
Delete consensus features from consensus_df based on consensus identifiers.
|
|
1702
2213
|
This is an alias for consensus_filter for consistency with other delete methods.
|
|
1703
|
-
|
|
2214
|
+
|
|
1704
2215
|
Parameters:
|
|
1705
2216
|
consensus: Consensus features to delete. Can be:
|
|
1706
2217
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1707
2218
|
- list: List of consensus_uids to delete
|
|
1708
2219
|
- int: Single consensus_uid to delete
|
|
1709
|
-
|
|
2220
|
+
|
|
1710
2221
|
Returns:
|
|
1711
2222
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
1712
2223
|
"""
|