masster 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/sample/helpers.py +53 -4
- masster/sample/plot.py +100 -16
- masster/sample/sample.py +6 -0
- masster/sample/sample5_schema.json +43 -34
- masster/study/defaults/align_def.py +10 -10
- masster/study/helpers.py +466 -3
- masster/study/load.py +6 -0
- masster/study/plot.py +809 -130
- masster/study/processing.py +35 -10
- masster/study/study.py +60 -4
- masster/study/study5_schema.json +83 -83
- {masster-0.3.11.dist-info → masster-0.3.12.dist-info}/METADATA +1 -1
- {masster-0.3.11.dist-info → masster-0.3.12.dist-info}/RECORD +16 -16
- {masster-0.3.11.dist-info → masster-0.3.12.dist-info}/WHEEL +0 -0
- {masster-0.3.11.dist-info → masster-0.3.12.dist-info}/entry_points.txt +0 -0
- {masster-0.3.11.dist-info → masster-0.3.12.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -7,6 +7,289 @@ import pandas as pd
|
|
|
7
7
|
import polars as pl
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
+
from masster.chromatogram import Chromatogram
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
14
|
+
"""
|
|
15
|
+
Return a Chromatogram object containing the Base Peak Chromatogram (BPC).
|
|
16
|
+
|
|
17
|
+
The `owner` argument may be either a Study instance or a Sample-like object that
|
|
18
|
+
exposes `ms1_df` (Polars DataFrame) and optionally `scans_df`.
|
|
19
|
+
|
|
20
|
+
If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
|
|
21
|
+
and the Sample will be retrieved using `get_sample(owner, sample)`.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Chromatogram
|
|
25
|
+
"""
|
|
26
|
+
# resolve sample when owner is a Study-like object (has get_sample)
|
|
27
|
+
s = None
|
|
28
|
+
if hasattr(owner, "ms1_df"):
|
|
29
|
+
s = owner
|
|
30
|
+
else:
|
|
31
|
+
# owner is expected to be a Study
|
|
32
|
+
s = get_sample(owner, sample)
|
|
33
|
+
|
|
34
|
+
if s is None:
|
|
35
|
+
raise ValueError("Could not resolve sample for BPC computation")
|
|
36
|
+
|
|
37
|
+
# ensure ms1_df exists
|
|
38
|
+
if getattr(s, "ms1_df", None) is None:
|
|
39
|
+
raise ValueError("Sample has no ms1_df for BPC computation")
|
|
40
|
+
|
|
41
|
+
# try Polars aggregation first
|
|
42
|
+
try:
|
|
43
|
+
cols = s.ms1_df.columns
|
|
44
|
+
if not all(c in cols for c in ["rt", "inty"]):
|
|
45
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
46
|
+
|
|
47
|
+
bpc = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
|
|
48
|
+
bpc = bpc.groupby("rt").agg(pl.col("inty").max().alias("inty"))
|
|
49
|
+
bpc_pd = bpc.to_pandas().sort_values("rt")
|
|
50
|
+
except Exception:
|
|
51
|
+
# fallback to pandas
|
|
52
|
+
try:
|
|
53
|
+
bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
|
|
54
|
+
bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
|
|
55
|
+
except Exception:
|
|
56
|
+
raise
|
|
57
|
+
|
|
58
|
+
if bpc_pd.empty:
|
|
59
|
+
raise ValueError("Computed BPC is empty")
|
|
60
|
+
|
|
61
|
+
# If caller requests original RTs (original=True) and we were called from a Study
|
|
62
|
+
# we can obtain a per-sample mapping between current rt and rt_original from
|
|
63
|
+
# the study.features_df and apply it to the computed BPC rt values.
|
|
64
|
+
# Note: original parameter default is False (return current/aligned RTs).
|
|
65
|
+
if original is True:
|
|
66
|
+
try:
|
|
67
|
+
# Only proceed if owner is a Study-like object with features_df
|
|
68
|
+
study = None
|
|
69
|
+
if hasattr(owner, "features_df"):
|
|
70
|
+
study = owner
|
|
71
|
+
else:
|
|
72
|
+
# If owner is a Sample, try to find Study via attribute (not guaranteed)
|
|
73
|
+
study = getattr(owner, "study", None)
|
|
74
|
+
|
|
75
|
+
if study is not None and getattr(study, "features_df", None) is not None:
|
|
76
|
+
# Attempt to select mapping rows for this sample. Prefer matching by sample_uid,
|
|
77
|
+
# fall back to sample_name when necessary.
|
|
78
|
+
import numpy as _np
|
|
79
|
+
|
|
80
|
+
feats = study.features_df
|
|
81
|
+
# try filtering by sample identifier provided to this function
|
|
82
|
+
mapping_rows = None
|
|
83
|
+
if sample is not None:
|
|
84
|
+
try:
|
|
85
|
+
mapping_rows = feats.filter(pl.col("sample_uid") == sample)
|
|
86
|
+
except Exception:
|
|
87
|
+
mapping_rows = pl.DataFrame()
|
|
88
|
+
|
|
89
|
+
if mapping_rows is None or mapping_rows.is_empty():
|
|
90
|
+
try:
|
|
91
|
+
mapping_rows = feats.filter(pl.col("sample_name") == sample)
|
|
92
|
+
except Exception:
|
|
93
|
+
mapping_rows = pl.DataFrame()
|
|
94
|
+
|
|
95
|
+
# If we still have no sample selector, try to infer sample from the Sample object s
|
|
96
|
+
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
|
|
97
|
+
# attempt to match by sample_path or file name
|
|
98
|
+
try:
|
|
99
|
+
sample_paths = feats.select(["sample_uid", "sample_name", "sample_path"]) # type: ignore[arg-type]
|
|
100
|
+
# find row where sample_path matches
|
|
101
|
+
mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
|
|
102
|
+
except Exception:
|
|
103
|
+
mapping_rows = pl.DataFrame()
|
|
104
|
+
|
|
105
|
+
# If still empty, give up mapping
|
|
106
|
+
if mapping_rows is not None and not mapping_rows.is_empty():
|
|
107
|
+
# collect rt and rt_original pairs
|
|
108
|
+
try:
|
|
109
|
+
map_pd = mapping_rows.select(["rt", "rt_original"]).to_pandas()
|
|
110
|
+
except Exception:
|
|
111
|
+
map_pd = mapping_rows.to_pandas()[["rt", "rt_original"]]
|
|
112
|
+
|
|
113
|
+
# drop NA and duplicates
|
|
114
|
+
map_pd = map_pd.dropna()
|
|
115
|
+
if not map_pd.empty:
|
|
116
|
+
# sort by rt (current/aligned)
|
|
117
|
+
map_pd = map_pd.sort_values("rt")
|
|
118
|
+
x = map_pd["rt"].to_numpy()
|
|
119
|
+
y = map_pd["rt_original"].to_numpy()
|
|
120
|
+
# require at least 2 points to interpolate
|
|
121
|
+
if x.size >= 2:
|
|
122
|
+
# apply linear interpolation from current rt -> original rt
|
|
123
|
+
# for values outside the known range, numpy.interp will clip to endpoints
|
|
124
|
+
new_rt = _np.interp(bpc_pd["rt"].to_numpy(), x, y)
|
|
125
|
+
bpc_pd = bpc_pd.copy()
|
|
126
|
+
bpc_pd["rt"] = new_rt
|
|
127
|
+
except Exception:
|
|
128
|
+
# If mapping fails, silently continue and return the original computed BPC
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
# build Chromatogram
|
|
132
|
+
ycol = "inty"
|
|
133
|
+
try:
|
|
134
|
+
chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
|
|
135
|
+
except Exception:
|
|
136
|
+
chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
|
|
137
|
+
|
|
138
|
+
return chrom
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_tic(owner, sample=None, label=None):
|
|
142
|
+
"""
|
|
143
|
+
Return a Chromatogram object containing the Total Ion Chromatogram (TIC).
|
|
144
|
+
|
|
145
|
+
`owner` may be a Sample-like object (has `ms1_df`) or a Study (in which case `sample` selects the sample).
|
|
146
|
+
The function falls back to `scans_df` when `ms1_df` is not available.
|
|
147
|
+
"""
|
|
148
|
+
# resolve sample object
|
|
149
|
+
s = None
|
|
150
|
+
if hasattr(owner, "ms1_df"):
|
|
151
|
+
s = owner
|
|
152
|
+
else:
|
|
153
|
+
s = get_sample(owner, sample)
|
|
154
|
+
|
|
155
|
+
if s is None:
|
|
156
|
+
raise ValueError("Could not resolve sample for TIC computation")
|
|
157
|
+
|
|
158
|
+
# prefer ms1_df
|
|
159
|
+
try:
|
|
160
|
+
cols = s.ms1_df.columns
|
|
161
|
+
if all(c in cols for c in ["rt", "inty"]):
|
|
162
|
+
tic = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
|
|
163
|
+
tic = tic.groupby("rt").agg(pl.col("inty").sum().alias("inty_tot"))
|
|
164
|
+
tic_pd = tic.to_pandas().sort_values("rt")
|
|
165
|
+
else:
|
|
166
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
167
|
+
except Exception:
|
|
168
|
+
# fallback to scans_df if present
|
|
169
|
+
if getattr(s, "scans_df", None) is not None:
|
|
170
|
+
try:
|
|
171
|
+
scans = s.scans_df.filter(pl.col("ms_level") == 1)
|
|
172
|
+
data = scans[["rt", "scan_uid", "inty_tot"]].to_pandas()
|
|
173
|
+
data = data.sort_values("rt")
|
|
174
|
+
tic_pd = data.rename(columns={"inty_tot": "inty_tot"})
|
|
175
|
+
except Exception:
|
|
176
|
+
raise
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
|
|
179
|
+
|
|
180
|
+
if tic_pd.empty:
|
|
181
|
+
raise ValueError("Computed TIC is empty")
|
|
182
|
+
|
|
183
|
+
# ensure column name
|
|
184
|
+
if "inty_tot" not in tic_pd.columns:
|
|
185
|
+
tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
|
|
189
|
+
except Exception:
|
|
190
|
+
chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
|
|
191
|
+
|
|
192
|
+
return chrom
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
|
|
196
|
+
"""
|
|
197
|
+
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
198
|
+
|
|
199
|
+
The `owner` argument may be either a Study instance or a Sample-like object that
|
|
200
|
+
exposes `ms1_df` (Polars DataFrame).
|
|
201
|
+
|
|
202
|
+
If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
|
|
203
|
+
and the Sample will be retrieved using `get_sample(owner, sample)`.
|
|
204
|
+
|
|
205
|
+
Parameters:
|
|
206
|
+
owner: Study or Sample instance
|
|
207
|
+
sample: Sample identifier (required if owner is Study)
|
|
208
|
+
mz (float): Target m/z value
|
|
209
|
+
mz_tol (float): m/z tolerance (default 0.01)
|
|
210
|
+
rt_unit (str): Retention time unit for the chromatogram
|
|
211
|
+
label (str): Optional label for the chromatogram
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Chromatogram
|
|
215
|
+
"""
|
|
216
|
+
if mz is None:
|
|
217
|
+
raise ValueError("mz must be provided for EIC computation")
|
|
218
|
+
|
|
219
|
+
# resolve sample when owner is a Study-like object (has get_sample)
|
|
220
|
+
s = None
|
|
221
|
+
if hasattr(owner, "ms1_df"):
|
|
222
|
+
s = owner
|
|
223
|
+
else:
|
|
224
|
+
# owner is expected to be a Study
|
|
225
|
+
s = get_sample(owner, sample)
|
|
226
|
+
|
|
227
|
+
if s is None:
|
|
228
|
+
raise ValueError("Could not resolve sample for EIC computation")
|
|
229
|
+
|
|
230
|
+
# ensure ms1_df exists
|
|
231
|
+
if getattr(s, "ms1_df", None) is None:
|
|
232
|
+
raise ValueError("Sample has no ms1_df for EIC computation")
|
|
233
|
+
|
|
234
|
+
# Extract EIC from ms1_df using mz window
|
|
235
|
+
try:
|
|
236
|
+
cols = s.ms1_df.columns
|
|
237
|
+
if not all(c in cols for c in ["rt", "mz", "inty"]):
|
|
238
|
+
raise RuntimeError("ms1_df missing required columns")
|
|
239
|
+
|
|
240
|
+
# Filter by mz window
|
|
241
|
+
mz_min = mz - mz_tol
|
|
242
|
+
mz_max = mz + mz_tol
|
|
243
|
+
eic_data = s.ms1_df.filter(
|
|
244
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if eic_data.is_empty():
|
|
248
|
+
# Return empty chromatogram if no data found
|
|
249
|
+
import numpy as _np
|
|
250
|
+
return Chromatogram(
|
|
251
|
+
rt=_np.array([0.0]),
|
|
252
|
+
inty=_np.array([0.0]),
|
|
253
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
254
|
+
rt_unit=rt_unit
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Aggregate intensities per retention time (sum in case of multiple points per rt)
|
|
258
|
+
eic = eic_data.group_by("rt").agg(pl.col("inty").sum().alias("inty"))
|
|
259
|
+
eic_pd = eic.sort("rt").to_pandas()
|
|
260
|
+
|
|
261
|
+
except Exception:
|
|
262
|
+
raise RuntimeError("Failed to extract EIC from ms1_df")
|
|
263
|
+
|
|
264
|
+
if eic_pd.empty:
|
|
265
|
+
# Return empty chromatogram if no data found
|
|
266
|
+
import numpy as _np
|
|
267
|
+
return Chromatogram(
|
|
268
|
+
rt=_np.array([0.0]),
|
|
269
|
+
inty=_np.array([0.0]),
|
|
270
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
271
|
+
rt_unit=rt_unit
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# build Chromatogram
|
|
275
|
+
try:
|
|
276
|
+
chrom = Chromatogram(
|
|
277
|
+
rt=eic_pd["rt"].to_numpy(),
|
|
278
|
+
inty=eic_pd["inty"].to_numpy(),
|
|
279
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
280
|
+
rt_unit=rt_unit
|
|
281
|
+
)
|
|
282
|
+
except Exception:
|
|
283
|
+
chrom = Chromatogram(
|
|
284
|
+
rt=eic_pd["rt"].values,
|
|
285
|
+
inty=eic_pd["inty"].values,
|
|
286
|
+
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
287
|
+
rt_unit=rt_unit
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return chrom
|
|
291
|
+
|
|
292
|
+
|
|
10
293
|
|
|
11
294
|
|
|
12
295
|
def get_chrom(self, uids=None, samples=None):
|
|
@@ -124,8 +407,6 @@ def set_folder(self, folder):
|
|
|
124
407
|
|
|
125
408
|
|
|
126
409
|
def align_reset(self):
|
|
127
|
-
if self.alignment_ref_index is None:
|
|
128
|
-
return
|
|
129
410
|
self.logger.debug("Resetting alignment.")
|
|
130
411
|
# iterate over all feature maps and set RT to original RT
|
|
131
412
|
for feature_map in self.features_maps:
|
|
@@ -135,7 +416,13 @@ def align_reset(self):
|
|
|
135
416
|
feature.setRT(rt)
|
|
136
417
|
feature.removeMetaValue("original_RT")
|
|
137
418
|
self.alignment_ref_index = None
|
|
138
|
-
|
|
419
|
+
# in self.features_df, set rt equal to rt_original
|
|
420
|
+
self.features_df = self.features_df.with_columns(
|
|
421
|
+
pl.col("rt_original").alias("rt")
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Ensure column order is maintained after with_columns operation
|
|
425
|
+
self._ensure_features_df_schema_order()
|
|
139
426
|
|
|
140
427
|
# TODO I don't get this param
|
|
141
428
|
def get_consensus(self, quant="chrom_area"):
|
|
@@ -410,6 +697,56 @@ def _get_sample_uids(self, samples=None, seed=42):
|
|
|
410
697
|
return sample_uids
|
|
411
698
|
|
|
412
699
|
|
|
700
|
+
def get_sample(self, sample):
|
|
701
|
+
"""
|
|
702
|
+
Return a `Sample` object corresponding to the provided sample identifier.
|
|
703
|
+
|
|
704
|
+
Accepted `sample` values:
|
|
705
|
+
- int: interpreted as `sample_uid`
|
|
706
|
+
- str: interpreted as `sample_name`
|
|
707
|
+
- Sample instance: returned as-is
|
|
708
|
+
|
|
709
|
+
This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
|
|
710
|
+
"""
|
|
711
|
+
from masster.sample.sample import Sample
|
|
712
|
+
|
|
713
|
+
if isinstance(sample, Sample):
|
|
714
|
+
return sample
|
|
715
|
+
|
|
716
|
+
if isinstance(sample, int):
|
|
717
|
+
rows = self.samples_df.filter(pl.col("sample_uid") == sample)
|
|
718
|
+
elif isinstance(sample, str):
|
|
719
|
+
rows = self.samples_df.filter(pl.col("sample_name") == sample)
|
|
720
|
+
else:
|
|
721
|
+
raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
|
|
722
|
+
|
|
723
|
+
if rows.is_empty():
|
|
724
|
+
raise KeyError(f"Sample not found: {sample}")
|
|
725
|
+
|
|
726
|
+
row = rows.row(0, named=True)
|
|
727
|
+
sample_uid = int(row["sample_uid"]) if row["sample_uid"] is not None else None
|
|
728
|
+
|
|
729
|
+
# Use a cache on the Study instance if available
|
|
730
|
+
cache = getattr(self, "_samples_cache", None)
|
|
731
|
+
if cache is not None and sample_uid in cache:
|
|
732
|
+
return cache[sample_uid]
|
|
733
|
+
|
|
734
|
+
sample_path = row.get("sample_path", None)
|
|
735
|
+
s = Sample(log_level='ERROR')
|
|
736
|
+
try:
|
|
737
|
+
if sample_path:
|
|
738
|
+
try:
|
|
739
|
+
s.load(sample_path)
|
|
740
|
+
except Exception:
|
|
741
|
+
s = Sample(file=sample_path)
|
|
742
|
+
except Exception:
|
|
743
|
+
pass
|
|
744
|
+
|
|
745
|
+
if cache is not None and sample_uid is not None:
|
|
746
|
+
cache[sample_uid] = s
|
|
747
|
+
return s
|
|
748
|
+
|
|
749
|
+
|
|
413
750
|
def get_orphans(self):
|
|
414
751
|
"""
|
|
415
752
|
Get all features that are not in the consensus mapping.
|
|
@@ -914,6 +1251,132 @@ def compress_chrom(self):
|
|
|
914
1251
|
self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
|
|
915
1252
|
|
|
916
1253
|
|
|
1254
|
+
def name_replace(self, replace_dict):
|
|
1255
|
+
"""
|
|
1256
|
+
Replace sample names in samples_df based on a dictionary mapping.
|
|
1257
|
+
|
|
1258
|
+
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1259
|
+
all keys with their corresponding values from replace_dict. Checks that all
|
|
1260
|
+
resulting sample names are unique. If unique, replaces the values in self.samples_df.
|
|
1261
|
+
|
|
1262
|
+
Parameters:
|
|
1263
|
+
replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
|
|
1264
|
+
All keys found in sample names will be replaced with their
|
|
1265
|
+
corresponding values.
|
|
1266
|
+
e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
|
|
1267
|
+
|
|
1268
|
+
Returns:
|
|
1269
|
+
None
|
|
1270
|
+
|
|
1271
|
+
Raises:
|
|
1272
|
+
ValueError: If replace_dict is not a dictionary
|
|
1273
|
+
ValueError: If resulting sample names are not unique
|
|
1274
|
+
"""
|
|
1275
|
+
if not isinstance(replace_dict, dict):
|
|
1276
|
+
raise ValueError("replace_dict must be a dictionary")
|
|
1277
|
+
|
|
1278
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1279
|
+
self.logger.warning("No samples found in study.")
|
|
1280
|
+
return
|
|
1281
|
+
|
|
1282
|
+
if not replace_dict:
|
|
1283
|
+
self.logger.warning("Empty replace_dict provided, no changes made.")
|
|
1284
|
+
return
|
|
1285
|
+
|
|
1286
|
+
# Get current sample names
|
|
1287
|
+
current_names = self.samples_df.get_column("sample_name").to_list()
|
|
1288
|
+
|
|
1289
|
+
# Create a copy and apply replacements
|
|
1290
|
+
new_names = []
|
|
1291
|
+
replaced_count = 0
|
|
1292
|
+
|
|
1293
|
+
for name in current_names:
|
|
1294
|
+
if name in replace_dict:
|
|
1295
|
+
new_names.append(replace_dict[name])
|
|
1296
|
+
replaced_count += 1
|
|
1297
|
+
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1298
|
+
else:
|
|
1299
|
+
new_names.append(name)
|
|
1300
|
+
|
|
1301
|
+
# Check that all new names are unique
|
|
1302
|
+
if len(set(new_names)) != len(new_names):
|
|
1303
|
+
duplicates = []
|
|
1304
|
+
seen = set()
|
|
1305
|
+
for name in new_names:
|
|
1306
|
+
if name in seen:
|
|
1307
|
+
duplicates.append(name)
|
|
1308
|
+
else:
|
|
1309
|
+
seen.add(name)
|
|
1310
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1311
|
+
|
|
1312
|
+
# If we get here, all names are unique - apply the changes
|
|
1313
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1314
|
+
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1318
|
+
|
|
1319
|
+
|
|
1320
|
+
def name_reset(self):
|
|
1321
|
+
"""
|
|
1322
|
+
Reset sample names to the basename of sample_path without extensions.
|
|
1323
|
+
|
|
1324
|
+
Takes all paths in self.samples_df['sample_path'], extracts the basename,
|
|
1325
|
+
removes file extensions, and checks that all resulting names are unique.
|
|
1326
|
+
If unique, replaces the values in self.samples_df['sample_name'].
|
|
1327
|
+
|
|
1328
|
+
Returns:
|
|
1329
|
+
None
|
|
1330
|
+
|
|
1331
|
+
Raises:
|
|
1332
|
+
ValueError: If resulting sample names are not unique
|
|
1333
|
+
RuntimeError: If any sample_path is None or empty
|
|
1334
|
+
"""
|
|
1335
|
+
import os
|
|
1336
|
+
|
|
1337
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1338
|
+
self.logger.warning("No samples found in study.")
|
|
1339
|
+
return
|
|
1340
|
+
|
|
1341
|
+
# Get current sample paths
|
|
1342
|
+
sample_paths = self.samples_df.get_column("sample_path").to_list()
|
|
1343
|
+
|
|
1344
|
+
# Extract basenames without extensions
|
|
1345
|
+
new_names = []
|
|
1346
|
+
|
|
1347
|
+
for i, path in enumerate(sample_paths):
|
|
1348
|
+
if path is None or path == "":
|
|
1349
|
+
raise RuntimeError(f"Sample at index {i} has no sample_path set")
|
|
1350
|
+
|
|
1351
|
+
# Get basename and remove extension(s)
|
|
1352
|
+
basename = os.path.basename(path)
|
|
1353
|
+
# Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
|
|
1354
|
+
name_without_ext = basename
|
|
1355
|
+
while '.' in name_without_ext:
|
|
1356
|
+
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1357
|
+
|
|
1358
|
+
new_names.append(name_without_ext)
|
|
1359
|
+
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1360
|
+
|
|
1361
|
+
# Check that all new names are unique
|
|
1362
|
+
if len(set(new_names)) != len(new_names):
|
|
1363
|
+
duplicates = []
|
|
1364
|
+
seen = set()
|
|
1365
|
+
for name in new_names:
|
|
1366
|
+
if name in seen:
|
|
1367
|
+
duplicates.append(name)
|
|
1368
|
+
else:
|
|
1369
|
+
seen.add(name)
|
|
1370
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1371
|
+
|
|
1372
|
+
# If we get here, all names are unique - apply the changes
|
|
1373
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1374
|
+
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1375
|
+
)
|
|
1376
|
+
|
|
1377
|
+
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1378
|
+
|
|
1379
|
+
|
|
917
1380
|
def set_source(self, filename):
|
|
918
1381
|
"""
|
|
919
1382
|
Reassign file_source for all samples in samples_df. If filename contains only a path,
|
masster/study/load.py
CHANGED
|
@@ -268,6 +268,8 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
268
268
|
).select(
|
|
269
269
|
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
270
270
|
)
|
|
271
|
+
# Ensure column order matches schema from the very beginning
|
|
272
|
+
self._ensure_features_df_schema_order()
|
|
271
273
|
else:
|
|
272
274
|
offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
|
|
273
275
|
# Chain operations and add to existing DataFrame
|
|
@@ -277,6 +279,10 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
277
279
|
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
278
280
|
)
|
|
279
281
|
self.features_df = pl.concat([self.features_df, f_df])
|
|
282
|
+
|
|
283
|
+
# Ensure features_df column order matches schema
|
|
284
|
+
self._ensure_features_df_schema_order()
|
|
285
|
+
|
|
280
286
|
self.logger.debug(
|
|
281
287
|
f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
|
|
282
288
|
)
|