masster 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -7,6 +7,289 @@ import pandas as pd
7
7
  import polars as pl
8
8
 
9
9
  from tqdm import tqdm
10
+ from masster.chromatogram import Chromatogram
11
+
12
+
13
+ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
14
+ """
15
+ Return a Chromatogram object containing the Base Peak Chromatogram (BPC).
16
+
17
+ The `owner` argument may be either a Study instance or a Sample-like object that
18
+ exposes `ms1_df` (Polars DataFrame) and optionally `scans_df`.
19
+
20
+ If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
21
+ and the Sample will be retrieved using `get_sample(owner, sample)`.
22
+
23
+ Returns:
24
+ Chromatogram
25
+ """
26
+ # resolve sample when owner is a Study-like object (has get_sample)
27
+ s = None
28
+ if hasattr(owner, "ms1_df"):
29
+ s = owner
30
+ else:
31
+ # owner is expected to be a Study
32
+ s = get_sample(owner, sample)
33
+
34
+ if s is None:
35
+ raise ValueError("Could not resolve sample for BPC computation")
36
+
37
+ # ensure ms1_df exists
38
+ if getattr(s, "ms1_df", None) is None:
39
+ raise ValueError("Sample has no ms1_df for BPC computation")
40
+
41
+ # try Polars aggregation first
42
+ try:
43
+ cols = s.ms1_df.columns
44
+ if not all(c in cols for c in ["rt", "inty"]):
45
+ raise RuntimeError("ms1_df missing required columns")
46
+
47
+ bpc = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
48
+ bpc = bpc.groupby("rt").agg(pl.col("inty").max().alias("inty"))
49
+ bpc_pd = bpc.to_pandas().sort_values("rt")
50
+ except Exception:
51
+ # fallback to pandas
52
+ try:
53
+ bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
54
+ bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
55
+ except Exception:
56
+ raise
57
+
58
+ if bpc_pd.empty:
59
+ raise ValueError("Computed BPC is empty")
60
+
61
+ # If caller requests original RTs (original=True) and we were called from a Study
62
+ # we can obtain a per-sample mapping between current rt and rt_original from
63
+ # the study.features_df and apply it to the computed BPC rt values.
64
+ # Note: original parameter default is False (return current/aligned RTs).
65
+ if original is True:
66
+ try:
67
+ # Only proceed if owner is a Study-like object with features_df
68
+ study = None
69
+ if hasattr(owner, "features_df"):
70
+ study = owner
71
+ else:
72
+ # If owner is a Sample, try to find Study via attribute (not guaranteed)
73
+ study = getattr(owner, "study", None)
74
+
75
+ if study is not None and getattr(study, "features_df", None) is not None:
76
+ # Attempt to select mapping rows for this sample. Prefer matching by sample_uid,
77
+ # fall back to sample_name when necessary.
78
+ import numpy as _np
79
+
80
+ feats = study.features_df
81
+ # try filtering by sample identifier provided to this function
82
+ mapping_rows = None
83
+ if sample is not None:
84
+ try:
85
+ mapping_rows = feats.filter(pl.col("sample_uid") == sample)
86
+ except Exception:
87
+ mapping_rows = pl.DataFrame()
88
+
89
+ if mapping_rows is None or mapping_rows.is_empty():
90
+ try:
91
+ mapping_rows = feats.filter(pl.col("sample_name") == sample)
92
+ except Exception:
93
+ mapping_rows = pl.DataFrame()
94
+
95
+ # If we still have no sample selector, try to infer sample from the Sample object s
96
+ if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
97
+ # attempt to match by sample_path or file name
98
+ try:
99
+ sample_paths = feats.select(["sample_uid", "sample_name", "sample_path"]) # type: ignore[arg-type]
100
+ # find row where sample_path matches
101
+ mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
102
+ except Exception:
103
+ mapping_rows = pl.DataFrame()
104
+
105
+ # If still empty, give up mapping
106
+ if mapping_rows is not None and not mapping_rows.is_empty():
107
+ # collect rt and rt_original pairs
108
+ try:
109
+ map_pd = mapping_rows.select(["rt", "rt_original"]).to_pandas()
110
+ except Exception:
111
+ map_pd = mapping_rows.to_pandas()[["rt", "rt_original"]]
112
+
113
+ # drop NA and duplicates
114
+ map_pd = map_pd.dropna()
115
+ if not map_pd.empty:
116
+ # sort by rt (current/aligned)
117
+ map_pd = map_pd.sort_values("rt")
118
+ x = map_pd["rt"].to_numpy()
119
+ y = map_pd["rt_original"].to_numpy()
120
+ # require at least 2 points to interpolate
121
+ if x.size >= 2:
122
+ # apply linear interpolation from current rt -> original rt
123
+ # for values outside the known range, numpy.interp will clip to endpoints
124
+ new_rt = _np.interp(bpc_pd["rt"].to_numpy(), x, y)
125
+ bpc_pd = bpc_pd.copy()
126
+ bpc_pd["rt"] = new_rt
127
+ except Exception:
128
+ # If mapping fails, silently continue and return the original computed BPC
129
+ pass
130
+
131
+ # build Chromatogram
132
+ ycol = "inty"
133
+ try:
134
+ chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
135
+ except Exception:
136
+ chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
137
+
138
+ return chrom
139
+
140
+
141
+ def get_tic(owner, sample=None, label=None):
142
+ """
143
+ Return a Chromatogram object containing the Total Ion Chromatogram (TIC).
144
+
145
+ `owner` may be a Sample-like object (has `ms1_df`) or a Study (in which case `sample` selects the sample).
146
+ The function falls back to `scans_df` when `ms1_df` is not available.
147
+ """
148
+ # resolve sample object
149
+ s = None
150
+ if hasattr(owner, "ms1_df"):
151
+ s = owner
152
+ else:
153
+ s = get_sample(owner, sample)
154
+
155
+ if s is None:
156
+ raise ValueError("Could not resolve sample for TIC computation")
157
+
158
+ # prefer ms1_df
159
+ try:
160
+ cols = s.ms1_df.columns
161
+ if all(c in cols for c in ["rt", "inty"]):
162
+ tic = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
163
+ tic = tic.groupby("rt").agg(pl.col("inty").sum().alias("inty_tot"))
164
+ tic_pd = tic.to_pandas().sort_values("rt")
165
+ else:
166
+ raise RuntimeError("ms1_df missing required columns")
167
+ except Exception:
168
+ # fallback to scans_df if present
169
+ if getattr(s, "scans_df", None) is not None:
170
+ try:
171
+ scans = s.scans_df.filter(pl.col("ms_level") == 1)
172
+ data = scans[["rt", "scan_uid", "inty_tot"]].to_pandas()
173
+ data = data.sort_values("rt")
174
+ tic_pd = data.rename(columns={"inty_tot": "inty_tot"})
175
+ except Exception:
176
+ raise
177
+ else:
178
+ raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
179
+
180
+ if tic_pd.empty:
181
+ raise ValueError("Computed TIC is empty")
182
+
183
+ # ensure column name
184
+ if "inty_tot" not in tic_pd.columns:
185
+ tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
186
+
187
+ try:
188
+ chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
189
+ except Exception:
190
+ chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
191
+
192
+ return chrom
193
+
194
+
195
+ def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
196
+ """
197
+ Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
198
+
199
+ The `owner` argument may be either a Study instance or a Sample-like object that
200
+ exposes `ms1_df` (Polars DataFrame).
201
+
202
+ If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
203
+ and the Sample will be retrieved using `get_sample(owner, sample)`.
204
+
205
+ Parameters:
206
+ owner: Study or Sample instance
207
+ sample: Sample identifier (required if owner is Study)
208
+ mz (float): Target m/z value
209
+ mz_tol (float): m/z tolerance (default 0.01)
210
+ rt_unit (str): Retention time unit for the chromatogram
211
+ label (str): Optional label for the chromatogram
212
+
213
+ Returns:
214
+ Chromatogram
215
+ """
216
+ if mz is None:
217
+ raise ValueError("mz must be provided for EIC computation")
218
+
219
+ # resolve sample when owner is a Study-like object (has get_sample)
220
+ s = None
221
+ if hasattr(owner, "ms1_df"):
222
+ s = owner
223
+ else:
224
+ # owner is expected to be a Study
225
+ s = get_sample(owner, sample)
226
+
227
+ if s is None:
228
+ raise ValueError("Could not resolve sample for EIC computation")
229
+
230
+ # ensure ms1_df exists
231
+ if getattr(s, "ms1_df", None) is None:
232
+ raise ValueError("Sample has no ms1_df for EIC computation")
233
+
234
+ # Extract EIC from ms1_df using mz window
235
+ try:
236
+ cols = s.ms1_df.columns
237
+ if not all(c in cols for c in ["rt", "mz", "inty"]):
238
+ raise RuntimeError("ms1_df missing required columns")
239
+
240
+ # Filter by mz window
241
+ mz_min = mz - mz_tol
242
+ mz_max = mz + mz_tol
243
+ eic_data = s.ms1_df.filter(
244
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
245
+ )
246
+
247
+ if eic_data.is_empty():
248
+ # Return empty chromatogram if no data found
249
+ import numpy as _np
250
+ return Chromatogram(
251
+ rt=_np.array([0.0]),
252
+ inty=_np.array([0.0]),
253
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
254
+ rt_unit=rt_unit
255
+ )
256
+
257
+ # Aggregate intensities per retention time (sum in case of multiple points per rt)
258
+ eic = eic_data.group_by("rt").agg(pl.col("inty").sum().alias("inty"))
259
+ eic_pd = eic.sort("rt").to_pandas()
260
+
261
+ except Exception:
262
+ raise RuntimeError("Failed to extract EIC from ms1_df")
263
+
264
+ if eic_pd.empty:
265
+ # Return empty chromatogram if no data found
266
+ import numpy as _np
267
+ return Chromatogram(
268
+ rt=_np.array([0.0]),
269
+ inty=_np.array([0.0]),
270
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
271
+ rt_unit=rt_unit
272
+ )
273
+
274
+ # build Chromatogram
275
+ try:
276
+ chrom = Chromatogram(
277
+ rt=eic_pd["rt"].to_numpy(),
278
+ inty=eic_pd["inty"].to_numpy(),
279
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
280
+ rt_unit=rt_unit
281
+ )
282
+ except Exception:
283
+ chrom = Chromatogram(
284
+ rt=eic_pd["rt"].values,
285
+ inty=eic_pd["inty"].values,
286
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
287
+ rt_unit=rt_unit
288
+ )
289
+
290
+ return chrom
291
+
292
+
10
293
 
11
294
 
12
295
  def get_chrom(self, uids=None, samples=None):
@@ -124,8 +407,6 @@ def set_folder(self, folder):
124
407
 
125
408
 
126
409
  def align_reset(self):
127
- if self.alignment_ref_index is None:
128
- return
129
410
  self.logger.debug("Resetting alignment.")
130
411
  # iterate over all feature maps and set RT to original RT
131
412
  for feature_map in self.features_maps:
@@ -135,7 +416,13 @@ def align_reset(self):
135
416
  feature.setRT(rt)
136
417
  feature.removeMetaValue("original_RT")
137
418
  self.alignment_ref_index = None
138
-
419
+ # in self.features_df, set rt equal to rt_original
420
+ self.features_df = self.features_df.with_columns(
421
+ pl.col("rt_original").alias("rt")
422
+ )
423
+
424
+ # Ensure column order is maintained after with_columns operation
425
+ self._ensure_features_df_schema_order()
139
426
 
140
427
  # TODO I don't get this param
141
428
  def get_consensus(self, quant="chrom_area"):
@@ -410,6 +697,56 @@ def _get_sample_uids(self, samples=None, seed=42):
410
697
  return sample_uids
411
698
 
412
699
 
700
+ def get_sample(self, sample):
701
+ """
702
+ Return a `Sample` object corresponding to the provided sample identifier.
703
+
704
+ Accepted `sample` values:
705
+ - int: interpreted as `sample_uid`
706
+ - str: interpreted as `sample_name`
707
+ - Sample instance: returned as-is
708
+
709
+ This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
710
+ """
711
+ from masster.sample.sample import Sample
712
+
713
+ if isinstance(sample, Sample):
714
+ return sample
715
+
716
+ if isinstance(sample, int):
717
+ rows = self.samples_df.filter(pl.col("sample_uid") == sample)
718
+ elif isinstance(sample, str):
719
+ rows = self.samples_df.filter(pl.col("sample_name") == sample)
720
+ else:
721
+ raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
722
+
723
+ if rows.is_empty():
724
+ raise KeyError(f"Sample not found: {sample}")
725
+
726
+ row = rows.row(0, named=True)
727
+ sample_uid = int(row["sample_uid"]) if row["sample_uid"] is not None else None
728
+
729
+ # Use a cache on the Study instance if available
730
+ cache = getattr(self, "_samples_cache", None)
731
+ if cache is not None and sample_uid in cache:
732
+ return cache[sample_uid]
733
+
734
+ sample_path = row.get("sample_path", None)
735
+ s = Sample(log_level='ERROR')
736
+ try:
737
+ if sample_path:
738
+ try:
739
+ s.load(sample_path)
740
+ except Exception:
741
+ s = Sample(file=sample_path)
742
+ except Exception:
743
+ pass
744
+
745
+ if cache is not None and sample_uid is not None:
746
+ cache[sample_uid] = s
747
+ return s
748
+
749
+
413
750
  def get_orphans(self):
414
751
  """
415
752
  Get all features that are not in the consensus mapping.
@@ -914,6 +1251,132 @@ def compress_chrom(self):
914
1251
  self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
915
1252
 
916
1253
 
1254
+ def name_replace(self, replace_dict):
1255
+ """
1256
+ Replace sample names in samples_df based on a dictionary mapping.
1257
+
1258
+ Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1259
+ all keys with their corresponding values from replace_dict. Checks that all
1260
+ resulting sample names are unique. If unique, replaces the values in self.samples_df.
1261
+
1262
+ Parameters:
1263
+ replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
1264
+ All keys found in sample names will be replaced with their
1265
+ corresponding values.
1266
+ e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
1267
+
1268
+ Returns:
1269
+ None
1270
+
1271
+ Raises:
1272
+ ValueError: If replace_dict is not a dictionary
1273
+ ValueError: If resulting sample names are not unique
1274
+ """
1275
+ if not isinstance(replace_dict, dict):
1276
+ raise ValueError("replace_dict must be a dictionary")
1277
+
1278
+ if self.samples_df is None or len(self.samples_df) == 0:
1279
+ self.logger.warning("No samples found in study.")
1280
+ return
1281
+
1282
+ if not replace_dict:
1283
+ self.logger.warning("Empty replace_dict provided, no changes made.")
1284
+ return
1285
+
1286
+ # Get current sample names
1287
+ current_names = self.samples_df.get_column("sample_name").to_list()
1288
+
1289
+ # Create a copy and apply replacements
1290
+ new_names = []
1291
+ replaced_count = 0
1292
+
1293
+ for name in current_names:
1294
+ if name in replace_dict:
1295
+ new_names.append(replace_dict[name])
1296
+ replaced_count += 1
1297
+ self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1298
+ else:
1299
+ new_names.append(name)
1300
+
1301
+ # Check that all new names are unique
1302
+ if len(set(new_names)) != len(new_names):
1303
+ duplicates = []
1304
+ seen = set()
1305
+ for name in new_names:
1306
+ if name in seen:
1307
+ duplicates.append(name)
1308
+ else:
1309
+ seen.add(name)
1310
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1311
+
1312
+ # If we get here, all names are unique - apply the changes
1313
+ self.samples_df = self.samples_df.with_columns(
1314
+ pl.Series("sample_name", new_names).alias("sample_name"),
1315
+ )
1316
+
1317
+ self.logger.info(f"Successfully replaced {replaced_count} sample names")
1318
+
1319
+
1320
+ def name_reset(self):
1321
+ """
1322
+ Reset sample names to the basename of sample_path without extensions.
1323
+
1324
+ Takes all paths in self.samples_df['sample_path'], extracts the basename,
1325
+ removes file extensions, and checks that all resulting names are unique.
1326
+ If unique, replaces the values in self.samples_df['sample_name'].
1327
+
1328
+ Returns:
1329
+ None
1330
+
1331
+ Raises:
1332
+ ValueError: If resulting sample names are not unique
1333
+ RuntimeError: If any sample_path is None or empty
1334
+ """
1335
+ import os
1336
+
1337
+ if self.samples_df is None or len(self.samples_df) == 0:
1338
+ self.logger.warning("No samples found in study.")
1339
+ return
1340
+
1341
+ # Get current sample paths
1342
+ sample_paths = self.samples_df.get_column("sample_path").to_list()
1343
+
1344
+ # Extract basenames without extensions
1345
+ new_names = []
1346
+
1347
+ for i, path in enumerate(sample_paths):
1348
+ if path is None or path == "":
1349
+ raise RuntimeError(f"Sample at index {i} has no sample_path set")
1350
+
1351
+ # Get basename and remove extension(s)
1352
+ basename = os.path.basename(path)
1353
+ # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
1354
+ name_without_ext = basename
1355
+ while '.' in name_without_ext:
1356
+ name_without_ext = os.path.splitext(name_without_ext)[0]
1357
+
1358
+ new_names.append(name_without_ext)
1359
+ self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1360
+
1361
+ # Check that all new names are unique
1362
+ if len(set(new_names)) != len(new_names):
1363
+ duplicates = []
1364
+ seen = set()
1365
+ for name in new_names:
1366
+ if name in seen:
1367
+ duplicates.append(name)
1368
+ else:
1369
+ seen.add(name)
1370
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1371
+
1372
+ # If we get here, all names are unique - apply the changes
1373
+ self.samples_df = self.samples_df.with_columns(
1374
+ pl.Series("sample_name", new_names).alias("sample_name"),
1375
+ )
1376
+
1377
+ self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1378
+
1379
+
917
1380
  def set_source(self, filename):
918
1381
  """
919
1382
  Reassign file_source for all samples in samples_df. If filename contains only a path,
masster/study/load.py CHANGED
@@ -170,7 +170,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
170
170
  self.logger.error(f"Unsupported file format: {file}")
171
171
  return
172
172
  if ddaobj.features_df is None and not reset:
173
- self.logger.warning(
173
+ self.logger.debug(
174
174
  f"File {file} will be newly processed.",
175
175
  )
176
176
  ddaobj.features = None
@@ -268,6 +268,8 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
268
268
  ).select(
269
269
  ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
270
270
  )
271
+ # Ensure column order matches schema from the very beginning
272
+ self._ensure_features_df_schema_order()
271
273
  else:
272
274
  offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
273
275
  # Chain operations and add to existing DataFrame
@@ -276,7 +278,37 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
276
278
  ).select(
277
279
  ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
278
280
  )
281
+
282
+ # Reorganize f_df columns to match self.features_df column order and schema
283
+ target_columns = self.features_df.columns
284
+ target_schema = self.features_df.schema
285
+ f_df_columns = f_df.columns
286
+
287
+ # Create select expressions for reordering and type casting
288
+ select_exprs = []
289
+ for col in target_columns:
290
+ if col in f_df_columns:
291
+ # Cast to the expected type
292
+ expected_dtype = target_schema[col]
293
+ select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
294
+ else:
295
+ # Add missing columns with null values of the correct type
296
+ expected_dtype = target_schema[col]
297
+ select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
298
+
299
+ # Add any extra columns from f_df that aren't in target_columns (keep their original types)
300
+ for col in f_df_columns:
301
+ if col not in target_columns:
302
+ select_exprs.append(pl.col(col))
303
+
304
+ # Reorder and type-cast f_df columns
305
+ f_df = f_df.select(select_exprs)
306
+
279
307
  self.features_df = pl.concat([self.features_df, f_df])
308
+
309
+ # Ensure features_df column order matches schema
310
+ self._ensure_features_df_schema_order()
311
+
280
312
  self.logger.debug(
281
313
  f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
282
314
  )