PyPI - masster - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

masster 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (16) hide show

masster/sample/helpers.py +53 -4
masster/sample/plot.py +100 -16
masster/sample/sample.py +6 -0
masster/sample/sample5_schema.json +43 -34
masster/study/defaults/align_def.py +10 -10
masster/study/helpers.py +466 -3
masster/study/load.py +33 -1
masster/study/plot.py +809 -130
masster/study/processing.py +35 -10
masster/study/study.py +60 -4
masster/study/study5_schema.json +83 -83
{masster-0.3.11.dist-info → masster-0.3.13.dist-info}/METADATA +1 -1
{masster-0.3.11.dist-info → masster-0.3.13.dist-info}/RECORD +16 -16
{masster-0.3.11.dist-info → masster-0.3.13.dist-info}/WHEEL +0 -0
{masster-0.3.11.dist-info → masster-0.3.13.dist-info}/entry_points.txt +0 -0
{masster-0.3.11.dist-info → masster-0.3.13.dist-info}/licenses/LICENSE +0 -0

masster/study/helpers.py CHANGED Viewed

@@ -7,6 +7,289 @@ import pandas as pd
 import polars as pl
 from tqdm import tqdm
+from masster.chromatogram import Chromatogram
+def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
+    """
+    Return a Chromatogram object containing the Base Peak Chromatogram (BPC).
+    The `owner` argument may be either a Study instance or a Sample-like object that
+    exposes `ms1_df` (Polars DataFrame) and optionally `scans_df`.
+    If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
+    and the Sample will be retrieved using `get_sample(owner, sample)`.
+    Returns:
+        Chromatogram
+    """
+    # resolve sample when owner is a Study-like object (has get_sample)
+    s = None
+    if hasattr(owner, "ms1_df"):
+        s = owner
+    else:
+        # owner is expected to be a Study
+        s = get_sample(owner, sample)
+    if s is None:
+        raise ValueError("Could not resolve sample for BPC computation")
+    # ensure ms1_df exists
+    if getattr(s, "ms1_df", None) is None:
+        raise ValueError("Sample has no ms1_df for BPC computation")
+    # try Polars aggregation first
+    try:
+        cols = s.ms1_df.columns
+        if not all(c in cols for c in ["rt", "inty"]):
+            raise RuntimeError("ms1_df missing required columns")
+        bpc = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
+        bpc = bpc.groupby("rt").agg(pl.col("inty").max().alias("inty"))
+        bpc_pd = bpc.to_pandas().sort_values("rt")
+    except Exception:
+        # fallback to pandas
+        try:
+            bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
+            bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
+        except Exception:
+            raise
+    if bpc_pd.empty:
+        raise ValueError("Computed BPC is empty")
+    # If caller requests original RTs (original=True) and we were called from a Study
+    # we can obtain a per-sample mapping between current rt and rt_original from
+    # the study.features_df and apply it to the computed BPC rt values.
+    # Note: original parameter default is False (return current/aligned RTs).
+    if original is True:
+        try:
+            # Only proceed if owner is a Study-like object with features_df
+            study = None
+            if hasattr(owner, "features_df"):
+                study = owner
+            else:
+                # If owner is a Sample, try to find Study via attribute (not guaranteed)
+                study = getattr(owner, "study", None)
+            if study is not None and getattr(study, "features_df", None) is not None:
+                # Attempt to select mapping rows for this sample. Prefer matching by sample_uid,
+                # fall back to sample_name when necessary.
+                import numpy as _np
+                feats = study.features_df
+                # try filtering by sample identifier provided to this function
+                mapping_rows = None
+                if sample is not None:
+                    try:
+                        mapping_rows = feats.filter(pl.col("sample_uid") == sample)
+                    except Exception:
+                        mapping_rows = pl.DataFrame()
+                    if mapping_rows is None or mapping_rows.is_empty():
+                        try:
+                            mapping_rows = feats.filter(pl.col("sample_name") == sample)
+                        except Exception:
+                            mapping_rows = pl.DataFrame()
+                # If we still have no sample selector, try to infer sample from the Sample object s
+                if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
+                    # attempt to match by sample_path or file name
+                    try:
+                        sample_paths = feats.select(["sample_uid", "sample_name", "sample_path"])  # type: ignore[arg-type]
+                        # find row where sample_path matches
+                        mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
+                    except Exception:
+                        mapping_rows = pl.DataFrame()
+                # If still empty, give up mapping
+                if mapping_rows is not None and not mapping_rows.is_empty():
+                    # collect rt and rt_original pairs
+                    try:
+                        map_pd = mapping_rows.select(["rt", "rt_original"]).to_pandas()
+                    except Exception:
+                        map_pd = mapping_rows.to_pandas()[["rt", "rt_original"]]
+                    # drop NA and duplicates
+                    map_pd = map_pd.dropna()
+                    if not map_pd.empty:
+                        # sort by rt (current/aligned)
+                        map_pd = map_pd.sort_values("rt")
+                        x = map_pd["rt"].to_numpy()
+                        y = map_pd["rt_original"].to_numpy()
+                        # require at least 2 points to interpolate
+                        if x.size >= 2:
+                            # apply linear interpolation from current rt -> original rt
+                            # for values outside the known range, numpy.interp will clip to endpoints
+                            new_rt = _np.interp(bpc_pd["rt"].to_numpy(), x, y)
+                            bpc_pd = bpc_pd.copy()
+                            bpc_pd["rt"] = new_rt
+        except Exception:
+            # If mapping fails, silently continue and return the original computed BPC
+            pass
+    # build Chromatogram
+    ycol = "inty"
+    try:
+        chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+    except Exception:
+        chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+    return chrom
+def get_tic(owner, sample=None, label=None):
+    """
+    Return a Chromatogram object containing the Total Ion Chromatogram (TIC).
+    `owner` may be a Sample-like object (has `ms1_df`) or a Study (in which case `sample` selects the sample).
+    The function falls back to `scans_df` when `ms1_df` is not available.
+    """
+    # resolve sample object
+    s = None
+    if hasattr(owner, "ms1_df"):
+        s = owner
+    else:
+        s = get_sample(owner, sample)
+    if s is None:
+        raise ValueError("Could not resolve sample for TIC computation")
+    # prefer ms1_df
+    try:
+        cols = s.ms1_df.columns
+        if all(c in cols for c in ["rt", "inty"]):
+            tic = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
+            tic = tic.groupby("rt").agg(pl.col("inty").sum().alias("inty_tot"))
+            tic_pd = tic.to_pandas().sort_values("rt")
+        else:
+            raise RuntimeError("ms1_df missing required columns")
+    except Exception:
+        # fallback to scans_df if present
+        if getattr(s, "scans_df", None) is not None:
+            try:
+                scans = s.scans_df.filter(pl.col("ms_level") == 1)
+                data = scans[["rt", "scan_uid", "inty_tot"]].to_pandas()
+                data = data.sort_values("rt")
+                tic_pd = data.rename(columns={"inty_tot": "inty_tot"})
+            except Exception:
+                raise
+        else:
+            raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
+    if tic_pd.empty:
+        raise ValueError("Computed TIC is empty")
+    # ensure column name
+    if "inty_tot" not in tic_pd.columns:
+        tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
+    try:
+        chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
+    except Exception:
+        chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
+    return chrom
+def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
+    """
+    Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
+    The `owner` argument may be either a Study instance or a Sample-like object that
+    exposes `ms1_df` (Polars DataFrame).
+    If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
+    and the Sample will be retrieved using `get_sample(owner, sample)`.
+    Parameters:
+        owner: Study or Sample instance
+        sample: Sample identifier (required if owner is Study)
+        mz (float): Target m/z value
+        mz_tol (float): m/z tolerance (default 0.01)
+        rt_unit (str): Retention time unit for the chromatogram
+        label (str): Optional label for the chromatogram
+    Returns:
+        Chromatogram
+    """
+    if mz is None:
+        raise ValueError("mz must be provided for EIC computation")
+    # resolve sample when owner is a Study-like object (has get_sample)
+    s = None
+    if hasattr(owner, "ms1_df"):
+        s = owner
+    else:
+        # owner is expected to be a Study
+        s = get_sample(owner, sample)
+    if s is None:
+        raise ValueError("Could not resolve sample for EIC computation")
+    # ensure ms1_df exists
+    if getattr(s, "ms1_df", None) is None:
+        raise ValueError("Sample has no ms1_df for EIC computation")
+    # Extract EIC from ms1_df using mz window
+    try:
+        cols = s.ms1_df.columns
+        if not all(c in cols for c in ["rt", "mz", "inty"]):
+            raise RuntimeError("ms1_df missing required columns")
+        # Filter by mz window
+        mz_min = mz - mz_tol
+        mz_max = mz + mz_tol
+        eic_data = s.ms1_df.filter(
+            (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+        )
+        if eic_data.is_empty():
+            # Return empty chromatogram if no data found
+            import numpy as _np
+            return Chromatogram(
+                rt=_np.array([0.0]),
+                inty=_np.array([0.0]),
+                label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
+                rt_unit=rt_unit
+            )
+        # Aggregate intensities per retention time (sum in case of multiple points per rt)
+        eic = eic_data.group_by("rt").agg(pl.col("inty").sum().alias("inty"))
+        eic_pd = eic.sort("rt").to_pandas()
+    except Exception:
+        raise RuntimeError("Failed to extract EIC from ms1_df")
+    if eic_pd.empty:
+        # Return empty chromatogram if no data found
+        import numpy as _np
+        return Chromatogram(
+            rt=_np.array([0.0]),
+            inty=_np.array([0.0]),
+            label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
+            rt_unit=rt_unit
+        )
+    # build Chromatogram
+    try:
+        chrom = Chromatogram(
+            rt=eic_pd["rt"].to_numpy(),
+            inty=eic_pd["inty"].to_numpy(),
+            label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
+            rt_unit=rt_unit
+        )
+    except Exception:
+        chrom = Chromatogram(
+            rt=eic_pd["rt"].values,
+            inty=eic_pd["inty"].values,
+            label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
+            rt_unit=rt_unit
+        )
+    return chrom
 def get_chrom(self, uids=None, samples=None):
@@ -124,8 +407,6 @@ def set_folder(self, folder):
 def align_reset(self):
-    if self.alignment_ref_index is None:
-        return
     self.logger.debug("Resetting alignment.")
     # iterate over all feature maps and set RT to original RT
     for feature_map in self.features_maps:
@@ -135,7 +416,13 @@ def align_reset(self):
                 feature.setRT(rt)
                 feature.removeMetaValue("original_RT")
     self.alignment_ref_index = None
+    # in self.features_df, set rt equal to rt_original
+    self.features_df = self.features_df.with_columns(
+        pl.col("rt_original").alias("rt")
+    )
+    # Ensure column order is maintained after with_columns operation
+    self._ensure_features_df_schema_order()
 # TODO I don't get this param
 def get_consensus(self, quant="chrom_area"):
@@ -410,6 +697,56 @@ def _get_sample_uids(self, samples=None, seed=42):
         return sample_uids
+def get_sample(self, sample):
+    """
+    Return a `Sample` object corresponding to the provided sample identifier.
+    Accepted `sample` values:
+    - int: interpreted as `sample_uid`
+    - str: interpreted as `sample_name`
+    - Sample instance: returned as-is
+    This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
+    """
+    from masster.sample.sample import Sample
+    if isinstance(sample, Sample):
+        return sample
+    if isinstance(sample, int):
+        rows = self.samples_df.filter(pl.col("sample_uid") == sample)
+    elif isinstance(sample, str):
+        rows = self.samples_df.filter(pl.col("sample_name") == sample)
+    else:
+        raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
+    if rows.is_empty():
+        raise KeyError(f"Sample not found: {sample}")
+    row = rows.row(0, named=True)
+    sample_uid = int(row["sample_uid"]) if row["sample_uid"] is not None else None
+    # Use a cache on the Study instance if available
+    cache = getattr(self, "_samples_cache", None)
+    if cache is not None and sample_uid in cache:
+        return cache[sample_uid]
+    sample_path = row.get("sample_path", None)
+    s = Sample(log_level='ERROR')
+    try:
+        if sample_path:
+            try:
+                s.load(sample_path)
+            except Exception:
+                s = Sample(file=sample_path)
+    except Exception:
+        pass
+    if cache is not None and sample_uid is not None:
+        cache[sample_uid] = s
+    return s
 def get_orphans(self):
     """
     Get all features that are not in the consensus mapping.
@@ -914,6 +1251,132 @@ def compress_chrom(self):
     self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
+def name_replace(self, replace_dict):
+    """
+    Replace sample names in samples_df based on a dictionary mapping.
+    Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
+    all keys with their corresponding values from replace_dict. Checks that all
+    resulting sample names are unique. If unique, replaces the values in self.samples_df.
+    Parameters:
+        replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
+                           All keys found in sample names will be replaced with their
+                           corresponding values.
+                           e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
+    Returns:
+        None
+    Raises:
+        ValueError: If replace_dict is not a dictionary
+        ValueError: If resulting sample names are not unique
+    """
+    if not isinstance(replace_dict, dict):
+        raise ValueError("replace_dict must be a dictionary")
+    if self.samples_df is None or len(self.samples_df) == 0:
+        self.logger.warning("No samples found in study.")
+        return
+    if not replace_dict:
+        self.logger.warning("Empty replace_dict provided, no changes made.")
+        return
+    # Get current sample names
+    current_names = self.samples_df.get_column("sample_name").to_list()
+    # Create a copy and apply replacements
+    new_names = []
+    replaced_count = 0
+    for name in current_names:
+        if name in replace_dict:
+            new_names.append(replace_dict[name])
+            replaced_count += 1
+            self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
+        else:
+            new_names.append(name)
+    # Check that all new names are unique
+    if len(set(new_names)) != len(new_names):
+        duplicates = []
+        seen = set()
+        for name in new_names:
+            if name in seen:
+                duplicates.append(name)
+            else:
+                seen.add(name)
+        raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
+    # If we get here, all names are unique - apply the changes
+    self.samples_df = self.samples_df.with_columns(
+        pl.Series("sample_name", new_names).alias("sample_name"),
+    )
+    self.logger.info(f"Successfully replaced {replaced_count} sample names")
+def name_reset(self):
+    """
+    Reset sample names to the basename of sample_path without extensions.
+    Takes all paths in self.samples_df['sample_path'], extracts the basename,
+    removes file extensions, and checks that all resulting names are unique.
+    If unique, replaces the values in self.samples_df['sample_name'].
+    Returns:
+        None
+    Raises:
+        ValueError: If resulting sample names are not unique
+        RuntimeError: If any sample_path is None or empty
+    """
+    import os
+    if self.samples_df is None or len(self.samples_df) == 0:
+        self.logger.warning("No samples found in study.")
+        return
+    # Get current sample paths
+    sample_paths = self.samples_df.get_column("sample_path").to_list()
+    # Extract basenames without extensions
+    new_names = []
+    for i, path in enumerate(sample_paths):
+        if path is None or path == "":
+            raise RuntimeError(f"Sample at index {i} has no sample_path set")
+        # Get basename and remove extension(s)
+        basename = os.path.basename(path)
+        # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
+        name_without_ext = basename
+        while '.' in name_without_ext:
+            name_without_ext = os.path.splitext(name_without_ext)[0]
+        new_names.append(name_without_ext)
+        self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
+    # Check that all new names are unique
+    if len(set(new_names)) != len(new_names):
+        duplicates = []
+        seen = set()
+        for name in new_names:
+            if name in seen:
+                duplicates.append(name)
+            else:
+                seen.add(name)
+        raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
+    # If we get here, all names are unique - apply the changes
+    self.samples_df = self.samples_df.with_columns(
+        pl.Series("sample_name", new_names).alias("sample_name"),
+    )
+    self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
 def set_source(self, filename):
     """
     Reassign file_source for all samples in samples_df. If filename contains only a path,

masster/study/load.py CHANGED Viewed

@@ -170,7 +170,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
         self.logger.error(f"Unsupported file format: {file}")
         return
     if ddaobj.features_df is None and not reset:
-        self.logger.warning(
+        self.logger.debug(
             f"File {file} will be newly processed.",
         )
         ddaobj.features = None
@@ -268,6 +268,8 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
         ).select(
             ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
         )
+        # Ensure column order matches schema from the very beginning
+        self._ensure_features_df_schema_order()
     else:
         offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
         # Chain operations and add to existing DataFrame
@@ -276,7 +278,37 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
         ).select(
             ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
         )
+        # Reorganize f_df columns to match self.features_df column order and schema
+        target_columns = self.features_df.columns
+        target_schema = self.features_df.schema
+        f_df_columns = f_df.columns
+        # Create select expressions for reordering and type casting
+        select_exprs = []
+        for col in target_columns:
+            if col in f_df_columns:
+                # Cast to the expected type
+                expected_dtype = target_schema[col]
+                select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
+            else:
+                # Add missing columns with null values of the correct type
+                expected_dtype = target_schema[col]
+                select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
+        # Add any extra columns from f_df that aren't in target_columns (keep their original types)
+        for col in f_df_columns:
+            if col not in target_columns:
+                select_exprs.append(pl.col(col))
+        # Reorder and type-cast f_df columns
+        f_df = f_df.select(select_exprs)
         self.features_df = pl.concat([self.features_df, f_df])
+    # Ensure features_df column order matches schema
+    self._ensure_features_df_schema_order()
     self.logger.debug(
         f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
     )

masster 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

Potentially problematic release.

masster 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl