PyPI - msreport - Versions diffs - 0.0.24__py3-none-any.whl - Mend

msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

msreport/__init__.py +13 -0
msreport/aggregate/__init__.py +0 -0
msreport/aggregate/condense.py +163 -0
msreport/aggregate/pivot.py +132 -0
msreport/aggregate/summarize.py +281 -0
msreport/analyze.py +586 -0
msreport/errors.py +10 -0
msreport/export.py +526 -0
msreport/fasta.py +28 -0
msreport/helper/__init__.py +23 -0
msreport/helper/calc.py +120 -0
msreport/helper/maxlfq.py +339 -0
msreport/helper/table.py +267 -0
msreport/helper/temp.py +99 -0
msreport/impute.py +275 -0
msreport/isobar.py +161 -0
msreport/normalize.py +496 -0
msreport/peptidoform.py +283 -0
msreport/plot.py +1129 -0
msreport/qtable.py +537 -0
msreport/reader.py +2357 -0
msreport/rinterface/__init__.py +3 -0
msreport/rinterface/limma.py +126 -0
msreport/rinterface/rinstaller.py +35 -0
msreport/rinterface/rscripts/limma.R +104 -0
msreport-0.0.24.dist-info/METADATA +128 -0
msreport-0.0.24.dist-info/RECORD +30 -0
msreport-0.0.24.dist-info/WHEEL +5 -0
msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
msreport-0.0.24.dist-info/top_level.txt +1 -0

msreport/normalize.py ADDED Viewed

@@ -0,0 +1,496 @@
+from __future__ import annotations
+import abc
+import itertools
+from typing import Callable, Iterable, Optional
+import numpy as np
+import pandas as pd
+import statsmodels.nonparametric.smoothers_lowess
+import msreport.helper
+import msreport.helper.maxlfq as MAXLFQ
+from msreport.errors import NotFittedError
+class BaseSampleNormalizer(abc.ABC):
+    """Base class for all sample normalizers."""
+    @abc.abstractmethod
+    def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
+        ...
+    @abc.abstractmethod
+    def is_fitted(self) -> bool:
+        ...
+    @abc.abstractmethod
+    def get_fits(self) -> dict[...]:
+        ...
+    @abc.abstractmethod
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        ...
+class FixedValueNormalizer(BaseSampleNormalizer):
+    """Normalization by a constant normalization factor for each sample.
+    Expects log transformed intensity values.
+    """
+    def __init__(self, center_function: Callable, comparison: str):
+        """Initializes the FixedValueNormalizer.
+        Args:
+            center_function: A function that accepts a sequence of values and
+                returns a center value such as the median.
+            comparison: Must be "paired" or "reference". When "paired" is specified
+                the normalization values are first calculated for each column pair. Then
+                an optimal normalization value for each column is calculated by solving
+                a matrix of linear equations of the column pair values with least
+                squares. When "reference" is selected, a pseudo-reference sample is
+                generated by calculating the mean value for each row. Only rows with
+                valid values in all columns are used. Normalization values are then
+                calculated by comparing each column to the pseudo-reference sample.
+        """
+        if comparison not in ["paired", "reference"]:
+            raise ValueError(
+                f'"comparison" = {comparison} not allowed. '
+                'Must be either "paired" or "reference".'
+            )
+        self._comparison_mode = comparison
+        self._fit_function = center_function
+        self._sample_fits = None
+    def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
+        """Fits the FixedValueNormalizer.
+        Args:
+            table: Dataframe used to calculate normalization values for each column.
+                The normalization values are stored with the column names.
+        Returns:
+            Returns the instance itself.
+        """
+        if self._comparison_mode == "paired":
+            self._fit_with_paired_samples(table)
+        elif self._comparison_mode == "reference":
+            self._fit_with_pseudo_reference(table)
+        return self
+    def is_fitted(self) -> bool:
+        """Returns True if the FixedValueNormalizer has been fitted."""
+        return self._sample_fits is not None
+    def get_fits(self) -> dict[str, float]:
+        """Returns a dictionary containing the fitted center values per sample.
+        Raises:
+            NotFittedError: If the FixedValueNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        return self._sample_fits.copy()
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Applies a fixed value normalization to each column of the table.
+        Args:
+            table: The data to normalize. Each column name must correspond to a column
+                name from the table that was used for the fitting.
+        Returns:
+            Transformed dataframe.
+        Raises:
+            NotFittedError: If the FixedValueNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        _table = table.copy()
+        for column in _table.columns:
+            column_data = np.array(_table[column], dtype=float)
+            mask = np.isfinite(column_data)
+            column_data[mask] = column_data[mask] - self._sample_fits[column]
+            _table[column] = column_data
+        return _table
+    def _fit_with_paired_samples(self, table: pd.DataFrame) -> None:
+        """Fits the FixedValueNormalizer by doing pair-wise column comparisons.
+        Normalization values are first calculated for each column pair. Then an optimal
+        normalization value for each column is calculated by solving a matrix of linear
+        equations of the column pair values with least squares. The individual
+        normalization values are stored in a dictionary with the column names as keys.
+        Args:
+            table: Dataframe used to calculate normalization values for each column.
+        """
+        samples = table.columns.tolist()
+        array = table.to_numpy()
+        ratio_matrix = MAXLFQ._calculate_pairwise_centered_log_ratio_matrix(
+            array, self._fit_function, log_transformed=True
+        )
+        coef_matrix, ratio_array, _ = MAXLFQ.prepare_coefficient_matrix(ratio_matrix)
+        profile = MAXLFQ.log_profiles_by_lstsq(coef_matrix, ratio_array)
+        self._sample_fits = dict(zip(samples, profile))
+    def _fit_with_pseudo_reference(self, table: pd.DataFrame) -> None:
+        """Fits the FixedValueNormalizer by comparing columns to a pseudo-reference.
+        First, a pseudo-reference samples is generated by calculating the mean value for
+        each row. Only rows with valid values in all columns are used. Normalization
+        values are then calculated by comparing each column to the pseudo-reference
+        sample. The individual normalization values are stored in a dictionary with the
+        column names as keys.
+        Args:
+            table: Dataframe used to calculate normalization values for each column.
+        """
+        ref_mask = table.isna().sum(axis=1) == 0
+        ref_values = table[ref_mask].mean(axis=1)
+        samples = table.columns.tolist()
+        self._sample_fits = {}
+        for sample in samples:
+            sample_values = table.loc[ref_mask, sample]
+            sample_fit = self._fit_function(sample_values - ref_values)
+            self._sample_fits[sample] = sample_fit
+class ValueDependentNormalizer(BaseSampleNormalizer):
+    """Normalization with a value dependent fit for each sample.
+    Expects log transformed intensity values.
+    """
+    def __init__(self, fit_function: Callable):
+        """Initializes the ValueDependentNormalizer.
+        Args:
+            fit_function: A function that accepts two sequences of values with equal
+                length, with the first sequence being the observed samples values and
+                the second the reference values. The function must return a numpy array
+                with two columns. The first column contains the values and the second
+                column the fitted deviations.
+        """
+        self._sample_fits = None
+        self._fit_function = fit_function
+    def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
+        """Fits the ValueDependentNormalizer.
+        Args:
+            table: Dataframe used to calculate normalization arrays for each column.
+        Returns:
+            Returns the instance itself.
+        """
+        self._fit_with_pseudo_reference(table)
+        return self
+    def is_fitted(self) -> bool:
+        """Returns True if the ValueDependentNormalizer has been fitted."""
+        return self._sample_fits is not None
+    def get_fits(self) -> dict[str, Iterable[float, float]]:
+        """Returns a dictionary containing lists of fitting data per sample.
+        Returns:
+            A dictionary mapping sample names to fitting data. Fitting data is sequence
+            of [itensity, deviation at this intensity] pairs.
+        Raises:
+            NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        return self._sample_fits.copy()
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Applies a value dependent normalization to each column of the table.
+        Args:
+            table: The data to normalize. Each column name must correspond to a column
+                name from the table that was used for the fitting.
+        Returns:
+            Transformed dataframe.
+        Raises:
+            NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        _table = table.copy()
+        for column in _table.columns:
+            column_data = np.array(_table[column], dtype=float)
+            mask = np.isfinite(column_data)
+            sample_fit = self._sample_fits[column]
+            fit_values, fit_deviations = [np.array(i) for i in zip(*sample_fit)]
+            column_data[mask] = column_data[mask] - np.interp(
+                column_data[mask], fit_values, fit_deviations
+            )
+            _table[column] = column_data
+        return _table
+    def _fit_with_pseudo_reference(self, table: pd.DataFrame) -> None:
+        """Fits the FixedValueNormalizer by comparing columns to a pseudo-reference.
+        First, a pseudo-reference samples is generated by calculating the mean value for
+        each row. Only rows with valid values in all columns are used. Normalization
+        arrays are then calculated by comparing each column to the pseudo-reference
+        sample. The individual normalization arrays are stored in a dictionary with the
+        column names as keys.
+        Args:
+            table: Dataframe used to calculate normalization values for each column.
+        """
+        ref_mask = table.isna().sum(axis=1) == 0
+        ref_values = table[ref_mask].mean(axis=1)
+        samples = table.columns.tolist()
+        self._sample_fits = {}
+        for sample in samples:
+            sample_values = table.loc[ref_mask, sample]
+            sample_fit = self._fit_function(sample_values, ref_values)
+            self._sample_fits[sample] = sample_fit
+class MedianNormalizer(FixedValueNormalizer):
+    """A FixedValueNormalizer that uses the median as the fitting function.
+    Use MedianNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
+    MedianNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
+    the normalization.
+    """
+    def __init__(self):
+        """Initializes the MedianNormalizer."""
+        super(MedianNormalizer, self).__init__(
+            center_function=np.median, comparison="paired"
+        )
+class ModeNormalizer(FixedValueNormalizer):
+    """A FixedValueNormalizer that uses the mode as the fitting function.
+    Use ModeNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
+    ModeNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
+    the normalization.
+    """
+    def __init__(self):
+        """Initializes the ModeNormalizer."""
+        super(ModeNormalizer, self).__init__(
+            center_function=msreport.helper.mode, comparison="paired"
+        )
+class LowessNormalizer(ValueDependentNormalizer):
+    """A ValueDependentNormalizer that uses lowess as the fitting function.
+    Use LowessNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
+    LowessNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
+    the normalization.
+    """
+    def __init__(self):
+        """Initializes the LowessNormalizer."""
+        super(LowessNormalizer, self).__init__(fit_function=_value_dependent_fit_lowess)
+class CategoricalNormalizer:
+    """Normalize samples based on category-dependent reference values.
+    Values from the reference table are used for normalization of the corresponding
+    categories in the table that will be transformed. The normalization is applied to
+    each column of the input table based on the category of each row.
+    The reference table must not contain NaN values and values in the sample columns
+    must be log-transformed. The table to be transformed must contain the same
+    `category_column` as the reference table and only include sample columns that were
+    used for fitting. Values from categories not present in the reference table will be
+    set to NaN. The table sample columns must also be log-transformed.
+    """
+    def __init__(self, category_column: str):
+        """Initializes a new instance of the CategoricalNormalizer class.
+        Args:
+            category_column: The name of the column containing the categories. This
+                column must be present in the reference table and the table to be
+                transformed.
+        """
+        self._fitted_table = None
+        self._category_column = category_column
+    def is_fitted(self) -> bool:
+        """Returns True if the CategoricalNormalizer has been fitted."""
+        return self._fitted_table is not None
+    def fit(self, reference_table: pd.DataFrame) -> BaseSampleNormalizer:
+        """Fits the CategoricalNormalizer to a reference table.
+        Args:
+            reference_table: The reference table used for fitting.
+        Returns:
+            Returns the instance itself.
+        Raises:
+            ValueError: If the reference table contains NaN values.
+        """
+        if reference_table.isna().values.any():
+            raise ValueError("Input table contains NaN values")
+        reference_table = reference_table.set_index(self.get_category_column())
+        self._fitted_table = reference_table
+        return self
+    def get_fits(self) -> pd.DataFrame:
+        """Returns a copy of the reference table used for fitting.
+        Raises:
+            NotFittedError: If the CategoricalNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        return self._fitted_table.copy()
+    def get_category_column(self) -> str:
+        """Returns the name of the category column."""
+        return self._category_column
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Applies a category dependent normalization to the table.
+        Args:
+            table: The table to normalize.
+        Returns:
+            The normalized table.
+        Raises:
+            KeyError: If the input table contains columns not present in the reference
+                table.
+            NotFittedError: If the CategoricalNormalizer has not been fitted yet.
+        """
+        confirm_is_fitted(self)
+        original_index = table.index
+        table = table.set_index(self.get_category_column(), drop=True, inplace=False)
+        if not table.columns.isin(self._fitted_table).all():
+            raise KeyError("The `table` contains columns not present in the fits")
+        valid_categories = table.index.isin(self._fitted_table.index)
+        sub_table = table[valid_categories]
+        values_for_fitting = self._fitted_table.loc[sub_table.index, sub_table.columns]
+        transformed_table = table.copy()
+        transformed_table[~valid_categories] = np.nan
+        transformed_table[valid_categories] = sub_table.sub(values_for_fitting, axis=1)
+        transformed_table.reset_index(inplace=True)
+        transformed_table.index = original_index
+        return transformed_table
+class ZscoreScaler(BaseSampleNormalizer):
+    """Normalize samples by z-score scaling."""
+    def __init__(self, with_mean: bool = True, with_std: bool = True):
+        """Initializes a new instance of the ZscoreScaler class.
+        Args:
+            with_mean: If True, center row values by subtracting the row mean.
+            with_std: If True, scale row values by dividing by the row std.
+        """
+        self._with_mean = with_mean
+        self._with_std = with_std
+    def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
+        """Returns the instance itself."""
+        return self
+    def is_fitted(self) -> bool:
+        """Always returns True because the ZscoreScaler does not need to be fitted."""
+        return True
+    def get_fits(self) -> dict:
+        """Returns a dictionary containing the parameters 'with_mean' and 'with_std'."""
+        return {"with_mean": self._with_mean, "with_std": self._with_std}
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Applies a z-score normalization to each column of the table.
+        Args:
+            table: The table used to scale row values.
+        Returns:
+            A copy of the table containing the scaled values.
+        """
+        scaled_table = table.copy()
+        if self._with_mean:
+            scaled_table = scaled_table.subtract(scaled_table.mean(axis=1), axis=0)
+        if self._with_std:
+            scaled_table = scaled_table.divide(scaled_table.std(axis=1, ddof=0), axis=0)
+        return scaled_table
+def confirm_is_fitted(
+    normalizer: BaseSampleNormalizer, msg: Optional[str] = None
+) -> None:
+    """Perform is_fitted validation for normalizer instances.
+    Checks if the normalizer is fitted by verifying the presence of fitted attributes
+    and otherwise raises a NotFittedError with the given message.
+    Args:
+        msg : str, default=None
+            The default error message is, "This %(name) instance is not fitted
+            yet. Call 'fit' with appropriate arguments before using this
+            normalizer."
+    """
+    if msg is None:
+        msg = (
+            "This %(name)s instance is not fitted yet. Call 'fit' with "
+            "appropriate arguments before using this normalizer."
+        )
+    if not hasattr(normalizer, "is_fitted"):
+        raise TypeError(f"{normalizer} is not an normalizer instance.")
+    else:
+        fitted = normalizer.is_fitted()
+    if not fitted:
+        raise NotFittedError(msg % {"name": type(normalizer).__name__})
+def _value_dependent_fit_lowess(
+    values: np.ndarray,
+    reference_values: np.ndarray,
+    delta_span_percentage: float = 0.05,
+    iterations: int = 5,
+) -> np.ndarray:
+    """Calculates estimated deviations between values and reference_values using lowess.
+    Args:
+        values: The y-values of the observed points
+        reference_values: Used to calcualte the x-values of the observed points, as
+            'values' - 'reference_values'.
+        delta_span_percentage: Distance within which to use linear-interpolation
+            instead of weighted regression, as a percentage of the data span.
+        iterations: The number of residual-based reweightings to perform
+    Returns:
+        A numpy array with two columns. The first column contains the sorted 'values'
+        and the second column the associated estimated deviation values from the
+        reference.
+    """
+    delta = (reference_values.max() - reference_values.min()) * delta_span_percentage
+    deviations = values - reference_values
+    return statsmodels.nonparametric.smoothers_lowess.lowess(
+        deviations, values, delta=delta, it=iterations
+    )