PyPI - msreport - Versions diffs - 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl - Mend

msreport 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

msreport/__init__.py +1 -1
msreport/aggregate/__init__.py +10 -0
msreport/aggregate/condense.py +9 -0
msreport/aggregate/pivot.py +14 -5
msreport/aggregate/summarize.py +14 -4
msreport/analyze.py +67 -5
msreport/export.py +9 -15
msreport/fasta.py +9 -2
msreport/helper/__init__.py +18 -0
msreport/impute.py +18 -10
msreport/isobar.py +11 -14
msreport/normalize.py +95 -10
msreport/peptidoform.py +21 -11
msreport/plot/__init__.py +3 -3
msreport/plot/distribution.py +2 -1
msreport/plot/quality.py +1 -1
msreport/qtable.py +44 -20
msreport/reader.py +321 -40
msreport/rinterface/limma.py +1 -1
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/METADATA +20 -2
msreport-0.0.31.dist-info/RECORD +38 -0
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/WHEEL +1 -1
msreport-0.0.29.dist-info/RECORD +0 -38
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/licenses/LICENSE.txt +0 -0
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/top_level.txt +0 -0

msreport/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@ from msreport.fasta import import_protein_database
 from msreport.qtable import Qtable
 from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
-__version__ = "0.0.29"
+__version__ = "0.0.31"

msreport/aggregate/__init__.py CHANGED Viewed

@@ -0,0 +1,10 @@
+"""A comprehensive set of tools for aggregating and reshaping tabular proteomics data.
+The `aggregation` module contains submodules that offer functionalities to transform
+data from lower levels of abstraction (e.g. ions, peptides) to higher levels (e.g.
+peptides, proteins, PTMs) through various summarization and condensation techniques.
+It also includes methods for reshaping tables from "long" to "wide" format, a common
+prerequisite for aggregation. The MaxLFQ algorithm is integrated for specific
+quantitative summarizations, enabling users to build customized, higher-level data
+tables.
+"""

msreport/aggregate/condense.py CHANGED Viewed

@@ -1,3 +1,12 @@
+"""Low-level functions for aggregating numerical and string data.
+This module defines fundamental "condenser" functions that operate directly on NumPy
+arrays. These functions are designed to be applied to groups of data, performing
+operations such as summing values, finding maximum/minimum, counting or joining unique
+elements, and calculating abundance profiles. It includes the core implementations for
+MaxLFQ summation.
+"""
 import numpy as np
 import msreport.helper.maxlfq as MAXLFQ

msreport/aggregate/pivot.py CHANGED Viewed

@@ -1,4 +1,12 @@
-from typing import Iterable, Union
+"""Functionalities for reshaping tabular quantitative proteomics data.
+This module offers methods to transform data from a "long" format into a "wide" format,
+which is a common and often necessary step before aggregation or analysis. It supports
+pivoting data based on specified index and grouping columns, and can handle both
+quantitative values and annotation columns.
+"""
+from typing import Iterable
 import pandas as pd
@@ -12,11 +20,12 @@ def pivot_table(
     group_by: str,
     annotation_columns: Iterable[str],
     pivoting_columns: Iterable[str],
-):
+) -> pd.DataFrame:
     """Generates a pivoted table in wide format.
     Args:
-        table: Dataframe in long format that is used to generate a table in wide format.
+        long_table: Dataframe in long format that is used to generate a table in wide
+            format.
         index: One or multiple column names that are used to group the table for
             pivoting.
         group_by: Column that is used to split the table on its unique entries.
@@ -58,7 +67,7 @@ def pivot_table(
 def pivot_column(
-    table: pd.DataFrame, index: Union[str, Iterable], group_by: str, values: str
+    table: pd.DataFrame, index: str | Iterable[str], group_by: str, values: str
 ) -> pd.DataFrame:
     """Returns a reshaped dataframe, generated by pivoting the table on one column.
@@ -98,7 +107,7 @@ def pivot_column(
 def join_unique(
-    table: pd.DataFrame, index: Union[str, Iterable], values: str
+    table: pd.DataFrame, index: str | Iterable[str], values: str
 ) -> pd.DataFrame:
     """Returns a new dataframe with unique values from a column and grouped by 'index'.

msreport/aggregate/summarize.py CHANGED Viewed

@@ -1,4 +1,14 @@
-from typing import Callable, Iterable, Optional, Union
+"""High-level functions for aggregating quantitative proteomics data.
+This module offers functions to summarize data from a lower level of abstraction (e.g.
+ions, peptides) to a higher level (e.g., peptides, proteins, PTMs). It operates directly
+on pandas DataFrames, allowing users to specify a grouping column and the columns to be
+summarized. These functions often leverage low-level condenser operations defined in
+`msreport.aggregate.condense`. It includes specific functions for MaxLFQ summation, as
+well as general counting, joining, and summing of columns.
+"""
+from typing import Callable, Iterable, Optional
 import numpy as np
 import pandas as pd
@@ -10,7 +20,7 @@ from msreport.helper import find_sample_columns
 def count_unique(
     table: pd.DataFrame,
     group_by: str,
-    input_column: Union[str, Iterable],
+    input_column: str | Iterable[str],
     output_column: str = "Unique counts",
     is_sorted: bool = False,
 ) -> pd.DataFrame:
@@ -55,7 +65,7 @@ def count_unique(
 def join_unique(
     table: pd.DataFrame,
     group_by: str,
-    input_column: Union[str, Iterable],
+    input_column: str | Iterable[str],
     output_column: str = "Unique values",
     sep: str = ";",
     is_sorted: bool = False,
@@ -215,7 +225,7 @@ def sum_columns_maxlfq(
 def aggregate_unique_groups(
     table: pd.DataFrame,
     group_by: str,
-    columns_to_aggregate: Union[str, Iterable],
+    columns_to_aggregate: str | Iterable[str],
     condenser: Callable,
     is_sorted: bool,
 ) -> tuple[np.ndarray, np.ndarray]:

msreport/analyze.py CHANGED Viewed

@@ -1,12 +1,16 @@
-"""The analyze module contains methods for analysing quantification results."""
+"""Tools for post-processing and statistical analysis of `Qtable` data.
-from __future__ import annotations
+All functions in this module take a `Qtable` object and modify its data in place. The
+module provides functionality for data evaluation, normalization, imputation of missing
+values, and statistical testing, including integration with R's LIMMA package.
+"""
 import warnings
 from typing import Iterable, Optional, Protocol, Sequence
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 import msreport.normalize
 from msreport.errors import OptionalDependencyError
@@ -24,7 +28,7 @@ except OptionalDependencyError as err:
 class Transformer(Protocol):
-    def fit(self, table: pd.DataFrame) -> Transformer:
+    def fit(self, table: pd.DataFrame) -> Self:
         """Fits the Transformer and returns a fitted Transformer instance."""
     def is_fitted(self) -> bool:
@@ -35,7 +39,7 @@ class Transformer(Protocol):
 class CategoryTransformer(Protocol):
-    def fit(self, table: pd.DataFrame) -> Transformer:
+    def fit(self, table: pd.DataFrame) -> Self:
         """Fits the Transformer and returns a fitted Transformer instance."""
     def is_fitted(self) -> bool:
@@ -162,7 +166,7 @@ def validate_proteins(
 def apply_transformer(
-    qtable: msreport.Qtable,
+    qtable: Qtable,
     transformer: Transformer,
     tag: str,
     exclude_invalid: bool,
@@ -205,6 +209,64 @@ def apply_transformer(
     qtable.data[data_table.columns] = data_table
+def apply_category_transformer(
+    qtable: Qtable,
+    transformer: CategoryTransformer,
+    tag: str,
+    exclude_invalid: bool,
+    remove_invalid: bool,
+    new_tag: Optional[str] = None,
+) -> None:
+    """Apply a category transformer to Qtable columns selected by tag.
+    Args:
+        qtable: A Qtable instance, to which the transformer is applied.
+        transformer: The CategoryTransformer to apply.
+        tag: The tag used to identify the columns for applying the transformer.
+        exclude_invalid: Exclude invalid values from the transformation.
+        remove_invalid: Remove invalid values from the table after the transformation.
+        new_tag: Optional, if specified than the tag is replaced with this value in the
+            column names and the transformed data is stored to these new columns.
+    Raises:
+        KeyError: If the category column of the `transformer` is not found in the
+            `qtable.data`.
+        ValueError: If no sample columns are found for the specified tag.
+    """
+    category_column = transformer.get_category_column()
+    if category_column not in qtable.data.columns:
+        raise KeyError(
+            f'The category column "{category_column}" in the transformer '
+            f"is not found in `qtable.data`."
+        )
+    valid = qtable.data["Valid"]
+    samples = qtable.get_samples()
+    sample_columns = find_sample_columns(qtable.data, tag, samples)
+    if not sample_columns:
+        raise ValueError(f"No sample columns found for tag '{tag}'.")
+    if new_tag is not None:
+        sample_columns = [c.replace(tag, new_tag) for c in sample_columns]
+    column_mapping = dict(zip(samples, sample_columns))
+    data_table = qtable.make_sample_table(tag, samples_as_columns=True)
+    data_table[category_column] = qtable.data[category_column]
+    if exclude_invalid:
+        data_table.loc[valid, :] = transformer.transform(data_table.loc[valid, :])
+    else:
+        data_table = transformer.transform(data_table)
+    data_table = data_table.drop(columns=[category_column])
+    if remove_invalid:
+        data_table[~valid] = np.nan
+    data_table.columns = [column_mapping[s] for s in data_table.columns]
+    qtable.data[data_table.columns] = data_table
 def normalize_expression(
     qtable: Qtable,
     normalizer: Transformer,

msreport/export.py CHANGED Viewed

@@ -1,19 +1,13 @@
-"""
-Columns that are not yet present in the amica output at the moment:
-Index([
-    'Protein Probability',
-    'Top Peptide Probability',
-    'Total peptides',
-    'Leading proteins',
-    'Protein entry name',
-    'Fasta header',
-    'Protein length',
-    'iBAQ peptides',
-    'Sequence coverage',
-], dtype='object')
+"""Exporting of proteomics data from `Qtable` into external formats.
+This module offers functionalities to convert and save `Qtable` data into files
+compatible with external tools (Amica and Perseus), and creating sequence coverage maps
+in HTML format. While most functions operate on `Qtable` instances, some may accept
+other data structures.
 """
 import os
+import pathlib
 import warnings
 from collections import defaultdict as ddict
 from typing import Iterable, Optional, Protocol, Sequence
@@ -99,7 +93,7 @@ def contaminants_to_clipboard(qtable: Qtable) -> None:
 def to_perseus_matrix(
     qtable: Qtable,
-    directory,
+    directory: str | pathlib.Path,
     table_name: str = "perseus_matrix.tsv",
 ) -> None:
     """Exports a qtable to a perseus matrix file in tsv format.
@@ -151,7 +145,7 @@ def to_perseus_matrix(
 def to_amica(
     qtable: Qtable,
-    directory,
+    directory: str | pathlib.Path,
     table_name: str = "amica_table.tsv",
     design_name: str = "amica_design.tsv",
 ) -> None:

msreport/fasta.py CHANGED Viewed

@@ -1,11 +1,18 @@
+"""Functionalities for import and access to protein sequence databases from FASTA files.
+This module serves as an interface to the `profasta` library, offering a convenient way
+to generate a `profasta.db.ProteinDatabase` from one or multiple FASTA files. It
+supports custom FASTA header parsing through a configurable header parser.
+"""
 import pathlib
-from typing import Iterable, Union
+from typing import Iterable
 from profasta.db import ProteinDatabase
 def import_protein_database(
-    fasta_path: Union[str, pathlib.Path, Iterable[Union[str, pathlib.Path]]],
+    fasta_path: str | pathlib.Path | Iterable[str | pathlib.Path],
     header_parser: str = "uniprot",
 ) -> ProteinDatabase:
     """Generates a protein database from one or a list of fasta files.

msreport/helper/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
+"""A collection of widely used helper and utility functions.
+This module re-exports commonly used functions from various `msreport.helper`
+submodules for convenience.
+"""
 from .calc import (
     calculate_monoisotopic_mass,
     calculate_sequence_coverage,
@@ -21,3 +27,15 @@ from .temp import (
     extract_modifications,
     modify_peptide,
 )
+__all__ = [
+    "apply_intensity_cutoff",
+    "find_columns",
+    "find_sample_columns",
+    "guess_design",
+    "intensities_in_logspace",
+    "keep_rows_by_partial_match",
+    "remove_rows_by_partial_match",
+    "rename_mq_reporter_channels",
+    "rename_sample_columns",
+]

msreport/impute.py CHANGED Viewed

@@ -1,9 +1,17 @@
-from __future__ import annotations
+"""Transformer classes for imputing missing values in quantitative proteomics data.
+This module defines transformer classes that can be fitted to a table containing
+quantitative values to learn imputation parameters. Once fitted, these transformers can
+then be applied to another table to transform it by filling in missing values. The
+transformation returns a new copy of the table with the imputed values, leaving the
+original table unchanged.
+"""
 from typing import Any, Optional
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 from msreport.errors import NotFittedError
@@ -42,7 +50,7 @@ class FixedValueImputer:
         self.column_wise = column_wise
         self._sample_fill_values: dict[str, float] = {}
-    def fit(self, table: pd.DataFrame) -> FixedValueImputer:
+    def fit(self, table: pd.DataFrame) -> Self:
         """Fits the FixedValueImputer.
         Args:
@@ -79,7 +87,7 @@ class FixedValueImputer:
         Returns:
             'table' with imputed missing values.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         _table = table.copy()
         for column in _table.columns:
@@ -108,7 +116,7 @@ class GaussianImputer:
         self.sigma = sigma
         self.seed = seed
-    def fit(self, table: pd.DataFrame) -> GaussianImputer:
+    def fit(self, table: pd.DataFrame) -> Self:
         """Fits the GaussianImputer, altough this is not necessary.
         Args:
@@ -134,7 +142,7 @@ class GaussianImputer:
         Returns:
             'table' with imputed missing values.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         np.random.seed(self.seed)
         _table = table.copy()
@@ -182,9 +190,9 @@ class PerseusImputer:
         self.std_width = std_width
         self.column_wise = column_wise
         self.seed = seed
-        self._column_params: dict[str, dict] = {}
+        self._column_params: dict[str, dict[str, float]] = {}
-    def fit(self, table: pd.DataFrame) -> PerseusImputer:
+    def fit(self, table: pd.DataFrame) -> Self:
         """Fits the PerseusImputer.
         Args:
@@ -223,7 +231,7 @@ class PerseusImputer:
         Returns:
             'table' with imputed missing values.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         np.random.seed(self.seed)
         _table = table.copy()
@@ -239,7 +247,7 @@ class PerseusImputer:
         return _table
-def confirm_is_fitted(imputer: Any, msg: Optional[str] = None) -> None:
+def _confirm_is_fitted(imputer: Any, msg: Optional[str] = None) -> None:
     """Perform is_fitted validation for imputer instances.
     Checks if the imputer is fitted by verifying the presence of fitted attributes
@@ -266,7 +274,7 @@ def confirm_is_fitted(imputer: Any, msg: Optional[str] = None) -> None:
         raise NotFittedError(msg % {"name": type(imputer).__name__})
-def _calculate_integer_below_min(table) -> int:
+def _calculate_integer_below_min(table: pd.DataFrame) -> int:
     minimal_value = np.nanmin(table.to_numpy().flatten())
     below_minimal = np.floor(minimal_value)
     if minimal_value <= below_minimal:

msreport/isobar.py CHANGED Viewed

@@ -1,34 +1,31 @@
-from __future__ import annotations
+"""Provides a transformer class for processing isobarically labeled proteomics data.
+This module defines the `IsotopeImpurityCorrecter` class for processing of isobaric
+(e.g., TMT, iTRAQ) reporter intensities. This transformer must be fitted with an isotope
+impurity matrix to correct interference in reporter intensities. Once fitted, the
+transformer can then be applied to a table containing reporter ion intensities to adjust
+its intensity values. The transformation returns a new copy of the table with the
+processed values, leaving the original table unchanged.
+"""
 import functools
-from typing import Protocol
 import numpy as np
 import pandas as pd
 import scipy
+from typing_extensions import Self
 import msreport.helper
 from msreport.errors import NotFittedError
-class Transformer(Protocol):
-    def fit(self, table: pd.DataFrame) -> Transformer:
-        """Fits the Transformer and returns a fitted Transformer instance."""
-    def is_fitted(self) -> bool:
-        """Returns True if the Transformer has been fitted."""
-    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
-        """Transform values in 'table'."""
 class IsotopeImpurityCorrecter:
     """Corrects isotope impurity interference in isobaric reporter expression values."""
     def __init__(self):
         self._impurity_matrix = None
-    def fit(self, impurity_matrix: np.ndarray) -> IsotopeImpurityCorrecter:
+    def fit(self, impurity_matrix: np.ndarray) -> Self:
         """Fits the isotope impurity correcter to a given impurity matrix.
         Args:

msreport/normalize.py CHANGED Viewed

@@ -1,4 +1,16 @@
-from __future__ import annotations
+"""Transformer classes for normalizing and transforming quantitative proteomics data.
+This module defines various transformer classes for normalizing and scaling quantitative
+values in tabular data. Examples include normalizers like median, mode, and LOWESS, as
+well as scalers such as PercentageScaler and ZScoreScaler. A specialized
+`CategoricalNormalizer` is also provided, which, when appropriately fitted and applied,
+can be used for complex transformations such as iBAQ or site-to-protein normalization.
+These transformers can be fitted to a table containing quantitative values to learn
+parameters. Once fitted, they can then be applied to another table to adjust its values.
+The transformation returns a new copy of the table with the normalized/scaled values,
+leaving the original table unchanged.
+"""
 from typing import Callable, Iterable, Optional, Protocol
@@ -79,7 +91,7 @@ class FixedValueNormalizer:
         Raises:
             NotFittedError: If the FixedValueNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         return self._sample_fits.copy()
     def transform(self, table: pd.DataFrame) -> pd.DataFrame:
@@ -95,7 +107,7 @@ class FixedValueNormalizer:
         Raises:
             NotFittedError: If the FixedValueNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         _table = table.copy()
         for column in _table.columns:
@@ -195,7 +207,7 @@ class ValueDependentNormalizer:
         Raises:
             NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         return self._sample_fits.copy()
     def transform(self, table: pd.DataFrame) -> pd.DataFrame:
@@ -211,7 +223,7 @@ class ValueDependentNormalizer:
         Raises:
             NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         _table = table.copy()
         for column in _table.columns:
@@ -250,6 +262,59 @@ class ValueDependentNormalizer:
             self._sample_fits[sample] = sample_fit
+class SumNormalizer:
+    """Normalizer that uses the sum of all values in each sample for normalization.
+    Expects log2-transformed intensity values. To obtain normalization factors, the sum
+    of non-log2-transformed values is calculated for each sample, then divided by the
+    average of all sample sums and log2-transformed.
+    """
+    def __init__(self):
+        """Initializes the SumNormalizer."""
+        self._sample_fits: dict[str, float] = {}
+    def fit(self, table: pd.DataFrame) -> Self:
+        """Fits the SumNormalizer and returns a fitted instance.
+        Args:
+            table: Dataframe used to calculate normalization values for each column.
+        Returns:
+            Returns the instance itself.
+        """
+        _sums = np.power(2, table).sum()
+        _log2_fits = np.log2(_sums.divide(_sums.mean()))
+        self._sample_fits = _log2_fits.to_dict()
+        return self
+    def is_fitted(self) -> bool:
+        """Returns True if the Transformer has been fitted."""
+        return True if self._sample_fits else False
+    def get_fits(self) -> dict[str, float]:
+        """Returns a dictionary containing the fitted center values per sample.
+        Raises:
+            NotFittedError: If the FixedValueNormalizer has not been fitted yet.
+        """
+        _confirm_is_fitted(self)
+        return self._sample_fits.copy()
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Transform values in table."""
+        _confirm_is_fitted(self)
+        _table = table.copy()
+        for column in _table.columns:
+            column_data = np.array(_table[column], dtype=float)
+            mask = np.isfinite(column_data)
+            column_data[mask] = column_data[mask] - self._sample_fits[column]
+            _table[column] = column_data
+        return _table
 class MedianNormalizer(FixedValueNormalizer):
     """A FixedValueNormalizer that uses the median as the fitting function.
@@ -346,7 +411,7 @@ class CategoricalNormalizer:
         Raises:
             NotFittedError: If the CategoricalNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         return self._fitted_table.copy()
     def get_category_column(self) -> str:
@@ -367,7 +432,7 @@ class CategoricalNormalizer:
                 table.
             NotFittedError: If the CategoricalNormalizer has not been fitted yet.
         """
-        confirm_is_fitted(self)
+        _confirm_is_fitted(self)
         original_index = table.index
         table = table.set_index(self.get_category_column(), drop=True, inplace=False)
@@ -396,11 +461,11 @@ class PercentageScaler:
         return self
     def is_fitted(self) -> bool:
-        """Always returns True because the ZscoreScaler does not need to be fitted."""
+        """Always returns True because the Scaler does not need to be fitted."""
         return True
     def get_fits(self) -> dict:
-        """Returns a dictionary containing the parameters 'with_mean' and 'with_std'."""
+        """Returns an empty dictionary."""
         return {}
     def transform(self, table: pd.DataFrame) -> pd.DataFrame:
@@ -457,7 +522,27 @@ class ZscoreScaler:
         return scaled_table
-def confirm_is_fitted(
+class Log2Transformer:
+    """Apply log2 transformation to column values."""
+    def fit(self, table: pd.DataFrame) -> Self:
+        """Returns the instance itself."""
+        return self
+    def is_fitted(self) -> bool:
+        """Returns True if the transformer is fitted."""
+        return True
+    def transform(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Applies a log2 transformation to each column of the table.
+        Zero values are replaced with NaN before the transformation to avoid an error
+        during the log2 calculation.
+        """
+        return pd.DataFrame(np.log2(table.replace({0: np.nan})))
+def _confirm_is_fitted(
     normalizer: AbstractTransformer, msg: Optional[str] = None
 ) -> None:
     """Perform is_fitted validation for normalizer instances.

msreport 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

msreport 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl