PyPI - msreport - Versions diffs - 0.0.31__tar.gz → 0.0.32__tar.gz - Mend

msreport 0.0.31tar.gz → 0.0.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{msreport-0.0.31 → msreport-0.0.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: msreport
-Version: 0.0.31
+Version: 0.0.32
 Summary: Post processing and analysis of quantitative proteomics data
 Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
 License-Expression: Apache-2.0

{msreport-0.0.31 → msreport-0.0.32}/msreport/__init__.py RENAMED Viewed

@@ -8,4 +8,4 @@ from msreport.fasta import import_protein_database
 from msreport.qtable import Qtable
 from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
-__version__ = "0.0.31"
+__version__ = "0.0.32"

{msreport-0.0.31 → msreport-0.0.32}/msreport/export.py RENAMED Viewed

@@ -502,7 +502,7 @@ def _find_covered_region_boundaries(
     Examples:
         >>> coverage_mask = [True, True, False, False, True]
         >>> _find_covered_region_boundaries(coverage_mask)
-        ... [(0, 1), (4, 4)]
+        [(0, 1), (4, 4)]
     """
     start = []
     stop = []

{msreport-0.0.31 → msreport-0.0.32}/msreport/helper/maxlfq.py RENAMED Viewed

@@ -113,9 +113,9 @@ def calculate_pairwise_mode_log_ratio_matrix(
         ...     ]
         ... )
         >>> calculate_pairwise_mode_log_ratio_matrix(array)
-        array([[ 0.       , -0.0849625, -1.       ],
-               [ 0.0849625,  0.       , -1.       ],
-               [ 1.       ,  1.       ,  0.       ]])
+        array([[ 0.        , -0.08496251, -1.       ],
+               [ 0.08496251,  0.       , -1.       ],
+               [ 1.        ,  1.       ,  0.       ]])
     """
     ratio_marix = _calculate_pairwise_centered_log_ratio_matrix(
         array, msreport.helper.mode, log_transformed=log_transformed

{msreport-0.0.31 → msreport-0.0.32}/msreport/plot/comparison.py RENAMED Viewed

@@ -77,10 +77,15 @@ def volcano_ma(
         )
         special_entries = list(special_entries) + list(special_proteins)
-    data = qtable.get_data(exclude_invalid=exclude_invalid)
-    if annotation_column not in data.columns:
+    if annotation_column not in qtable.data.columns:
         annotation_column = qtable.id_column
+    data = qtable.get_data(exclude_invalid=exclude_invalid)
+    mask = np.ones(data.shape[0], dtype=bool)
+    for tag in [ratio_tag, expression_tag, pvalue_tag]:
+        mask = mask & np.isfinite(data[f"{tag} {comparison_group}"])
+    data = data[mask]
     scatter_size = 2 / (max(min(data.shape[0], 10000), 1000) / 1000)
     masks = {

{msreport-0.0.31 → msreport-0.0.32}/msreport/plot/multivariate.py RENAMED Viewed

@@ -21,6 +21,7 @@ def sample_pca(
     pc_x: str = "PC1",
     pc_y: str = "PC2",
     exclude_invalid: bool = True,
+    exclude_missing: bool = False,
 ) -> tuple[plt.Figure, list[plt.Axes]]:
     """Figure to compare sample similarities with a principle component analysis.
@@ -44,11 +45,14 @@ def sample_pca(
             samples.
         exclude_invalid: If True, rows are filtered according to the Boolean entries of
             the "Valid" column.
+        exclude_missing: If True, only rows without any missing values are used.
     Returns:
         A matplotlib Figure and a list of Axes objects, containing the PCA plots.
     """
     design = qtable.get_design()
+    samples = qtable.get_samples()
     if design.shape[0] < 3:
         fig, ax = plt.subplots(1, 1, figsize=(2, 1.3))
         fig.suptitle(f'PCA of "{tag}" values', y=1.1)
@@ -65,13 +69,22 @@ def sample_pca(
         return fig, np.array([ax])
     table = qtable.make_sample_table(
-        tag, samples_as_columns=True, exclude_invalid=exclude_invalid
+        tag, samples_as_columns=True, exclude_invalid=False
     )
+    inclusion_mask = np.ones(qtable.data.shape[0], dtype=bool)
+    if exclude_invalid:
+        inclusion_mask = inclusion_mask & qtable["Valid"]
+    if exclude_missing:
+        _non_missing_masks = [(qtable[f"Missing {s}"] == 0) for s in samples]
+        inclusion_mask = inclusion_mask & (np.all(_non_missing_masks, axis=0))
+    table = table[inclusion_mask]
     table = table.replace({0: np.nan})
     table = table[np.isfinite(table).sum(axis=1) > 0]
     if not msreport.helper.intensities_in_logspace(table):
         table = np.log2(table)
-    table[table.isna()] = 0
+    table = table.fillna(0)
     table = table.transpose()
     sample_index = table.index.tolist()
@@ -203,6 +216,7 @@ def sample_pca(
 def expression_clustermap(
     qtable: Qtable,
     exclude_invalid: bool = True,
+    exclude_missing: bool = False,
     remove_imputation: bool = True,
     mean_center: bool = False,
     cluster_samples: bool = True,
@@ -218,6 +232,7 @@ def expression_clustermap(
         qtable: A `Qtable` instance, which data is used for plotting.
         exclude_invalid: If True, rows are filtered according to the Boolean entries of
             the "Valid" column.
+        exclude_missing: If True, only rows without any missing values are used.
         remove_imputation: If True, imputed values are set to 0 before clustering.
             Defaults to True.
         mean_center: If True, the data is mean-centered before clustering. Defaults to
@@ -242,25 +257,29 @@ def expression_clustermap(
     if len(samples) < 2:
         raise ValueError("At least two samples are required to generate a clustermap.")
-    data = qtable.make_expression_table(samples_as_columns=True)
+    data = qtable.make_expression_table(samples_as_columns=True, exclude_invalid=False)
     data = data[samples]
+    data = data.fillna(0)
-    for sample in samples:
-        if remove_imputation:
-            data.loc[qtable.data[f"Missing {sample}"], sample] = 0
-        data[sample] = data[sample].fillna(0)
-    if not mean_center:
-        # Hide missing values in the heatmap, making them appear white
-        mask_values = qtable.data[
+    if not mean_center:  # Hide missing values in the heatmap, making them appear white
+        hide_values_mask = qtable.data[
             [f"Missing {sample}" for sample in samples]
         ].to_numpy()
     else:
-        mask_values = np.zeros(data.shape, dtype=bool)
+        hide_values_mask = np.zeros(data.shape, dtype=bool)
+    if remove_imputation:
+        for sample in samples:
+            data.loc[qtable.data[f"Missing {sample}"], sample] = 0
+    inclusion_mask = np.ones(data.shape[0], dtype=bool)
     if exclude_invalid:
-        data = data[qtable.data["Valid"]]
-        mask_values = mask_values[qtable.data["Valid"]]
+        inclusion_mask = inclusion_mask & qtable["Valid"]
+    if exclude_missing:
+        _non_missing_masks = [(qtable[f"Missing {s}"] == 0) for s in samples]
+        inclusion_mask = inclusion_mask & (np.all(_non_missing_masks, axis=0))
+    hide_values_mask = hide_values_mask[inclusion_mask]
+    data = data[inclusion_mask]
     color_wheel = ColorWheelDict()
     for exp in experiments:
@@ -314,7 +333,7 @@ def expression_clustermap(
         col_cluster=cluster_samples,
         col_colors=sample_colors,
         row_colors=["#000000" for _ in range(len(data))],
-        mask=mask_values,
+        mask=hide_values_mask,
         method=cluster_method,
         metric="euclidean",
         **heatmap_args,

{msreport-0.0.31 → msreport-0.0.32}/msreport/reader.py RENAMED Viewed

@@ -545,7 +545,12 @@ class FragPipeReader(ResultReader):
     """FragPipe result reader.
     Methods:
-        import_design: Reads a "fragpipe-files.fp-manifest" file and returns a
+        import_design: Depending on the quantification strategy, imports either the
+            manifest file or the experiment annotation file and returns a processed
+            design dataframe.
+        import_manifest: Reads a "fragpipe-files.fp-manifest" file and returns a
+            processed design dataframe.
+        import_experiment_annotation: Reads a "experiment_annotation" file and returns a
             processed design dataframe.
         import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
             returns a processed dataframe, conforming to the MsReport naming
@@ -589,12 +594,8 @@ class FragPipeReader(ResultReader):
         "ions": "combined_ion.tsv",
         "ion_evidence": "ion.tsv",
         "psm_evidence": "psm.tsv",
-        "design": "fragpipe-files.fp-manifest",
-    }
-    isobar_filenames: dict[str, str] = {
-        "proteins": "protein.tsv",
-        "peptides": "peptide.tsv",
-        "ions": "ion.tsv",
+        "manifest": "fragpipe-files.fp-manifest",
+        "experiment_annotation": "experiment_annotation.tsv",
     }
     sil_filenames: dict[str, str] = {
         "proteins": "combined_protein_label_quant.tsv",
@@ -675,14 +676,27 @@ class FragPipeReader(ResultReader):
         self._isobar: bool = isobar
         self._sil: bool = sil
         self._contaminant_tag: str = contaminant_tag
-        if isobar:
-            self.filenames = self.isobar_filenames
-        elif sil:
-            self.filenames = self.sil_filenames
+        self.filenames = self.default_filenames.copy()
+        if sil:
+            self.filenames.update(self.sil_filenames)
+    def import_design(self, sort: bool = False) -> pd.DataFrame:
+        """Reads the experimental design file and returns a processed design dataframe.
+        Depending on the quantification strategy (isobaric or label-free/sil), either
+        the experiment annotation file or the manifest file is imported.
+        Args:
+            sort: If True, the design dataframe is sorted by "Experiment" and
+                "Replicate"; default False.
+        """
+        if self._isobar:
+            return self.import_experiment_annotation(sort=sort)
         else:
-            self.filenames = self.default_filenames
+            return self.import_manifest(sort=sort)
-    def import_design(
+    def import_manifest(
         self, filename: Optional[str] = None, sort: bool = False
     ) -> pd.DataFrame:
         """Read a 'fp-manifest' file and returns a processed design dataframe.
@@ -709,7 +723,7 @@ class FragPipeReader(ResultReader):
             FileNotFoundError: If the specified manifest file does not exist.
         """
         if filename is None:
-            filepath = os.path.join(self.data_directory, self.filenames["design"])
+            filepath = os.path.join(self.data_directory, self.filenames["manifest"])
         else:
             filepath = os.path.join(self.data_directory, filename)
         if not os.path.exists(filepath):
@@ -748,6 +762,63 @@ class FragPipeReader(ResultReader):
             design.reset_index(drop=True, inplace=True)
         return design
+    def import_experiment_annotation(
+        self, filename: Optional[str] = None, sort: bool = False
+    ) -> pd.DataFrame:
+        """Read a 'experiment_annotation' file and returns a processed design dataframe.
+        The annotation columns "sample", "channel", and "plex" are mapped to the design
+        table columns "Sample", "Channel", and "Plex". The "Experiment" and "Replicate"
+        columns are extracted from the "Sample" column by splitting at the last
+        underscore, if there is no underscore, "Replicate" is set to an empty string.
+        Note that this convention of splitting the "Sample" column does confirm to the
+        FragPipe convention, but FragPipe does not enforce it for the experiment
+        annotation file.
+        Args:
+            filename: Allows specifying an alternative filename, otherwise the default
+                filename is used.
+            sort: If True, the design dataframe is sorted by "Experiment" and
+                "Replicate"; default False.
+        Returns:
+            A dataframe containing the processed design table with columns:
+            "Sample", "Experiment", "Replicate", "Channel", and "Plex".
+        Raises:
+            FileNotFoundError: If the specified manifest file does not exist.
+        """
+        if filename is None:
+            filepath = os.path.join(
+                self.data_directory, self.filenames["experiment_annotation"]
+            )
+        else:
+            filepath = os.path.join(self.data_directory, filename)
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(
+                f"File '{filepath}' does not exist. Please check the file path."
+            )
+        annotation = pd.read_csv(filepath, sep="\t")
+        design = pd.DataFrame(
+            {
+                "Sample": annotation["sample"],
+                "Experiment": annotation["sample"].str.rsplit("_", n=1).str[0],
+                "Replicate": annotation["sample"].str.rsplit("_", n=1).str[1],
+                "Channel": annotation["channel"],
+                "Plex": annotation["plex"],
+            }
+        )
+        design["Replicate"] = design["Replicate"].fillna("")
+        if sort:
+            design.sort_values(by=["Experiment", "Replicate"], inplace=True)
+            design.reset_index(drop=True, inplace=True)
+        return design
     def import_proteins(
         self,
         filename: Optional[str] = None,
@@ -1034,6 +1105,7 @@ class FragPipeReader(ResultReader):
             )
             df["Modified sequence"] = mod_entries["Modified sequence"]
             df["Modifications"] = mod_entries["Modifications"]
+            df = self._add_modification_localization_string_to_psm_evidence(df)
         return df
     def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -1207,6 +1279,66 @@ class FragPipeReader(ResultReader):
             new_df[new_column] = localization_strings
         return new_df
+    def _add_modification_localization_string_to_psm_evidence(
+        self, df: pd.DataFrame
+    ) -> pd.DataFrame:
+        """Adds a modification localization string column to a PSM evidence table.
+        Extracts localization probabilities from all columns in the form
+        f"{aa:modification}", converts them into the standardized modification
+        localization string format used by msreport, and adds a new column
+        "Modification localization string".
+        Probabilities are written in the format
+        "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
+        e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
+        `msreport.peptidoform.make_localization_string` for details.
+        Args:
+            df: A dataframe containing PSM tables from FragPipe.
+        Returns:
+            A copy of the input dataframe with the added
+            "Modification localization string" column.
+        """
+        new_df = df.copy()
+        _search_tag = " Best Localization"
+        mod_localization_columns = [
+            c.strip(_search_tag) for c in new_df.columns if c.endswith(_search_tag)
+        ]
+        if not mod_localization_columns:
+            new_df["Modification localization string"] = ""
+            return new_df
+        df[mod_localization_columns] = (
+            df[mod_localization_columns].astype(str).replace("nan", "")
+        )
+        row_mod_probabilities: list[dict[str, dict[int, float]]] = [
+            {} for i in range(df.shape[0])
+        ]
+        for mod_localization_column in mod_localization_columns:
+            modification = mod_localization_column.split(":")[1]
+            for modification_probabilities, probability_sequence in zip(
+                row_mod_probabilities, df[mod_localization_column]
+            ):
+                if not probability_sequence:
+                    continue
+                _, probabilities = msreport.peptidoform.parse_modified_sequence(
+                    probability_sequence, "(", ")"
+                )
+                modification_probabilities[modification] = {
+                    site: float(probability) for site, probability in probabilities
+                }
+        localization_strings = []
+        for localization_probabilities in row_mod_probabilities:
+            localization_string = msreport.peptidoform.make_localization_string(
+                localization_probabilities
+            )
+            localization_strings.append(localization_string)
+        new_df["Modification localization string"] = localization_strings
+        return new_df
 class SpectronautReader(ResultReader):
     """Spectronaut result reader.

{msreport-0.0.31 → msreport-0.0.32}/msreport.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: msreport
-Version: 0.0.31
+Version: 0.0.32
 Summary: Post processing and analysis of quantitative proteomics data
 Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
 License-Expression: Apache-2.0

{msreport-0.0.31 → msreport-0.0.32}/pyproject.toml RENAMED Viewed

@@ -112,3 +112,11 @@ module = [
   "yaml.*",
 ]
 follow_untyped_imports = true
+[tool.pytest.ini_options]
+addopts = "--doctest-modules"
+doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL"
+testpaths = [
+    "tests",
+    "msreport",
+]

{msreport-0.0.31 → msreport-0.0.32}/tests/test_plot.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import matplotlib
 import numpy as np
 import pandas as pd
 import pytest
@@ -5,6 +6,9 @@ import pytest
 import msreport.plot
 import msreport.qtable
+# Use the 'Agg' backend for plotting tests to prevent TclError in headless environments.
+matplotlib.use("Agg")
 @pytest.fixture
 def example_data():