PyPI - msreport - Versions diffs - 0.0.27__py3-none-any.whl → 0.0.29__py3-none-any.whl - Mend

msreport 0.0.27py3-none-any.whl → 0.0.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

msreport/__init__.py +4 -6
msreport/aggregate/condense.py +1 -1
msreport/aggregate/pivot.py +1 -0
msreport/aggregate/summarize.py +2 -2
msreport/analyze.py +117 -36
msreport/errors.py +5 -2
msreport/export.py +16 -13
msreport/fasta.py +2 -1
msreport/helper/__init__.py +7 -7
msreport/helper/calc.py +14 -15
msreport/helper/maxlfq.py +2 -2
msreport/helper/table.py +5 -6
msreport/impute.py +4 -3
msreport/isobar.py +10 -9
msreport/normalize.py +2 -1
msreport/peptidoform.py +6 -4
msreport/plot/__init__.py +41 -0
msreport/plot/_partial_plots.py +159 -0
msreport/plot/comparison.py +490 -0
msreport/plot/distribution.py +253 -0
msreport/plot/multivariate.py +355 -0
msreport/plot/quality.py +431 -0
msreport/plot/style.py +286 -0
msreport/plot/style_sheets/msreport-notebook.mplstyle +57 -0
msreport/plot/style_sheets/seaborn-whitegrid.mplstyle +45 -0
msreport/qtable.py +109 -17
msreport/reader.py +235 -86
msreport/rinterface/__init__.py +16 -3
msreport/rinterface/limma.py +2 -1
msreport/rinterface/rinstaller.py +3 -3
msreport-0.0.29.dist-info/METADATA +136 -0
msreport-0.0.29.dist-info/RECORD +38 -0
{msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/WHEEL +1 -1
msreport/plot.py +0 -1134
msreport-0.0.27.dist-info/METADATA +0 -129
msreport-0.0.27.dist-info/RECORD +0 -30
{msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/licenses/LICENSE.txt +0 -0
{msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/top_level.txt +0 -0

msreport/reader.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" Module for reading result tables from various MS analysis tools and converting them
+"""Module for reading result tables from various MS analysis tools and converting them
 to a standardized format following the MsReport convention.
 Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
@@ -20,19 +20,19 @@ Unified column names:
 - iBAQ intensity "sample name"
 """
-from collections import OrderedDict, defaultdict
 import os
-from typing import Any, Callable, Iterable, Optional, Protocol
 import pathlib
 import warnings
+from collections import OrderedDict, defaultdict
+from typing import Any, Callable, Iterable, Optional, Protocol
 import numpy as np
 import pandas as pd
 import msreport.helper as helper
-from msreport.helper.temp import extract_window_around_position
-from msreport.errors import ProteinsNotInFastaWarning
 import msreport.peptidoform
+from msreport.errors import ProteinsNotInFastaWarning
+from msreport.helper.temp import extract_window_around_position
 class Protein(Protocol):
@@ -54,6 +54,8 @@ class ProteinDatabase(Protocol):
 class ResultReader:
     """Base Reader class, is by itself not functional."""
+    data_directory: str
+    filenames: dict[str, str]
     default_filenames: dict[str, str]
     protected_columns: list[str]
     column_mapping: dict[str, str]
@@ -61,8 +63,8 @@ class ResultReader:
     sample_column_tags: list[str]
     def __init__(self):
-        self.data_directory: str = ""
-        self.filenames: dict[str, str] = {}
+        self.data_directory = ""
+        self.filenames = {}
     def _read_file(self, which: str, sep: str = "\t") -> pd.DataFrame:
         """Read a result table.
@@ -183,18 +185,16 @@ class MaxQuantReader(ResultReader):
         "MS/MS count",
         "Sequence coverage",
     ]
-    column_mapping: dict[str, str] = dict(
-        [
-            ("Peptides", "Total peptides"),
-            ("Sequence coverage [%]", "Sequence coverage"),
-            ("MS/MS count", "Spectral count Combined"),  # proteinGroups, evidence
-            ("MS/MS Count", "Spectral count Combined"),  # peptides
-            ("Sequence", "Peptide sequence"),  # peptides, evidence
-            ("Sequence length", "Protein length"),
-            ("Mol. weight [kDa]", "Molecular weight [kDa]"),
-            ("Experiment", "Sample"),
-        ]
-    )
+    column_mapping: dict[str, str] = {
+        "Peptides": "Total peptides",
+        "Sequence coverage [%]": "Sequence coverage",
+        "MS/MS count": "Spectral count Combined",  # proteinGroups, evidence
+        "MS/MS Count": "Spectral count Combined",  # peptides
+        "Sequence": "Peptide sequence",  # peptides, evidence
+        "Sequence length": "Protein length",
+        "Mol. weight [kDa]": "Molecular weight [kDa]",
+        "Experiment": "Sample",
+    }
     column_tag_mapping: OrderedDict[str, str] = OrderedDict(
         [("MS/MS count", "Spectral count"), ("iBAQ", "iBAQ intensity")]
     )
@@ -343,7 +343,9 @@ class MaxQuantReader(ResultReader):
         Adds new columns to comply with the MsReport convention. "Modified sequence",
         "Modifications columns", "Modification localization string". "Protein reported
         by software" and "Representative protein", both contain the first entry from
-        "Leading razor protein".
+        "Leading razor protein". "Ion ID" contains unique entries for each ion, which
+        are generated by concatenating the "Modified sequence" and "Charge" columns, and
+        if present, the "Compensation voltage" column.
         "Modified sequence" entries contain modifications within square brackets.
         "Modification" entries are strings in the form of "position:modification_tag",
@@ -376,15 +378,19 @@ class MaxQuantReader(ResultReader):
             df["Leading razor protein"]
         )
         df["Representative protein"] = df["Protein reported by software"]
         if drop_decoy:
             df = self._drop_decoy(df)
         if rename_columns:
-            df = self._rename_columns(
-                df, True
-            )  # Actually there are no column tags as the table is in long format
+            # Actually there are no column tags as the table is in long format
+            df = self._rename_columns(df, prefix_tag=True)
         if rewrite_modifications and rename_columns:
             df = self._add_peptide_modification_entries(df)
             df = self._add_modification_localization_string(df)
+            df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
+            if "Compensation voltage" in df.columns:
+                _cv = df["Compensation voltage"].astype(str)
+                df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
         return df
     def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -576,6 +582,7 @@ class FragPipeReader(ResultReader):
         "peptides": "combined_peptide.tsv",
         "ions": "combined_ion.tsv",
         "ion_evidence": "ion.tsv",
+        "psm_evidence": "psm.tsv",
     }
     isobar_filenames: dict[str, str] = {
         "proteins": "protein.tsv",
@@ -590,20 +597,25 @@ class FragPipeReader(ResultReader):
         "Intensity",
         "MaxLFQ Intensity",
     ]
-    column_mapping: dict[str, str] = dict(
-        [
-            ("Peptide Sequence", "Peptide sequence"),  # Peptide and ion
-            ("Modified Sequence", "Modified sequence"),  # Modified peptide and ion
-            ("Start", "Start position"),  # Peptide and ion
-            ("End", "End position"),  # Peptide and ion
-            ("Combined Total Peptides", "Total peptides"),  # From LFQ
-            ("Total Peptides", "Total peptides"),  # From TMT
-            ("Description", "Protein name"),
-            ("Protein Length", "Protein length"),
-            ("Entry Name", "Protein entry name"),
-            ("Gene", "Gene name"),
-        ]
-    )
+    column_mapping: dict[str, str] = {
+        "Peptide": "Peptide sequence",  # PSM
+        "Modified Peptide": "Modified sequence",  # PSM
+        "Protein Start": "Start position",  # PSM
+        "Protein End": "End position",  # PSM
+        "Number of Missed Cleavages": "Missed cleavage",  # PSM
+        "PeptideProphet Probability": "Probability",  # PSM
+        "Compensation Voltage": "Compensation voltage",  # PSM and ion
+        "Peptide Sequence": "Peptide sequence",  # Peptide and ion
+        "Modified Sequence": "Modified sequence",  # Modified peptide and ion
+        "Start": "Start position",  # Peptide and ion
+        "End": "End position",  # Peptide and ion
+        "Combined Total Peptides": "Total peptides",  # From LFQ
+        "Total Peptides": "Total peptides",  # From TMT
+        "Description": "Protein name",
+        "Protein Length": "Protein length",
+        "Entry Name": "Protein entry name",
+        "Gene": "Gene name",
+    }
     column_tag_mapping: OrderedDict[str, str] = OrderedDict(
         [
             ("MaxLFQ Intensity", "LFQ intensity"),
@@ -743,7 +755,10 @@ class FragPipeReader(ResultReader):
         Adds new columns to comply with the MsReport convention. "Modified sequence"
         and "Modifications columns". "Protein reported by software" and "Representative
-        protein", both contain the first entry from "Leading razor protein".
+        protein", both contain the first entry from "Leading razor protein". "Ion ID"
+        contains unique entries for each ion, which are generated by concatenating the
+        "Modified sequence" and "Charge" columns, and if present, the
+        "Compensation voltage" column.
         "Modified sequence" entries contain modifications within square brackets.
         "Modification" entries are strings in the form of "position:modification_text",
@@ -783,6 +798,11 @@ class FragPipeReader(ResultReader):
         if rewrite_modifications and rename_columns:
             df = self._add_peptide_modification_entries(df)
             df = self._add_modification_localization_string(df, prefix_column_tags)
+            df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
+            if "Compensation voltage" in df.columns:
+                _cv = df["Compensation voltage"].astype(str)
+                df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
         return df
     def import_ion_evidence(
@@ -797,7 +817,9 @@ class FragPipeReader(ResultReader):
         Adds new columns to comply with the MsReport convention. "Modified sequence",
         "Modifications", and "Modification localization string" columns. "Protein
         reported by software" and "Representative protein", both contain the first entry
-        from "Leading razor protein".
+        from "Leading razor protein". "Ion ID" contains unique entries for each ion,
+        which are generated by concatenating the "Modified sequence" and "Charge"
+        columns, and if present, the "Compensation voltage" column.
         "Modified sequence" entries contain modifications within square brackets.
         "Modification" entries are strings in the form of "position:modification_text",
@@ -850,6 +872,9 @@ class FragPipeReader(ResultReader):
         df = pd.concat(ion_tables, ignore_index=True)
         # --- Process dataframe --- #
+        df["Ion ID"] = df["Modified Sequence"] + "_c" + df["Charge"].astype(str)
+        if "Compensation Voltage" in df.columns:
+            df["Ion ID"] = df["Ion ID"] + "_cv" + df["Compensation Voltage"].astype(str)
         # FUTURE: replace this by _add_protein_entries(df, False) if FragPipe adds
         #         'Indistinguishable Proteins' to the ion table.
         df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
@@ -861,6 +886,76 @@ class FragPipeReader(ResultReader):
             df = self._add_modification_localization_string(df, prefix_column_tags)
         return df
+    def import_psm_evidence(
+        self,
+        filename: Optional[str] = None,
+        rename_columns: bool = True,
+        rewrite_modifications: bool = True,
+    ):
+        """Concatenate all "psm.tsv" files and return a processed dataframe.
+        Args:
+            filename: Allows specifying an alternative filename, otherwise the default
+                filename is used.
+            rename_columns: If True, columns are renamed according to the MsReport
+                convention; default True.
+            rewrite_modifications: If True, the peptide format in "Modified sequence" is
+                changed according to the MsReport convention, and a "Modifications" is
+                added to contains the amino acid position for all modifications.
+                Requires 'rename_columns' to be true. Default True.
+        Returns:
+            A DataFrame containing the processed psm evidence tables.
+        """
+        if filename is None:
+            filename = self.default_filenames["psm_evidence"]
+        psm_table_paths = []
+        for path in pathlib.Path(self.data_directory).iterdir():
+            psm_table_path = path / filename
+            if path.is_dir() and psm_table_path.exists():
+                psm_table_paths.append(psm_table_path)
+        psm_tables = []
+        for filepath in psm_table_paths:
+            table = pd.read_csv(filepath, sep="\t", low_memory=False)
+            str_cols = table.select_dtypes(include=["object"]).columns
+            table.loc[:, str_cols] = table.loc[:, str_cols].fillna("")
+            table["Sample"] = filepath.parent.name
+            psm_tables.append(table)
+        df = pd.concat(psm_tables, ignore_index=True)
+        df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
+        df["Representative protein"] = df["Protein reported by software"]
+        df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
+        # FP only lists additional mapped proteins in the "Mapped Proteins" column
+        # MsReport reports all matching proteins in the "Mapped proteins" column
+        mapped_proteins_entries = []
+        for protein, mapped_protein_fp in zip(
+            df["Representative protein"], df["Mapped Proteins"], strict=True
+        ):
+            if mapped_protein_fp == "":
+                mapped_proteins = [protein]
+            else:
+                additional_mapped_proteins = msreport.reader._extract_protein_ids(
+                    mapped_protein_fp.split(", ")
+                )
+                mapped_proteins = [protein] + additional_mapped_proteins
+            mapped_proteins_entries.append(";".join(mapped_proteins))
+        df["Mapped proteins"] = mapped_proteins_entries
+        if rename_columns:
+            df = self._rename_columns(df, prefix_tag=True)
+        if rewrite_modifications and rename_columns:
+            mod_entries = _generate_modification_entries_from_assigned_modifications(
+                df["Peptide sequence"], df["Assigned Modifications"]
+            )
+            df["Modified sequence"] = mod_entries["Modified sequence"]
+            df["Modifications"] = mod_entries["Modifications"]
+        return df
     def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
         """Adds standardized protein entry columns to the data frame.
@@ -1038,40 +1133,32 @@ class SpectronautReader(ResultReader):
         "design": "conditionsetup",
     }
     protected_columns: list[str] = []
-    column_mapping: dict[str, str] = dict(
-        [
-            ("R.FileName", "Filename"),
-            ("R.Label", "Sample"),
-            ("PG.Qvalue", "Protein qvalue"),
-            ("PG.Cscore", "Protein cscore"),
-            ("PG.NrOfStrippedSequencesIdentified (Experiment-wide)", "Total peptides"),
-            ("PG.NrOfPrecursorsIdentified (Experiment-wide)", "Total ions"),
-            ("PG.Cscore", "Cscore"),
-            ("PEP.StrippedSequence", "Peptide sequence"),
-            ("PEP.AllOccurringProteinAccessions", "Mapped proteins"),
-            ("EG.ModifiedSequence", "Modified sequence"),
-            ("EG.CompensationVoltage", "Compensation voltage"),
-            ("EG.Qvalue", "Qvalue"),
-            ("EG.ApexRT", "Apex retention time"),
-            ("EG.DatapointsPerPeak", "Datapoints per peak"),
-            ("EG.FWHM", "FWHM"),
-            ("EG.SignalToNoise", "Signal to noise"),
-            ("FG.FragmentCount", "Fragment count"),
-            ("FG.Charge", "Charge"),
-            ("FG.MS1Quantity", "MS1 intensity"),
-            ("FG.MS1RawQuantity", "MS1 raw intensity"),
-            ("FG.MS2Quantity", "MS2 intensity"),
-            ("FG.MS2RawQuantity", "MS2 raw intensity"),
-            ("FG.MeasuredMz", "Observed m/z"),
-            ("FG.TheoreticalMz", "Theoretical m/z"),
-            ("FG.CalibratedMz", "Calibrated m/z"),
-            # ("PG.ProteinAccessions", ""),
-            # ("EG.HasLocalizationInformation", ""),
-            # ("EG.PTMLocalizationProbabilities", ""),
-            # ("EG.UsedForProteinGroupQuantity", ""),
-            # Modified peptides need to be parsed and rewritten
-        ]
-    )
+    column_mapping: dict[str, str] = {
+        "R.FileName": "Filename",
+        "R.Label": "Sample",
+        "PG.Qvalue": "Protein qvalue",
+        "PG.Cscore": "Protein cscore",
+        "PG.NrOfStrippedSequencesIdentified (Experiment-wide)": "Total peptides",
+        "PG.NrOfPrecursorsIdentified (Experiment-wide)": "Total ions",
+        "PEP.StrippedSequence": "Peptide sequence",
+        "PEP.AllOccurringProteinAccessions": "Mapped proteins",
+        "EG.ModifiedSequence": "Modified sequence",
+        "EG.CompensationVoltage": "Compensation voltage",
+        "EG.Qvalue": "Qvalue",
+        "EG.ApexRT": "Apex retention time",
+        "EG.DatapointsPerPeak": "Datapoints per peak",
+        "EG.FWHM": "FWHM",
+        "EG.SignalToNoise": "Signal to noise",
+        "FG.FragmentCount": "Fragment count",
+        "FG.Charge": "Charge",
+        "FG.MS1Quantity": "MS1 intensity",
+        "FG.MS1RawQuantity": "MS1 raw intensity",
+        "FG.MS2Quantity": "MS2 intensity",
+        "FG.MS2RawQuantity": "MS2 raw intensity",
+        "FG.MeasuredMz": "Observed m/z",
+        "FG.TheoreticalMz": "Theoretical m/z",
+        "FG.CalibratedMz": "Calibrated m/z",
+    }
     sample_column_tags: list[str] = [
         ".PG.NrOfPrecursorsIdentified",
         ".PG.IBAQ",
@@ -1324,12 +1411,14 @@ class SpectronautReader(ResultReader):
         filename: Optional[str] = None,
         filetag: Optional[str] = None,
         rename_columns: bool = True,
-    ) -> None:
+    ) -> pd.DataFrame:
         """Reads an ion evidence file (long format) and returns a processed dataframe.
         Adds new columns to comply with the MsReport convention. "Protein reported
         by software" and "Representative protein", both contain the first entry from
-        "PG.ProteinAccessions".
+        "PG.ProteinAccessions". "Ion ID" contains unique entries for each ion, which are
+        generated by concatenating the "Modified sequence" and "Charge" columns, and if
+        present, the "Compensation voltage" column.
         (!) Note that the modified sequence and modification localization probabilities
         are currently not processed.
@@ -1367,6 +1456,11 @@ class SpectronautReader(ResultReader):
         df = self._add_protein_entries(df)
         if rename_columns:
             df = self._rename_columns(df, True)
+            df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
+            if "Compensation voltage" in df.columns:
+                _cv = df["Compensation voltage"].astype(str)
+                df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
         return df
     def _tidy_up_sample_columns(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -1462,7 +1556,7 @@ def sort_leading_proteins(
     if penalize_contaminants is not None:
         contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
-    for idx, row in table.iterrows():
+    for _, row in table.iterrows():
         protein_ids = row["Leading proteins"].split(";")
         sorting_info = [[] for _ in protein_ids]
@@ -1559,6 +1653,7 @@ def add_protein_annotation(
         warnings.warn(
             f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
             ProteinsNotInFastaWarning,
+            stacklevel=2,
         )
     annotations = {}
@@ -1636,9 +1731,10 @@ def add_protein_site_annotation(
         warnings.warn(
             f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
             ProteinsNotInFastaWarning,
+            stacklevel=2,
         )
-    annotations = {
+    annotations: dict[str, list[str]] = {
         "Modified residue": [],
         "Sequence window": [],
     }
@@ -1702,6 +1798,7 @@ def add_leading_proteins_annotation(
         warnings.warn(
             f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
             ProteinsNotInFastaWarning,
+            stacklevel=2,
         )
     annotations = {}
@@ -1853,7 +1950,7 @@ def add_peptide_positions(
             find matching entries in the FASTA files.
     """
     # not tested #
-    peptide_positions = {"Start position": [], "End position": []}
+    peptide_positions: dict[str, list[int]] = {"Start position": [], "End position": []}
     proteins_not_in_db = []
     for peptide, protein_id in zip(table[peptide_column], table[protein_column]):
         if protein_id in protein_db:
@@ -1875,6 +1972,7 @@ def add_peptide_positions(
         warnings.warn(
             f"Some peptides could not be annotated: {repr(proteins_not_in_db)}",
             ProteinsNotInFastaWarning,
+            stacklevel=2,
         )
@@ -1894,10 +1992,10 @@ def add_protein_modifications(table: pd.DataFrame):
             for peptide_site, mod in [m.split(":") for m in mod_entry.split(";")]:
                 protein_site = int(peptide_site) + start_pos - 1
                 protein_mods.append([str(protein_site), mod])
-            protein_mods = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
+            protein_mod_string = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
         else:
-            protein_mods = ""
-        protein_modification_entries.append(protein_mods)
+            protein_mod_string = ""
+        protein_modification_entries.append(protein_mod_string)
     table["Protein modifications"] = protein_modification_entries
@@ -2074,7 +2172,7 @@ def _process_protein_entries(
         A dataframe containing the columns "Protein reported by software",
         "Leading proteins", "Representative protein", and "Potential contaminant".
     """
-    new_entries = {
+    new_entries: dict[str, list[str | bool]] = {
         "Protein reported by software": [],
         "Representative protein": [],
         "Potential contaminant": [],
@@ -2147,6 +2245,57 @@ def _generate_modification_entries(
     return entries
+def _generate_modification_entries_from_assigned_modifications(
+    sequences: Iterable[str],
+    assigned_modifications: Iterable[str],
+) -> dict[str, list[str]]:
+    modified_sequence_entries = []
+    modification_entries = []
+    for sequence, modifications_entry in zip(sequences, assigned_modifications):
+        modifications = _extract_fragpipe_assigned_modifications(
+            modifications_entry, sequence
+        )
+        modified_sequence = helper.modify_peptide(sequence, modifications)
+        modification_entry = ";".join([f"{pos}:{mod}" for pos, mod in modifications])
+        modified_sequence_entries.append(modified_sequence)
+        modification_entries.append(modification_entry)
+    entries = {
+        "Modified sequence": modified_sequence_entries,
+        "Modifications": modification_entries,
+    }
+    return entries
+def _extract_fragpipe_assigned_modifications(
+    modifications_entry: str,
+    sequence: str,
+) -> list[tuple[int, str]]:
+    """Extracts modifications from a FragPipe "Modifications" entry.
+    Example for a modification entry: "N-term(42.0106),8C(57.0215)"
+    Returns:
+        A list of tuples, where each tuple contains the position of the modification and
+        the modification text. The position is one-indexed, meaning that the first amino
+        acid position is 1. N-term and C-term are represented as 0 and len(sequence)
+        respectively.
+    """
+    if modifications_entry == "":
+        return []
+    modifications = []
+    for mod_entry in modifications_entry.split(","):
+        position_entry, modification = mod_entry.split(")")[0].split("(")
+        if position_entry == "N-term":
+            position = 0
+        elif position_entry == "C-term":
+            position = len(sequence)
+        else:
+            position = int(position_entry[:-1])
+        modifications.append((position, modification))
+    return modifications
 def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
     """Extract localization probabilites from a MaxQuant "Probabilities" entry.
@@ -2189,7 +2338,7 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
     ... )
     {'15.9949': {3: 1.0}, '79.9663': {4: 0.334, 6: 0.666}}
     """
-    modification_probabilities = {}
+    modification_probabilities: dict[str, dict[int, float]] = {}
     for modification_entry in filter(None, localization_entry.split(";")):
         specified_modification, probability_sequence = modification_entry.split("@")
         _, modification = specified_modification.split(":")
@@ -2247,7 +2396,7 @@ def _create_protein_annotations_from_db(
     protein_db: ProteinDatabase,
     query_function: Callable,
     default_value: Any,
-) -> list[str]:
+) -> list[Any]:
     """Returns a list of multi protein entry annotations.
     Used to generate protein annotations for protein entries. For each protein id an
@@ -2274,9 +2423,9 @@ def _create_protein_annotations_from_db(
         if protein_id in protein_db:
             db_entry = protein_db[protein_id]
             query_result = query_function(db_entry, default_value)
+            annotation_values.append(query_result)
         else:
-            query_result = default_value
-        annotation_values.append(query_result)
+            annotation_values.append(default_value)
     return annotation_values

msreport/rinterface/__init__.py CHANGED Viewed

@@ -1,3 +1,16 @@
-""" Python interface to custome R scripts. """
-from .limma import multi_group_limma, two_group_limma
-from .rinstaller import r_package_version
+"""Python interface to custome R scripts."""
+from msreport.errors import OptionalDependencyError
+try:
+    from .limma import multi_group_limma, two_group_limma
+    from .rinstaller import r_package_version
+except ImportError as err:
+    raise OptionalDependencyError(
+        "R integration is not available. R must be installed and configured before "
+        "installing optional R dependencies using 'pip install msreport[R]'. For "
+        "more information, see: https://github.com/hollenstein/msreport"
+    ) from err
+__all__ = ["multi_group_limma", "two_group_limma", "r_package_version"]

msreport/rinterface/limma.py CHANGED Viewed

@@ -1,4 +1,5 @@
-""" Python interface to custome R scripts. """
+"""Python interface to custome R scripts."""
 import os
 import pandas as pd

msreport/rinterface/rinstaller.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from rpy2.robjects.packages import importr
-import rpy2.robjects.packages as rpackages
 import rpy2.robjects as robjects
+import rpy2.robjects.packages as rpackages
+from rpy2.robjects.packages import importr
-def r_package_version(package_name: str) -> (str, str):
+def r_package_version(package_name: str) -> str:
     """Returns the version number of an installed R package."""
     with robjects.conversion.localconverter(robjects.default_converter):
         utils = importr("utils")

msreport 0.0.27__py3-none-any.whl → 0.0.29__py3-none-any.whl

msreport 0.0.27py3-none-any.whl → 0.0.29py3-none-any.whl