PyPI - msreport - Versions diffs - 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl - Mend

msreport 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

msreport/__init__.py +1 -1
msreport/aggregate/__init__.py +10 -0
msreport/aggregate/condense.py +9 -0
msreport/aggregate/pivot.py +14 -5
msreport/aggregate/summarize.py +14 -4
msreport/analyze.py +67 -5
msreport/export.py +9 -15
msreport/fasta.py +9 -2
msreport/helper/__init__.py +18 -0
msreport/impute.py +18 -10
msreport/isobar.py +11 -14
msreport/normalize.py +95 -10
msreport/peptidoform.py +21 -11
msreport/plot/__init__.py +3 -3
msreport/plot/distribution.py +2 -1
msreport/plot/quality.py +1 -1
msreport/qtable.py +44 -20
msreport/reader.py +321 -40
msreport/rinterface/limma.py +1 -1
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/METADATA +20 -2
msreport-0.0.31.dist-info/RECORD +38 -0
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/WHEEL +1 -1
msreport-0.0.29.dist-info/RECORD +0 -38
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/licenses/LICENSE.txt +0 -0
{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/top_level.txt +0 -0

msreport/reader.py CHANGED Viewed

@@ -1,17 +1,18 @@
-"""Module for reading result tables from various MS analysis tools and converting them
-to a standardized format following the MsReport convention.
+"""Provides tools for importing and standardizing quantitative proteomics data.
-Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
-for Spectronaut protein tables are supported when exported with the correct report
-scheme.
+This module offers software-specific reader classes to import raw result tables (e.g.,
+proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
+Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
+provides functions for annotating imported data with biological metadata, such as
+protein information (e.g., sequence length, molecular weight) and peptide positions,
+extracted from a ProteinDatabase (FASTA file).
-New column names:
+New columns added to imported protein tables:
 - Representative protein
 - Leading proteins
 - Protein reported by software
-Unified column names:
-- Total peptides
+Standardized column names for quantitative values (if available in the software output):
 - Spectral count "sample name"
 - Unique spectral count "sample name"
 - Total spectral count "sample name"
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
 class Protein(Protocol):
     """Abstract protein entry"""
+    # identifier: str
     header: str
     sequence: str
     header_fields: dict[str, str]
@@ -46,9 +48,9 @@ class Protein(Protocol):
 class ProteinDatabase(Protocol):
     """Abstract protein database"""
-    def __getitem__(self, protein_id: str) -> Protein: ...
+    def __getitem__(self, identifier: str) -> Protein: ...
-    def __contains__(self, protein_id: str) -> bool: ...
+    def __contains__(self, identifier: str) -> bool: ...
 class ResultReader:
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
         mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
         localization_string_column = "Modification localization string"
-        mod_localization_probabilities = [{} for _ in range(new_df.shape[0])]
+        mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
+            {} for _ in range(new_df.shape[0])
+        ]
         for probability_column in mod_probability_columns:
             # FUTURE: Type should be checked and enforced during the import
             if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
@@ -541,6 +545,8 @@ class FragPipeReader(ResultReader):
     """FragPipe result reader.
     Methods:
+        import_design: Reads a "fragpipe-files.fp-manifest" file and returns a
+            processed design dataframe.
         import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
             returns a processed dataframe, conforming to the MsReport naming
             convention.
@@ -583,12 +589,19 @@ class FragPipeReader(ResultReader):
         "ions": "combined_ion.tsv",
         "ion_evidence": "ion.tsv",
         "psm_evidence": "psm.tsv",
+        "design": "fragpipe-files.fp-manifest",
     }
     isobar_filenames: dict[str, str] = {
         "proteins": "protein.tsv",
         "peptides": "peptide.tsv",
         "ions": "ion.tsv",
     }
+    sil_filenames: dict[str, str] = {
+        "proteins": "combined_protein_label_quant.tsv",
+        "peptides": "combined_modified_peptide_label_quant.tsv",
+        "ions": "combined_ion_label_quant.tsv",
+    }
     protected_columns: list[str] = []
     sample_column_tags: list[str] = [
         "Spectral Count",
@@ -609,6 +622,7 @@ class FragPipeReader(ResultReader):
         "Modified Sequence": "Modified sequence",  # Modified peptide and ion
         "Start": "Start position",  # Peptide and ion
         "End": "End position",  # Peptide and ion
+        "Mapped Proteins": "Mapped proteins",  # All PSM, ion, and peptide tables
         "Combined Total Peptides": "Total peptides",  # From LFQ
         "Total Peptides": "Total peptides",  # From TMT
         "Description": "Protein name",
@@ -638,7 +652,11 @@ class FragPipeReader(ResultReader):
     protein_info_tags: list[str] = []
     def __init__(
-        self, directory: str, isobar: bool = False, contaminant_tag: str = "contam_"
+        self,
+        directory: str,
+        isobar: bool = False,
+        sil: bool = False,
+        contaminant_tag: str = "contam_",
     ) -> None:
         """Initializes the FragPipeReader.
@@ -646,16 +664,89 @@ class FragPipeReader(ResultReader):
             directory: Location of the FragPipe result folder
             isobar: Set to True if quantification strategy was TMT, iTRAQ or similar;
                 default False.
+            sil: Set to True if the FragPipe result files are from a stable isotope
+                labeling experiment, such as SILAC; default False.
             contaminant_tag: Prefix of Protein ID entries to identify contaminants;
                 default "contam_".
         """
+        if sil and isobar:
+            raise ValueError("Cannot set both 'isobar' and 'sil' to True.")
         self._add_data_directory(directory)
         self._isobar: bool = isobar
+        self._sil: bool = sil
         self._contaminant_tag: str = contaminant_tag
-        if not isobar:
+        if isobar:
+            self.filenames = self.isobar_filenames
+        elif sil:
+            self.filenames = self.sil_filenames
+        else:
             self.filenames = self.default_filenames
+    def import_design(
+        self, filename: Optional[str] = None, sort: bool = False
+    ) -> pd.DataFrame:
+        """Read a 'fp-manifest' file and returns a processed design dataframe.
+        The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
+        design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
+        column is extracted as the filename from the full path. The "Sample" column is
+        generated by combining "Experiment" and "Replicate" with an underscore
+        (e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
+        "Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
+        by default.
+        Args:
+            filename: Allows specifying an alternative filename, otherwise the default
+                filename is used.
+            sort: If True, the design dataframe is sorted by "Experiment" and
+                "Replicate"; default False.
+        Returns:
+            A dataframe containing the processed design table with columns:
+            "Sample", "Experiment", "Replicate", "Rawfile".
+        Raises:
+            FileNotFoundError: If the specified manifest file does not exist.
+        """
+        if filename is None:
+            filepath = os.path.join(self.data_directory, self.filenames["design"])
         else:
-            self.filenames = self.isobar_filenames
+            filepath = os.path.join(self.data_directory, filename)
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(
+                f"File '{filepath}' does not exist. Please check the file path."
+            )
+        fp_manifest = (
+            pd.read_csv(
+                filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
+            )
+            .fillna("")
+            .astype(str)
+        )
+        fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
+        design = pd.DataFrame(
+            {
+                "Sample": "",
+                "Experiment": fp_manifest["Experiment"],
+                "Replicate": fp_manifest["Bioreplicate"],
+                "Rawfile": fp_manifest["Path"].apply(
+                    # Required to handle Windows and Unix style paths on either system
+                    lambda x: x.replace("\\", "/").split("/")[-1]
+                ),
+            }
+        )
+        # FragPipe uses "exp" for missing 'Experiment' values
+        design.loc[design["Experiment"] == "", "Experiment"] = "exp"
+        # FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
+        # 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
+        design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
+        design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
+        if sort:
+            design.sort_values(by=["Experiment", "Replicate"], inplace=True)
+            design.reset_index(drop=True, inplace=True)
+        return design
     def import_proteins(
         self,
@@ -737,6 +828,7 @@ class FragPipeReader(ResultReader):
         df = self._read_file("peptides" if filename is None else filename)
         df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
         df["Representative protein"] = df["Protein reported by software"]
+        df["Mapped Proteins"] = self._collect_mapped_proteins(df)
         # Note that _add_protein_entries would need to be adapted for the peptide table.
         # df = self._add_protein_entries(df)
         if rename_columns:
@@ -793,6 +885,8 @@ class FragPipeReader(ResultReader):
         #         'Indistinguishable Proteins' to the ion table.
         df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
         df["Representative protein"] = df["Protein reported by software"]
+        df["Mapped Proteins"] = self._collect_mapped_proteins(df)
         if rename_columns:
             df = self._rename_columns(df, prefix_column_tags)
         if rewrite_modifications and rename_columns:
@@ -879,6 +973,8 @@ class FragPipeReader(ResultReader):
         #         'Indistinguishable Proteins' to the ion table.
         df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
         df["Representative protein"] = df["Protein reported by software"]
+        df["Mapped Proteins"] = self._collect_mapped_proteins(df)
         if rename_columns:
             df = self._rename_columns(df, prefix_column_tags)
         if rewrite_modifications and rename_columns:
@@ -891,7 +987,7 @@ class FragPipeReader(ResultReader):
         filename: Optional[str] = None,
         rename_columns: bool = True,
         rewrite_modifications: bool = True,
-    ):
+    ) -> pd.DataFrame:
         """Concatenate all "psm.tsv" files and return a processed dataframe.
         Args:
@@ -928,23 +1024,7 @@ class FragPipeReader(ResultReader):
         df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
         df["Representative protein"] = df["Protein reported by software"]
-        df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
-        # FP only lists additional mapped proteins in the "Mapped Proteins" column
-        # MsReport reports all matching proteins in the "Mapped proteins" column
-        mapped_proteins_entries = []
-        for protein, mapped_protein_fp in zip(
-            df["Representative protein"], df["Mapped Proteins"], strict=True
-        ):
-            if mapped_protein_fp == "":
-                mapped_proteins = [protein]
-            else:
-                additional_mapped_proteins = msreport.reader._extract_protein_ids(
-                    mapped_protein_fp.split(", ")
-                )
-                mapped_proteins = [protein] + additional_mapped_proteins
-            mapped_proteins_entries.append(";".join(mapped_proteins))
-        df["Mapped proteins"] = mapped_proteins_entries
+        df["Mapped Proteins"] = self._collect_mapped_proteins(df)
         if rename_columns:
             df = self._rename_columns(df, prefix_tag=True)
@@ -980,6 +1060,35 @@ class FragPipeReader(ResultReader):
             df[key] = protein_entry_table[key]
         return df
+    def _collect_mapped_proteins(self, df: pd.DataFrame) -> list[str]:
+        """Generates a list of mapped proteins entries.
+        This method extracts protein IDs from the 'Representative protein' and the
+        'Mapped Proteins' column and combines them into a single string for each row,
+        where multiple protein IDs are separated by semicolons.
+        Args:
+            df: DataFrame containing the 'Mapped Proteins' column.
+        Returns:
+            A list of mapped proteins entries.
+        """
+        mapped_proteins_entries = []
+        for protein, mapped_protein_fp in zip(
+            df["Representative protein"],
+            df["Mapped Proteins"].astype(str).replace("nan", ""),
+            strict=True,
+        ):
+            if mapped_protein_fp == "":
+                mapped_proteins = [protein]
+            else:
+                additional_mapped_proteins = msreport.reader._extract_protein_ids(
+                    mapped_protein_fp.split(", ")
+                )
+                mapped_proteins = [protein] + additional_mapped_proteins
+            mapped_proteins_entries.append(";".join(mapped_proteins))
+        return mapped_proteins_entries
     def _collect_leading_protein_entries(self, df: pd.DataFrame) -> list[list[str]]:
         """Generates a list of leading protein entries.
@@ -995,6 +1104,9 @@ class FragPipeReader(ResultReader):
             A list of the same length as the input dataframe. Each position contains a
             list of leading protein entries, which a minimum of one entry.
         """
+        if self._sil:  # No "Indistinguishable Proteins" columns in 'SIL' data
+            return [[p] for p in df["Protein"]]
         leading_protein_entries = []
         for protein_entry, indist_protein_entry in zip(
             df["Protein"], df["Indistinguishable Proteins"].fillna("").astype(str)
@@ -1411,6 +1523,7 @@ class SpectronautReader(ResultReader):
         filename: Optional[str] = None,
         filetag: Optional[str] = None,
         rename_columns: bool = True,
+        rewrite_modifications: bool = True,
     ) -> pd.DataFrame:
         """Reads an ion evidence file (long format) and returns a processed dataframe.
@@ -1420,8 +1533,15 @@ class SpectronautReader(ResultReader):
         generated by concatenating the "Modified sequence" and "Charge" columns, and if
         present, the "Compensation voltage" column.
-        (!) Note that the modified sequence and modification localization probabilities
-        are currently not processed.
+        "Modified sequence" entries contain modifications within square brackets.
+        "Modification" entries are strings in the form of "position:modification_tag",
+        multiple modifications are joined by ";". An example for a modified sequence and
+        a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
+        "Modification localization string" contains localization probabilities in the
+        format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
+        e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
+        `msreport.peptidoform.make_localization_string` for details.
         Args:
             filename: Optional, allows specifying a specific file that will be imported.
@@ -1429,6 +1549,10 @@ class SpectronautReader(ResultReader):
                 a substring, instead of specifying a filename.
             rename_columns: If True, columns are renamed according to the MsReport
                 convention; default True.
+            rewrite_modifications: If True, the peptide format in "Modified sequence" is
+                changed according to the MsReport convention, and a "Modifications" is
+                added to contains the amino acid position for all modifications.
+                Requires 'rename_columns' to be true. Default True.
         Returns:
             A dataframe containing the processed ion table.
@@ -1456,6 +1580,9 @@ class SpectronautReader(ResultReader):
         df = self._add_protein_entries(df)
         if rename_columns:
             df = self._rename_columns(df, True)
+        if rewrite_modifications and rename_columns:
+            df = self._add_peptide_modification_entries(df)
+            df = self._add_modification_localization_string(df)
             df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
             if "Compensation voltage" in df.columns:
                 _cv = df["Compensation voltage"].astype(str)
@@ -1509,6 +1636,70 @@ class SpectronautReader(ResultReader):
         leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
         return leading_protein_entries
+    def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Adds standardized "Modified sequence" and "Modifications" columns.
+        "Modified sequence" entries contain modifications within square brackets.
+        "Modifications" entries are strings in the form of "position:modification_text",
+        multiple modifications are joined by ";". An example for a modified sequence and
+        a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
+        Requires the columns "Peptide sequence" and "Modified sequence" from the
+        software output.
+        Args:
+            df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
+        Returns:
+            A copy of the input dataframe with updated columns.
+        """
+        # TODO: not tested
+        mod_sequences = df["Modified sequence"].str[1:-1]  # Remove sourrounding "_"
+        mod_entries = _generate_modification_entries(
+            df["Peptide sequence"], mod_sequences, "[", "]"
+        )
+        new_df = df.copy()
+        new_df["Modified sequence"] = mod_entries["Modified sequence"]
+        new_df["Modifications"] = mod_entries["Modifications"]
+        return new_df
+    def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Adds modification localization string columns.
+        Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
+        column, converts them into the standardized modification localization string
+        format used by msreport, and adds new column "Modification localization string".
+        Probabilities are written in the format
+        "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
+        e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
+        `msreport.peptidoform.make_localization_string` for details.
+        Args:
+            df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
+        Returns:
+            A copy of the input dataframe with the added column
+            "Modification localization string".
+        """
+        # TODO: not tested
+        new_df = df.copy()
+        localization_strings = []
+        for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
+            if localization_entry == "":
+                localization_strings.append("")
+                continue
+            localization_probabilities = extract_spectronaut_localization_probabilities(
+                localization_entry
+            )
+            localization_string = msreport.peptidoform.make_localization_string(
+                localization_probabilities
+            )
+            localization_strings.append(localization_string)
+        new_df["Modification localization string"] = localization_strings
+        return new_df
 def sort_leading_proteins(
     table: pd.DataFrame,
@@ -1551,7 +1742,7 @@ def sort_leading_proteins(
     db_origins_present = "Leading proteins database origin" in table
     if database_order is not None:
-        database_encoding = defaultdict(lambda: 999)
+        database_encoding: dict[str, int] = defaultdict(lambda: 999)
         database_encoding.update({db: i for i, db in enumerate(database_order)})
     if penalize_contaminants is not None:
         contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
@@ -1559,7 +1750,7 @@ def sort_leading_proteins(
     for _, row in table.iterrows():
         protein_ids = row["Leading proteins"].split(";")
-        sorting_info = [[] for _ in protein_ids]
+        sorting_info: list[list] = [[] for _ in protein_ids]
         if special_proteins is not None:
             for i, _id in enumerate(protein_ids):
                 sorting_info[i].append(_id not in special_proteins)
@@ -1699,7 +1890,7 @@ def add_protein_site_annotation(
     protein_db: ProteinDatabase,
     protein_column: str = "Representative protein",
     site_column: str = "Protein site",
-):
+) -> pd.DataFrame:
     """Uses a FASTA protein database to add protein site annotation columns.
     Adds the columns "Modified residue", which corresponds to the amino acid at the
@@ -1837,6 +2028,61 @@ def add_leading_proteins_annotation(
     return table
+def add_protein_site_identifiers(
+    table: pd.DataFrame,
+    protein_db: ProteinDatabase,
+    site_column: str,
+    protein_name_column: str,
+):
+    """Adds a "Protein site identifier" column to the 'table'.
+    The "Protein site identifier" is generated by concatenating the protein name
+    with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
+    or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
+    the position of the site. If the protein name is not available, the
+    "Representative protein" entry is used instead.
+    Args:
+        table: Dataframe to which the protein site identifiers are added.
+        protein_db: A protein database containing entries from one or multiple FASTA
+            files. Protein identifiers in the 'table' column "Representative protein"
+            are used to look up entries in the 'protein_db'.
+        site_column: Column in 'table' that contains protein site positions. Positions
+            are one-indexed, meaning the first amino acid of the protein is position 1.
+            Multiple sites in a single entry should be separated by ";".
+        protein_name_column: Column in 'table' that contains protein names, which will
+            be used to generate the identifier. If no name is available, the accession
+            is used instead.
+    Raises:
+        ValueError: If the "Representative protein", 'protein_name_column' or
+            'site_column' is not found in the 'table'.
+    """
+    if site_column not in table.columns:
+        raise ValueError(f"Column '{site_column}' not found in the table.")
+    if protein_name_column not in table.columns:
+        raise ValueError(f"Column '{protein_name_column}' not found in the table.")
+    if "Representative protein" not in table.columns:
+        raise ValueError("Column 'Representative protein' not found in the table.")
+    site_identifiers = []
+    for accession, sites, name in zip(
+        table["Representative protein"],
+        table[site_column].astype(str),
+        table[protein_name_column],
+    ):
+        protein_sequence = protein_db[accession].sequence
+        protein_identifier = name if name else accession
+        aa_sites = []
+        for site in sites.split(";"):
+            aa = protein_sequence[int(site) - 1]
+            aa_sites.append(f"{aa}{site}")
+        aa_site_tag = " / ".join(aa_sites)
+        site_identifier = f"{protein_identifier} - {aa_site_tag}"
+        site_identifiers.append(site_identifier)
+    table["Protein site identifier"] = site_identifiers
 def add_sequence_coverage(
     protein_table: pd.DataFrame,
     peptide_table: pd.DataFrame,
@@ -2296,7 +2542,9 @@ def _extract_fragpipe_assigned_modifications(
     return modifications
-def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
+def extract_maxquant_localization_probabilities(
+    localization_entry: str,
+) -> dict[int, float]:
     """Extract localization probabilites from a MaxQuant "Probabilities" entry.
     Args:
@@ -2353,6 +2601,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
     return modification_probabilities
+def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
+    """Extract localization probabilites from a Spectronaut localization entry.
+    Args:
+        localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
+            spectronaut elution group (EG) output table.
+    Returns:
+        A dictionary of modifications containing a dictionary of {position: probability}
+        mappings. Positions are one-indexed, which means that the first amino acid
+        position is 1.
+    Example:
+    >>> extract_spectronaut_localization_probabilities(
+    ...     "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
+    ... )
+    {'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
+    """
+    modification_probabilities: dict[str, dict[int, float]] = {}
+    localization_entry = localization_entry.strip("_")
+    _, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
+        localization_entry, "[", "]"
+    )
+    for site, mod_probability_entry in raw_probability_entries:
+        modification, probability_entry = mod_probability_entry.split(": ")
+        if modification not in modification_probabilities:
+            modification_probabilities[modification] = {}
+        probability = float(probability_entry.replace("%", "")) / 100.0
+        modification_probabilities[modification][site] = probability
+    return modification_probabilities
 def _extract_protein_ids(entries: list[str]) -> list[str]:
     """Returns a list of protein IDs, extracted from protein entries.
@@ -2466,8 +2747,8 @@ def _create_multi_protein_annotations_from_db(
                 query_result.append(query_function(db_entry, default_value))
             else:
                 query_result.append(default_value)
-        query_result = ";".join(map(str, query_result))
-        annotation_values.append(query_result)
+        annotation_value = ";".join(map(str, query_result))
+        annotation_values.append(annotation_value)
     return annotation_values

msreport/rinterface/limma.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Python interface to custome R scripts."""
+"""Python interface to the 'limma.R' script."""
 import os

{msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.4
 Name: msreport
-Version: 0.0.29
+Version: 0.0.31
 Summary: Post processing and analysis of quantitative proteomics data
 Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
 License-Expression: Apache-2.0
 Project-URL: homepage, https://github.com/hollenstein/msreport
+Project-URL: documentation, https://hollenstein.github.io/msreport/
 Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
 Keywords: mass spectrometry,proteomics,post processing,data analysis
 Classifier: Development Status :: 4 - Beta
@@ -29,10 +30,17 @@ Requires-Dist: seaborn>=0.12.0
 Requires-Dist: statsmodels>=0.13.2
 Requires-Dist: typing_extensions>=4
 Provides-Extra: r
-Requires-Dist: rpy2!=3.5.13,>=3.5.3; extra == "r"
+Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
 Provides-Extra: dev
 Requires-Dist: mypy>=1.15.0; extra == "dev"
 Requires-Dist: pytest>=8.3.5; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
+Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
+Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
+Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
+Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
+Requires-Dist: ruff>=0.12.2; extra == "docs"
 Provides-Extra: test
 Requires-Dist: pytest>=8.3.5; extra == "test"
 Dynamic: license-file
@@ -40,6 +48,7 @@ Dynamic: license-file
 # MsReport
 [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15309090.svg)](https://doi.org/10.5281/zenodo.15309090)
 ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fhollenstein%2Fmsreport%2Fmain%2Fpyproject.toml)
 [![Run tests](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml/badge.svg)](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
@@ -55,6 +64,7 @@ bottom-up mass spectrometry experiments.
     - [Additional requirements](#additional-requirements)
     - [Optional Dependencies](#optional-dependencies)
 - [Development status](#development-status)
+- [How to cite](#how-to-cite)
 ## What is MsReport?
@@ -62,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
 The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
+The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
 ### Key features of MsReport
 #### Data Import and Standardization
@@ -134,3 +146,9 @@ For example, the R home directory might look like this on Windows: `C:\Program F
 ## Development status
 MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
+## How to cite
+If you use MsReport for your research or publications, please include the following citation and consider giving the project a star on GitHub.
+> Hollenstein, D. M., & Hartl, M. (2025). hollenstein/msreport: v0.0.29 (0.0.29). Zenodo. https://doi.org/10.5281/zenodo.15309090

msreport 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

msreport 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl