PyPI - ms2rescore - Versions diffs - 3.1.0.dev6__tar.gz → 3.1.0.dev7__tar.gz - Mend

ms2rescore 3.1.0.dev6tar.gz → 3.1.0.dev7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ms2rescore
-Version: 3.1.0.dev6
+Version: 3.1.0.dev7
 Summary: MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and retention times.
 Keywords: MS2Rescore,MS2PIP,DeepLC,Percolator,proteomics,mass spectrometry,peptide identification,rescoring,machine learning
 Author: Ana Sílvia C. Silva, Robbin Bouwmeester, Louise Buur
@@ -24,13 +24,12 @@ Requires-Dist: lxml>=4.5
 Requires-Dist: mokapot>=0.9
 Requires-Dist: ms2pip>=4.0.0-dev10
 Requires-Dist: ms2rescore_rs
-Requires-Dist: numpy==1.24.3; python_version == '3.11'
-Requires-Dist: numpy>=1.16.0; python_version != '3.11'
+Requires-Dist: numpy>=1.16.0
+Requires-Dist: scikit-learn==1.5.1; python_version == '3.11'
 Requires-Dist: pandas>=1.0
 Requires-Dist: plotly>=5
-Requires-Dist: psm_utils>=0.8
-Requires-Dist: pydantic>=1.8.2,<2
-Requires-Dist: pyteomics>=4.1.0, <4.7
+Requires-Dist: psm_utils>=0.9
+Requires-Dist: pyteomics>=4.7.2
 Requires-Dist: rich>=12
 Requires-Dist: tomli>=2; python_version < '3.11'
 Requires-Dist: ruff ; extra == "dev"

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
-__version__ = "3.1.0-dev6"
+__version__ = "3.1.0-dev7"
 from warnings import filterwarnings

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/core.py RENAMED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, Optional
 import numpy as np
 import psm_utils.io
+from mokapot.dataset import LinearPsmDataset
 from psm_utils import PSMList
 from ms2rescore import exceptions
@@ -13,6 +14,7 @@ from ms2rescore.parse_psms import parse_psms
 from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
+from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
 logger = logging.getLogger(__name__)
@@ -104,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
         psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
-    # If no rescoring engine is specified, write PSMs and features to PIN file
-    if not config["rescoring_engine"]:
+    # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
+    if not config["rescoring_engine"] or config["log_level"] == "debug":
         logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
         psm_utils.io.write_file(
             psm_list,
@@ -113,42 +115,52 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
             filetype="percolator",
             feature_names=all_feature_names,
         )
+    if not config["rescoring_engine"]:
+        logger.info("No rescoring engine specified. Skipping rescoring.")
         return None
     # Rescore PSMs
-    if "percolator" in config["rescoring_engine"]:
-        percolator.rescore(
-            psm_list,
-            output_file_root=output_file_root,
-            log_level=config["log_level"],
-            processes=config["processes"],
-            percolator_kwargs=config["rescoring_engine"]["percolator"],
-        )
-    elif "mokapot" in config["rescoring_engine"]:
-        if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
-            config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
-        if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
-            protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
-        else:
-            protein_kwargs = dict()
-        mokapot.rescore(
-            psm_list,
-            output_file_root=output_file_root,
-            protein_kwargs=protein_kwargs,
-            **config["rescoring_engine"]["mokapot"],
-        )
+    try:
+        if "percolator" in config["rescoring_engine"]:
+            percolator.rescore(
+                psm_list,
+                output_file_root=output_file_root,
+                log_level=config["log_level"],
+                processes=config["processes"],
+                percolator_kwargs=config["rescoring_engine"]["percolator"],
+            )
+        elif "mokapot" in config["rescoring_engine"]:
+            if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
+                config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+            if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+                protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+            else:
+                protein_kwargs = dict()
+            mokapot.rescore(
+                psm_list,
+                output_file_root=output_file_root,
+                protein_kwargs=protein_kwargs,
+                **config["rescoring_engine"]["mokapot"],
+            )
+    except exceptions.RescoringError as e:
+        logger.exception(e)
+        rescoring_succeeded = False
     else:
-        logger.info("No known rescoring engine specified. Skipping rescoring.")
+        rescoring_succeeded = True
+        _log_id_psms_after(psm_list, id_psms_before)
-    _log_id_psms_after(psm_list, id_psms_before)
+    # Workaround for broken PEP calculation if best PSM is decoy
+    if all(psm_list["pep"] == 1.0):
+        psm_list = _fix_constant_pep(psm_list)
     # Write output
     logger.info(f"Writing output to {output_file_root}.psms.tsv...")
     psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
     # Write report
-    if config["write_report"]:
+    if config["write_report"] and rescoring_succeeded:
         try:
             generate.generate_report(
                 output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
@@ -231,3 +243,44 @@ def _log_id_psms_after(psm_list, id_psms_before):
     logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
     return id_psms_after
+def _fix_constant_pep(psm_list):
+    """Workaround for broken PEP calculation if best PSM is decoy."""
+    logger.warning(
+        "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
+        "best target PSM."
+    )
+    max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
+    higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
+    if not higher_scoring_decoys.any():
+        logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
+    else:
+        logger.warning(f"Removing {higher_scoring_decoys.sum()} decoy PSMs.")
+        psm_list = psm_list[~higher_scoring_decoys]
+        # Minimal conversion to LinearPsmDataset
+        psm_df = psm_list.to_dataframe()
+        psm_df = psm_df.reset_index(drop=True).reset_index()
+        psm_df["peptide"] = (
+            psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
+        )
+        psm_df["is_target"] = ~psm_df["is_decoy"]
+        lin_psm_data = LinearPsmDataset(
+            psms=psm_df[["index", "peptide", "score", "is_target"]],
+            target_column="is_target",
+            spectrum_columns="index",  # Use artificial index to allow multi-rank rescoring
+            peptide_column="peptide",
+            feature_columns=["score"],
+        )
+        # Recalculate confidence
+        new_confidence = lin_psm_data.assign_confidence()
+        # Add new confidence estimations to PSMList
+        add_psm_confidence(psm_list, new_confidence)
+        add_peptide_confidence(psm_list, new_confidence)
+        return psm_list

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/exceptions.py RENAMED Viewed

@@ -35,3 +35,9 @@ class ReportGenerationError(MS2RescoreError):
     """Error while generating report."""
     pass
+class RescoringError(MS2RescoreError):
+    """Error while rescoring PSMs."""
+    pass

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/config_default.json RENAMED Viewed

@@ -14,6 +14,7 @@
         },
         "rescoring_engine": {
             "mokapot": {
+                "train_fdr": 0.01,
                 "write_weights": true,
                 "write_txt": true,
                 "write_flashlfq": true

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/config_schema.json RENAMED Viewed

@@ -68,7 +68,11 @@
                 },
                 "psm_file": {
                     "description": "Path to file with peptide-spectrum matches.",
-                    "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
+                    "oneOf": [
+                        { "type": "string" },
+                        { "type": "null" },
+                        { "type": "array", "items": { "type": "string" } }
+                    ]
                 },
                 "psm_file_type": {
                     "description": "PSM file type. By default inferred from file extension.",
@@ -159,7 +163,7 @@
                     "default": false
                 },
                 "profile": {
-                    "description": "Write an txt report using cProfile for profiling",
+                    "description": "Write a txt report using cProfile for profiling",
                     "type": "boolean",
                     "default": false
                 }
@@ -263,6 +267,13 @@
             "type": "object",
             "additionalProperties": true,
             "properties": {
+                "train_fdr": {
+                    "description": "FDR threshold for training Mokapot",
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1,
+                    "default": 0.01
+                },
                 "write_weights": {
                     "description": "Write Mokapot weights to a text file",
                     "type": "boolean",

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/parse_psms.py RENAMED Viewed

@@ -2,6 +2,7 @@ import logging
 import re
 from typing import Dict, Union
+import numpy as np
 import psm_utils.io
 from psm_utils import PSMList
@@ -25,6 +26,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     """
     # Read PSMs, find decoys, calculate q-values
     psm_list = _read_psms(config, psm_list)
+    psm_list = _remove_invalid_aa(psm_list)
     _find_decoys(config, psm_list)
     _calculate_qvalues(config, psm_list)
     if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
@@ -70,10 +72,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
         new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
         psm_list["spectrum_id"] = new_ids
-    # TODO: Temporary fix until implemented in psm_utils
-    # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
-    psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
     return psm_list
@@ -82,21 +80,20 @@ def _read_psms(config, psm_list):
         return psm_list
     else:
         logger.info("Reading PSMs from file...")
-        current_file = 1
         total_files = len(config["psm_file"])
-        valid_psms_list = []
-        total_psms = 0
-        valid_psms = 0
-        for psm_file in config["psm_file"]:
+        psm_list = []
+        for current_file, psm_file in enumerate(config["psm_file"]):
             logger.info(
-                f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
+                f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
             )
             try:
-                id_file_psm_list = psm_utils.io.read_file(
-                    psm_file,
-                    filetype=config["psm_file_type"],
-                    show_progressbar=True,
-                    **config["psm_reader_kwargs"],
+                psm_list.extend(
+                    psm_utils.io.read_file(
+                        psm_file,
+                        filetype=config["psm_file_type"],
+                        show_progressbar=True,
+                        **config["psm_reader_kwargs"],
+                    )
                 )
             except psm_utils.io.PSMUtilsIOException:
                 raise MS2RescoreConfigurationError(
@@ -105,18 +102,9 @@ def _read_psms(config, psm_list):
                     "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
                     " for more information."
                 )
+            logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.")
-            total_psms += len(id_file_psm_list.psm_list)
-            for psm in id_file_psm_list.psm_list:
-                if not _has_invalid_aminoacids(psm):
-                    valid_psms_list.append(psm)
-                    valid_psms += 1
-            current_file += 1
-        if total_psms - valid_psms > 0:
-            logger.warning(
-                f"{total_psms - valid_psms} PSMs with invalid amino acids were removed."
-            )
-        return PSMList(psm_list=valid_psms_list)
+        return PSMList(psm_list=psm_list)
 def _find_decoys(config, psm_list):
@@ -175,6 +163,7 @@ def _parse_values_spectrum_id(config, psm_list):
             raise MS2RescoreConfigurationError(
                 f"Could not parse retention time from spectrum_id with the "
                 f"{config['psm_id_rt_pattern']} regex pattern. "
+                f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n."
                 "Please make sure the retention time key is present in the spectrum_id "
                 "and the value is in a capturing group or disable the relevant feature generator."
             )
@@ -198,7 +187,16 @@ def _parse_values_spectrum_id(config, psm_list):
             )
-def _has_invalid_aminoacids(psm):
-    """Check if a PSM contains invalid amino acids."""
+def _remove_invalid_aa(psm_list: PSMList) -> PSMList:
+    """Remove PSMs with invalid amino acids."""
+    logger.debug("Removing PSMs with invalid amino acids...")
+    invalid_psms = np.array(
+        [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list]
+    )
-    return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence)
+    if any(invalid_psms):
+        logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.")
+        return psm_list[~invalid_psms]
+    else:
+        logger.debug("No PSMs with invalid amino acids found.")
+        return psm_list

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/charts.py RENAMED Viewed

@@ -198,6 +198,7 @@ def score_scatter_plot(
     after: mokapot.LinearConfidence,
     level: str = "psms",
     indexer: str = "index",
+    fdr_threshold: float = 0.01,
 ) -> go.Figure:
     """
     Plot PSM scores before and after rescoring.
@@ -242,12 +243,12 @@ def score_scatter_plot(
     # Get score thresholds
     score_threshold_before = (
-        ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
+        ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
         .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
         .iloc[0]
     )
     score_threshold_after = (
-        ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
+        ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
         .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
         .iloc[0]
     )

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/rescoring_engines/mokapot.py RENAMED Viewed

@@ -29,8 +29,11 @@ import pandas as pd
 import psm_utils
 from mokapot.brew import brew
 from mokapot.dataset import LinearPsmDataset
+from mokapot.model import PercolatorModel
 from pyteomics.mass import nist_mass
+from ms2rescore.exceptions import RescoringError
 logger = logging.getLogger(__name__)
 logging.getLogger("numba").setLevel(logging.WARNING)
@@ -39,6 +42,7 @@ def rescore(
     psm_list: psm_utils.PSMList,
     output_file_root: str = "ms2rescore",
     fasta_file: Optional[str] = None,
+    train_fdr: float = 0.01,
     write_weights: bool = False,
     write_txt: bool = False,
     write_flashlfq: bool = False,
@@ -65,6 +69,8 @@ def rescore(
     fasta_file
         Path to FASTA file with protein sequences to use for protein inference. Defaults to
         ``None``.
+    train_fdr
+        FDR to use for training the Mokapot model. Defaults to ``0.01``.
     write_weights
         Write model weights to a text file. Defaults to ``False``.
     write_txt
@@ -91,46 +97,15 @@ def rescore(
     # Rescore
     logger.debug(f"Mokapot brew options: `{kwargs}`")
-    confidence_results, models = brew(lin_psm_data, rng=8, **kwargs)
-    # Reshape confidence estimates to match PSMList
-    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
-    mokapot_values_targets = (
-        confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
-    )
-    mokapot_values_decoys = (
-        confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
-    )
-    q = np.full((len(psm_list), 3), np.nan)
-    q[mokapot_values_targets.index] = mokapot_values_targets.values
-    q[mokapot_values_decoys.index] = mokapot_values_decoys.values
-    # Add Mokapot results to PSMList
-    psm_list["score"] = q[:, 0]
-    psm_list["qvalue"] = q[:, 1]
-    psm_list["pep"] = q[:, 2]
-    # Repeat for peptide-level scores
-    peptides_targets = confidence_results.confidence_estimates["peptides"].set_index(["peptide"])[
-        keys
-    ]
-    peptides_decoys = confidence_results.decoy_confidence_estimates["peptides"].set_index(
-        ["peptide"]
-    )[keys]
-    peptide_info = pd.concat([peptides_targets, peptides_decoys], axis=0).to_dict(orient="index")
-    # Add peptide-level scores to PSM metadata
-    # run_key = "na" if not all(psm.run for psm in psm_list) else None
-    no_charge_pattern = re.compile(r"(/\d+$)")
-    for psm in psm_list:
-        peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
-        psm.metadata.update(
-            {
-                "peptide_score": peptide_scores["mokapot score"],
-                "peptide_qvalue": peptide_scores["mokapot q-value"],
-                "peptide_pep": peptide_scores["mokapot PEP"],
-            }
+    try:
+        confidence_results, models = brew(
+            lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
         )
+    except RuntimeError as e:
+        raise RescoringError("Mokapot could not be run. Please check the input data.") from e
+    add_psm_confidence(psm_list, confidence_results)
+    add_peptide_confidence(psm_list, confidence_results)
     # Write results
     if write_weights:
@@ -245,6 +220,55 @@ def save_model_weights(
     )
+def add_psm_confidence(
+    psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+    """Add Mokapot PSM-level confidence estimates to PSM list."""
+    # Reshape confidence estimates to match PSMList
+    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+    mokapot_values_targets = (
+        confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
+    )
+    mokapot_values_decoys = (
+        confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
+    )
+    q = np.full((len(psm_list), 3), np.nan)
+    q[mokapot_values_targets.index] = mokapot_values_targets.values
+    q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+    # Add Mokapot results to PSMList
+    psm_list["score"] = q[:, 0]
+    psm_list["qvalue"] = q[:, 1]
+    psm_list["pep"] = q[:, 2]
+def add_peptide_confidence(
+    psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+    """Add Mokapot peptide-level confidence estimates to PSM list."""
+    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+    peptide_info = pd.concat(
+        [
+            confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
+            confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
+        ],
+        axis=0,
+    ).to_dict(orient="index")
+    # Add peptide-level scores to PSM metadata
+    # run_key = "na" if not all(psm.run for psm in psm_list) else None
+    no_charge_pattern = re.compile(r"(/\d+$)")
+    for psm in psm_list:
+        peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
+        psm.metadata.update(
+            {
+                "peptide_score": peptide_scores["mokapot score"],
+                "peptide_qvalue": peptide_scores["mokapot q-value"],
+                "peptide_pep": peptide_scores["mokapot PEP"],
+            }
+        )
 def _mz_to_mass(mz: float, charge: int) -> float:
     """Convert m/z to mass."""
     return mz * charge - charge * nist_mass["H"][1][0]

{ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/pyproject.toml RENAMED Viewed

@@ -43,13 +43,14 @@ dependencies = [
     "mokapot>=0.9",
     "ms2pip>=4.0.0-dev10",
     "ms2rescore_rs",
-    "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
-    "numpy>=1.16.0; python_version != '3.11'",
+    # "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
+    # "numpy>=1.16.0; python_version != '3.11'",
+    "numpy>=1.16.0",
+    "scikit-learn==1.5.1; python_version == '3.11'",
     "pandas>=1.0",
     "plotly>=5",
-    "psm_utils>=0.8",
-    "pydantic>=1.8.2,<2",                      # Fix compatibility with v2 in psm_utils
-    "pyteomics>=4.1.0, <4.7",
+    "psm_utils>=0.9",
+    "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
 ]