PyPI - ms2rescore - Versions diffs - 3.0.3__tar.gz → 3.1.0.dev1__tar.gz - Mend

ms2rescore 3.0.3tar.gz → 3.1.0.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ms2rescore
-Version: 3.0.3
+Version: 3.1.0.dev1
 Summary: MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and retention times.
 Keywords: MS2Rescore,MS2PIP,DeepLC,Percolator,proteomics,mass spectrometry,peptide identification,rescoring,machine learning
 Author: Ana Sílvia C. Silva, Robbin Bouwmeester, Louise Buur
@@ -13,25 +13,26 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Classifier: Development Status :: 5 - Production/Stable
-Requires-Dist: ms2rescore_rs
-Requires-Dist: numpy>=1.16.0; python_version != '3.11'
-Requires-Dist: numpy==1.24.3; python_version == '3.11'
-Requires-Dist: pandas>=1.0
-Requires-Dist: rich>=12
-Requires-Dist: pyteomics>=4.1.0
-Requires-Dist: lxml>=4.5
-Requires-Dist: ms2pip>=4.0.0-dev4
-Requires-Dist: click>=7
 Requires-Dist: cascade-config>=0.4.0
+Requires-Dist: click>=7
+Requires-Dist: customtkinter>=5,<6
 Requires-Dist: deeplc>=2.2
 Requires-Dist: deeplcretrainer>=0.2
-Requires-Dist: tomli>=2; python_version < '3.11'
-Requires-Dist: psm_utils>=0.4
-Requires-Dist: customtkinter>=5,<6
-Requires-Dist: mokapot>=0.9
-Requires-Dist: pydantic>=1.8.2,<2
+Requires-Dist: im2deep>=0.1.3
 Requires-Dist: jinja2>=3
+Requires-Dist: lxml>=4.5
+Requires-Dist: mokapot>=0.9
+Requires-Dist: ms2pip>=4.0.0-dev10
+Requires-Dist: ms2rescore_rs
+Requires-Dist: numpy==1.24.3; python_version == '3.11'
+Requires-Dist: numpy>=1.16.0; python_version != '3.11'
+Requires-Dist: pandas>=1.0
 Requires-Dist: plotly>=5
+Requires-Dist: psm_utils>=0.8
+Requires-Dist: pydantic>=1.8.2,<2
+Requires-Dist: pyteomics>=4.1.0, <4.7
+Requires-Dist: rich>=12
+Requires-Dist: tomli>=2; python_version < '3.11'
 Requires-Dist: ruff ; extra == "dev"
 Requires-Dist: black ; extra == "dev"
 Requires-Dist: pytest ; extra == "dev"

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
-__version__ = "3.0.3"
+__version__ = "3.1.0-dev1"
 from warnings import filterwarnings

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/__main__.py RENAMED Viewed

@@ -1,6 +1,8 @@
 """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
 import argparse
+import importlib.resources
+import json
 import logging
 import sys
 from pathlib import Path
@@ -10,7 +12,7 @@ from rich.console import Console
 from rich.logging import RichHandler
 from rich.text import Text
-from ms2rescore import __version__
+from ms2rescore import __version__, package_data
 from ms2rescore.config_parser import parse_configurations
 from ms2rescore.core import rescore
 from ms2rescore.exceptions import MS2RescoreConfigurationError
@@ -33,19 +35,26 @@ LOGGER = logging.getLogger(__name__)
 CONSOLE = Console(record=True)
-def _print_credits():
+def _print_credits(tims=False):
     """Print software credits to terminal."""
     text = Text()
     text.append("\n")
-    text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+    if tims:
+        text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+    else:
+        text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
     text.append(f" (v{__version__})\n", style="bold")
+    if tims:
+        text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic")
     text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n")
     text.append("Please cite: ")
     text.append(
-        "Declercq et al. MCP (2022)", style="link https://doi.org/10.1016/j.mcpro.2022.100266"
+        "Buur & Declercq et al. JPR (2024)",
+        style="link https://doi.org/10.1021/acs.jproteome.3c00785",
     )
     text.append("\n")
-    text.stylize("cyan")
+    if tims:
+        text.stylize("#006cb5")
     CONSOLE.print(text)
@@ -152,18 +161,30 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]):
     )
-def main():
+def main_tims():
+    """Run MS²Rescore command-line interface in TIMS²Rescore mode."""
+    main(tims=True)
+def main(tims=False):
     """Run MS²Rescore command-line interface."""
-    _print_credits()
+    _print_credits(tims)
     # Parse CLI arguments and configuration file
     parser = _argument_parser()
     cli_args = parser.parse_args()
+    configurations = []
+    if cli_args.config_file:
+        configurations.append(cli_args.config_file)
+    if tims:
+        configurations.append(
+            json.load(importlib.resources.open_text(package_data, "config_default_tims.json"))
+        )
+    configurations.append(cli_args)
     try:
-        if cli_args.config_file:
-            config = parse_configurations([cli_args.config_file, cli_args])
-        else:
-            config = parse_configurations(cli_args)
+        config = parse_configurations(configurations)
     except MS2RescoreConfigurationError as e:
         LOGGER.critical(e)
         sys.exit(1)

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/core.py RENAMED Viewed

@@ -3,15 +3,16 @@ import logging
 from multiprocessing import cpu_count
 from typing import Dict, Optional
+import numpy as np
 import psm_utils.io
 from psm_utils import PSMList
+from ms2rescore import exceptions
 from ms2rescore.feature_generators import FEATURE_GENERATORS
 from ms2rescore.parse_psms import parse_psms
 from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore import exceptions
 logger = logging.getLogger(__name__)
@@ -58,12 +59,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
-    # TODO: avoid hard coding feature generators in some way
-    rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
-    im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
-    if rt_required or im_required:
-        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-        get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
+    # Add missing precursor info from spectrum file if needed
+    _fill_missing_precursor_info(psm_list, config)
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -160,6 +157,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
             logger.exception(e)
+def _fill_missing_precursor_info(psm_list, config):
+    """Fill missing precursor info from spectrum file if needed."""
+    # Check if required
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = ("deeplc" in config["feature_generators"]) and any(
+        v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
+    )
+    im_required = (
+        "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
+    ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
+    logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+    # Add missing values
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
+    # Check if values are now present
+    for value_name in ["retention_time", "ion_mobility"]:
+        if (
+            0.0 in psm_list[value_name]
+            or None in psm_list[value_name]
+            or np.isnan(psm_list[value_name]).any()
+        ):
+            if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
+                raise exceptions.MissingValuesError(
+                    f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
+                    f"feature generators that require '{value_name}' or ensure that the values are "
+                    "present in the input files."
+                )
+            else:
+                missing_value_psms = psm_list[
+                    [v is None or np.isnan(v) for v in psm_list[value_name]]
+                ]
+                logger.warning(
+                    f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
+                    "These PSMs will be removed."
+                )
+                psm_list = psm_list[
+                    [v is not None and not np.isnan(v) for v in psm_list[value_name]]
+                ]
 def _write_feature_names(feature_names, output_file_root):
     """Write feature names to file."""
     with open(output_file_root + ".feature_names.tsv", "w") as f:

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/exceptions.py RENAMED Viewed

@@ -25,6 +25,12 @@ class ModificationParsingError(IDFileParsingError):
     pass
+class MissingValuesError(MS2RescoreError):
+    """Missing values in PSMs and/or spectra."""
+    pass
 class ReportGenerationError(MS2RescoreError):
     """Error while generating report."""

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@ from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
 from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
 from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
@@ -14,4 +15,5 @@ FEATURE_GENERATORS = {
     "deeplc": DeepLCFeatureGenerator,
     "maxquant": MaxQuantFeatureGenerator,
     "ionmob": IonMobFeatureGenerator,
+    "im2deep": IM2DeepFeatureGenerator,
 }

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/deeplc.py RENAMED Viewed

@@ -21,12 +21,10 @@ import os
 from collections import defaultdict
 from inspect import getfullargspec
 from itertools import chain
-from typing import List, Optional, Union
+from typing import List, Union
 import numpy as np
-import pandas as pd
 from psm_utils import PSMList
-from psm_utils.io import peptide_record
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -41,8 +39,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
         self,
         *args,
         lower_score_is_better: bool = False,
-        calibration_set_size: Union[int, float] = 0.15,
-        spectrum_path: Optional[str] = None,
+        calibration_set_size: Union[int, float, None] = None,
         processes: int = 1,
         **kwargs,
     ) -> None:
@@ -59,9 +56,6 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
         calibration_set_size: int or float
             Amount of best PSMs to use for DeepLC calibration. If this value is lower
             than the number of available PSMs, all PSMs will be used. (default: 0.15)
-        spectrum_path
-            Path to spectrum file or directory with spectrum files. If None, inferred from `run`
-            field in PSMs. Defaults to None.
         processes: {int, None}
             Number of processes to use in DeepLC. Defaults to 1.
         kwargs: dict
@@ -77,7 +71,6 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
         self.lower_psm_score_better = lower_score_is_better
         self.calibration_set_size = calibration_set_size
-        self.spectrum_path = spectrum_path
         self.processes = processes
         self.deeplc_kwargs = kwargs or {}
@@ -151,17 +144,15 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    logger.debug("Calibrating DeepLC...")
                     psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,
                         verbose=self._verbose,
                         path_model=self.selected_model or self.user_model,
                         **self.deeplc_kwargs,
                     )
-                    self.deeplc_predictor.calibrate_preds(
-                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
-                    )
+                    self.deeplc_predictor.calibrate_preds(psm_list_calibration)
                     # Still calibrate for each run, but do not try out all model options.
                     # Just use model that was selected based on first run
                     if not self.selected_model:
@@ -174,11 +165,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
                         )
                     logger.debug("Predicting retention times...")
-                    predictions = np.array(
-                        self.deeplc_predictor.make_preds(
-                            seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
-                        )
-                    )
+                    predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
                     observations = psm_list_run["retention_time"]
                     rt_diffs_run = np.abs(predictions - observations)
@@ -204,25 +191,25 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
                         )
                 current_run += 1
-    # TODO: Remove when DeepLC supports PSMList directly
-    @staticmethod
-    def _psm_list_to_deeplc_peprec(psm_list: PSMList) -> pd.DataFrame:
-        peprec = peptide_record.to_dataframe(psm_list)
-        peprec = peprec.rename(
-            columns={
-                "observed_retention_time": "tr",
-                "peptide": "seq",
-            }
-        )[["tr", "seq", "modifications"]]
-        return peprec
     def _get_calibration_psms(self, psm_list: PSMList):
         """Get N best scoring target PSMs for calibration."""
         psm_list_targets = psm_list[~psm_list["is_decoy"]]
-        n_psms = self._get_number_of_calibration_psms(psm_list_targets)
-        indices = np.argsort(psm_list_targets["score"])
-        indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
-        return psm_list_targets[indices]
+        if self.calibration_set_size:
+            n_psms = self._get_number_of_calibration_psms(psm_list_targets)
+            indices = np.argsort(psm_list_targets["score"])
+            indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
+            return psm_list_targets[indices]
+        else:
+            identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01]
+            if len(identified_psms) == 0:
+                raise ValueError(
+                    "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc."
+                )
+            elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]):
+                logger.warning(
+                    " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining."
+                )
+            return identified_psms
     def _get_number_of_calibration_psms(self, psm_list):
         """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""

ms2rescore-3.1.0.dev1/ms2rescore/feature_generators/im2deep.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""
+IM2Deep ion mobility-based feature generator.
+IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional
+neural network to predict retention times based on the atomic composition of the (modified) amino
+acid residues in the peptide. See
+`github.com/compomics/IM2Deep <https://github.com/compomics/IM2Deep>`_ for more information.
+"""
+import contextlib
+import logging
+import os
+from inspect import getfullargspec
+from itertools import chain
+from typing import List
+import numpy as np
+import pandas as pd
+from im2deep.calibrate import im2ccs
+from im2deep.im2deep import predict_ccs
+from psm_utils import PSMList
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+logger = logging.getLogger(__name__)
+class IM2DeepFeatureGenerator(FeatureGeneratorBase):
+    """IM2Deep collision cross section feature generator."""
+    def __init__(
+        self,
+        *args,
+        processes: int = 1,
+        **kwargs,
+    ):
+        """
+        Initialize the IM2DeepFeatureGenerator.
+        Parameters
+        ----------
+        processes : int, optional
+            Number of parallel processes to use for IM2Deep predictions. Default is 1.
+        **kwargs : dict, optional
+            Additional keyword arguments to `im2deep.predict_ccs`.
+        """
+        super().__init__(*args, **kwargs)
+        self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
+        # Remove any kwargs that are not IM2Deep arguments
+        self.im2deep_kwargs = kwargs or {}
+        self.im2deep_kwargs = {
+            k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args
+        }
+        self.im2deep_kwargs["n_jobs"] = processes
+    @property
+    def feature_names(self) -> List[str]:
+        return [
+            "ccs_observed_im2deep",
+            "ccs_predicted_im2deep",
+            "ccs_error_im2deep",
+            "abs_ccs_error_im2deep",
+            "perc_ccs_error_im2deep",
+        ]
+    def add_features(self, psm_list: PSMList) -> None:
+        """Add IM2Deep-derived features to PSMs"""
+        logger.info("Adding IM2Deep-derived features to PSMs")
+        # Get easy-access nested version of PSMlist
+        psm_dict = psm_list.get_psm_dict()
+        # Run IM2Deep for each spectrum file
+        current_run = 1
+        total_runs = sum(len(runs) for runs in psm_dict.values())
+        for runs in psm_dict.values():
+            # Reset IM2Deep predictor for each collection of runs
+            for run, psms in runs.items():
+                logger.info(
+                    f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..."
+                )
+                # Disable wild logging to stdout by TensorFlow, unless in debug mode
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
+                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
+                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+                    logger.debug("Calibrating IM2Deep...")
+                    # Convert ion mobility to CCS and calibrate CCS values
+                    psm_list_run_df = psm_list_run.to_dataframe()
+                    psm_list_run_df["charge"] = [
+                        pep.precursor_charge for pep in psm_list_run_df["peptidoform"]
+                    ]
+                    psm_list_run_df["ccs_observed"] = im2ccs(
+                        psm_list_run_df["ion_mobility"],
+                        psm_list_run_df["precursor_mz"],
+                        psm_list_run_df["charge"],
+                    )
+                    # Create dataframe with high confidence hits for calibration
+                    cal_psm_df = self.make_calibration_df(psm_list_run_df)
+                    # Make predictions with IM2Deep
+                    logger.debug("Predicting CCS values...")
+                    predictions = predict_ccs(
+                        psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs
+                    )
+                    # Add features to PSMs
+                    logger.debug("Adding features to PSMs...")
+                    observations = psm_list_run_df["ccs_observed"]
+                    ccs_diffs_run = np.abs(predictions - observations)
+                    for i, psm in enumerate(psm_list_run):
+                        psm["rescoring_features"].update(
+                            {
+                                "ccs_observed_im2deep": observations[i],
+                                "ccs_predicted_im2deep": predictions[i],
+                                "ccs_error_im2deep": ccs_diffs_run[i],
+                                "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]),
+                                "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i])
+                                / observations[i]
+                                * 100,
+                            }
+                        )
+                current_run += 1
+    @staticmethod
+    def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
+        """
+        Make dataframe for calibration of IM2Deep predictions.
+        Parameters
+        ----------
+        psm_list_df
+            DataFrame with PSMs.
+        threshold
+            Percentage of highest scoring identified target PSMs to use for calibration,
+            default 0.95.
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with high confidence hits for calibration.
+        """
+        identified_psms = psm_list_df[
+            (psm_list_df["qvalue"] < 0.01)
+            & (~psm_list_df["is_decoy"])
+            & (psm_list_df["charge"] < 5)  # predictions do not go higher for IM2Deep
+        ]
+        calibration_psms = identified_psms[
+            identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
+        ]
+        logger.debug(
+            f"Number of high confidence hits for calculating shift: {len(calibration_psms)}"
+        )
+        return calibration_psms

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/ionmob.py RENAMED Viewed

@@ -165,6 +165,7 @@ class IonMobFeatureGenerator(FeatureGeneratorBase):
                     )
                 ]
+                # TODO: Use observed m/z?
                 psm_list_run_df["mz"] = psm_list_run_df.apply(
                     lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
                 )  # use precursor m/z from PSMs?
@@ -175,9 +176,8 @@ class IonMobFeatureGenerator(FeatureGeneratorBase):
                 )
                 # calibrate CCS values
                 shift_factor = self.calculate_ccs_shift(psm_list_run_df)
-                psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
-                    lambda x: x["ccs_observed"] + shift_factor, axis=1
-                )
+                psm_list_run_df["ccs_observed"] + shift_factor
                 # predict CCS values
                 tf_ds = to_tf_dataset_inference(
                     psm_list_run_df["mz"],

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/ms2pip.py RENAMED Viewed

@@ -193,7 +193,7 @@ class MS2PIPFeatureGenerator(FeatureGeneratorBase):
                 try:
                     ms2pip_results = correlate(
                         psms=psm_list_run,
-                        spectrum_file=spectrum_filename,
+                        spectrum_file=str(spectrum_filename),
                         spectrum_id_pattern=self.spectrum_id_pattern,
                         model=self.model,
                         ms2_tolerance=self.ms2_tolerance,

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/app.py RENAMED Viewed

@@ -360,15 +360,20 @@ class FeatureGeneratorConfig(ctk.CTkFrame):
         self.deeplc_config = DeepLCConfiguration(self)
         self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew")
+        self.im2deep_config = Im2DeepConfiguration(self)
+        self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
         self.ionmob_config = IonmobConfiguration(self)
-        self.ionmob_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
+        self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew")
     def get(self) -> Dict:
         """Return the configuration as a dictionary."""
         basic_enabled, basic_config = self.basic_config.get()
         ms2pip_enabled, ms2pip_config = self.ms2pip_config.get()
         deeplc_enabled, deeplc_config = self.deeplc_config.get()
+        im2deep_enabled, im2deep_config = self.im2deep_config.get()
         ionmob_enabled, ionmob_config = self.ionmob_config.get()
         config = {}
         if basic_enabled:
             config["basic"] = basic_config
@@ -523,6 +528,27 @@ class IonmobConfiguration(ctk.CTkFrame):
         return enabled, config
+class Im2DeepConfiguration(ctk.CTkFrame):
+    def __init__(self, *args, **kwargs):
+        """IM2Deep configuration frame."""
+        super().__init__(*args, **kwargs)
+        self.configure(fg_color="transparent")
+        self.grid_columnconfigure(0, weight=1)
+        self.title = widgets.Heading(self, text="im2deep")
+        self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
+        self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False)
+        self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
+    def get(self) -> Dict:
+        """Return the configuration as a dictionary."""
+        enabled = self.enabled.get()
+        config = {}
+        return enabled, config
 class RescoringEngineConfig(ctk.CTkFrame):
     def __init__(self, *args, **kwargs):
         """Rescoring engine configuration frame."""

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/config_default.json RENAMED Viewed

@@ -29,6 +29,8 @@
         "id_decoy_pattern": null,
         "psm_id_pattern": null,
         "spectrum_id_pattern": null,
+        "psm_id_rt_pattern": null,
+        "psm_id_im_pattern": null,
         "lower_score_is_better": false,
         "modification_mapping": {},
         "fixed_modifications": {},

ms2rescore-3.1.0.dev1/ms2rescore/package_data/config_default_tims.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+    "$schema": "./config_schema.json",
+    "ms2rescore": {
+        "feature_generators": {
+            "basic": {},
+            "ms2pip": {
+                "model": "timsTOF",
+                "ms2_tolerance": 0.02
+            },
+            "deeplc": {
+                "deeplc_retrain": false
+            },
+            "im2deep": {},
+            "maxquant": {}
+        },
+        "rescoring_engine": {
+            "mokapot": {
+                "write_weights": true,
+                "write_txt": true,
+                "write_flashlfq": true
+            }
+        },
+        "psm_file": null
+    }
+}

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/config_schema.json RENAMED Viewed

@@ -29,6 +29,9 @@
                         },
                         "ionmob": {
                             "$ref": "#/definitions/ionmob"
+                        },
+                        "im2deep": {
+                            "$ref": "#/definitions/im2deep"
                         }
                     },
                     "default": {
@@ -107,6 +110,18 @@
                     "default": "(.*)",
                     "format": "regex"
                 },
+                "psm_id_rt_pattern": {
+                    "description": "Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group.",
+                    "oneOf": [{ "type": "string" }, { "type": "null" }],
+                    "default": null,
+                    "format": "regex"
+                },
+                "psm_id_im_pattern": {
+                    "description": "Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group.",
+                    "oneOf": [{ "type": "string" }, { "type": "null" }],
+                    "default": null,
+                    "format": "regex"
+                },
                 "lower_score_is_better": {
                     "description": "Bool indicating if lower score is better",
                     "type": "boolean",
@@ -224,6 +239,19 @@
                 }
             }
         },
+        "im2deep": {
+            "$ref": "#/definitions/feature_generator",
+            "description": "Ion mobility feature generator configuration using IM2Deep",
+            "type": "object",
+            "additionalProperties": true,
+            "properties": {
+                "reference_dataset": {
+                    "description": "Path to IM2Deep reference dataset file",
+                    "type": "string",
+                    "default": "Meier_unimod.parquet"
+                }
+            }
+        },
         "mokapot": {
             "$ref": "#/definitions/rescoring_engine",
             "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/parse_psms.py RENAMED Viewed

@@ -27,6 +27,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     psm_list = _read_psms(config, psm_list)
     _find_decoys(config, psm_list)
     _calculate_qvalues(config, psm_list)
+    if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
+        logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
+        _parse_values_spectrum_id(config, psm_list)
     # Store scoring values for comparison later
     for psm in psm_list:
@@ -51,7 +54,8 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     non_mapped_modifications = modifications_found - set(config["modification_mapping"].keys())
     if non_mapped_modifications:
         logger.warning(
-            f"Non-mapped modifications found: {non_mapped_modifications}\nThis can be ignored if Unimod modification label"
+            f"Non-mapped modifications found: {non_mapped_modifications}\n"
+            "This can be ignored if they are Unimod modification labels."
         )
     psm_list.rename_modifications(config["modification_mapping"])
     psm_list.add_fixed_modifications(config["fixed_modifications"])
@@ -154,6 +158,46 @@ def _match_psm_ids(old_id, regex_pattern):
         )
+def _parse_values_spectrum_id(config, psm_list):
+    """Parse retention time and or ion mobility values from the spectrum_id."""
+    if config["psm_id_rt_pattern"]:
+        logger.debug(
+            "Parsing retention time from spectrum_id with regex pattern "
+            f"{config['psm_id_rt_pattern']}"
+        )
+        try:
+            rt_pattern = re.compile(config["psm_id_rt_pattern"])
+            psm_list["retention_time"] = [
+                float(rt_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
+            ]
+        except AttributeError:
+            raise MS2RescoreConfigurationError(
+                f"Could not parse retention time from spectrum_id with the "
+                f"{config['psm_id_rt_pattern']} regex pattern. "
+                "Please make sure the retention time key is present in the spectrum_id "
+                "and the value is in a capturing group or disable the relevant feature generator."
+            )
+    if config["psm_id_im_pattern"]:
+        logger.debug(
+            "Parsing ion mobility from spectrum_id with regex pattern "
+            f"{config['psm_id_im_pattern']}"
+        )
+        try:
+            im_pattern = re.compile(config["psm_id_im_pattern"])
+            psm_list["ion_mobility"] = [
+                float(im_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
+            ]
+        except AttributeError:
+            raise MS2RescoreConfigurationError(
+                f"Could not parse ion mobility from spectrum_id with the "
+                f"{config['psm_id_im_pattern']} regex pattern. "
+                "Please make sure the ion mobility key is present in the spectrum_id "
+                "and the value is in a capturing group or disable the relevant feature generator."
+            )
 def _has_invalid_aminoacids(psm):
     """Check if a PSM contains invalid amino acids."""

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/parse_spectra.py RENAMED Viewed

@@ -6,7 +6,6 @@ from itertools import chain
 from ms2rescore_rs import get_precursor_info
 from psm_utils import PSMList
-from rich.progress import track
 from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.utils import infer_spectrum_path

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/generate.py RENAMED Viewed

@@ -145,9 +145,11 @@ def _collect_files(output_path_prefix, use_txt_log=False):
         "configuration": Path(output_path_prefix + ".full-config.json").resolve(),
         "feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
         "feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
-        "log": Path(output_path_prefix + ".log.txt").resolve()
-        if use_txt_log
-        else Path(output_path_prefix + ".log.html").resolve(),
+        "log": (
+            Path(output_path_prefix + ".log.txt").resolve()
+            if use_txt_log
+            else Path(output_path_prefix + ".log.html").resolve()
+        ),
     }
     for file, path in files.items():
         if Path(path).is_file():
@@ -321,16 +323,12 @@ def _get_features_context(
         import deeplc.plot
         scatter_chart = deeplc.plot.scatter(
-            df=features[
-                (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
-            ],  # noqa: E712
+            df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
             predicted_column="predicted_retention_time_best",
             observed_column="observed_retention_time_best",
         )
         baseline_chart = deeplc.plot.distribution_baseline(
-            df=features[
-                (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
-            ],  # noqa: E712
+            df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
             predicted_column="predicted_retention_time_best",
             observed_column="observed_retention_time_best",
         )
@@ -343,6 +341,26 @@ def _get_features_context(
             }
         )
+    # IM2Deep specific charts
+    if "im2deep" in feature_names:
+        import deeplc.plot
+        scatter_chart = deeplc.plot.scatter(
+            df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
+            predicted_column="ccs_predicted_im2deep",
+            observed_column="ccs_observed_im2deep",
+            xaxis_label="Observed CCS",
+            yaxis_label="Predicted CCS",
+            plot_title="Predicted vs. observed CCS",
+        )
+        context["charts"].append(
+            {
+                "title": TEXTS["charts"]["im2deep_performance"]["title"],
+                "description": TEXTS["charts"]["im2deep_performance"]["description"],
+                "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS),
+            }
+        )
     return context

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/texts.toml RENAMED Viewed

@@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460
 datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold
 of the current dataset. A lower RMAE value indicates better performance.
 """
+[charts.im2deep_performance]
+title = "IM2Deep model performance"
+description = """
+IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS.
+"""

ms2rescore-3.1.0.dev1/ms2rescore/utils.py ADDED Viewed

@@ -0,0 +1,95 @@
+import logging
+import os
+import re
+from glob import glob
+from pathlib import Path
+from typing import Optional, Union
+from ms2rescore.exceptions import MS2RescoreConfigurationError
+logger = logging.getLogger(__name__)
+def infer_spectrum_path(
+    configured_path: Union[str, Path, None],
+    run_name: Optional[str] = None,
+) -> Union[str, Path]:
+    """
+    Infer spectrum path from passed path and expected filename (e.g. from PSM file).
+    Parameters
+    ----------
+    configured_path: str, Path, None
+        User-defined path to spectrum file or directory containing spectrum file
+    run_name : str, optional
+        MS run name (stem of spectrum filename), e.g., as expected from PSM file.
+    """
+    # If no spectrum path configured, use expected run_name in default dir
+    if not configured_path:
+        if run_name:
+            resolved_path = os.path.join(".", run_name)
+        else:
+            raise MS2RescoreConfigurationError(
+                "Could not resolve spectrum file name: No spectrum path configured "
+                "and no run name in PSM file found."
+            )
+    else:
+        is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path)
+        # If passed path is directory (that is not Bruker raw), join with run name
+        if os.path.isdir(configured_path) and not is_bruker_dir:
+            if run_name:
+                resolved_path = os.path.join(configured_path, run_name)
+            else:
+                raise MS2RescoreConfigurationError(
+                    "Could not resolve spectrum file name: Spectrum path is directory "
+                    "but no run name in PSM file found."
+                )
+        # If passed path is file, use that, but warn if basename doesn't match expected
+        elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir):
+            if run_name and Path(configured_path).stem != Path(run_name).stem:
+                logger.warning(
+                    "Passed spectrum path (`%s`) does not match run name found in PSM "
+                    "file (`%s`). Continuing with passed spectrum path.",
+                    configured_path,
+                    run_name,
+                )
+            resolved_path = configured_path
+        else:
+            raise MS2RescoreConfigurationError(
+                "Configured `spectrum_path` must be `None` or a path to an existing file "
+                "or directory. If `None` or path to directory, spectrum run information "
+                "should be present in the PSM file."
+            )
+    # Match with file extension if not in resolved_path yet
+    if not _is_minitdf(resolved_path) and not re.match(
+        r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE
+    ):
+        for filename in glob(resolved_path + "*"):
+            if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE):
+                resolved_path = filename
+                break
+        else:
+            raise MS2RescoreConfigurationError(
+                f"Resolved spectrum filename ('{resolved_path}') does not contain a supported "
+                "file extension (mzML, MGF, or .d) and could not find any matching existing "
+                "files."
+            )
+    return Path(resolved_path)
+def _is_minitdf(spectrum_file: str) -> bool:
+    """
+    Check if the spectrum file is a Bruker miniTDF folder.
+    A Bruker miniTDF folder has no fixed name, but contains files matching the patterns
+    ``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``.
+    """
+    files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
+    files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
+    return len(files) >= 2

{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/pyproject.toml RENAMED Viewed

@@ -32,25 +32,26 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.8"
 dependencies = [
-    "ms2rescore_rs",
-    "numpy>=1.16.0; python_version != '3.11'",
-    "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
-    "pandas>=1.0",
-    "rich>=12",
-    "pyteomics>=4.1.0",
-    "lxml>=4.5",
-    "ms2pip>=4.0.0-dev4",
-    "click>=7",
     "cascade-config>=0.4.0",
+    "click>=7",
+    "customtkinter>=5,<6",
     "deeplc>=2.2",
     "deeplcretrainer>=0.2",
-    "tomli>=2; python_version < '3.11'",
-    "psm_utils>=0.4",
-    "customtkinter>=5,<6",
-    "mokapot>=0.9",
-    "pydantic>=1.8.2,<2",                      # Fix compatibility with v2 in psm_utils
+    "im2deep>=0.1.3",
     "jinja2>=3",
+    "lxml>=4.5",
+    "mokapot>=0.9",
+    "ms2pip>=4.0.0-dev10",
+    "ms2rescore_rs",
+    "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
+    "numpy>=1.16.0; python_version != '3.11'",
+    "pandas>=1.0",
     "plotly>=5",
+    "psm_utils>=0.8",
+    "pydantic>=1.8.2,<2",                      # Fix compatibility with v2 in psm_utils
+    "pyteomics>=4.1.0, <4.7",
+    "rich>=12",
+    "tomli>=2; python_version < '3.11'",
 ]
 [project.optional-dependencies]
@@ -79,6 +80,7 @@ CompOmics = "https://www.compomics.com"
 ms2rescore = "ms2rescore.__main__:main"
 ms2rescore-gui = "ms2rescore.gui.__main__:main"
 ms2rescore-report = "ms2rescore.report.__main__:main"
+tims2rescore = "ms2rescore.__main__:main_tims"
 [build-system]
 requires = ["flit_core >=3.2,<4"]
@@ -94,3 +96,6 @@ target-version = ['py38']
 [tool.ruff]
 line-length = 99
 target-version = 'py38'
+[tool.ruff.lint]
+extend-select = ["T201", "T203"]

ms2rescore-3.0.3/ms2rescore/utils.py DELETED Viewed

@@ -1,78 +0,0 @@
-import logging
-import os
-import re
-from glob import glob
-from pathlib import Path
-from typing import Optional, Union
-from ms2rescore.exceptions import MS2RescoreConfigurationError
-logger = logging.getLogger(__name__)
-def infer_spectrum_path(
-    configured_path: Union[str, Path, None],
-    run_name: Optional[str] = None,
-) -> Union[str, Path]:
-    """
-    Infer spectrum path from passed path and expected filename (e.g. from PSM file).
-    Parameters
-    ----------
-    configured_path: str, Path, None
-        User-defined path to spectrum file or directory containing spectrum file
-    run_name : str, optional
-        MS run name (stem of spectrum filename), e.g., as expected from PSM file.
-    """
-    # If no spectrum path configured, use expected run_name in default dir
-    if not configured_path:
-        if run_name:
-            resolved_path = os.path.join(".", run_name)
-        else:
-            raise MS2RescoreConfigurationError(
-                "Could not resolve spectrum file name: No spectrum path configured "
-                "and no run name in PSM file found."
-            )
-    # If passed path is directory, join with run name
-    elif os.path.isdir(configured_path):
-        if run_name:
-            resolved_path = os.path.join(configured_path, run_name)
-        else:
-            raise MS2RescoreConfigurationError(
-                "Could not resolve spectrum file name: Spectrum path is directory "
-                "but no run name in PSM file found."
-            )
-    # If passed path is file, use that, but warn if basename doesn't match expected
-    elif os.path.isfile(configured_path):
-        if run_name and Path(configured_path).stem != Path(run_name).stem:
-            logger.warning(
-                "Passed spectrum path (`%s`) does not match run name found in PSM "
-                "file (`%s`). Continuing with passed spectrum path.",
-                configured_path,
-                run_name,
-            )
-        resolved_path = configured_path
-    else:
-        raise MS2RescoreConfigurationError(
-            "Configured `spectrum_path` must be `None` or a path to an existing file "
-            "or directory. If `None` or path to directory, spectrum run information "
-            "should be present in the PSM file."
-        )
-    # Match with file extension if not in resolved_path yet
-    if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE):
-        for filename in glob(resolved_path + "*"):
-            if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE):
-                resolved_path = filename
-                break
-        else:
-            raise MS2RescoreConfigurationError(
-                "Resolved spectrum filename does not contain a supported file "
-                "extension (mgf or mzml) and could not find any matching existing "
-                "files."
-            )
-    return Path(resolved_path)