ms2rescore 3.0.3__tar.gz → 3.1.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/PKG-INFO +16 -15
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/__init__.py +1 -1
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/__main__.py +32 -11
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/core.py +47 -7
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/exceptions.py +6 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/__init__.py +2 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/deeplc.py +21 -34
- ms2rescore-3.1.0.dev1/ms2rescore/feature_generators/im2deep.py +169 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/ionmob.py +3 -3
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/ms2pip.py +1 -1
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/app.py +27 -1
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/config_default.json +2 -0
- ms2rescore-3.1.0.dev1/ms2rescore/package_data/config_default_tims.json +25 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/config_schema.json +28 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/parse_psms.py +45 -1
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/parse_spectra.py +0 -1
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/generate.py +27 -9
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/texts.toml +6 -0
- ms2rescore-3.1.0.dev1/ms2rescore/utils.py +95 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/pyproject.toml +19 -14
- ms2rescore-3.0.3/ms2rescore/utils.py +0 -78
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/LICENSE +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/README.md +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/config_parser.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/base.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/basic.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/feature_generators/maxquant.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/__main__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/function2ctk.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/gui/widgets.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/config_icon.png +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/github-mark-white.png +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/github-mark.png +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/ms2rescore_logo.png +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/program_icon.ico +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/ms2rescore-gui-theme.json +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/__main__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/charts.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/about.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/base.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/config.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/features.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/log.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/metadata.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/overview.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/stats-card.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/style.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/templates/target-decoy.html +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/report/utils.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/rescoring_engines/__init__.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/rescoring_engines/mokapot.py +0 -0
- {ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/rescoring_engines/percolator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ms2rescore
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.1.0.dev1
|
|
4
4
|
Summary: MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and retention times.
|
|
5
5
|
Keywords: MS2Rescore,MS2PIP,DeepLC,Percolator,proteomics,mass spectrometry,peptide identification,rescoring,machine learning
|
|
6
6
|
Author: Ana Sílvia C. Silva, Robbin Bouwmeester, Louise Buur
|
|
@@ -13,25 +13,26 @@ Classifier: Operating System :: OS Independent
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
15
|
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
-
Requires-Dist: ms2rescore_rs
|
|
17
|
-
Requires-Dist: numpy>=1.16.0; python_version != '3.11'
|
|
18
|
-
Requires-Dist: numpy==1.24.3; python_version == '3.11'
|
|
19
|
-
Requires-Dist: pandas>=1.0
|
|
20
|
-
Requires-Dist: rich>=12
|
|
21
|
-
Requires-Dist: pyteomics>=4.1.0
|
|
22
|
-
Requires-Dist: lxml>=4.5
|
|
23
|
-
Requires-Dist: ms2pip>=4.0.0-dev4
|
|
24
|
-
Requires-Dist: click>=7
|
|
25
16
|
Requires-Dist: cascade-config>=0.4.0
|
|
17
|
+
Requires-Dist: click>=7
|
|
18
|
+
Requires-Dist: customtkinter>=5,<6
|
|
26
19
|
Requires-Dist: deeplc>=2.2
|
|
27
20
|
Requires-Dist: deeplcretrainer>=0.2
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist: psm_utils>=0.4
|
|
30
|
-
Requires-Dist: customtkinter>=5,<6
|
|
31
|
-
Requires-Dist: mokapot>=0.9
|
|
32
|
-
Requires-Dist: pydantic>=1.8.2,<2
|
|
21
|
+
Requires-Dist: im2deep>=0.1.3
|
|
33
22
|
Requires-Dist: jinja2>=3
|
|
23
|
+
Requires-Dist: lxml>=4.5
|
|
24
|
+
Requires-Dist: mokapot>=0.9
|
|
25
|
+
Requires-Dist: ms2pip>=4.0.0-dev10
|
|
26
|
+
Requires-Dist: ms2rescore_rs
|
|
27
|
+
Requires-Dist: numpy==1.24.3; python_version == '3.11'
|
|
28
|
+
Requires-Dist: numpy>=1.16.0; python_version != '3.11'
|
|
29
|
+
Requires-Dist: pandas>=1.0
|
|
34
30
|
Requires-Dist: plotly>=5
|
|
31
|
+
Requires-Dist: psm_utils>=0.8
|
|
32
|
+
Requires-Dist: pydantic>=1.8.2,<2
|
|
33
|
+
Requires-Dist: pyteomics>=4.1.0, <4.7
|
|
34
|
+
Requires-Dist: rich>=12
|
|
35
|
+
Requires-Dist: tomli>=2; python_version < '3.11'
|
|
35
36
|
Requires-Dist: ruff ; extra == "dev"
|
|
36
37
|
Requires-Dist: black ; extra == "dev"
|
|
37
38
|
Requires-Dist: pytest ; extra == "dev"
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
|
+
import importlib.resources
|
|
5
|
+
import json
|
|
4
6
|
import logging
|
|
5
7
|
import sys
|
|
6
8
|
from pathlib import Path
|
|
@@ -10,7 +12,7 @@ from rich.console import Console
|
|
|
10
12
|
from rich.logging import RichHandler
|
|
11
13
|
from rich.text import Text
|
|
12
14
|
|
|
13
|
-
from ms2rescore import __version__
|
|
15
|
+
from ms2rescore import __version__, package_data
|
|
14
16
|
from ms2rescore.config_parser import parse_configurations
|
|
15
17
|
from ms2rescore.core import rescore
|
|
16
18
|
from ms2rescore.exceptions import MS2RescoreConfigurationError
|
|
@@ -33,19 +35,26 @@ LOGGER = logging.getLogger(__name__)
|
|
|
33
35
|
CONSOLE = Console(record=True)
|
|
34
36
|
|
|
35
37
|
|
|
36
|
-
def _print_credits():
|
|
38
|
+
def _print_credits(tims=False):
|
|
37
39
|
"""Print software credits to terminal."""
|
|
38
40
|
text = Text()
|
|
39
41
|
text.append("\n")
|
|
40
|
-
|
|
42
|
+
if tims:
|
|
43
|
+
text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
|
|
44
|
+
else:
|
|
45
|
+
text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
|
|
41
46
|
text.append(f" (v{__version__})\n", style="bold")
|
|
47
|
+
if tims:
|
|
48
|
+
text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic")
|
|
42
49
|
text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n")
|
|
43
50
|
text.append("Please cite: ")
|
|
44
51
|
text.append(
|
|
45
|
-
"Declercq et al.
|
|
52
|
+
"Buur & Declercq et al. JPR (2024)",
|
|
53
|
+
style="link https://doi.org/10.1021/acs.jproteome.3c00785",
|
|
46
54
|
)
|
|
47
55
|
text.append("\n")
|
|
48
|
-
|
|
56
|
+
if tims:
|
|
57
|
+
text.stylize("#006cb5")
|
|
49
58
|
CONSOLE.print(text)
|
|
50
59
|
|
|
51
60
|
|
|
@@ -152,18 +161,30 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]):
|
|
|
152
161
|
)
|
|
153
162
|
|
|
154
163
|
|
|
155
|
-
def
|
|
164
|
+
def main_tims():
|
|
165
|
+
"""Run MS²Rescore command-line interface in TIMS²Rescore mode."""
|
|
166
|
+
main(tims=True)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def main(tims=False):
|
|
156
170
|
"""Run MS²Rescore command-line interface."""
|
|
157
|
-
_print_credits()
|
|
171
|
+
_print_credits(tims)
|
|
158
172
|
|
|
159
173
|
# Parse CLI arguments and configuration file
|
|
160
174
|
parser = _argument_parser()
|
|
161
175
|
cli_args = parser.parse_args()
|
|
176
|
+
|
|
177
|
+
configurations = []
|
|
178
|
+
if cli_args.config_file:
|
|
179
|
+
configurations.append(cli_args.config_file)
|
|
180
|
+
if tims:
|
|
181
|
+
configurations.append(
|
|
182
|
+
json.load(importlib.resources.open_text(package_data, "config_default_tims.json"))
|
|
183
|
+
)
|
|
184
|
+
configurations.append(cli_args)
|
|
185
|
+
|
|
162
186
|
try:
|
|
163
|
-
|
|
164
|
-
config = parse_configurations([cli_args.config_file, cli_args])
|
|
165
|
-
else:
|
|
166
|
-
config = parse_configurations(cli_args)
|
|
187
|
+
config = parse_configurations(configurations)
|
|
167
188
|
except MS2RescoreConfigurationError as e:
|
|
168
189
|
LOGGER.critical(e)
|
|
169
190
|
sys.exit(1)
|
|
@@ -3,15 +3,16 @@ import logging
|
|
|
3
3
|
from multiprocessing import cpu_count
|
|
4
4
|
from typing import Dict, Optional
|
|
5
5
|
|
|
6
|
+
import numpy as np
|
|
6
7
|
import psm_utils.io
|
|
7
8
|
from psm_utils import PSMList
|
|
8
9
|
|
|
10
|
+
from ms2rescore import exceptions
|
|
9
11
|
from ms2rescore.feature_generators import FEATURE_GENERATORS
|
|
10
12
|
from ms2rescore.parse_psms import parse_psms
|
|
11
13
|
from ms2rescore.parse_spectra import get_missing_values
|
|
12
14
|
from ms2rescore.report import generate
|
|
13
15
|
from ms2rescore.rescoring_engines import mokapot, percolator
|
|
14
|
-
from ms2rescore import exceptions
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
@@ -58,12 +59,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
|
|
|
58
59
|
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
|
|
59
60
|
)
|
|
60
61
|
|
|
61
|
-
#
|
|
62
|
-
|
|
63
|
-
im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
|
|
64
|
-
if rt_required or im_required:
|
|
65
|
-
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
|
|
66
|
-
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
|
|
62
|
+
# Add missing precursor info from spectrum file if needed
|
|
63
|
+
_fill_missing_precursor_info(psm_list, config)
|
|
67
64
|
|
|
68
65
|
# Add rescoring features
|
|
69
66
|
for fgen_name, fgen_config in config["feature_generators"].items():
|
|
@@ -160,6 +157,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
|
|
|
160
157
|
logger.exception(e)
|
|
161
158
|
|
|
162
159
|
|
|
160
|
+
def _fill_missing_precursor_info(psm_list, config):
|
|
161
|
+
"""Fill missing precursor info from spectrum file if needed."""
|
|
162
|
+
# Check if required
|
|
163
|
+
# TODO: avoid hard coding feature generators in some way
|
|
164
|
+
rt_required = ("deeplc" in config["feature_generators"]) and any(
|
|
165
|
+
v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
|
|
166
|
+
)
|
|
167
|
+
im_required = (
|
|
168
|
+
"ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
|
|
169
|
+
) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
|
|
170
|
+
logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
|
|
171
|
+
|
|
172
|
+
# Add missing values
|
|
173
|
+
if rt_required or im_required:
|
|
174
|
+
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
|
|
175
|
+
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
|
|
176
|
+
|
|
177
|
+
# Check if values are now present
|
|
178
|
+
for value_name in ["retention_time", "ion_mobility"]:
|
|
179
|
+
if (
|
|
180
|
+
0.0 in psm_list[value_name]
|
|
181
|
+
or None in psm_list[value_name]
|
|
182
|
+
or np.isnan(psm_list[value_name]).any()
|
|
183
|
+
):
|
|
184
|
+
if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
|
|
185
|
+
raise exceptions.MissingValuesError(
|
|
186
|
+
f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
|
|
187
|
+
f"feature generators that require '{value_name}' or ensure that the values are "
|
|
188
|
+
"present in the input files."
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
missing_value_psms = psm_list[
|
|
192
|
+
[v is None or np.isnan(v) for v in psm_list[value_name]]
|
|
193
|
+
]
|
|
194
|
+
logger.warning(
|
|
195
|
+
f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
|
|
196
|
+
"These PSMs will be removed."
|
|
197
|
+
)
|
|
198
|
+
psm_list = psm_list[
|
|
199
|
+
[v is not None and not np.isnan(v) for v in psm_list[value_name]]
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
|
|
163
203
|
def _write_feature_names(feature_names, output_file_root):
|
|
164
204
|
"""Write feature names to file."""
|
|
165
205
|
with open(output_file_root + ".feature_names.tsv", "w") as f:
|
|
@@ -25,6 +25,12 @@ class ModificationParsingError(IDFileParsingError):
|
|
|
25
25
|
pass
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
class MissingValuesError(MS2RescoreError):
|
|
29
|
+
"""Missing values in PSMs and/or spectra."""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
28
34
|
class ReportGenerationError(MS2RescoreError):
|
|
29
35
|
"""Error while generating report."""
|
|
30
36
|
|
|
@@ -7,6 +7,7 @@ from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
|
|
|
7
7
|
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
|
|
8
8
|
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
|
|
9
9
|
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
|
|
10
|
+
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
|
|
10
11
|
|
|
11
12
|
FEATURE_GENERATORS = {
|
|
12
13
|
"basic": BasicFeatureGenerator,
|
|
@@ -14,4 +15,5 @@ FEATURE_GENERATORS = {
|
|
|
14
15
|
"deeplc": DeepLCFeatureGenerator,
|
|
15
16
|
"maxquant": MaxQuantFeatureGenerator,
|
|
16
17
|
"ionmob": IonMobFeatureGenerator,
|
|
18
|
+
"im2deep": IM2DeepFeatureGenerator,
|
|
17
19
|
}
|
|
@@ -21,12 +21,10 @@ import os
|
|
|
21
21
|
from collections import defaultdict
|
|
22
22
|
from inspect import getfullargspec
|
|
23
23
|
from itertools import chain
|
|
24
|
-
from typing import List,
|
|
24
|
+
from typing import List, Union
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
|
-
import pandas as pd
|
|
28
27
|
from psm_utils import PSMList
|
|
29
|
-
from psm_utils.io import peptide_record
|
|
30
28
|
|
|
31
29
|
from ms2rescore.feature_generators.base import FeatureGeneratorBase
|
|
32
30
|
|
|
@@ -41,8 +39,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
41
39
|
self,
|
|
42
40
|
*args,
|
|
43
41
|
lower_score_is_better: bool = False,
|
|
44
|
-
calibration_set_size: Union[int, float] =
|
|
45
|
-
spectrum_path: Optional[str] = None,
|
|
42
|
+
calibration_set_size: Union[int, float, None] = None,
|
|
46
43
|
processes: int = 1,
|
|
47
44
|
**kwargs,
|
|
48
45
|
) -> None:
|
|
@@ -59,9 +56,6 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
59
56
|
calibration_set_size: int or float
|
|
60
57
|
Amount of best PSMs to use for DeepLC calibration. If this value is lower
|
|
61
58
|
than the number of available PSMs, all PSMs will be used. (default: 0.15)
|
|
62
|
-
spectrum_path
|
|
63
|
-
Path to spectrum file or directory with spectrum files. If None, inferred from `run`
|
|
64
|
-
field in PSMs. Defaults to None.
|
|
65
59
|
processes: {int, None}
|
|
66
60
|
Number of processes to use in DeepLC. Defaults to 1.
|
|
67
61
|
kwargs: dict
|
|
@@ -77,7 +71,6 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
77
71
|
|
|
78
72
|
self.lower_psm_score_better = lower_score_is_better
|
|
79
73
|
self.calibration_set_size = calibration_set_size
|
|
80
|
-
self.spectrum_path = spectrum_path
|
|
81
74
|
self.processes = processes
|
|
82
75
|
self.deeplc_kwargs = kwargs or {}
|
|
83
76
|
|
|
@@ -151,17 +144,15 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
151
144
|
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
|
|
152
145
|
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
|
|
153
146
|
|
|
154
|
-
logger.debug("Calibrating DeepLC...")
|
|
155
147
|
psm_list_calibration = self._get_calibration_psms(psm_list_run)
|
|
148
|
+
logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
|
|
156
149
|
self.deeplc_predictor = self.DeepLC(
|
|
157
150
|
n_jobs=self.processes,
|
|
158
151
|
verbose=self._verbose,
|
|
159
152
|
path_model=self.selected_model or self.user_model,
|
|
160
153
|
**self.deeplc_kwargs,
|
|
161
154
|
)
|
|
162
|
-
self.deeplc_predictor.calibrate_preds(
|
|
163
|
-
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
|
|
164
|
-
)
|
|
155
|
+
self.deeplc_predictor.calibrate_preds(psm_list_calibration)
|
|
165
156
|
# Still calibrate for each run, but do not try out all model options.
|
|
166
157
|
# Just use model that was selected based on first run
|
|
167
158
|
if not self.selected_model:
|
|
@@ -174,11 +165,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
174
165
|
)
|
|
175
166
|
|
|
176
167
|
logger.debug("Predicting retention times...")
|
|
177
|
-
predictions = np.array(
|
|
178
|
-
self.deeplc_predictor.make_preds(
|
|
179
|
-
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
|
|
180
|
-
)
|
|
181
|
-
)
|
|
168
|
+
predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
|
|
182
169
|
observations = psm_list_run["retention_time"]
|
|
183
170
|
rt_diffs_run = np.abs(predictions - observations)
|
|
184
171
|
|
|
@@ -204,25 +191,25 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
|
|
|
204
191
|
)
|
|
205
192
|
current_run += 1
|
|
206
193
|
|
|
207
|
-
# TODO: Remove when DeepLC supports PSMList directly
|
|
208
|
-
@staticmethod
|
|
209
|
-
def _psm_list_to_deeplc_peprec(psm_list: PSMList) -> pd.DataFrame:
|
|
210
|
-
peprec = peptide_record.to_dataframe(psm_list)
|
|
211
|
-
peprec = peprec.rename(
|
|
212
|
-
columns={
|
|
213
|
-
"observed_retention_time": "tr",
|
|
214
|
-
"peptide": "seq",
|
|
215
|
-
}
|
|
216
|
-
)[["tr", "seq", "modifications"]]
|
|
217
|
-
return peprec
|
|
218
|
-
|
|
219
194
|
def _get_calibration_psms(self, psm_list: PSMList):
|
|
220
195
|
"""Get N best scoring target PSMs for calibration."""
|
|
221
196
|
psm_list_targets = psm_list[~psm_list["is_decoy"]]
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
197
|
+
if self.calibration_set_size:
|
|
198
|
+
n_psms = self._get_number_of_calibration_psms(psm_list_targets)
|
|
199
|
+
indices = np.argsort(psm_list_targets["score"])
|
|
200
|
+
indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
|
|
201
|
+
return psm_list_targets[indices]
|
|
202
|
+
else:
|
|
203
|
+
identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01]
|
|
204
|
+
if len(identified_psms) == 0:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc."
|
|
207
|
+
)
|
|
208
|
+
elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]):
|
|
209
|
+
logger.warning(
|
|
210
|
+
" Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining."
|
|
211
|
+
)
|
|
212
|
+
return identified_psms
|
|
226
213
|
|
|
227
214
|
def _get_number_of_calibration_psms(self, psm_list):
|
|
228
215
|
"""Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
IM2Deep ion mobility-based feature generator.
|
|
3
|
+
|
|
4
|
+
IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional
|
|
5
|
+
neural network to predict retention times based on the atomic composition of the (modified) amino
|
|
6
|
+
acid residues in the peptide. See
|
|
7
|
+
`github.com/compomics/IM2Deep <https://github.com/compomics/IM2Deep>`_ for more information.
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import contextlib
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from inspect import getfullargspec
|
|
15
|
+
from itertools import chain
|
|
16
|
+
from typing import List
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
from im2deep.calibrate import im2ccs
|
|
21
|
+
from im2deep.im2deep import predict_ccs
|
|
22
|
+
from psm_utils import PSMList
|
|
23
|
+
|
|
24
|
+
from ms2rescore.feature_generators.base import FeatureGeneratorBase
|
|
25
|
+
|
|
26
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class IM2DeepFeatureGenerator(FeatureGeneratorBase):
|
|
31
|
+
"""IM2Deep collision cross section feature generator."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
*args,
|
|
36
|
+
processes: int = 1,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the IM2DeepFeatureGenerator.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
processes : int, optional
|
|
45
|
+
Number of parallel processes to use for IM2Deep predictions. Default is 1.
|
|
46
|
+
**kwargs : dict, optional
|
|
47
|
+
Additional keyword arguments to `im2deep.predict_ccs`.
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(*args, **kwargs)
|
|
51
|
+
|
|
52
|
+
self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
|
|
53
|
+
|
|
54
|
+
# Remove any kwargs that are not IM2Deep arguments
|
|
55
|
+
self.im2deep_kwargs = kwargs or {}
|
|
56
|
+
self.im2deep_kwargs = {
|
|
57
|
+
k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args
|
|
58
|
+
}
|
|
59
|
+
self.im2deep_kwargs["n_jobs"] = processes
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def feature_names(self) -> List[str]:
|
|
63
|
+
return [
|
|
64
|
+
"ccs_observed_im2deep",
|
|
65
|
+
"ccs_predicted_im2deep",
|
|
66
|
+
"ccs_error_im2deep",
|
|
67
|
+
"abs_ccs_error_im2deep",
|
|
68
|
+
"perc_ccs_error_im2deep",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
def add_features(self, psm_list: PSMList) -> None:
|
|
72
|
+
"""Add IM2Deep-derived features to PSMs"""
|
|
73
|
+
logger.info("Adding IM2Deep-derived features to PSMs")
|
|
74
|
+
|
|
75
|
+
# Get easy-access nested version of PSMlist
|
|
76
|
+
psm_dict = psm_list.get_psm_dict()
|
|
77
|
+
|
|
78
|
+
# Run IM2Deep for each spectrum file
|
|
79
|
+
current_run = 1
|
|
80
|
+
total_runs = sum(len(runs) for runs in psm_dict.values())
|
|
81
|
+
|
|
82
|
+
for runs in psm_dict.values():
|
|
83
|
+
# Reset IM2Deep predictor for each collection of runs
|
|
84
|
+
for run, psms in runs.items():
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..."
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Disable wild logging to stdout by TensorFlow, unless in debug mode
|
|
90
|
+
with (
|
|
91
|
+
contextlib.redirect_stdout(open(os.devnull, "w"))
|
|
92
|
+
if not self._verbose
|
|
93
|
+
else contextlib.nullcontext()
|
|
94
|
+
):
|
|
95
|
+
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
|
|
96
|
+
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
|
|
97
|
+
|
|
98
|
+
logger.debug("Calibrating IM2Deep...")
|
|
99
|
+
|
|
100
|
+
# Convert ion mobility to CCS and calibrate CCS values
|
|
101
|
+
psm_list_run_df = psm_list_run.to_dataframe()
|
|
102
|
+
psm_list_run_df["charge"] = [
|
|
103
|
+
pep.precursor_charge for pep in psm_list_run_df["peptidoform"]
|
|
104
|
+
]
|
|
105
|
+
psm_list_run_df["ccs_observed"] = im2ccs(
|
|
106
|
+
psm_list_run_df["ion_mobility"],
|
|
107
|
+
psm_list_run_df["precursor_mz"],
|
|
108
|
+
psm_list_run_df["charge"],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Create dataframe with high confidence hits for calibration
|
|
112
|
+
cal_psm_df = self.make_calibration_df(psm_list_run_df)
|
|
113
|
+
|
|
114
|
+
# Make predictions with IM2Deep
|
|
115
|
+
logger.debug("Predicting CCS values...")
|
|
116
|
+
predictions = predict_ccs(
|
|
117
|
+
psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Add features to PSMs
|
|
121
|
+
logger.debug("Adding features to PSMs...")
|
|
122
|
+
observations = psm_list_run_df["ccs_observed"]
|
|
123
|
+
ccs_diffs_run = np.abs(predictions - observations)
|
|
124
|
+
for i, psm in enumerate(psm_list_run):
|
|
125
|
+
psm["rescoring_features"].update(
|
|
126
|
+
{
|
|
127
|
+
"ccs_observed_im2deep": observations[i],
|
|
128
|
+
"ccs_predicted_im2deep": predictions[i],
|
|
129
|
+
"ccs_error_im2deep": ccs_diffs_run[i],
|
|
130
|
+
"abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]),
|
|
131
|
+
"perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i])
|
|
132
|
+
/ observations[i]
|
|
133
|
+
* 100,
|
|
134
|
+
}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
current_run += 1
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
|
|
141
|
+
"""
|
|
142
|
+
Make dataframe for calibration of IM2Deep predictions.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
psm_list_df
|
|
147
|
+
DataFrame with PSMs.
|
|
148
|
+
threshold
|
|
149
|
+
Percentage of highest scoring identified target PSMs to use for calibration,
|
|
150
|
+
default 0.95.
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
pd.DataFrame
|
|
155
|
+
DataFrame with high confidence hits for calibration.
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
identified_psms = psm_list_df[
|
|
159
|
+
(psm_list_df["qvalue"] < 0.01)
|
|
160
|
+
& (~psm_list_df["is_decoy"])
|
|
161
|
+
& (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep
|
|
162
|
+
]
|
|
163
|
+
calibration_psms = identified_psms[
|
|
164
|
+
identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
|
|
165
|
+
]
|
|
166
|
+
logger.debug(
|
|
167
|
+
f"Number of high confidence hits for calculating shift: {len(calibration_psms)}"
|
|
168
|
+
)
|
|
169
|
+
return calibration_psms
|
|
@@ -165,6 +165,7 @@ class IonMobFeatureGenerator(FeatureGeneratorBase):
|
|
|
165
165
|
)
|
|
166
166
|
]
|
|
167
167
|
|
|
168
|
+
# TODO: Use observed m/z?
|
|
168
169
|
psm_list_run_df["mz"] = psm_list_run_df.apply(
|
|
169
170
|
lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
|
|
170
171
|
) # use precursor m/z from PSMs?
|
|
@@ -175,9 +176,8 @@ class IonMobFeatureGenerator(FeatureGeneratorBase):
|
|
|
175
176
|
)
|
|
176
177
|
# calibrate CCS values
|
|
177
178
|
shift_factor = self.calculate_ccs_shift(psm_list_run_df)
|
|
178
|
-
psm_list_run_df["ccs_observed"]
|
|
179
|
-
|
|
180
|
-
)
|
|
179
|
+
psm_list_run_df["ccs_observed"] + shift_factor
|
|
180
|
+
|
|
181
181
|
# predict CCS values
|
|
182
182
|
tf_ds = to_tf_dataset_inference(
|
|
183
183
|
psm_list_run_df["mz"],
|
|
@@ -193,7 +193,7 @@ class MS2PIPFeatureGenerator(FeatureGeneratorBase):
|
|
|
193
193
|
try:
|
|
194
194
|
ms2pip_results = correlate(
|
|
195
195
|
psms=psm_list_run,
|
|
196
|
-
spectrum_file=spectrum_filename,
|
|
196
|
+
spectrum_file=str(spectrum_filename),
|
|
197
197
|
spectrum_id_pattern=self.spectrum_id_pattern,
|
|
198
198
|
model=self.model,
|
|
199
199
|
ms2_tolerance=self.ms2_tolerance,
|
|
@@ -360,15 +360,20 @@ class FeatureGeneratorConfig(ctk.CTkFrame):
|
|
|
360
360
|
self.deeplc_config = DeepLCConfiguration(self)
|
|
361
361
|
self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew")
|
|
362
362
|
|
|
363
|
+
self.im2deep_config = Im2DeepConfiguration(self)
|
|
364
|
+
self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
|
|
365
|
+
|
|
363
366
|
self.ionmob_config = IonmobConfiguration(self)
|
|
364
|
-
self.ionmob_config.grid(row=
|
|
367
|
+
self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew")
|
|
365
368
|
|
|
366
369
|
def get(self) -> Dict:
|
|
367
370
|
"""Return the configuration as a dictionary."""
|
|
368
371
|
basic_enabled, basic_config = self.basic_config.get()
|
|
369
372
|
ms2pip_enabled, ms2pip_config = self.ms2pip_config.get()
|
|
370
373
|
deeplc_enabled, deeplc_config = self.deeplc_config.get()
|
|
374
|
+
im2deep_enabled, im2deep_config = self.im2deep_config.get()
|
|
371
375
|
ionmob_enabled, ionmob_config = self.ionmob_config.get()
|
|
376
|
+
|
|
372
377
|
config = {}
|
|
373
378
|
if basic_enabled:
|
|
374
379
|
config["basic"] = basic_config
|
|
@@ -523,6 +528,27 @@ class IonmobConfiguration(ctk.CTkFrame):
|
|
|
523
528
|
return enabled, config
|
|
524
529
|
|
|
525
530
|
|
|
531
|
+
class Im2DeepConfiguration(ctk.CTkFrame):
|
|
532
|
+
def __init__(self, *args, **kwargs):
|
|
533
|
+
"""IM2Deep configuration frame."""
|
|
534
|
+
super().__init__(*args, **kwargs)
|
|
535
|
+
|
|
536
|
+
self.configure(fg_color="transparent")
|
|
537
|
+
self.grid_columnconfigure(0, weight=1)
|
|
538
|
+
|
|
539
|
+
self.title = widgets.Heading(self, text="im2deep")
|
|
540
|
+
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
|
|
541
|
+
|
|
542
|
+
self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False)
|
|
543
|
+
self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
|
|
544
|
+
|
|
545
|
+
def get(self) -> Dict:
|
|
546
|
+
"""Return the configuration as a dictionary."""
|
|
547
|
+
enabled = self.enabled.get()
|
|
548
|
+
config = {}
|
|
549
|
+
return enabled, config
|
|
550
|
+
|
|
551
|
+
|
|
526
552
|
class RescoringEngineConfig(ctk.CTkFrame):
|
|
527
553
|
def __init__(self, *args, **kwargs):
|
|
528
554
|
"""Rescoring engine configuration frame."""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "./config_schema.json",
|
|
3
|
+
"ms2rescore": {
|
|
4
|
+
"feature_generators": {
|
|
5
|
+
"basic": {},
|
|
6
|
+
"ms2pip": {
|
|
7
|
+
"model": "timsTOF",
|
|
8
|
+
"ms2_tolerance": 0.02
|
|
9
|
+
},
|
|
10
|
+
"deeplc": {
|
|
11
|
+
"deeplc_retrain": false
|
|
12
|
+
},
|
|
13
|
+
"im2deep": {},
|
|
14
|
+
"maxquant": {}
|
|
15
|
+
},
|
|
16
|
+
"rescoring_engine": {
|
|
17
|
+
"mokapot": {
|
|
18
|
+
"write_weights": true,
|
|
19
|
+
"write_txt": true,
|
|
20
|
+
"write_flashlfq": true
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"psm_file": null
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -29,6 +29,9 @@
|
|
|
29
29
|
},
|
|
30
30
|
"ionmob": {
|
|
31
31
|
"$ref": "#/definitions/ionmob"
|
|
32
|
+
},
|
|
33
|
+
"im2deep": {
|
|
34
|
+
"$ref": "#/definitions/im2deep"
|
|
32
35
|
}
|
|
33
36
|
},
|
|
34
37
|
"default": {
|
|
@@ -107,6 +110,18 @@
|
|
|
107
110
|
"default": "(.*)",
|
|
108
111
|
"format": "regex"
|
|
109
112
|
},
|
|
113
|
+
"psm_id_rt_pattern": {
|
|
114
|
+
"description": "Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group.",
|
|
115
|
+
"oneOf": [{ "type": "string" }, { "type": "null" }],
|
|
116
|
+
"default": null,
|
|
117
|
+
"format": "regex"
|
|
118
|
+
},
|
|
119
|
+
"psm_id_im_pattern": {
|
|
120
|
+
"description": "Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group.",
|
|
121
|
+
"oneOf": [{ "type": "string" }, { "type": "null" }],
|
|
122
|
+
"default": null,
|
|
123
|
+
"format": "regex"
|
|
124
|
+
},
|
|
110
125
|
"lower_score_is_better": {
|
|
111
126
|
"description": "Bool indicating if lower score is better",
|
|
112
127
|
"type": "boolean",
|
|
@@ -224,6 +239,19 @@
|
|
|
224
239
|
}
|
|
225
240
|
}
|
|
226
241
|
},
|
|
242
|
+
"im2deep": {
|
|
243
|
+
"$ref": "#/definitions/feature_generator",
|
|
244
|
+
"description": "Ion mobility feature generator configuration using IM2Deep",
|
|
245
|
+
"type": "object",
|
|
246
|
+
"additionalProperties": true,
|
|
247
|
+
"properties": {
|
|
248
|
+
"reference_dataset": {
|
|
249
|
+
"description": "Path to IM2Deep reference dataset file",
|
|
250
|
+
"type": "string",
|
|
251
|
+
"default": "Meier_unimod.parquet"
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
},
|
|
227
255
|
"mokapot": {
|
|
228
256
|
"$ref": "#/definitions/rescoring_engine",
|
|
229
257
|
"description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",
|
|
@@ -27,6 +27,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
|
|
|
27
27
|
psm_list = _read_psms(config, psm_list)
|
|
28
28
|
_find_decoys(config, psm_list)
|
|
29
29
|
_calculate_qvalues(config, psm_list)
|
|
30
|
+
if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
|
|
31
|
+
logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
|
|
32
|
+
_parse_values_spectrum_id(config, psm_list)
|
|
30
33
|
|
|
31
34
|
# Store scoring values for comparison later
|
|
32
35
|
for psm in psm_list:
|
|
@@ -51,7 +54,8 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
|
|
|
51
54
|
non_mapped_modifications = modifications_found - set(config["modification_mapping"].keys())
|
|
52
55
|
if non_mapped_modifications:
|
|
53
56
|
logger.warning(
|
|
54
|
-
f"Non-mapped modifications found: {non_mapped_modifications}\
|
|
57
|
+
f"Non-mapped modifications found: {non_mapped_modifications}\n"
|
|
58
|
+
"This can be ignored if they are Unimod modification labels."
|
|
55
59
|
)
|
|
56
60
|
psm_list.rename_modifications(config["modification_mapping"])
|
|
57
61
|
psm_list.add_fixed_modifications(config["fixed_modifications"])
|
|
@@ -154,6 +158,46 @@ def _match_psm_ids(old_id, regex_pattern):
|
|
|
154
158
|
)
|
|
155
159
|
|
|
156
160
|
|
|
161
|
+
def _parse_values_spectrum_id(config, psm_list):
|
|
162
|
+
"""Parse retention time and or ion mobility values from the spectrum_id."""
|
|
163
|
+
|
|
164
|
+
if config["psm_id_rt_pattern"]:
|
|
165
|
+
logger.debug(
|
|
166
|
+
"Parsing retention time from spectrum_id with regex pattern "
|
|
167
|
+
f"{config['psm_id_rt_pattern']}"
|
|
168
|
+
)
|
|
169
|
+
try:
|
|
170
|
+
rt_pattern = re.compile(config["psm_id_rt_pattern"])
|
|
171
|
+
psm_list["retention_time"] = [
|
|
172
|
+
float(rt_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
|
|
173
|
+
]
|
|
174
|
+
except AttributeError:
|
|
175
|
+
raise MS2RescoreConfigurationError(
|
|
176
|
+
f"Could not parse retention time from spectrum_id with the "
|
|
177
|
+
f"{config['psm_id_rt_pattern']} regex pattern. "
|
|
178
|
+
"Please make sure the retention time key is present in the spectrum_id "
|
|
179
|
+
"and the value is in a capturing group or disable the relevant feature generator."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if config["psm_id_im_pattern"]:
|
|
183
|
+
logger.debug(
|
|
184
|
+
"Parsing ion mobility from spectrum_id with regex pattern "
|
|
185
|
+
f"{config['psm_id_im_pattern']}"
|
|
186
|
+
)
|
|
187
|
+
try:
|
|
188
|
+
im_pattern = re.compile(config["psm_id_im_pattern"])
|
|
189
|
+
psm_list["ion_mobility"] = [
|
|
190
|
+
float(im_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
|
|
191
|
+
]
|
|
192
|
+
except AttributeError:
|
|
193
|
+
raise MS2RescoreConfigurationError(
|
|
194
|
+
f"Could not parse ion mobility from spectrum_id with the "
|
|
195
|
+
f"{config['psm_id_im_pattern']} regex pattern. "
|
|
196
|
+
"Please make sure the ion mobility key is present in the spectrum_id "
|
|
197
|
+
"and the value is in a capturing group or disable the relevant feature generator."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
157
201
|
def _has_invalid_aminoacids(psm):
|
|
158
202
|
"""Check if a PSM contains invalid amino acids."""
|
|
159
203
|
|
|
@@ -145,9 +145,11 @@ def _collect_files(output_path_prefix, use_txt_log=False):
|
|
|
145
145
|
"configuration": Path(output_path_prefix + ".full-config.json").resolve(),
|
|
146
146
|
"feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
|
|
147
147
|
"feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
|
|
148
|
-
"log":
|
|
149
|
-
|
|
150
|
-
|
|
148
|
+
"log": (
|
|
149
|
+
Path(output_path_prefix + ".log.txt").resolve()
|
|
150
|
+
if use_txt_log
|
|
151
|
+
else Path(output_path_prefix + ".log.html").resolve()
|
|
152
|
+
),
|
|
151
153
|
}
|
|
152
154
|
for file, path in files.items():
|
|
153
155
|
if Path(path).is_file():
|
|
@@ -321,16 +323,12 @@ def _get_features_context(
|
|
|
321
323
|
import deeplc.plot
|
|
322
324
|
|
|
323
325
|
scatter_chart = deeplc.plot.scatter(
|
|
324
|
-
df=features[
|
|
325
|
-
(psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
|
|
326
|
-
], # noqa: E712
|
|
326
|
+
df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
|
|
327
327
|
predicted_column="predicted_retention_time_best",
|
|
328
328
|
observed_column="observed_retention_time_best",
|
|
329
329
|
)
|
|
330
330
|
baseline_chart = deeplc.plot.distribution_baseline(
|
|
331
|
-
df=features[
|
|
332
|
-
(psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
|
|
333
|
-
], # noqa: E712
|
|
331
|
+
df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
|
|
334
332
|
predicted_column="predicted_retention_time_best",
|
|
335
333
|
observed_column="observed_retention_time_best",
|
|
336
334
|
)
|
|
@@ -343,6 +341,26 @@ def _get_features_context(
|
|
|
343
341
|
}
|
|
344
342
|
)
|
|
345
343
|
|
|
344
|
+
# IM2Deep specific charts
|
|
345
|
+
if "im2deep" in feature_names:
|
|
346
|
+
import deeplc.plot
|
|
347
|
+
|
|
348
|
+
scatter_chart = deeplc.plot.scatter(
|
|
349
|
+
df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
|
|
350
|
+
predicted_column="ccs_predicted_im2deep",
|
|
351
|
+
observed_column="ccs_observed_im2deep",
|
|
352
|
+
xaxis_label="Observed CCS",
|
|
353
|
+
yaxis_label="Predicted CCS",
|
|
354
|
+
plot_title="Predicted vs. observed CCS",
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
context["charts"].append(
|
|
358
|
+
{
|
|
359
|
+
"title": TEXTS["charts"]["im2deep_performance"]["title"],
|
|
360
|
+
"description": TEXTS["charts"]["im2deep_performance"]["description"],
|
|
361
|
+
"chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS),
|
|
362
|
+
}
|
|
363
|
+
)
|
|
346
364
|
return context
|
|
347
365
|
|
|
348
366
|
|
|
@@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460
|
|
|
105
105
|
datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold
|
|
106
106
|
of the current dataset. A lower RMAE value indicates better performance.
|
|
107
107
|
"""
|
|
108
|
+
|
|
109
|
+
[charts.im2deep_performance]
|
|
110
|
+
title = "IM2Deep model performance"
|
|
111
|
+
description = """
|
|
112
|
+
IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS.
|
|
113
|
+
"""
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from glob import glob
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, Union
|
|
7
|
+
|
|
8
|
+
from ms2rescore.exceptions import MS2RescoreConfigurationError
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def infer_spectrum_path(
|
|
14
|
+
configured_path: Union[str, Path, None],
|
|
15
|
+
run_name: Optional[str] = None,
|
|
16
|
+
) -> Union[str, Path]:
|
|
17
|
+
"""
|
|
18
|
+
Infer spectrum path from passed path and expected filename (e.g. from PSM file).
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
configured_path: str, Path, None
|
|
23
|
+
User-defined path to spectrum file or directory containing spectrum file
|
|
24
|
+
run_name : str, optional
|
|
25
|
+
MS run name (stem of spectrum filename), e.g., as expected from PSM file.
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
# If no spectrum path configured, use expected run_name in default dir
|
|
29
|
+
if not configured_path:
|
|
30
|
+
if run_name:
|
|
31
|
+
resolved_path = os.path.join(".", run_name)
|
|
32
|
+
else:
|
|
33
|
+
raise MS2RescoreConfigurationError(
|
|
34
|
+
"Could not resolve spectrum file name: No spectrum path configured "
|
|
35
|
+
"and no run name in PSM file found."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
else:
|
|
39
|
+
is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path)
|
|
40
|
+
|
|
41
|
+
# If passed path is directory (that is not Bruker raw), join with run name
|
|
42
|
+
if os.path.isdir(configured_path) and not is_bruker_dir:
|
|
43
|
+
if run_name:
|
|
44
|
+
resolved_path = os.path.join(configured_path, run_name)
|
|
45
|
+
else:
|
|
46
|
+
raise MS2RescoreConfigurationError(
|
|
47
|
+
"Could not resolve spectrum file name: Spectrum path is directory "
|
|
48
|
+
"but no run name in PSM file found."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# If passed path is file, use that, but warn if basename doesn't match expected
|
|
52
|
+
elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir):
|
|
53
|
+
if run_name and Path(configured_path).stem != Path(run_name).stem:
|
|
54
|
+
logger.warning(
|
|
55
|
+
"Passed spectrum path (`%s`) does not match run name found in PSM "
|
|
56
|
+
"file (`%s`). Continuing with passed spectrum path.",
|
|
57
|
+
configured_path,
|
|
58
|
+
run_name,
|
|
59
|
+
)
|
|
60
|
+
resolved_path = configured_path
|
|
61
|
+
else:
|
|
62
|
+
raise MS2RescoreConfigurationError(
|
|
63
|
+
"Configured `spectrum_path` must be `None` or a path to an existing file "
|
|
64
|
+
"or directory. If `None` or path to directory, spectrum run information "
|
|
65
|
+
"should be present in the PSM file."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Match with file extension if not in resolved_path yet
|
|
69
|
+
if not _is_minitdf(resolved_path) and not re.match(
|
|
70
|
+
r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE
|
|
71
|
+
):
|
|
72
|
+
for filename in glob(resolved_path + "*"):
|
|
73
|
+
if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE):
|
|
74
|
+
resolved_path = filename
|
|
75
|
+
break
|
|
76
|
+
else:
|
|
77
|
+
raise MS2RescoreConfigurationError(
|
|
78
|
+
f"Resolved spectrum filename ('{resolved_path}') does not contain a supported "
|
|
79
|
+
"file extension (mzML, MGF, or .d) and could not find any matching existing "
|
|
80
|
+
"files."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return Path(resolved_path)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_minitdf(spectrum_file: str) -> bool:
|
|
87
|
+
"""
|
|
88
|
+
Check if the spectrum file is a Bruker miniTDF folder.
|
|
89
|
+
|
|
90
|
+
A Bruker miniTDF folder has no fixed name, but contains files matching the patterns
|
|
91
|
+
``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``.
|
|
92
|
+
"""
|
|
93
|
+
files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
|
|
94
|
+
files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
|
|
95
|
+
return len(files) >= 2
|
|
@@ -32,25 +32,26 @@ classifiers = [
|
|
|
32
32
|
dynamic = ["version"]
|
|
33
33
|
requires-python = ">=3.8"
|
|
34
34
|
dependencies = [
|
|
35
|
-
"ms2rescore_rs",
|
|
36
|
-
"numpy>=1.16.0; python_version != '3.11'",
|
|
37
|
-
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
|
|
38
|
-
"pandas>=1.0",
|
|
39
|
-
"rich>=12",
|
|
40
|
-
"pyteomics>=4.1.0",
|
|
41
|
-
"lxml>=4.5",
|
|
42
|
-
"ms2pip>=4.0.0-dev4",
|
|
43
|
-
"click>=7",
|
|
44
35
|
"cascade-config>=0.4.0",
|
|
36
|
+
"click>=7",
|
|
37
|
+
"customtkinter>=5,<6",
|
|
45
38
|
"deeplc>=2.2",
|
|
46
39
|
"deeplcretrainer>=0.2",
|
|
47
|
-
"
|
|
48
|
-
"psm_utils>=0.4",
|
|
49
|
-
"customtkinter>=5,<6",
|
|
50
|
-
"mokapot>=0.9",
|
|
51
|
-
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
|
|
40
|
+
"im2deep>=0.1.3",
|
|
52
41
|
"jinja2>=3",
|
|
42
|
+
"lxml>=4.5",
|
|
43
|
+
"mokapot>=0.9",
|
|
44
|
+
"ms2pip>=4.0.0-dev10",
|
|
45
|
+
"ms2rescore_rs",
|
|
46
|
+
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
|
|
47
|
+
"numpy>=1.16.0; python_version != '3.11'",
|
|
48
|
+
"pandas>=1.0",
|
|
53
49
|
"plotly>=5",
|
|
50
|
+
"psm_utils>=0.8",
|
|
51
|
+
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
|
|
52
|
+
"pyteomics>=4.1.0, <4.7",
|
|
53
|
+
"rich>=12",
|
|
54
|
+
"tomli>=2; python_version < '3.11'",
|
|
54
55
|
]
|
|
55
56
|
|
|
56
57
|
[project.optional-dependencies]
|
|
@@ -79,6 +80,7 @@ CompOmics = "https://www.compomics.com"
|
|
|
79
80
|
ms2rescore = "ms2rescore.__main__:main"
|
|
80
81
|
ms2rescore-gui = "ms2rescore.gui.__main__:main"
|
|
81
82
|
ms2rescore-report = "ms2rescore.report.__main__:main"
|
|
83
|
+
tims2rescore = "ms2rescore.__main__:main_tims"
|
|
82
84
|
|
|
83
85
|
[build-system]
|
|
84
86
|
requires = ["flit_core >=3.2,<4"]
|
|
@@ -94,3 +96,6 @@ target-version = ['py38']
|
|
|
94
96
|
[tool.ruff]
|
|
95
97
|
line-length = 99
|
|
96
98
|
target-version = 'py38'
|
|
99
|
+
|
|
100
|
+
[tool.ruff.lint]
|
|
101
|
+
extend-select = ["T201", "T203"]
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from glob import glob
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional, Union
|
|
7
|
-
|
|
8
|
-
from ms2rescore.exceptions import MS2RescoreConfigurationError
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def infer_spectrum_path(
|
|
14
|
-
configured_path: Union[str, Path, None],
|
|
15
|
-
run_name: Optional[str] = None,
|
|
16
|
-
) -> Union[str, Path]:
|
|
17
|
-
"""
|
|
18
|
-
Infer spectrum path from passed path and expected filename (e.g. from PSM file).
|
|
19
|
-
|
|
20
|
-
Parameters
|
|
21
|
-
----------
|
|
22
|
-
configured_path: str, Path, None
|
|
23
|
-
User-defined path to spectrum file or directory containing spectrum file
|
|
24
|
-
run_name : str, optional
|
|
25
|
-
MS run name (stem of spectrum filename), e.g., as expected from PSM file.
|
|
26
|
-
|
|
27
|
-
"""
|
|
28
|
-
# If no spectrum path configured, use expected run_name in default dir
|
|
29
|
-
if not configured_path:
|
|
30
|
-
if run_name:
|
|
31
|
-
resolved_path = os.path.join(".", run_name)
|
|
32
|
-
else:
|
|
33
|
-
raise MS2RescoreConfigurationError(
|
|
34
|
-
"Could not resolve spectrum file name: No spectrum path configured "
|
|
35
|
-
"and no run name in PSM file found."
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
# If passed path is directory, join with run name
|
|
39
|
-
elif os.path.isdir(configured_path):
|
|
40
|
-
if run_name:
|
|
41
|
-
resolved_path = os.path.join(configured_path, run_name)
|
|
42
|
-
else:
|
|
43
|
-
raise MS2RescoreConfigurationError(
|
|
44
|
-
"Could not resolve spectrum file name: Spectrum path is directory "
|
|
45
|
-
"but no run name in PSM file found."
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
# If passed path is file, use that, but warn if basename doesn't match expected
|
|
49
|
-
elif os.path.isfile(configured_path):
|
|
50
|
-
if run_name and Path(configured_path).stem != Path(run_name).stem:
|
|
51
|
-
logger.warning(
|
|
52
|
-
"Passed spectrum path (`%s`) does not match run name found in PSM "
|
|
53
|
-
"file (`%s`). Continuing with passed spectrum path.",
|
|
54
|
-
configured_path,
|
|
55
|
-
run_name,
|
|
56
|
-
)
|
|
57
|
-
resolved_path = configured_path
|
|
58
|
-
else:
|
|
59
|
-
raise MS2RescoreConfigurationError(
|
|
60
|
-
"Configured `spectrum_path` must be `None` or a path to an existing file "
|
|
61
|
-
"or directory. If `None` or path to directory, spectrum run information "
|
|
62
|
-
"should be present in the PSM file."
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
# Match with file extension if not in resolved_path yet
|
|
66
|
-
if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE):
|
|
67
|
-
for filename in glob(resolved_path + "*"):
|
|
68
|
-
if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE):
|
|
69
|
-
resolved_path = filename
|
|
70
|
-
break
|
|
71
|
-
else:
|
|
72
|
-
raise MS2RescoreConfigurationError(
|
|
73
|
-
"Resolved spectrum filename does not contain a supported file "
|
|
74
|
-
"extension (mgf or mzml) and could not find any matching existing "
|
|
75
|
-
"files."
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
return Path(resolved_path)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/img/github-mark-white.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ms2rescore-3.0.3 → ms2rescore-3.1.0.dev1}/ms2rescore/package_data/ms2rescore-gui-theme.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|