ms2rescore 3.1.0.dev6__tar.gz → 3.1.0.dev7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/PKG-INFO +5 -6
  2. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/__init__.py +1 -1
  3. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/core.py +80 -27
  4. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/exceptions.py +6 -0
  5. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/config_default.json +1 -0
  6. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/config_schema.json +13 -2
  7. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/parse_psms.py +27 -29
  8. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/charts.py +3 -2
  9. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/rescoring_engines/mokapot.py +63 -39
  10. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/pyproject.toml +6 -5
  11. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/LICENSE +0 -0
  12. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/README.md +0 -0
  13. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/__main__.py +0 -0
  14. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/config_parser.py +0 -0
  15. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/__init__.py +0 -0
  16. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/base.py +0 -0
  17. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/basic.py +0 -0
  18. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/deeplc.py +0 -0
  19. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/im2deep.py +0 -0
  20. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/ionmob.py +0 -0
  21. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/maxquant.py +0 -0
  22. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/feature_generators/ms2pip.py +0 -0
  23. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/gui/__init__.py +0 -0
  24. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/gui/__main__.py +0 -0
  25. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/gui/app.py +0 -0
  26. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/gui/function2ctk.py +0 -0
  27. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/gui/widgets.py +0 -0
  28. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/__init__.py +0 -0
  29. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/config_default_tims.json +0 -0
  30. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/__init__.py +0 -0
  31. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/config_icon.png +0 -0
  32. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/github-mark-white.png +0 -0
  33. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/github-mark.png +0 -0
  34. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/ms2rescore_logo.png +0 -0
  35. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/img/program_icon.ico +0 -0
  36. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/package_data/ms2rescore-gui-theme.json +0 -0
  37. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/parse_spectra.py +0 -0
  38. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/__init__.py +0 -0
  39. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/__main__.py +0 -0
  40. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/generate.py +0 -0
  41. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/__init__.py +0 -0
  42. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/about.html +0 -0
  43. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/base.html +0 -0
  44. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/config.html +0 -0
  45. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/features.html +0 -0
  46. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/log.html +0 -0
  47. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/metadata.html +0 -0
  48. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/overview.html +0 -0
  49. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/stats-card.html +0 -0
  50. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/style.html +0 -0
  51. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/target-decoy.html +0 -0
  52. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/templates/texts.toml +0 -0
  53. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/report/utils.py +0 -0
  54. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/rescoring_engines/__init__.py +0 -0
  55. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/rescoring_engines/percolator.py +0 -0
  56. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev7}/ms2rescore/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ms2rescore
3
- Version: 3.1.0.dev6
3
+ Version: 3.1.0.dev7
4
4
  Summary: MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and retention times.
5
5
  Keywords: MS2Rescore,MS2PIP,DeepLC,Percolator,proteomics,mass spectrometry,peptide identification,rescoring,machine learning
6
6
  Author: Ana Sílvia C. Silva, Robbin Bouwmeester, Louise Buur
@@ -24,13 +24,12 @@ Requires-Dist: lxml>=4.5
24
24
  Requires-Dist: mokapot>=0.9
25
25
  Requires-Dist: ms2pip>=4.0.0-dev10
26
26
  Requires-Dist: ms2rescore_rs
27
- Requires-Dist: numpy==1.24.3; python_version == '3.11'
28
- Requires-Dist: numpy>=1.16.0; python_version != '3.11'
27
+ Requires-Dist: numpy>=1.16.0
28
+ Requires-Dist: scikit-learn==1.5.1; python_version == '3.11'
29
29
  Requires-Dist: pandas>=1.0
30
30
  Requires-Dist: plotly>=5
31
- Requires-Dist: psm_utils>=0.8
32
- Requires-Dist: pydantic>=1.8.2,<2
33
- Requires-Dist: pyteomics>=4.1.0, <4.7
31
+ Requires-Dist: psm_utils>=0.9
32
+ Requires-Dist: pyteomics>=4.7.2
34
33
  Requires-Dist: rich>=12
35
34
  Requires-Dist: tomli>=2; python_version < '3.11'
36
35
  Requires-Dist: ruff ; extra == "dev"
@@ -1,6 +1,6 @@
1
1
  """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
2
2
 
3
- __version__ = "3.1.0-dev6"
3
+ __version__ = "3.1.0-dev7"
4
4
 
5
5
  from warnings import filterwarnings
6
6
 
@@ -5,6 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import psm_utils.io
8
+ from mokapot.dataset import LinearPsmDataset
8
9
  from psm_utils import PSMList
9
10
 
10
11
  from ms2rescore import exceptions
@@ -13,6 +14,7 @@ from ms2rescore.parse_psms import parse_psms
13
14
  from ms2rescore.parse_spectra import get_missing_values
14
15
  from ms2rescore.report import generate
15
16
  from ms2rescore.rescoring_engines import mokapot, percolator
17
+ from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
16
18
 
17
19
  logger = logging.getLogger(__name__)
18
20
 
@@ -104,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
104
106
  logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
105
107
  psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
106
108
 
107
- # If no rescoring engine is specified, write PSMs and features to PIN file
108
- if not config["rescoring_engine"]:
109
+ # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
110
+ if not config["rescoring_engine"] or config["log_level"] == "debug":
109
111
  logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
110
112
  psm_utils.io.write_file(
111
113
  psm_list,
@@ -113,42 +115,52 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
113
115
  filetype="percolator",
114
116
  feature_names=all_feature_names,
115
117
  )
118
+
119
+ if not config["rescoring_engine"]:
120
+ logger.info("No rescoring engine specified. Skipping rescoring.")
116
121
  return None
117
122
 
118
123
  # Rescore PSMs
119
- if "percolator" in config["rescoring_engine"]:
120
- percolator.rescore(
121
- psm_list,
122
- output_file_root=output_file_root,
123
- log_level=config["log_level"],
124
- processes=config["processes"],
125
- percolator_kwargs=config["rescoring_engine"]["percolator"],
126
- )
127
- elif "mokapot" in config["rescoring_engine"]:
128
- if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
129
- config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
130
- if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
131
- protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
132
- else:
133
- protein_kwargs = dict()
134
-
135
- mokapot.rescore(
136
- psm_list,
137
- output_file_root=output_file_root,
138
- protein_kwargs=protein_kwargs,
139
- **config["rescoring_engine"]["mokapot"],
140
- )
124
+ try:
125
+ if "percolator" in config["rescoring_engine"]:
126
+ percolator.rescore(
127
+ psm_list,
128
+ output_file_root=output_file_root,
129
+ log_level=config["log_level"],
130
+ processes=config["processes"],
131
+ percolator_kwargs=config["rescoring_engine"]["percolator"],
132
+ )
133
+ elif "mokapot" in config["rescoring_engine"]:
134
+ if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
135
+ config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
136
+ if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
137
+ protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
138
+ else:
139
+ protein_kwargs = dict()
140
+
141
+ mokapot.rescore(
142
+ psm_list,
143
+ output_file_root=output_file_root,
144
+ protein_kwargs=protein_kwargs,
145
+ **config["rescoring_engine"]["mokapot"],
146
+ )
147
+ except exceptions.RescoringError as e:
148
+ logger.exception(e)
149
+ rescoring_succeeded = False
141
150
  else:
142
- logger.info("No known rescoring engine specified. Skipping rescoring.")
151
+ rescoring_succeeded = True
152
+ _log_id_psms_after(psm_list, id_psms_before)
143
153
 
144
- _log_id_psms_after(psm_list, id_psms_before)
154
+ # Workaround for broken PEP calculation if best PSM is decoy
155
+ if all(psm_list["pep"] == 1.0):
156
+ psm_list = _fix_constant_pep(psm_list)
145
157
 
146
158
  # Write output
147
159
  logger.info(f"Writing output to {output_file_root}.psms.tsv...")
148
160
  psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
149
161
 
150
162
  # Write report
151
- if config["write_report"]:
163
+ if config["write_report"] and rescoring_succeeded:
152
164
  try:
153
165
  generate.generate_report(
154
166
  output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
@@ -231,3 +243,44 @@ def _log_id_psms_after(psm_list, id_psms_before):
231
243
  logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
232
244
 
233
245
  return id_psms_after
246
+
247
+
248
+ def _fix_constant_pep(psm_list):
249
+ """Workaround for broken PEP calculation if best PSM is decoy."""
250
+ logger.warning(
251
+ "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
252
+ "best target PSM."
253
+ )
254
+ max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
255
+ higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
256
+
257
+ if not higher_scoring_decoys.any():
258
+ logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
259
+ else:
260
+ logger.warning(f"Removing {higher_scoring_decoys.sum()} decoy PSMs.")
261
+
262
+ psm_list = psm_list[~higher_scoring_decoys]
263
+
264
+ # Minimal conversion to LinearPsmDataset
265
+ psm_df = psm_list.to_dataframe()
266
+ psm_df = psm_df.reset_index(drop=True).reset_index()
267
+ psm_df["peptide"] = (
268
+ psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
269
+ )
270
+ psm_df["is_target"] = ~psm_df["is_decoy"]
271
+ lin_psm_data = LinearPsmDataset(
272
+ psms=psm_df[["index", "peptide", "score", "is_target"]],
273
+ target_column="is_target",
274
+ spectrum_columns="index", # Use artificial index to allow multi-rank rescoring
275
+ peptide_column="peptide",
276
+ feature_columns=["score"],
277
+ )
278
+
279
+ # Recalculate confidence
280
+ new_confidence = lin_psm_data.assign_confidence()
281
+
282
+ # Add new confidence estimations to PSMList
283
+ add_psm_confidence(psm_list, new_confidence)
284
+ add_peptide_confidence(psm_list, new_confidence)
285
+
286
+ return psm_list
@@ -35,3 +35,9 @@ class ReportGenerationError(MS2RescoreError):
35
35
  """Error while generating report."""
36
36
 
37
37
  pass
38
+
39
+
40
+ class RescoringError(MS2RescoreError):
41
+ """Error while rescoring PSMs."""
42
+
43
+ pass
@@ -14,6 +14,7 @@
14
14
  },
15
15
  "rescoring_engine": {
16
16
  "mokapot": {
17
+ "train_fdr": 0.01,
17
18
  "write_weights": true,
18
19
  "write_txt": true,
19
20
  "write_flashlfq": true
@@ -68,7 +68,11 @@
68
68
  },
69
69
  "psm_file": {
70
70
  "description": "Path to file with peptide-spectrum matches.",
71
- "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
71
+ "oneOf": [
72
+ { "type": "string" },
73
+ { "type": "null" },
74
+ { "type": "array", "items": { "type": "string" } }
75
+ ]
72
76
  },
73
77
  "psm_file_type": {
74
78
  "description": "PSM file type. By default inferred from file extension.",
@@ -159,7 +163,7 @@
159
163
  "default": false
160
164
  },
161
165
  "profile": {
162
- "description": "Write an txt report using cProfile for profiling",
166
+ "description": "Write a txt report using cProfile for profiling",
163
167
  "type": "boolean",
164
168
  "default": false
165
169
  }
@@ -263,6 +267,13 @@
263
267
  "type": "object",
264
268
  "additionalProperties": true,
265
269
  "properties": {
270
+ "train_fdr": {
271
+ "description": "FDR threshold for training Mokapot",
272
+ "type": "number",
273
+ "minimum": 0,
274
+ "maximum": 1,
275
+ "default": 0.01
276
+ },
266
277
  "write_weights": {
267
278
  "description": "Write Mokapot weights to a text file",
268
279
  "type": "boolean",
@@ -2,6 +2,7 @@ import logging
2
2
  import re
3
3
  from typing import Dict, Union
4
4
 
5
+ import numpy as np
5
6
  import psm_utils.io
6
7
  from psm_utils import PSMList
7
8
 
@@ -25,6 +26,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
25
26
  """
26
27
  # Read PSMs, find decoys, calculate q-values
27
28
  psm_list = _read_psms(config, psm_list)
29
+ psm_list = _remove_invalid_aa(psm_list)
28
30
  _find_decoys(config, psm_list)
29
31
  _calculate_qvalues(config, psm_list)
30
32
  if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
@@ -70,10 +72,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
70
72
  new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
71
73
  psm_list["spectrum_id"] = new_ids
72
74
 
73
- # TODO: Temporary fix until implemented in psm_utils
74
- # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
75
- psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
76
-
77
75
  return psm_list
78
76
 
79
77
 
@@ -82,21 +80,20 @@ def _read_psms(config, psm_list):
82
80
  return psm_list
83
81
  else:
84
82
  logger.info("Reading PSMs from file...")
85
- current_file = 1
86
83
  total_files = len(config["psm_file"])
87
- valid_psms_list = []
88
- total_psms = 0
89
- valid_psms = 0
90
- for psm_file in config["psm_file"]:
84
+ psm_list = []
85
+ for current_file, psm_file in enumerate(config["psm_file"]):
91
86
  logger.info(
92
- f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
87
+ f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
93
88
  )
94
89
  try:
95
- id_file_psm_list = psm_utils.io.read_file(
96
- psm_file,
97
- filetype=config["psm_file_type"],
98
- show_progressbar=True,
99
- **config["psm_reader_kwargs"],
90
+ psm_list.extend(
91
+ psm_utils.io.read_file(
92
+ psm_file,
93
+ filetype=config["psm_file_type"],
94
+ show_progressbar=True,
95
+ **config["psm_reader_kwargs"],
96
+ )
100
97
  )
101
98
  except psm_utils.io.PSMUtilsIOException:
102
99
  raise MS2RescoreConfigurationError(
@@ -105,18 +102,9 @@ def _read_psms(config, psm_list):
105
102
  "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
106
103
  " for more information."
107
104
  )
105
+ logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.")
108
106
 
109
- total_psms += len(id_file_psm_list.psm_list)
110
- for psm in id_file_psm_list.psm_list:
111
- if not _has_invalid_aminoacids(psm):
112
- valid_psms_list.append(psm)
113
- valid_psms += 1
114
- current_file += 1
115
- if total_psms - valid_psms > 0:
116
- logger.warning(
117
- f"{total_psms - valid_psms} PSMs with invalid amino acids were removed."
118
- )
119
- return PSMList(psm_list=valid_psms_list)
107
+ return PSMList(psm_list=psm_list)
120
108
 
121
109
 
122
110
  def _find_decoys(config, psm_list):
@@ -175,6 +163,7 @@ def _parse_values_spectrum_id(config, psm_list):
175
163
  raise MS2RescoreConfigurationError(
176
164
  f"Could not parse retention time from spectrum_id with the "
177
165
  f"{config['psm_id_rt_pattern']} regex pattern. "
166
+ f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n."
178
167
  "Please make sure the retention time key is present in the spectrum_id "
179
168
  "and the value is in a capturing group or disable the relevant feature generator."
180
169
  )
@@ -198,7 +187,16 @@ def _parse_values_spectrum_id(config, psm_list):
198
187
  )
199
188
 
200
189
 
201
- def _has_invalid_aminoacids(psm):
202
- """Check if a PSM contains invalid amino acids."""
190
+ def _remove_invalid_aa(psm_list: PSMList) -> PSMList:
191
+ """Remove PSMs with invalid amino acids."""
192
+ logger.debug("Removing PSMs with invalid amino acids...")
193
+ invalid_psms = np.array(
194
+ [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list]
195
+ )
203
196
 
204
- return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence)
197
+ if any(invalid_psms):
198
+ logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.")
199
+ return psm_list[~invalid_psms]
200
+ else:
201
+ logger.debug("No PSMs with invalid amino acids found.")
202
+ return psm_list
@@ -198,6 +198,7 @@ def score_scatter_plot(
198
198
  after: mokapot.LinearConfidence,
199
199
  level: str = "psms",
200
200
  indexer: str = "index",
201
+ fdr_threshold: float = 0.01,
201
202
  ) -> go.Figure:
202
203
  """
203
204
  Plot PSM scores before and after rescoring.
@@ -242,12 +243,12 @@ def score_scatter_plot(
242
243
 
243
244
  # Get score thresholds
244
245
  score_threshold_before = (
245
- ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
246
+ ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
246
247
  .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
247
248
  .iloc[0]
248
249
  )
249
250
  score_threshold_after = (
250
- ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
251
+ ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
251
252
  .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
252
253
  .iloc[0]
253
254
  )
@@ -29,8 +29,11 @@ import pandas as pd
29
29
  import psm_utils
30
30
  from mokapot.brew import brew
31
31
  from mokapot.dataset import LinearPsmDataset
32
+ from mokapot.model import PercolatorModel
32
33
  from pyteomics.mass import nist_mass
33
34
 
35
+ from ms2rescore.exceptions import RescoringError
36
+
34
37
  logger = logging.getLogger(__name__)
35
38
  logging.getLogger("numba").setLevel(logging.WARNING)
36
39
 
@@ -39,6 +42,7 @@ def rescore(
39
42
  psm_list: psm_utils.PSMList,
40
43
  output_file_root: str = "ms2rescore",
41
44
  fasta_file: Optional[str] = None,
45
+ train_fdr: float = 0.01,
42
46
  write_weights: bool = False,
43
47
  write_txt: bool = False,
44
48
  write_flashlfq: bool = False,
@@ -65,6 +69,8 @@ def rescore(
65
69
  fasta_file
66
70
  Path to FASTA file with protein sequences to use for protein inference. Defaults to
67
71
  ``None``.
72
+ train_fdr
73
+ FDR to use for training the Mokapot model. Defaults to ``0.01``.
68
74
  write_weights
69
75
  Write model weights to a text file. Defaults to ``False``.
70
76
  write_txt
@@ -91,46 +97,15 @@ def rescore(
91
97
 
92
98
  # Rescore
93
99
  logger.debug(f"Mokapot brew options: `{kwargs}`")
94
- confidence_results, models = brew(lin_psm_data, rng=8, **kwargs)
95
-
96
- # Reshape confidence estimates to match PSMList
97
- keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
98
- mokapot_values_targets = (
99
- confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
100
- )
101
- mokapot_values_decoys = (
102
- confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
103
- )
104
- q = np.full((len(psm_list), 3), np.nan)
105
- q[mokapot_values_targets.index] = mokapot_values_targets.values
106
- q[mokapot_values_decoys.index] = mokapot_values_decoys.values
107
-
108
- # Add Mokapot results to PSMList
109
- psm_list["score"] = q[:, 0]
110
- psm_list["qvalue"] = q[:, 1]
111
- psm_list["pep"] = q[:, 2]
112
-
113
- # Repeat for peptide-level scores
114
- peptides_targets = confidence_results.confidence_estimates["peptides"].set_index(["peptide"])[
115
- keys
116
- ]
117
- peptides_decoys = confidence_results.decoy_confidence_estimates["peptides"].set_index(
118
- ["peptide"]
119
- )[keys]
120
- peptide_info = pd.concat([peptides_targets, peptides_decoys], axis=0).to_dict(orient="index")
121
-
122
- # Add peptide-level scores to PSM metadata
123
- # run_key = "na" if not all(psm.run for psm in psm_list) else None
124
- no_charge_pattern = re.compile(r"(/\d+$)")
125
- for psm in psm_list:
126
- peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
127
- psm.metadata.update(
128
- {
129
- "peptide_score": peptide_scores["mokapot score"],
130
- "peptide_qvalue": peptide_scores["mokapot q-value"],
131
- "peptide_pep": peptide_scores["mokapot PEP"],
132
- }
100
+ try:
101
+ confidence_results, models = brew(
102
+ lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
133
103
  )
104
+ except RuntimeError as e:
105
+ raise RescoringError("Mokapot could not be run. Please check the input data.") from e
106
+
107
+ add_psm_confidence(psm_list, confidence_results)
108
+ add_peptide_confidence(psm_list, confidence_results)
134
109
 
135
110
  # Write results
136
111
  if write_weights:
@@ -245,6 +220,55 @@ def save_model_weights(
245
220
  )
246
221
 
247
222
 
223
+ def add_psm_confidence(
224
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
225
+ ) -> None:
226
+ """Add Mokapot PSM-level confidence estimates to PSM list."""
227
+ # Reshape confidence estimates to match PSMList
228
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
229
+ mokapot_values_targets = (
230
+ confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
231
+ )
232
+ mokapot_values_decoys = (
233
+ confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
234
+ )
235
+ q = np.full((len(psm_list), 3), np.nan)
236
+ q[mokapot_values_targets.index] = mokapot_values_targets.values
237
+ q[mokapot_values_decoys.index] = mokapot_values_decoys.values
238
+
239
+ # Add Mokapot results to PSMList
240
+ psm_list["score"] = q[:, 0]
241
+ psm_list["qvalue"] = q[:, 1]
242
+ psm_list["pep"] = q[:, 2]
243
+
244
+
245
+ def add_peptide_confidence(
246
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
247
+ ) -> None:
248
+ """Add Mokapot peptide-level confidence estimates to PSM list."""
249
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
250
+ peptide_info = pd.concat(
251
+ [
252
+ confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
253
+ confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
254
+ ],
255
+ axis=0,
256
+ ).to_dict(orient="index")
257
+
258
+ # Add peptide-level scores to PSM metadata
259
+ # run_key = "na" if not all(psm.run for psm in psm_list) else None
260
+ no_charge_pattern = re.compile(r"(/\d+$)")
261
+ for psm in psm_list:
262
+ peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
263
+ psm.metadata.update(
264
+ {
265
+ "peptide_score": peptide_scores["mokapot score"],
266
+ "peptide_qvalue": peptide_scores["mokapot q-value"],
267
+ "peptide_pep": peptide_scores["mokapot PEP"],
268
+ }
269
+ )
270
+
271
+
248
272
  def _mz_to_mass(mz: float, charge: int) -> float:
249
273
  """Convert m/z to mass."""
250
274
  return mz * charge - charge * nist_mass["H"][1][0]
@@ -43,13 +43,14 @@ dependencies = [
43
43
  "mokapot>=0.9",
44
44
  "ms2pip>=4.0.0-dev10",
45
45
  "ms2rescore_rs",
46
- "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
47
- "numpy>=1.16.0; python_version != '3.11'",
46
+ # "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
47
+ # "numpy>=1.16.0; python_version != '3.11'",
48
+ "numpy>=1.16.0",
49
+ "scikit-learn==1.5.1; python_version == '3.11'",
48
50
  "pandas>=1.0",
49
51
  "plotly>=5",
50
- "psm_utils>=0.8",
51
- "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
52
- "pyteomics>=4.1.0, <4.7",
52
+ "psm_utils>=0.9",
53
+ "pyteomics>=4.7.2",
53
54
  "rich>=12",
54
55
  "tomli>=2; python_version < '3.11'",
55
56
  ]
File without changes