ms2rescore 3.1.0.dev6__tar.gz → 3.1.0.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/PKG-INFO +5 -6
  2. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/__init__.py +1 -1
  3. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/core.py +117 -36
  4. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/exceptions.py +6 -0
  5. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/config_default.json +3 -0
  6. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/config_schema.json +25 -2
  7. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/parse_psms.py +74 -81
  8. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/charts.py +23 -14
  9. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/rescoring_engines/mokapot.py +66 -39
  10. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/rescoring_engines/percolator.py +2 -0
  11. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/pyproject.toml +6 -5
  12. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/LICENSE +0 -0
  13. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/README.md +0 -0
  14. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/__main__.py +0 -0
  15. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/config_parser.py +0 -0
  16. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/__init__.py +0 -0
  17. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/base.py +0 -0
  18. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/basic.py +0 -0
  19. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/deeplc.py +0 -0
  20. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/im2deep.py +0 -0
  21. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/ionmob.py +0 -0
  22. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/maxquant.py +0 -0
  23. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/feature_generators/ms2pip.py +0 -0
  24. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/gui/__init__.py +0 -0
  25. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/gui/__main__.py +0 -0
  26. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/gui/app.py +0 -0
  27. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/gui/function2ctk.py +0 -0
  28. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/gui/widgets.py +0 -0
  29. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/__init__.py +0 -0
  30. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/config_default_tims.json +0 -0
  31. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/__init__.py +0 -0
  32. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/config_icon.png +0 -0
  33. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/github-mark-white.png +0 -0
  34. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/github-mark.png +0 -0
  35. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/ms2rescore_logo.png +0 -0
  36. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/img/program_icon.ico +0 -0
  37. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/package_data/ms2rescore-gui-theme.json +0 -0
  38. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/parse_spectra.py +0 -0
  39. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/__init__.py +0 -0
  40. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/__main__.py +0 -0
  41. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/generate.py +0 -0
  42. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/__init__.py +0 -0
  43. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/about.html +0 -0
  44. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/base.html +0 -0
  45. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/config.html +0 -0
  46. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/features.html +0 -0
  47. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/log.html +0 -0
  48. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/metadata.html +0 -0
  49. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/overview.html +0 -0
  50. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/stats-card.html +0 -0
  51. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/style.html +0 -0
  52. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/target-decoy.html +0 -0
  53. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/templates/texts.toml +0 -0
  54. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/report/utils.py +0 -0
  55. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/rescoring_engines/__init__.py +0 -0
  56. {ms2rescore-3.1.0.dev6 → ms2rescore-3.1.0.dev8}/ms2rescore/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ms2rescore
3
- Version: 3.1.0.dev6
3
+ Version: 3.1.0.dev8
4
4
  Summary: MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and retention times.
5
5
  Keywords: MS2Rescore,MS2PIP,DeepLC,Percolator,proteomics,mass spectrometry,peptide identification,rescoring,machine learning
6
6
  Author: Ana Sílvia C. Silva, Robbin Bouwmeester, Louise Buur
@@ -24,13 +24,12 @@ Requires-Dist: lxml>=4.5
24
24
  Requires-Dist: mokapot>=0.9
25
25
  Requires-Dist: ms2pip>=4.0.0-dev10
26
26
  Requires-Dist: ms2rescore_rs
27
- Requires-Dist: numpy==1.24.3; python_version == '3.11'
28
- Requires-Dist: numpy>=1.16.0; python_version != '3.11'
27
+ Requires-Dist: numpy>=1.16.0
28
+ Requires-Dist: scikit-learn==1.5.1; python_version == '3.11'
29
29
  Requires-Dist: pandas>=1.0
30
30
  Requires-Dist: plotly>=5
31
- Requires-Dist: psm_utils>=0.8
32
- Requires-Dist: pydantic>=1.8.2,<2
33
- Requires-Dist: pyteomics>=4.1.0, <4.7
31
+ Requires-Dist: psm_utils>=0.9
32
+ Requires-Dist: pyteomics>=4.7.2
34
33
  Requires-Dist: rich>=12
35
34
  Requires-Dist: tomli>=2; python_version < '3.11'
36
35
  Requires-Dist: ruff ; extra == "dev"
@@ -1,6 +1,6 @@
1
1
  """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
2
2
 
3
- __version__ = "3.1.0-dev6"
3
+ __version__ = "3.1.0-dev8"
4
4
 
5
5
  from warnings import filterwarnings
6
6
 
@@ -5,6 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import psm_utils.io
8
+ from mokapot.dataset import LinearPsmDataset
8
9
  from psm_utils import PSMList
9
10
 
10
11
  from ms2rescore import exceptions
@@ -13,6 +14,7 @@ from ms2rescore.parse_psms import parse_psms
13
14
  from ms2rescore.parse_spectra import get_missing_values
14
15
  from ms2rescore.report import generate
15
16
  from ms2rescore.rescoring_engines import mokapot, percolator
17
+ from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
16
18
 
17
19
  logger = logging.getLogger(__name__)
18
20
 
@@ -45,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
45
47
  psm_list = parse_psms(config, psm_list)
46
48
 
47
49
  # Log #PSMs identified before rescoring
48
- id_psms_before = _log_id_psms_before(psm_list)
50
+ id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"])
49
51
 
50
52
  # Define feature names; get existing feature names from PSM file
51
53
  feature_names = dict()
@@ -60,7 +62,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
60
62
  )
61
63
 
62
64
  # Add missing precursor info from spectrum file if needed
63
- _fill_missing_precursor_info(psm_list, config)
65
+ psm_list = _fill_missing_precursor_info(psm_list, config)
64
66
 
65
67
  # Add rescoring features
66
68
  for fgen_name, fgen_config in config["feature_generators"].items():
@@ -104,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
104
106
  logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
105
107
  psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
106
108
 
107
- # If no rescoring engine is specified, write PSMs and features to PIN file
108
- if not config["rescoring_engine"]:
109
+ # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
110
+ if not config["rescoring_engine"] or config["log_level"] == "debug":
109
111
  logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
110
112
  psm_utils.io.write_file(
111
113
  psm_list,
@@ -113,35 +115,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
113
115
  filetype="percolator",
114
116
  feature_names=all_feature_names,
115
117
  )
118
+
119
+ if not config["rescoring_engine"]:
120
+ logger.info("No rescoring engine specified. Skipping rescoring.")
116
121
  return None
117
122
 
118
123
  # Rescore PSMs
119
- if "percolator" in config["rescoring_engine"]:
120
- percolator.rescore(
121
- psm_list,
122
- output_file_root=output_file_root,
123
- log_level=config["log_level"],
124
- processes=config["processes"],
125
- percolator_kwargs=config["rescoring_engine"]["percolator"],
126
- )
127
- elif "mokapot" in config["rescoring_engine"]:
128
- if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
129
- config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
130
- if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
131
- protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
132
- else:
133
- protein_kwargs = dict()
134
-
135
- mokapot.rescore(
136
- psm_list,
137
- output_file_root=output_file_root,
138
- protein_kwargs=protein_kwargs,
139
- **config["rescoring_engine"]["mokapot"],
140
- )
141
- else:
142
- logger.info("No known rescoring engine specified. Skipping rescoring.")
124
+ try:
125
+ if "percolator" in config["rescoring_engine"]:
126
+ percolator.rescore(
127
+ psm_list,
128
+ output_file_root=output_file_root,
129
+ log_level=config["log_level"],
130
+ processes=config["processes"],
131
+ percolator_kwargs=config["rescoring_engine"]["percolator"],
132
+ )
133
+ elif "mokapot" in config["rescoring_engine"]:
134
+ if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
135
+ config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
136
+ if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
137
+ protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
138
+ else:
139
+ protein_kwargs = dict()
140
+
141
+ mokapot.rescore(
142
+ psm_list,
143
+ output_file_root=output_file_root,
144
+ protein_kwargs=protein_kwargs,
145
+ **config["rescoring_engine"]["mokapot"],
146
+ )
147
+ except exceptions.RescoringError as e:
148
+ # Write output
149
+ logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...")
150
+ psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
151
+
152
+ # Reraise exception
153
+ raise e
143
154
 
144
- _log_id_psms_after(psm_list, id_psms_before)
155
+ # Post-rescoring processing
156
+ if all(psm_list["pep"] == 1.0):
157
+ psm_list = _fix_constant_pep(psm_list)
158
+ psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False)
159
+ psm_list = _calculate_confidence(psm_list)
160
+ _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"])
145
161
 
146
162
  # Write output
147
163
  logger.info(f"Writing output to {output_file_root}.psms.tsv...")
@@ -157,7 +173,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
157
173
  logger.exception(e)
158
174
 
159
175
 
160
- def _fill_missing_precursor_info(psm_list, config):
176
+ def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
161
177
  """Fill missing precursor info from spectrum file if needed."""
162
178
  # Check if required
163
179
  # TODO: avoid hard coding feature generators in some way
@@ -199,6 +215,16 @@ def _fill_missing_precursor_info(psm_list, config):
199
215
  [v is not None and not np.isnan(v) for v in psm_list[value_name]]
200
216
  ]
201
217
 
218
+ return psm_list
219
+
220
+
221
+ def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
222
+ """Filter PSMs by rank."""
223
+ psm_list.set_ranks(lower_score_better=lower_score_better)
224
+ rank_filter = psm_list["rank"] <= max_rank
225
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.")
226
+ return psm_list[rank_filter]
227
+
202
228
 
203
229
  def _write_feature_names(feature_names, output_file_root):
204
230
  """Write feature names to file."""
@@ -209,25 +235,80 @@ def _write_feature_names(feature_names, output_file_root):
209
235
  f.write(f"{fgen}\t{feature}\n")
210
236
 
211
237
 
212
- def _log_id_psms_before(psm_list):
238
+ def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
213
239
  """Log #PSMs identified before rescoring."""
214
240
  id_psms_before = (
215
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
241
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
216
242
  ).sum()
217
- logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
243
+ logger.info(
244
+ f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
245
+ "rescoring."
246
+ )
218
247
  return id_psms_before
219
248
 
220
249
 
221
- def _log_id_psms_after(psm_list, id_psms_before):
250
+ def _log_id_psms_after(
251
+ psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1
252
+ ) -> int:
222
253
  """Log #PSMs identified after rescoring."""
223
254
  id_psms_after = (
224
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
255
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
225
256
  ).sum()
226
257
  diff = id_psms_after - id_psms_before
227
258
  diff_perc = diff / id_psms_before if id_psms_before > 0 else None
228
259
 
229
260
  diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff)
230
261
  diff_word = "more" if diff > 0 else "less"
231
- logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
262
+ logger.info(
263
+ f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after "
264
+ "rescoring."
265
+ )
232
266
 
233
267
  return id_psms_after
268
+
269
+
270
+ def _fix_constant_pep(psm_list: PSMList) -> PSMList:
271
+ """Workaround for broken PEP calculation if best PSM is decoy."""
272
+ logger.warning(
273
+ "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
274
+ "best target PSM."
275
+ )
276
+ max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
277
+ higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
278
+
279
+ if not higher_scoring_decoys.any():
280
+ logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
281
+ else:
282
+ psm_list = psm_list[~higher_scoring_decoys]
283
+ logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.")
284
+
285
+ return psm_list
286
+
287
+
288
+ def _calculate_confidence(psm_list: PSMList) -> PSMList:
289
+ """
290
+ Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList.
291
+ """
292
+ # Minimal conversion to LinearPsmDataset
293
+ psm_df = psm_list.to_dataframe()
294
+ psm_df = psm_df.reset_index(drop=True).reset_index()
295
+ psm_df["peptide"] = (
296
+ psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
297
+ )
298
+ psm_df["is_target"] = ~psm_df["is_decoy"]
299
+ lin_psm_data = LinearPsmDataset(
300
+ psms=psm_df[["index", "peptide", "score", "is_target"]],
301
+ target_column="is_target",
302
+ spectrum_columns="index", # Use artificial index to allow multi-rank rescoring
303
+ peptide_column="peptide",
304
+ feature_columns=["score"],
305
+ )
306
+
307
+ # Recalculate confidence
308
+ new_confidence = lin_psm_data.assign_confidence()
309
+
310
+ # Add new confidence estimations to PSMList
311
+ add_psm_confidence(psm_list, new_confidence)
312
+ add_peptide_confidence(psm_list, new_confidence)
313
+
314
+ return psm_list
@@ -35,3 +35,9 @@ class ReportGenerationError(MS2RescoreError):
35
35
  """Error while generating report."""
36
36
 
37
37
  pass
38
+
39
+
40
+ class RescoringError(MS2RescoreError):
41
+ """Error while rescoring PSMs."""
42
+
43
+ pass
@@ -14,6 +14,7 @@
14
14
  },
15
15
  "rescoring_engine": {
16
16
  "mokapot": {
17
+ "train_fdr": 0.01,
17
18
  "write_weights": true,
18
19
  "write_txt": true,
19
20
  "write_flashlfq": true
@@ -32,6 +33,8 @@
32
33
  "psm_id_rt_pattern": null,
33
34
  "psm_id_im_pattern": null,
34
35
  "lower_score_is_better": false,
36
+ "max_psm_rank_input": 10,
37
+ "max_psm_rank_output": 1,
35
38
  "modification_mapping": {},
36
39
  "fixed_modifications": {},
37
40
  "processes": -1,
@@ -68,7 +68,11 @@
68
68
  },
69
69
  "psm_file": {
70
70
  "description": "Path to file with peptide-spectrum matches.",
71
- "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
71
+ "oneOf": [
72
+ { "type": "string" },
73
+ { "type": "null" },
74
+ { "type": "array", "items": { "type": "string" } }
75
+ ]
72
76
  },
73
77
  "psm_file_type": {
74
78
  "description": "PSM file type. By default inferred from file extension.",
@@ -127,6 +131,18 @@
127
131
  "type": "boolean",
128
132
  "default": false
129
133
  },
134
+ "max_psm_rank_input": {
135
+ "description": "Maximum rank of PSMs to use as input for rescoring",
136
+ "type": "number",
137
+ "default": 10,
138
+ "minimum": 1
139
+ },
140
+ "max_psm_rank_output": {
141
+ "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation",
142
+ "type": "number",
143
+ "default": 1,
144
+ "minimum": 1
145
+ },
130
146
  "modification_mapping": {
131
147
  "description": "Mapping of modification labels to each replacement label.",
132
148
  "type": "object",
@@ -159,7 +175,7 @@
159
175
  "default": false
160
176
  },
161
177
  "profile": {
162
- "description": "Write an txt report using cProfile for profiling",
178
+ "description": "Write a txt report using cProfile for profiling",
163
179
  "type": "boolean",
164
180
  "default": false
165
181
  }
@@ -263,6 +279,13 @@
263
279
  "type": "object",
264
280
  "additionalProperties": true,
265
281
  "properties": {
282
+ "train_fdr": {
283
+ "description": "FDR threshold for training Mokapot",
284
+ "type": "number",
285
+ "minimum": 0,
286
+ "maximum": 1,
287
+ "default": 0.01
288
+ },
266
289
  "write_weights": {
267
290
  "description": "Write Mokapot weights to a text file",
268
291
  "type": "boolean",
@@ -1,7 +1,8 @@
1
1
  import logging
2
2
  import re
3
- from typing import Dict, Union
3
+ from typing import Dict, Optional, Union
4
4
 
5
+ import numpy as np
5
6
  import psm_utils.io
6
7
  from psm_utils import PSMList
7
8
 
@@ -23,13 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
23
24
  PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
24
25
 
25
26
  """
26
- # Read PSMs, find decoys, calculate q-values
27
- psm_list = _read_psms(config, psm_list)
28
- _find_decoys(config, psm_list)
29
- _calculate_qvalues(config, psm_list)
27
+ # Read PSMs
28
+ try:
29
+ psm_list = _read_psms(config, psm_list)
30
+ except psm_utils.io.PSMUtilsIOException:
31
+ raise MS2RescoreConfigurationError(
32
+ "Error occurred while reading PSMs. Please check the 'psm_file' and "
33
+ "'psm_file_type' settings. See "
34
+ "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
35
+ " for more information."
36
+ )
37
+
38
+ # Filter by PSM rank
39
+ psm_list.set_ranks(config["lower_score_is_better"])
40
+ rank_filter = psm_list["rank"] <= config["max_psm_rank_input"]
41
+ psm_list = psm_list[rank_filter]
42
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.")
43
+
44
+ # Remove invalid AAs, find decoys, calculate q-values
45
+ psm_list = _remove_invalid_aa(psm_list)
46
+ _find_decoys(psm_list, config["id_decoy_pattern"])
47
+ _calculate_qvalues(psm_list, config["lower_score_is_better"])
30
48
  if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
31
49
  logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
32
- _parse_values_spectrum_id(config, psm_list)
50
+ _parse_values_from_spectrum_id(config, psm_list)
33
51
 
34
52
  # Store scoring values for comparison later
35
53
  for psm in psm_list:
@@ -70,10 +88,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
70
88
  new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
71
89
  psm_list["spectrum_id"] = new_ids
72
90
 
73
- # TODO: Temporary fix until implemented in psm_utils
74
- # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
75
- psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
76
-
77
91
  return psm_list
78
92
 
79
93
 
@@ -81,49 +95,30 @@ def _read_psms(config, psm_list):
81
95
  if isinstance(psm_list, PSMList):
82
96
  return psm_list
83
97
  else:
84
- logger.info("Reading PSMs from file...")
85
- current_file = 1
86
98
  total_files = len(config["psm_file"])
87
- valid_psms_list = []
88
- total_psms = 0
89
- valid_psms = 0
90
- for psm_file in config["psm_file"]:
99
+ psm_list = []
100
+ for current_file, psm_file in enumerate(config["psm_file"]):
91
101
  logger.info(
92
- f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
102
+ f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
93
103
  )
94
- try:
95
- id_file_psm_list = psm_utils.io.read_file(
104
+ psm_list.extend(
105
+ psm_utils.io.read_file(
96
106
  psm_file,
97
107
  filetype=config["psm_file_type"],
98
108
  show_progressbar=True,
99
109
  **config["psm_reader_kwargs"],
100
110
  )
101
- except psm_utils.io.PSMUtilsIOException:
102
- raise MS2RescoreConfigurationError(
103
- "Error occurred while reading PSMs. Please check the 'psm_file' and "
104
- "'psm_file_type' settings. See "
105
- "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
106
- " for more information."
107
- )
108
-
109
- total_psms += len(id_file_psm_list.psm_list)
110
- for psm in id_file_psm_list.psm_list:
111
- if not _has_invalid_aminoacids(psm):
112
- valid_psms_list.append(psm)
113
- valid_psms += 1
114
- current_file += 1
115
- if total_psms - valid_psms > 0:
116
- logger.warning(
117
- f"{total_psms - valid_psms} PSMs with invalid amino acids were removed."
118
111
  )
119
- return PSMList(psm_list=valid_psms_list)
112
+ logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.")
113
+
114
+ return PSMList(psm_list=psm_list)
120
115
 
121
116
 
122
- def _find_decoys(config, psm_list):
117
+ def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None):
123
118
  """Find decoys in PSMs, log amount, and raise error if none found."""
124
119
  logger.debug("Finding decoys...")
125
- if config["id_decoy_pattern"]:
126
- psm_list.find_decoys(config["id_decoy_pattern"])
120
+ if id_decoy_pattern:
121
+ psm_list.find_decoys(id_decoy_pattern)
127
122
 
128
123
  n_psms = len(psm_list)
129
124
  percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
@@ -138,12 +133,12 @@ def _find_decoys(config, psm_list):
138
133
  )
139
134
 
140
135
 
141
- def _calculate_qvalues(config, psm_list):
136
+ def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool):
142
137
  """Calculate q-values for PSMs if not present."""
143
138
  # Calculate q-values if not present
144
139
  if None in psm_list["qvalue"]:
145
140
  logger.debug("Recalculating q-values...")
146
- psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])
141
+ psm_list.calculate_qvalues(reverse=not lower_score_is_better)
147
142
 
148
143
 
149
144
  def _match_psm_ids(old_id, regex_pattern):
@@ -158,47 +153,45 @@ def _match_psm_ids(old_id, regex_pattern):
158
153
  )
159
154
 
160
155
 
161
- def _parse_values_spectrum_id(config, psm_list):
156
+ def _parse_values_from_spectrum_id(
157
+ psm_list: PSMList,
158
+ psm_id_rt_pattern: Optional[str] = None,
159
+ psm_id_im_pattern: Optional[str] = None,
160
+ ):
162
161
  """Parse retention time and or ion mobility values from the spectrum_id."""
163
-
164
- if config["psm_id_rt_pattern"]:
165
- logger.debug(
166
- "Parsing retention time from spectrum_id with regex pattern "
167
- f"{config['psm_id_rt_pattern']}"
168
- )
169
- try:
170
- rt_pattern = re.compile(config["psm_id_rt_pattern"])
171
- psm_list["retention_time"] = [
172
- float(rt_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
173
- ]
174
- except AttributeError:
175
- raise MS2RescoreConfigurationError(
176
- f"Could not parse retention time from spectrum_id with the "
177
- f"{config['psm_id_rt_pattern']} regex pattern. "
178
- "Please make sure the retention time key is present in the spectrum_id "
179
- "and the value is in a capturing group or disable the relevant feature generator."
180
- )
181
-
182
- if config["psm_id_im_pattern"]:
183
- logger.debug(
184
- "Parsing ion mobility from spectrum_id with regex pattern "
185
- f"{config['psm_id_im_pattern']}"
186
- )
187
- try:
188
- im_pattern = re.compile(config["psm_id_im_pattern"])
189
- psm_list["ion_mobility"] = [
190
- float(im_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
191
- ]
192
- except AttributeError:
193
- raise MS2RescoreConfigurationError(
194
- f"Could not parse ion mobility from spectrum_id with the "
195
- f"{config['psm_id_im_pattern']} regex pattern. "
196
- "Please make sure the ion mobility key is present in the spectrum_id "
197
- "and the value is in a capturing group or disable the relevant feature generator."
162
+ for pattern, label, key in zip(
163
+ [psm_id_rt_pattern, psm_id_im_pattern],
164
+ ["retention time", "ion mobility"],
165
+ ["retention_time", "ion_mobility"],
166
+ ):
167
+ if pattern:
168
+ logger.debug(
169
+ f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}"
198
170
  )
171
+ try:
172
+ pattern = re.compile(pattern)
173
+ psm_list[key] = [
174
+ float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
175
+ ]
176
+ except AttributeError:
177
+ raise MS2RescoreConfigurationError(
178
+ f"Could not parse {label} from spectrum_id with the "
179
+ f"{pattern} regex pattern. "
180
+ f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. "
181
+ f"Please make sure the {label} key is present in the spectrum_id "
182
+ "and the value is in a capturing group or disable the relevant feature generator."
183
+ )
199
184
 
200
185
 
201
- def _has_invalid_aminoacids(psm):
202
- """Check if a PSM contains invalid amino acids."""
186
+ def _remove_invalid_aa(psm_list: PSMList) -> PSMList:
187
+ """Remove PSMs with invalid amino acids."""
188
+ invalid_psms = np.array(
189
+ [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list]
190
+ )
203
191
 
204
- return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence)
192
+ if any(invalid_psms):
193
+ logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.")
194
+ return psm_list[~invalid_psms]
195
+ else:
196
+ logger.debug("No PSMs with invalid amino acids found.")
197
+ return psm_list
@@ -198,6 +198,7 @@ def score_scatter_plot(
198
198
  after: mokapot.LinearConfidence,
199
199
  level: str = "psms",
200
200
  indexer: str = "index",
201
+ fdr_threshold: float = 0.01,
201
202
  ) -> go.Figure:
202
203
  """
203
204
  Plot PSM scores before and after rescoring.
@@ -241,16 +242,22 @@ def score_scatter_plot(
241
242
  ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0)
242
243
 
243
244
  # Get score thresholds
244
- score_threshold_before = (
245
- ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
246
- .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
247
- .iloc[0]
248
- )
249
- score_threshold_after = (
250
- ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
251
- .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
252
- .iloc[0]
253
- )
245
+ try:
246
+ score_threshold_before = (
247
+ ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
248
+ .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
249
+ .iloc[0]
250
+ )
251
+ except IndexError: # No PSMs below threshold
252
+ score_threshold_before = None
253
+ try:
254
+ score_threshold_after = (
255
+ ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
256
+ .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
257
+ .iloc[0]
258
+ )
259
+ except IndexError: # No PSMs below threshold
260
+ score_threshold_after = None
254
261
 
255
262
  # Plot
256
263
  fig = px.scatter(
@@ -267,10 +274,12 @@ def score_scatter_plot(
267
274
  },
268
275
  )
269
276
  # draw FDR thresholds
270
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
271
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
272
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
273
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
277
+ if score_threshold_before:
278
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
279
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
280
+ if score_threshold_after:
281
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
282
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
274
283
 
275
284
  return fig
276
285
 
@@ -29,8 +29,11 @@ import pandas as pd
29
29
  import psm_utils
30
30
  from mokapot.brew import brew
31
31
  from mokapot.dataset import LinearPsmDataset
32
+ from mokapot.model import PercolatorModel
32
33
  from pyteomics.mass import nist_mass
33
34
 
35
+ from ms2rescore.exceptions import RescoringError
36
+
34
37
  logger = logging.getLogger(__name__)
35
38
  logging.getLogger("numba").setLevel(logging.WARNING)
36
39
 
@@ -39,6 +42,7 @@ def rescore(
39
42
  psm_list: psm_utils.PSMList,
40
43
  output_file_root: str = "ms2rescore",
41
44
  fasta_file: Optional[str] = None,
45
+ train_fdr: float = 0.01,
42
46
  write_weights: bool = False,
43
47
  write_txt: bool = False,
44
48
  write_flashlfq: bool = False,
@@ -65,6 +69,8 @@ def rescore(
65
69
  fasta_file
66
70
  Path to FASTA file with protein sequences to use for protein inference. Defaults to
67
71
  ``None``.
72
+ train_fdr
73
+ FDR to use for training the Mokapot model. Defaults to ``0.01``.
68
74
  write_weights
69
75
  Write model weights to a text file. Defaults to ``False``.
70
76
  write_txt
@@ -91,46 +97,15 @@ def rescore(
91
97
 
92
98
  # Rescore
93
99
  logger.debug(f"Mokapot brew options: `{kwargs}`")
94
- confidence_results, models = brew(lin_psm_data, rng=8, **kwargs)
95
-
96
- # Reshape confidence estimates to match PSMList
97
- keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
98
- mokapot_values_targets = (
99
- confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
100
- )
101
- mokapot_values_decoys = (
102
- confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
103
- )
104
- q = np.full((len(psm_list), 3), np.nan)
105
- q[mokapot_values_targets.index] = mokapot_values_targets.values
106
- q[mokapot_values_decoys.index] = mokapot_values_decoys.values
107
-
108
- # Add Mokapot results to PSMList
109
- psm_list["score"] = q[:, 0]
110
- psm_list["qvalue"] = q[:, 1]
111
- psm_list["pep"] = q[:, 2]
112
-
113
- # Repeat for peptide-level scores
114
- peptides_targets = confidence_results.confidence_estimates["peptides"].set_index(["peptide"])[
115
- keys
116
- ]
117
- peptides_decoys = confidence_results.decoy_confidence_estimates["peptides"].set_index(
118
- ["peptide"]
119
- )[keys]
120
- peptide_info = pd.concat([peptides_targets, peptides_decoys], axis=0).to_dict(orient="index")
121
-
122
- # Add peptide-level scores to PSM metadata
123
- # run_key = "na" if not all(psm.run for psm in psm_list) else None
124
- no_charge_pattern = re.compile(r"(/\d+$)")
125
- for psm in psm_list:
126
- peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
127
- psm.metadata.update(
128
- {
129
- "peptide_score": peptide_scores["mokapot score"],
130
- "peptide_qvalue": peptide_scores["mokapot q-value"],
131
- "peptide_pep": peptide_scores["mokapot PEP"],
132
- }
100
+ try:
101
+ confidence_results, models = brew(
102
+ lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
133
103
  )
104
+ except RuntimeError as e:
105
+ raise RescoringError("Mokapot could not be run. Please check the input data.") from e
106
+
107
+ add_psm_confidence(psm_list, confidence_results)
108
+ add_peptide_confidence(psm_list, confidence_results)
134
109
 
135
110
  # Write results
136
111
  if write_weights:
@@ -245,6 +220,58 @@ def save_model_weights(
245
220
  )
246
221
 
247
222
 
223
+ def add_psm_confidence(
224
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
225
+ ) -> None:
226
+ """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank."""
227
+ # Reshape confidence estimates to match PSMList
228
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
229
+ mokapot_values_targets = (
230
+ confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
231
+ )
232
+ mokapot_values_decoys = (
233
+ confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
234
+ )
235
+ q = np.full((len(psm_list), 3), np.nan)
236
+ q[mokapot_values_targets.index] = mokapot_values_targets.values
237
+ q[mokapot_values_decoys.index] = mokapot_values_decoys.values
238
+
239
+ # Add Mokapot results to PSMList
240
+ psm_list["score"] = q[:, 0]
241
+ psm_list["qvalue"] = q[:, 1]
242
+ psm_list["pep"] = q[:, 2]
243
+
244
+ # Reset ranks to match new scores
245
+ psm_list.set_ranks(lower_score_better=False)
246
+
247
+
248
+ def add_peptide_confidence(
249
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
250
+ ) -> None:
251
+ """Add Mokapot peptide-level confidence estimates to PSM list."""
252
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
253
+ peptide_info = pd.concat(
254
+ [
255
+ confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
256
+ confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
257
+ ],
258
+ axis=0,
259
+ ).to_dict(orient="index")
260
+
261
+ # Add peptide-level scores to PSM metadata
262
+ # run_key = "na" if not all(psm.run for psm in psm_list) else None
263
+ no_charge_pattern = re.compile(r"(/\d+$)")
264
+ for psm in psm_list:
265
+ peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
266
+ psm.metadata.update(
267
+ {
268
+ "peptide_score": peptide_scores["mokapot score"],
269
+ "peptide_qvalue": peptide_scores["mokapot q-value"],
270
+ "peptide_pep": peptide_scores["mokapot PEP"],
271
+ }
272
+ )
273
+
274
+
248
275
  def _mz_to_mass(mz: float, charge: int) -> float:
249
276
  """Convert m/z to mass."""
250
277
  return mz * charge - charge * nist_mass["H"][1][0]
@@ -175,6 +175,8 @@ def _update_psm_scores(
175
175
  original_psm["qvalue"] = new_psm["qvalue"]
176
176
  original_psm["pep"] = new_psm["pep"]
177
177
 
178
+ psm_list.set_ranks(lower_score_better=False)
179
+
178
180
 
179
181
  def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str):
180
182
  """Write PIN file for rescoring."""
@@ -43,13 +43,14 @@ dependencies = [
43
43
  "mokapot>=0.9",
44
44
  "ms2pip>=4.0.0-dev10",
45
45
  "ms2rescore_rs",
46
- "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
47
- "numpy>=1.16.0; python_version != '3.11'",
46
+ # "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
47
+ # "numpy>=1.16.0; python_version != '3.11'",
48
+ "numpy>=1.16.0",
49
+ "scikit-learn==1.5.1; python_version == '3.11'",
48
50
  "pandas>=1.0",
49
51
  "plotly>=5",
50
- "psm_utils>=0.8",
51
- "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
52
- "pyteomics>=4.1.0, <4.7",
52
+ "psm_utils>=0.9",
53
+ "pyteomics>=4.7.2",
53
54
  "rich>=12",
54
55
  "tomli>=2; python_version < '3.11'",
55
56
  ]
File without changes