imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,836 @@
1
+ """IMSPY DDA CLI - timsTOF DDA proteomics analysis using imspy and sagepy."""
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import sys
7
+ import time
8
+ import toml
9
+
10
+ import mokapot
11
+ import pandas as pd
12
+ import numpy as np
13
+
14
+ from pathlib import Path
15
+
16
+ from imspy_search.mgf import mgf_to_sagepy_query
17
+ from sagepy.core import Precursor, Tolerance, SpectrumProcessor, Scorer, EnzymeBuilder, SageSearchConfiguration
18
+ from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, ScoreType
19
+ from sagepy.qfdr.tdc import target_decoy_competition_pandas, assign_sage_spectrum_q, assign_sage_peptide_q, \
20
+ assign_sage_protein_q
21
+
22
+ from imspy_predictors import (
23
+ DeepPeptideIonMobilityApex, load_deep_ccs_predictor,
24
+ load_tokenizer_from_resources,
25
+ DeepChromatographyApex, load_deep_retention_time_predictor,
26
+ Prosit2023TimsTofWrapper,
27
+ get_collision_energy_calibration_factor,
28
+ )
29
+
30
+ from imspy_core.timstof import TimsDatasetDDA
31
+
32
+ from sklearn.svm import SVC
33
+ from sagepy.rescore.rescore import rescore_psms
34
+ from sagepy.core.fdr import sage_fdr_psm
35
+
36
+ from imspy_search.utility import (
37
+ sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta,
38
+ write_psms_binary, merge_dicts_with_merge_dict, generate_balanced_rt_dataset,
39
+ generate_balanced_im_dataset, linear_map, check_memory, parse_to_tims2rescore
40
+ )
41
+
42
+ from sagepy.rescore.utility import transform_psm_to_mokapot_pin
43
+ from sagepy.utility import psm_collection_to_pandas, apply_mz_calibration
44
+ from sagepy.utility import decompress_psms, compress_psms
45
+
46
+ # Suppress tensorflow warnings
47
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
48
+
49
+
50
+ def configure_gpu_memory(memory_limit_gb: int = 4) -> None:
51
+ """Configure TensorFlow to limit GPU memory usage."""
52
+ import tensorflow as tf
53
+
54
+ gpus = tf.config.experimental.list_physical_devices('GPU')
55
+ if gpus:
56
+ try:
57
+ for i, _ in enumerate(gpus):
58
+ tf.config.experimental.set_virtual_device_configuration(
59
+ gpus[i],
60
+ [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024 * memory_limit_gb)]
61
+ )
62
+ print(f"GPU: {i} memory restricted to {memory_limit_gb}GB.")
63
+ except RuntimeError as e:
64
+ print(e)
65
+
66
+
67
+ def create_database(fasta, static, variab, enzyme_builder, generate_decoys, bucket_size,
68
+ shuffle_decoys=True, keep_ends=True):
69
+ """Create indexed database for search."""
70
+ sage_config = SageSearchConfiguration(
71
+ fasta=fasta,
72
+ static_mods=static,
73
+ variable_mods=variab,
74
+ enzyme_builder=enzyme_builder,
75
+ generate_decoys=generate_decoys,
76
+ bucket_size=bucket_size,
77
+ shuffle_decoys=shuffle_decoys,
78
+ keep_ends=keep_ends,
79
+ )
80
+ return sage_config.generate_indexed_database()
81
+
82
+
83
+ def load_config(config_path):
84
+ """Load configuration from a TOML file."""
85
+ with open(config_path, 'r') as config_file:
86
+ config = toml.load(config_file)
87
+ return config
88
+
89
+
90
+ def main():
91
+ """Main entry point for imspy-dda CLI."""
92
+ # Configure GPU memory before TensorFlow is used
93
+ configure_gpu_memory(memory_limit_gb=4)
94
+
95
+ # Check memory
96
+ check_memory(limit_in_gb=16)
97
+
98
+ # Use argparse to parse command line arguments
99
+ parser = argparse.ArgumentParser(
100
+ description='IMSPY - timsTOF DDA - PROTEOMICS IMS DDA data analysis using imspy and sagepy.'
101
+ )
102
+
103
+ # Required arguments
104
+ parser.add_argument(
105
+ "path",
106
+ type=str,
107
+ help="Path to bruker raw folders (.d) containing RAW files"
108
+ )
109
+ parser.add_argument(
110
+ "fasta",
111
+ type=str,
112
+ help="Path to the fasta file of proteins to be digested"
113
+ )
114
+
115
+ # Path to the script directory for default config
116
+ script_dir = Path(__file__).parent.parent
117
+ default_config_path = script_dir / "configs" / "config_tryptic.toml"
118
+
119
+ parser.add_argument(
120
+ "--config",
121
+ type=str,
122
+ default=default_config_path,
123
+ help="Path to the configuration file (TOML format). Default: configs/config_tryptic.toml"
124
+ )
125
+
126
+ # Optional verbosity flag
127
+ parser.add_argument("-nv", "--no_verbose", dest="verbose", action="store_false", help="Decrease output verbosity")
128
+ parser.set_defaults(verbose=True)
129
+
130
+ # FASTA batch size
131
+ parser.add_argument("-fbs", "--fasta_batch_size", type=int, default=None, help="Batch size for fasta file")
132
+ parser.add_argument("--no_re_score_mokapot", dest="re_score_mokapot", action="store_false",
133
+ help="Do not re-score PSMs using mokapot")
134
+ parser.set_defaults(re_score_mokapot=None)
135
+
136
+ # SAGE isolation window settings
137
+ parser.add_argument("--isolation_window_lower", type=float, default=None)
138
+ parser.add_argument("--isolation_window_upper", type=float, default=None)
139
+
140
+ # Tolerance settings
141
+ parser.add_argument("--precursor_tolerance_da", dest="precursor_tolerance_da", action="store_true")
142
+ parser.set_defaults(precursor_tolerance_da=None)
143
+ parser.add_argument("--fragment_tolerance_da", dest="fragment_tolerance_da", action="store_true")
144
+ parser.set_defaults(fragment_tolerance_da=None)
145
+ parser.add_argument("--precursor_tolerance_lower", type=float, default=None)
146
+ parser.add_argument("--precursor_tolerance_upper", type=float, default=None)
147
+ parser.add_argument("--fragment_tolerance_lower", type=float, default=None)
148
+ parser.add_argument("--fragment_tolerance_upper", type=float, default=None)
149
+
150
+ # Scoring settings
151
+ parser.add_argument("--min_isotope_err", type=int, default=None)
152
+ parser.add_argument("--max_isotope_err", type=int, default=None)
153
+ parser.add_argument("--report_psms", type=int, default=None)
154
+ parser.add_argument("--min_matched_peaks", type=int, default=None)
155
+ parser.add_argument("--no_match_annotation", dest="annotate_matches", action="store_false")
156
+ parser.set_defaults(annotate_matches=None)
157
+ parser.add_argument("--score_type", type=str, default=None)
158
+ parser.add_argument("--take_top_n", type=int, default=None)
159
+
160
+ # Enzyme settings
161
+ parser.add_argument("--missed_cleavages", type=int, default=None)
162
+ parser.add_argument("--min_len", type=int, default=None)
163
+ parser.add_argument("--max_len", type=int, default=None)
164
+ parser.add_argument("--cleave_at", type=str, default=None)
165
+ parser.add_argument("--restrict", type=str, default=None)
166
+ parser.add_argument("--not_c_terminal", dest="c_terminal", action="store_false")
167
+ parser.set_defaults(c_terminal=None)
168
+
169
+ # Database settings
170
+ parser.add_argument("--calibrate_mz", dest="calibrate_mz", action="store_true")
171
+ parser.set_defaults(calibrate_mz=None)
172
+ parser.add_argument("--no_decoys", dest="decoys", action="store_false")
173
+ parser.set_defaults(decoys=None)
174
+ parser.add_argument("--shuffle_decoys", dest="shuffle_decoys", action="store_true")
175
+ parser.set_defaults(shuffle_decoys=None)
176
+ parser.add_argument("--include_peptide_ends", dest="keep_ends", action="store_false")
177
+ parser.set_defaults(keep_ends=None)
178
+ parser.add_argument("--fragment_max_mz", type=float, default=None)
179
+ parser.add_argument("--bucket_size", type=int, default=None)
180
+ parser.add_argument("--max_fragment_charge", type=int, default=None)
181
+ parser.add_argument("--randomize_fasta_split", dest="randomize_fasta_split", action="store_true")
182
+ parser.set_defaults(randomize_fasta_split=None)
183
+
184
+ # Re-scoring settings
185
+ parser.add_argument("--re_score_num_splits", type=int, default=None)
186
+ parser.add_argument("--re_score_metric", type=str, default=None)
187
+
188
+ # FDR settings
189
+ parser.add_argument("--fdr_threshold", type=float, default=None)
190
+ parser.add_argument("--fdr_psm_method", type=str, default=None)
191
+ parser.add_argument("--fdr_peptide_method", type=str, default=None)
192
+ parser.add_argument("--fdr_score", type=str, default=None)
193
+
194
+ # Other settings
195
+ parser.add_argument("--num_threads", type=int, default=None)
196
+ parser.add_argument("--no_remove_decoys", dest="remove_decoys", action="store_false")
197
+ parser.set_defaults(remove_decoys=None)
198
+ parser.add_argument("--no_balanced_re_score", dest="balanced_re_score", action="store_false")
199
+ parser.set_defaults(balanced_re_score=None)
200
+ parser.add_argument("--in_memory", dest="in_memory", action="store_true")
201
+ parser.set_defaults(in_memory=None)
202
+ parser.add_argument("--no_bruker_sdk", dest="bruker_sdk", action="store_false")
203
+ parser.set_defaults(bruker_sdk=None)
204
+
205
+ # Refinement settings
206
+ parser.add_argument("--refine_rt", dest="refine_rt", action="store_true")
207
+ parser.set_defaults(refine_rt=None)
208
+ parser.add_argument("--refine_im", dest="refine_im", action="store_true")
209
+ parser.set_defaults(refine_im=None)
210
+ parser.add_argument("--refinement_verbose", dest="refinement_verbose", action="store_true")
211
+ parser.set_defaults(refinement_verbose=None)
212
+
213
+ # Batch sizes
214
+ parser.add_argument("--intensity_prediction_batch_size", type=int, default=None)
215
+ parser.add_argument("--model_fine_tune_batch_size", type=int, default=None)
216
+ parser.add_argument("--sample_size_collision_energy_calibration", type=int, default=None)
217
+ parser.add_argument("--tims2rescore_table", dest="tims2rescore_table", action="store_true")
218
+ parser.set_defaults(tims2rescore_table=None)
219
+ parser.add_argument("--use_mgf", action="store_true")
220
+ parser.set_defaults(use_mgf=None)
221
+
222
+ args = parser.parse_args()
223
+
224
+ # Load the configuration from the specified file
225
+ config = load_config(args.config)
226
+
227
+ # Initialize parameters with defaults from the config file
228
+ params = {
229
+ 'variable_modifications': config.get('variable_modifications', {}),
230
+ 'static_modifications': config.get('static_modifications', {}),
231
+ 'score_type': config.get('scoring', {}).get('score_type', 'openmshyperscore'),
232
+ 'report_psms': config.get('scoring', {}).get('report_psms', 5),
233
+ 'min_matched_peaks': config.get('scoring', {}).get('min_matched_peaks', 5),
234
+ 'annotate_matches': config.get('scoring', {}).get('annotate_matches', True),
235
+ 'max_fragment_charge': config.get('scoring', {}).get('max_fragment_charge', 2),
236
+ 'precursor_tolerance_da': config.get('precursor_tolerance', {}).get('use_da', False),
237
+ 'precursor_tolerance_lower': config.get('precursor_tolerance', {}).get('lower', -15.0),
238
+ 'precursor_tolerance_upper': config.get('precursor_tolerance', {}).get('upper', 15.0),
239
+ 'fragment_tolerance_da': config.get('fragment_tolerance', {}).get('use_da', False),
240
+ 'fragment_tolerance_lower': config.get('fragment_tolerance', {}).get('lower', -20.0),
241
+ 'fragment_tolerance_upper': config.get('fragment_tolerance', {}).get('upper', 20.0),
242
+ 'isolation_window_lower': config.get('isolation_window', {}).get('lower', -3.0),
243
+ 'isolation_window_upper': config.get('isolation_window', {}).get('upper', 3.0),
244
+ 'take_top_n': config.get('preprocessing', {}).get('take_top_n', 150),
245
+ 'missed_cleavages': config.get('enzyme', {}).get('missed_cleavages', 2),
246
+ 'min_len': config.get('enzyme', {}).get('min_len', 7),
247
+ 'max_len': config.get('enzyme', {}).get('max_len', 30),
248
+ 'cleave_at': config.get('enzyme', {}).get('cleave_at', 'KR'),
249
+ 'restrict': config.get('enzyme', {}).get('restrict', 'P'),
250
+ 'c_terminal': config.get('enzyme', {}).get('c_terminal', True),
251
+ 'decoys': config.get('database', {}).get('generate_decoys', True),
252
+ 'shuffle_decoys': config.get('database', {}).get('shuffle_decoys', False),
253
+ 'keep_ends': config.get('database', {}).get('keep_ends', True),
254
+ 'bucket_size': config.get('database', {}).get('bucket_size', 16384),
255
+ 'fragment_max_mz': config.get('search', {}).get('fragment_max_mz', 1700.0),
256
+ 'randomize_fasta_split': config.get('other', {}).get('randomize_fasta_split', False),
257
+ 're_score_num_splits': config.get('re_scoring', {}).get('re_score_num_splits', 5),
258
+ 're_score_metric': config.get('re_scoring', {}).get('re_score_metric', 'hyperscore'),
259
+ 'fdr_threshold': config.get('fdr', {}).get('fdr_threshold', 0.01),
260
+ 'fdr_psm_method': config.get('fdr', {}).get('fdr_psm_method', 'psm'),
261
+ 'fdr_peptide_method': config.get('fdr', {}).get('fdr_peptide_method', 'peptide_psm_peptide'),
262
+ 'fdr_score': config.get('fdr', {}).get('fdr_score', 're_score'),
263
+ 'num_threads': config.get('parallelization', {}).get('num_threads', -1),
264
+ 'remove_decoys': config.get('fdr', {}).get('remove_decoys', True),
265
+ 'balanced_re_score': config.get('re_scoring', {}).get('balanced_re_score', True),
266
+ 'calibrate_mz': config.get('other', {}).get('calibrate_mz', False),
267
+ 'in_memory': config.get('other', {}).get('in_memory', False),
268
+ 'bruker_sdk': config.get('other', {}).get('bruker_sdk', True),
269
+ 'refine_rt': config.get('refinement', {}).get('refine_rt', False),
270
+ 'refine_im': config.get('refinement', {}).get('refine_im', False),
271
+ 'refinement_verbose': config.get('refinement', {}).get('refinement_verbose', False),
272
+ 'intensity_prediction_batch_size': config.get('batch_sizes', {}).get('intensity_prediction_batch_size', 2048),
273
+ 'model_fine_tune_batch_size': config.get('batch_sizes', {}).get('model_fine_tune_batch_size', 1024),
274
+ 'sample_size_collision_energy_calibration': config.get('batch_sizes', {}).get('sample_size_collision_energy_calibration', 256),
275
+ 'verbose': config.get('other', {}).get('verbose', True),
276
+ 'fasta_batch_size': config.get('other', {}).get('fasta_batch_size', 1),
277
+ 're_score_mokapot': config.get('re_scoring', {}).get('re_score_mokapot', True),
278
+ 'tims2rescore_table': config.get('other', {}).get('tims2rescore_table', False),
279
+ 'use_mgf': config.get('other', {}).get('use_mgf', False),
280
+ 'min_isotope_err': config.get('scoring', {}).get('min_isotope_err', -1),
281
+ 'max_isotope_err': config.get('scoring', {}).get('max_isotope_err', 3),
282
+ }
283
+
284
+ # Override parameters with command-line arguments if provided
285
+ for key in vars(args):
286
+ if getattr(args, key) is not None:
287
+ params[key] = getattr(args, key)
288
+
289
+ variable_modifications = params['variable_modifications']
290
+ static_modifications = params['static_modifications']
291
+
292
+ args.verbose = params['verbose']
293
+
294
+ if args.verbose:
295
+ print(f"Variable modifications to be applied: {variable_modifications}")
296
+ print(f"Static modifications to be applied: {static_modifications}")
297
+
298
+ paths = []
299
+ mgfs = []
300
+
301
+ # Check if path exists
302
+ if not os.path.exists(args.path):
303
+ print(f"Path {args.path} does not exist. Exiting.")
304
+ sys.exit(1)
305
+
306
+ if not os.path.isdir(args.path):
307
+ print(f"Path {args.path} is not a directory. Exiting.")
308
+ sys.exit(1)
309
+
310
+ if not os.path.exists(args.fasta):
311
+ print(f"Path {args.fasta} does not exist. Exiting.")
312
+ sys.exit(1)
313
+
314
+ for root, dirs, _ in os.walk(args.path):
315
+ for file in dirs:
316
+ if file.endswith(".d"):
317
+ path = os.path.join(root, file)
318
+ if params['use_mgf']:
319
+ if args.verbose:
320
+ print(f"Looking for mgf in folder `{path}` ...")
321
+ mgf_path = None
322
+ mgf_path_cnt = 0
323
+ for potential_mgf_path in Path(path).iterdir():
324
+ if potential_mgf_path.suffix == ".mgf":
325
+ mgf_path = str(potential_mgf_path)
326
+ mgf_path_cnt += 1
327
+ assert mgf_path_cnt == 1, f"Found {mgf_path_cnt} mgfs in folder `{path}`. We need exactly one."
328
+ mgfs.append(mgf_path)
329
+ paths.append(path)
330
+
331
+ write_folder_path = str(Path(args.path))
332
+
333
+ if not os.path.exists(write_folder_path + "/imspy"):
334
+ os.makedirs(write_folder_path + "/imspy")
335
+
336
+ current_time = time.strftime("%Y%m%d-%H%M%S")
337
+ logging.basicConfig(
338
+ filename=f"{write_folder_path}/imspy/imspy-{current_time}.log",
339
+ level=logging.INFO,
340
+ format='%(asctime)s %(message)s'
341
+ )
342
+
343
+ if params['num_threads'] == -1 and args.verbose:
344
+ print(f"Using all available CPU threads: {os.cpu_count()} ...")
345
+ params['num_threads'] = os.cpu_count()
346
+
347
+ logging.info("Arguments settings:")
348
+ for arg in vars(args):
349
+ logging.info(f"{arg}: {getattr(args, arg)}")
350
+
351
+ start_time = time.time()
352
+
353
+ if args.verbose:
354
+ print(f"found {len(paths)} RAW data folders in {args.path} ...")
355
+
356
+ # Setup tolerances
357
+ if params['precursor_tolerance_da']:
358
+ prec_tol = Tolerance(da=(params['precursor_tolerance_lower'], params['precursor_tolerance_upper']))
359
+ else:
360
+ prec_tol = Tolerance(ppm=(params['precursor_tolerance_lower'], params['precursor_tolerance_upper']))
361
+
362
+ if params['fragment_tolerance_da']:
363
+ frag_tol = Tolerance(da=(params['fragment_tolerance_lower'], params['fragment_tolerance_upper']))
364
+ else:
365
+ frag_tol = Tolerance(ppm=(params['fragment_tolerance_lower'], params['fragment_tolerance_upper']))
366
+
367
+ score_type = ScoreType(params['score_type'])
368
+
369
+ scorer = Scorer(
370
+ precursor_tolerance=prec_tol,
371
+ fragment_tolerance=frag_tol,
372
+ report_psms=params['report_psms'],
373
+ min_matched_peaks=params['min_matched_peaks'],
374
+ annotate_matches=params['annotate_matches'],
375
+ max_fragment_charge=params['max_fragment_charge'],
376
+ score_type=score_type,
377
+ variable_mods=variable_modifications,
378
+ static_mods=static_modifications,
379
+ min_isotope_err=params['min_isotope_err'],
380
+ max_isotope_err=params['max_isotope_err'],
381
+ )
382
+
383
+ if args.verbose:
384
+ print("generating fasta digest ...")
385
+
386
+ enzyme_builder = EnzymeBuilder(
387
+ missed_cleavages=params['missed_cleavages'],
388
+ min_len=params['min_len'],
389
+ max_len=params['max_len'],
390
+ cleave_at=params['cleave_at'],
391
+ restrict=params['restrict'],
392
+ c_terminal=params['c_terminal'],
393
+ )
394
+
395
+ # Read FASTA
396
+ if os.path.isdir(args.fasta):
397
+ fasta_files = [os.path.join(args.fasta, f) for f in os.listdir(args.fasta) if f.endswith(".fasta")]
398
+ fasta = ""
399
+ for fasta_file in fasta_files:
400
+ with open(fasta_file, 'r') as infile:
401
+ fasta += infile.read()
402
+ else:
403
+ with open(args.fasta, 'r') as infile:
404
+ fasta = infile.read()
405
+
406
+ fastas = split_fasta(fasta, params['fasta_batch_size'], randomize=params['randomize_fasta_split'])
407
+ indexed_db = None
408
+
409
+ if len(fastas) == 1:
410
+ indexed_db = create_database(
411
+ fastas[0],
412
+ static_modifications,
413
+ variable_modifications,
414
+ enzyme_builder,
415
+ params['decoys'],
416
+ params['bucket_size'],
417
+ shuffle_decoys=params['shuffle_decoys'],
418
+ keep_ends=params['keep_ends']
419
+ )
420
+
421
+ if args.verbose:
422
+ print("loading deep learning models for intensity, retention time and ion mobility prediction ...")
423
+
424
+ prosit_model = Prosit2023TimsTofWrapper(verbose=False)
425
+ im_predictor = DeepPeptideIonMobilityApex(
426
+ load_deep_ccs_predictor(),
427
+ load_tokenizer_from_resources("tokenizer-ptm")
428
+ )
429
+ rt_predictor = DeepChromatographyApex(
430
+ load_deep_retention_time_predictor(),
431
+ load_tokenizer_from_resources("tokenizer-ptm"),
432
+ verbose=True
433
+ )
434
+
435
+ # Process each file
436
+ for file_id, path in enumerate(paths):
437
+ if args.verbose:
438
+ print(f"processing {path} ...")
439
+ print(f"processing {file_id + 1} of {len(paths)} ...")
440
+
441
+ ds_name = os.path.basename(path).split(".")[0]
442
+ dataset = TimsDatasetDDA(str(path), in_memory=params['in_memory'], use_bruker_sdk=params['bruker_sdk'])
443
+
444
+ rt_min = dataset.meta_data.Time.min() / 60.0
445
+ rt_max = dataset.meta_data.Time.max() / 60.0
446
+
447
+ if args.verbose:
448
+ print("loading PASEF fragments ...")
449
+
450
+ fragments = None
451
+
452
+ if params['use_mgf']:
453
+ mgf_path = mgfs[file_id]
454
+ fragments = mgf_to_sagepy_query(mgf_path, top_n=params['take_top_n'])
455
+ else:
456
+ fragments = dataset.get_pasef_fragments(
457
+ num_threads=params['num_threads'] if not params['bruker_sdk'] else 1
458
+ )
459
+
460
+ if args.verbose:
461
+ print("aggregating re-fragmented PASEF frames ...")
462
+
463
+ fragments = fragments.groupby('precursor_id').agg({
464
+ 'frame_id': 'first',
465
+ 'time': 'first',
466
+ 'precursor_id': 'first',
467
+ 'raw_data': 'sum',
468
+ 'scan_begin': 'first',
469
+ 'scan_end': 'first',
470
+ 'isolation_mz': 'first',
471
+ 'isolation_width': 'first',
472
+ 'collision_energy': 'first',
473
+ 'largest_peak_mz': 'first',
474
+ 'average_mz': 'first',
475
+ 'monoisotopic_mz': 'first',
476
+ 'charge': 'first',
477
+ 'average_scan': 'first',
478
+ 'intensity': 'first',
479
+ 'parent_id': 'first',
480
+ })
481
+
482
+ mobility = fragments.apply(lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1)
483
+ fragments['mobility'] = mobility
484
+
485
+ spec_id = fragments.apply(
486
+ lambda r: str(np.random.randint(int(1e6))) + '-' + str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name,
487
+ axis=1
488
+ )
489
+ fragments['spec_id'] = spec_id
490
+
491
+ if args.verbose:
492
+ print("loading precursor data ...")
493
+
494
+ sage_precursor = fragments.apply(lambda r: Precursor(
495
+ mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
496
+ intensity=r['intensity'],
497
+ charge=sanitize_charge(r['charge']),
498
+ isolation_window=Tolerance(da=(params['isolation_window_lower'], params['isolation_window_upper'])),
499
+ collision_energy=r.collision_energy,
500
+ inverse_ion_mobility=r.mobility,
501
+ ), axis=1)
502
+
503
+ fragments['sage_precursor'] = sage_precursor
504
+
505
+ if args.verbose:
506
+ print("preprocessing spectra ...")
507
+
508
+ processed_spec = fragments.apply(
509
+ lambda r: get_searchable_spec(
510
+ precursor=r.sage_precursor,
511
+ raw_fragment_data=r.raw_data,
512
+ spec_processor=SpectrumProcessor(take_top_n=params['take_top_n'], deisotope=True),
513
+ spec_id=r.spec_id,
514
+ time=r['time'],
515
+ ),
516
+ axis=1
517
+ )
518
+
519
+ fragments['processed_spec'] = processed_spec
520
+
521
+ if args.verbose:
522
+ print(f"generated: {len(fragments)} spectra to be scored ...")
523
+ print("creating search configuration ...")
524
+
525
+ psm_dicts = []
526
+ logging.info(f"Processing {ds_name} ...")
527
+
528
+ for j, fasta in enumerate(fastas):
529
+ if len(fastas) > 1:
530
+ if args.verbose:
531
+ print(f"generating indexed database for fasta split {j + 1} of {len(fastas)} ...")
532
+
533
+ indexed_db = create_database(
534
+ fasta,
535
+ static_modifications,
536
+ variable_modifications,
537
+ enzyme_builder,
538
+ params['decoys'],
539
+ params['bucket_size'],
540
+ shuffle_decoys=params['shuffle_decoys'],
541
+ keep_ends=params['keep_ends']
542
+ )
543
+
544
+ if args.verbose:
545
+ print("searching database ...")
546
+
547
+ psm_dict = scorer.score_collection_psm(
548
+ db=indexed_db,
549
+ spectrum_collection=fragments if params['use_mgf'] else fragments['processed_spec'].values,
550
+ num_threads=params['num_threads'],
551
+ )
552
+
553
+ if params['calibrate_mz']:
554
+ if params['use_mgf']:
555
+ raise NotImplementedError("Mass calibration is not yet supported in --use_mgf mode.")
556
+
557
+ if args.verbose:
558
+ print("calibrating mz ...")
559
+
560
+ ppm_error = apply_mz_calibration(psm_dict, fragments)
561
+
562
+ if args.verbose:
563
+ print(f"calibrated mz with error: {np.round(ppm_error, 2)} ppm ...")
564
+ print("re-scoring PSMs after mz calibration ...")
565
+
566
+ psm_dict = scorer.score_collection_psm(
567
+ db=indexed_db,
568
+ spectrum_collection=fragments['processed_spec'].values,
569
+ num_threads=params['num_threads'],
570
+ )
571
+
572
+ for _, values in psm_dict.items():
573
+ for value in values:
574
+ value.file_name = ds_name
575
+ if params['calibrate_mz']:
576
+ value.mz_calibration_ppm = ppm_error
577
+
578
+ counter = 0
579
+ for _, values in psm_dict.items():
580
+ counter += len(values)
581
+
582
+ psm_dicts.append(psm_dict)
583
+
584
+ if args.verbose:
585
+ print("merging PSMs ...")
586
+
587
+ if len(psm_dicts) > 1:
588
+ merged_dict = merge_dicts_with_merge_dict(psm_dicts)
589
+ else:
590
+ merged_dict = psm_dicts[0]
591
+
592
+ psm = []
593
+ for _, values in merged_dict.items():
594
+ psm.extend(list(filter(lambda p: p.sage_feature.rank <= 5, values)))
595
+
596
+ for p in psm:
597
+ p.retention_time_projected = linear_map(p.retention_time, rt_min, rt_max, 0.0, 60.0)
598
+
599
+ if args.verbose:
600
+ print(f"generated {len(psm)} PSMs ...")
601
+
602
+ sample_size = params['sample_size_collision_energy_calibration']
603
+ sample = list(sorted(psm, key=lambda x: x.hyperscore, reverse=True))[:sample_size]
604
+
605
+ collision_energy_calibration_factor, _ = get_collision_energy_calibration_factor(
606
+ list(filter(lambda match: match.decoy is not True, sample)),
607
+ prosit_model,
608
+ verbose=args.verbose,
609
+ )
610
+
611
+ for ps in psm:
612
+ ps.collision_energy_calibrated = ps.collision_energy + collision_energy_calibration_factor
613
+
614
+ if args.verbose:
615
+ print("predicting ion intensities ...")
616
+
617
+ intensity_pred = prosit_model.predict_intensities(
618
+ [p.sequence_modified if p.decoy == False else p.sequence_decoy_modified for p in psm],
619
+ np.array([p.charge for p in psm]),
620
+ [p.collision_energy_calibrated for p in psm],
621
+ batch_size=params['intensity_prediction_batch_size'],
622
+ flatten=True,
623
+ )
624
+
625
+ psm = associate_fragment_ions_with_prosit_predicted_intensities(
626
+ psm, intensity_pred, num_threads=params['num_threads']
627
+ )
628
+
629
+ if args.verbose:
630
+ print("predicting ion mobilities ...")
631
+
632
+ if params['refine_im']:
633
+ im_predictor = DeepPeptideIonMobilityApex(
634
+ load_deep_ccs_predictor(),
635
+ load_tokenizer_from_resources("tokenizer-ptm")
636
+ )
637
+
638
+ if args.verbose:
639
+ print("refining ion mobility predictions ...")
640
+
641
+ im_predictor.fine_tune_model(
642
+ data=psm_collection_to_pandas(generate_balanced_im_dataset(psms=psm)),
643
+ batch_size=params['model_fine_tune_batch_size'],
644
+ re_compile=True,
645
+ verbose=params['refinement_verbose'],
646
+ )
647
+
648
+ inv_mob = im_predictor.simulate_ion_mobilities(
649
+ sequences=[x.sequence_modified if x.decoy == False else x.sequence_decoy_modified for x in psm],
650
+ charges=[x.charge for x in psm],
651
+ mz=[x.mono_mz_calculated for x in psm]
652
+ )
653
+
654
+ for mob, ps in zip(inv_mob, psm):
655
+ ps.inverse_ion_mobility_predicted = mob
656
+
657
+ if not params['refine_im']:
658
+ inv_mob_calibration_factor = np.mean(
659
+ [x.inverse_ion_mobility - x.inverse_ion_mobility_predicted for x in psm]
660
+ )
661
+
662
+ if args.verbose:
663
+ print(f"calibrated ion mobilities with factor: {np.round(inv_mob_calibration_factor, 4)} ...")
664
+
665
+ for p in psm:
666
+ p.inverse_ion_mobility_predicted += inv_mob_calibration_factor
667
+
668
+ if args.verbose:
669
+ print("predicting retention times ...")
670
+
671
+ if params['refine_rt']:
672
+ rt_predictor = DeepChromatographyApex(
673
+ load_deep_retention_time_predictor(),
674
+ load_tokenizer_from_resources("tokenizer-ptm"),
675
+ verbose=args.verbose
676
+ )
677
+
678
+ if args.verbose:
679
+ print("refining retention time predictions ...")
680
+
681
+ ds = psm_collection_to_pandas(generate_balanced_rt_dataset(psms=psm))
682
+ rt_predictor.fine_tune_model(
683
+ data=ds,
684
+ batch_size=params['model_fine_tune_batch_size'],
685
+ re_compile=True,
686
+ verbose=params['refinement_verbose'],
687
+ )
688
+
689
+ rt_pred = rt_predictor.simulate_separation_times(
690
+ sequences=[x.sequence_modified if x.decoy == False else x.sequence_decoy_modified for x in psm],
691
+ )
692
+
693
+ for rt, p in zip(rt_pred, psm):
694
+ p.retention_time_predicted = rt
695
+
696
+ for p in psm:
697
+ p.sage_feature_file_id = file_id
698
+
699
+ bts = compress_psms(psm)
700
+
701
+ if args.verbose:
702
+ print("writing PSMs to temp file ...")
703
+
704
+ write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name=ds_name)
705
+ logging.info(f"Processed {ds_name} ...")
706
+
707
+ if args.verbose:
708
+ time_end_tmp = time.time()
709
+ minutes, seconds = divmod(time_end_tmp - start_time, 60)
710
+ print(f"file {ds_name} processed after {minutes} minutes and {seconds:.2f} seconds.")
711
+
712
+ if args.verbose:
713
+ print("loading all PSMs ...")
714
+
715
+ psms = []
716
+ for file in os.listdir(write_folder_path + "/imspy/psm/"):
717
+ if file.endswith(".bin"):
718
+ f = open(os.path.join(write_folder_path + "/imspy/psm/", file), 'rb')
719
+ data = f.read()
720
+ f.close()
721
+ psms.extend(decompress_psms(data))
722
+
723
+ if len(fastas) == 1:
724
+ if args.verbose:
725
+ print("calculating q-values using SAGE internal functions...")
726
+ sage_fdr_psm(psms, indexed_db, use_hyper_score=True)
727
+ else:
728
+ if args.verbose:
729
+ print("calculating q-values using SAGE-style re-implemented functions...")
730
+ assign_sage_spectrum_q(psms, use_hyper_score=True)
731
+ assign_sage_peptide_q(psms, use_hyper_score=True)
732
+ assign_sage_protein_q(psms, use_hyper_score=True)
733
+
734
+ psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx)))
735
+
736
+ psms = rescore_psms(
737
+ psm_collection=psms,
738
+ verbose=args.verbose,
739
+ model=SVC(probability=True),
740
+ num_splits=params['re_score_num_splits'],
741
+ balance=params['balanced_re_score'],
742
+ score=params['re_score_metric'],
743
+ num_threads=params['num_threads'],
744
+ )
745
+
746
+ bts = compress_psms(psms)
747
+
748
+ if args.verbose:
749
+ print("writing all re-scored PSMs to temp file ...")
750
+
751
+ write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name="total_psms", total=True)
752
+
753
+ PSM_pandas = psm_collection_to_pandas(psms)
754
+
755
+ if params['re_score_mokapot']:
756
+ if args.verbose:
757
+ print("re-scoring PSMs using mokapot ...")
758
+
759
+ if not os.path.exists(write_folder_path + "/imspy/mokapot"):
760
+ os.makedirs(write_folder_path + "/imspy/mokapot")
761
+
762
+ PSM_pin = transform_psm_to_mokapot_pin(PSM_pandas)
763
+ PSM_pin.to_csv(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin", index=False, sep="\t")
764
+
765
+ psms_moka = mokapot.read_pin(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin")
766
+ results, models = mokapot.brew(psms_moka, max_workers=params['num_threads'])
767
+ results.to_txt(dest_dir=f"{write_folder_path}" + "/imspy/mokapot/", decoys=True)
768
+
769
+ PSM_pandas = PSM_pandas.drop(columns=["re_score"])
770
+
771
+ if args.verbose:
772
+ print(f"FDR calculation ...")
773
+
774
+ psms_rescored = target_decoy_competition_pandas(
775
+ psm_collection_to_pandas(psms),
776
+ method=params['fdr_psm_method'],
777
+ score=params['fdr_score']
778
+ )
779
+
780
+ if params['remove_decoys']:
781
+ psms_rescored = psms_rescored[psms_rescored.decoy == False]
782
+
783
+ psms_rescored = psms_rescored[(psms_rescored.q_value <= params['fdr_threshold'])]
784
+
785
+ TDC = pd.merge(
786
+ psms_rescored, PSM_pandas,
787
+ left_on=["spec_idx", "match_idx", "decoy"],
788
+ right_on=["spec_idx", "match_idx", "decoy"]
789
+ ).sort_values(by="re_score", ascending=False)
790
+
791
+ TDC.to_csv(f"{write_folder_path}" + "/imspy/PSMs.csv", index=False)
792
+
793
+ if params['tims2rescore_table']:
794
+ if args.verbose:
795
+ print(f"Writing PSM table that can be passed to tims2rescore ...")
796
+
797
+ if params['use_mgf']:
798
+ print("Warning: tims2rescore does not support parsed MGF files due to duplicate precursor IDs.")
799
+
800
+ TDC_tims2rescore = parse_to_tims2rescore(TDC, from_mgf=params['use_mgf'], file_name=ds_name + ".d")
801
+ TDC_tims2rescore.sort_values(by="filename", inplace=True)
802
+ TDC_tims2rescore.to_csv(f"{write_folder_path}" + "/imspy/results.sagepy.tsv", sep="\t", index=False)
803
+
804
+ peptides_rescored = target_decoy_competition_pandas(
805
+ psm_collection_to_pandas(psms),
806
+ method=params['fdr_peptide_method'],
807
+ score=params['fdr_score']
808
+ )
809
+
810
+ if params['remove_decoys']:
811
+ peptides_rescored = peptides_rescored[peptides_rescored.decoy == False]
812
+
813
+ peptides_rescored = peptides_rescored[(peptides_rescored.q_value <= params['fdr_threshold'])]
814
+
815
+ TDC = pd.merge(
816
+ peptides_rescored, PSM_pandas,
817
+ left_on=["spec_idx", "match_idx", "decoy"],
818
+ right_on=["spec_idx", "match_idx", "decoy"]
819
+ ).sort_values(by="re_score", ascending=False)
820
+
821
+ TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False, encoding='utf-8')
822
+
823
+ end_time = time.time()
824
+
825
+ logging.info("Done processing all RAW files.")
826
+ minutes, seconds = divmod(end_time - start_time, 60)
827
+
828
+ if args.verbose:
829
+ print("Done processing all RAW files.")
830
+ print(f"Processed {len(paths)} RAW files in {minutes} minutes and {seconds:.2f} seconds.")
831
+
832
+ logging.info(f"Processed {len(paths)} RAW files in {minutes} minutes and {seconds:.2f} seconds.")
833
+
834
+
835
+ if __name__ == "__main__":
836
+ main()