imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ """IMSPY SAGE Rescore CLI - Re-score SAGE search results using deep learning features."""
2
+
3
+ import logging
4
+ import argparse
5
+ import sys
6
+ import os
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from imspy_predictors import (
12
+ DeepPeptideIonMobilityApex, load_deep_ccs_predictor,
13
+ DeepChromatographyApex, load_deep_retention_time_predictor,
14
+ Prosit2023TimsTofWrapper, load_tokenizer_from_resources,
15
+ )
16
+ from imspy_core.chemistry.utility import calculate_mz
17
+ from imspy_search.utility import linear_map
18
+ from sagepy.core.scoring import prosit_intensities_to_fragments_par
19
+ from sagepy.qfdr.tdc import target_decoy_competition_pandas
20
+
21
+ from imspy_search.sage_output_utility import (
22
+ re_score_psms, row_to_fragment, remove_substrings,
23
+ PatternReplacer, replace_tokens, cosim_from_dict,
24
+ fragments_to_dict, plot_summary
25
+ )
26
+
27
+ # Suppress pandas warnings about column assignment
28
+ pd.options.mode.chained_assignment = None
29
+
30
+
31
+ def main():
32
+ """Main entry point for imspy-rescore-sage CLI."""
33
+ parser = argparse.ArgumentParser(
34
+ description='IMSPY - SAGE Parser DDA - Re-score SAGE search results using imspy and sagepy.'
35
+ )
36
+
37
+ parser.add_argument("sage_results", help="The path to the SAGE results file")
38
+ parser.add_argument("sage_fragments", help="The path to the SAGE fragments file")
39
+ parser.add_argument("output", help="The path to where the output files should be created")
40
+
41
+ parser.add_argument(
42
+ "--tdc_method",
43
+ default="peptide_psm_peptide",
44
+ help="The target decoy competition method",
45
+ choices=["psm", "peptide_psm_only", "peptide_peptide_only", "peptide_psm_peptide"]
46
+ )
47
+
48
+ parser.add_argument(
49
+ "--num_splits",
50
+ default=5,
51
+ type=int,
52
+ help="The number of splits for cross-validation"
53
+ )
54
+
55
+ parser.add_argument(
56
+ "--no_balanced_split",
57
+ action="store_false",
58
+ dest="balance",
59
+ help="Do not balance the training dataset"
60
+ )
61
+ parser.set_defaults(balance=True)
62
+
63
+ parser.add_argument(
64
+ "--no_store_hyperscore",
65
+ action="store_false",
66
+ dest="store_hyperscore",
67
+ help="Do not store the results with the hyperscore"
68
+ )
69
+ parser.set_defaults(store_hyperscore=True)
70
+
71
+ parser.add_argument(
72
+ "--fine_tune_predictors",
73
+ action="store_true",
74
+ help="Fine tune the rt and inv-mob predictors"
75
+ )
76
+ parser.set_defaults(fine_tune_predictors=False)
77
+
78
+ parser.add_argument(
79
+ "--positive_example_q_max",
80
+ default=0.01,
81
+ type=float,
82
+ help="Maximum q-value allowed for positive examples"
83
+ )
84
+
85
+ parser.add_argument(
86
+ "--verbose",
87
+ action="store_true",
88
+ help="Print verbose output"
89
+ )
90
+ parser.set_defaults(verbose=False)
91
+
92
+ parser.add_argument(
93
+ "--no_summary_plot",
94
+ action="store_false",
95
+ dest="summary_plot",
96
+ help="Do not create a summary plot"
97
+ )
98
+ parser.set_defaults(summary_plot=True)
99
+
100
+ args = parser.parse_args()
101
+
102
+ # Check if files exist
103
+ if not os.path.exists(args.sage_results):
104
+ logging.error(f"The SAGE results file {args.sage_results} does not exist.")
105
+ sys.exit(1)
106
+
107
+ if not os.path.exists(args.sage_fragments):
108
+ logging.error(f"The SAGE fragments file {args.sage_fragments} does not exist.")
109
+ sys.exit(1)
110
+
111
+ # Read SAGE results
112
+ if args.sage_results.endswith(".tsv"):
113
+ results = pd.read_csv(args.sage_results, sep="\t")
114
+ elif args.sage_results.endswith(".parquet"):
115
+ results = pd.read_parquet(args.sage_results)
116
+ else:
117
+ logging.error(f"Unknown file format for SAGE results file {args.sage_results}.")
118
+ sys.exit(1)
119
+
120
+ # Read SAGE fragments
121
+ if args.sage_fragments.endswith(".tsv"):
122
+ fragments = pd.read_csv(args.sage_fragments, sep="\t")
123
+ elif args.sage_fragments.endswith(".parquet"):
124
+ fragments = pd.read_parquet(args.sage_fragments)
125
+ else:
126
+ logging.error(f"Unknown file format for SAGE fragments file {args.sage_fragments}.")
127
+ sys.exit(1)
128
+
129
+ logging.basicConfig(level=logging.INFO)
130
+
131
+ # Load models
132
+ prosit_model = Prosit2023TimsTofWrapper(verbose=False)
133
+ im_predictor = DeepPeptideIonMobilityApex(
134
+ load_deep_ccs_predictor(),
135
+ load_tokenizer_from_resources("tokenizer-ptm")
136
+ )
137
+ rt_predictor = DeepChromatographyApex(
138
+ load_deep_retention_time_predictor(),
139
+ load_tokenizer_from_resources("tokenizer-ptm"),
140
+ verbose=True
141
+ )
142
+
143
+ # Filter sequences by length
144
+ results["sequence_length"] = results.apply(lambda s: len(remove_substrings(s.peptide)), axis=1)
145
+ results_filtered = results[results.sequence_length <= 30]
146
+
147
+ results_filtered["decoy"] = results_filtered.apply(lambda r: r.label == -1, axis=1)
148
+
149
+ token_replacer = PatternReplacer(replace_tokens)
150
+ results_filtered["sequence"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
151
+
152
+ results_filtered["mono_mz_calculated"] = results_filtered.apply(
153
+ lambda r: calculate_mz(r.calcmass, r.charge), axis=1
154
+ )
155
+ results_filtered["inverse_mobility_observed"] = results.ion_mobility
156
+ results_filtered["projected_rt"] = results_filtered.apply(
157
+ lambda r: linear_map(r.rt, old_min=results_filtered.rt.min(), old_max=results_filtered.rt.max()),
158
+ axis=1
159
+ )
160
+
161
+ results_filtered["match_idx"] = results_filtered.sequence
162
+ results_filtered["spec_idx"] = [str(x) for x in results_filtered.psm_id]
163
+ results_filtered["score"] = results_filtered.hyperscore
164
+ results_filtered["q_value"] = None
165
+
166
+ if len(results_filtered) < len(results):
167
+ s = len(results) - len(results_filtered)
168
+ logging.info(f"Removed {s} sequences with sequence length > 30.")
169
+
170
+ S = set(results_filtered.psm_id)
171
+
172
+ # Fine-tuning data
173
+ if args.fine_tune_predictors:
174
+ TDC_train = target_decoy_competition_pandas(results_filtered, method="psm")
175
+ TDC_train_f = TDC_train[(TDC_train.decoy == False) & (TDC_train.q_value <= 0.01)]
176
+ TDC_train_f["spec_idxi"] = [int(x) for x in TDC_train_f.spec_idx]
177
+ FT_data = pd.merge(TDC_train_f, results_filtered, left_on=["spec_idxi"], right_on=["psm_id"])
178
+
179
+ fragments = fragments[[f in S for f in fragments.psm_id.values]]
180
+
181
+ logging.info(f"Processing {len(results_filtered)} PSMs.")
182
+
183
+ # Group fragments by PSM
184
+ fragments_grouped = fragments.groupby("psm_id").agg({
185
+ "fragment_type": list,
186
+ "fragment_ordinals": list,
187
+ "fragment_charge": list,
188
+ "fragment_mz_calculated": list,
189
+ "fragment_mz_experimental": list,
190
+ "fragment_intensity": list,
191
+ }).reset_index()
192
+
193
+ fragments_grouped["fragments_observed"] = fragments_grouped.apply(lambda r: row_to_fragment(r), axis=1)
194
+ fragments_grouped = fragments_grouped[["psm_id", "fragments_observed"]]
195
+
196
+ logging.info("Predicting intensities...")
197
+
198
+ intensity_pred = prosit_model.predict_intensities(
199
+ results_filtered.sequence.values,
200
+ results_filtered.charge.values,
201
+ collision_energies=np.zeros_like(results_filtered.charge.values) + 30,
202
+ batch_size=2048,
203
+ flatten=True,
204
+ )
205
+
206
+ logging.info("Predicting peptide retention times...")
207
+
208
+ if args.fine_tune_predictors:
209
+ rt_predictor.fine_tune_model(data=FT_data, verbose=args.verbose)
210
+
211
+ rt_pred = rt_predictor.simulate_separation_times(sequences=results_filtered.sequence.values)
212
+
213
+ logging.info("Predicting ion mobilities...")
214
+
215
+ if args.fine_tune_predictors:
216
+ im_predictor.fine_tune_model(data=FT_data, verbose=args.verbose)
217
+
218
+ inv_mob = im_predictor.simulate_ion_mobilities(
219
+ sequences=results_filtered.sequence.values,
220
+ charges=results_filtered.charge.values,
221
+ mz=results_filtered.mono_mz_calculated.values,
222
+ )
223
+
224
+ results_filtered["inv_mob_predicted"] = inv_mob
225
+ results_filtered["rt_predicted"] = rt_pred
226
+ results_filtered["fragments_predicted"] = prosit_intensities_to_fragments_par(intensity_pred)
227
+
228
+ PSMS = pd.merge(results_filtered, fragments_grouped, on="psm_id")
229
+
230
+ PSMS["observed_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_observed), axis=1)
231
+ PSMS["predicted_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_predicted), axis=1)
232
+ PSMS["cosine_similarity"] = PSMS.apply(lambda s: cosim_from_dict(s.observed_dict, s.predicted_dict), axis=1)
233
+ PSMS["delta_rt"] = PSMS.projected_rt - PSMS.rt_predicted
234
+ PSMS["delta_ims"] = PSMS.ion_mobility - PSMS.inv_mob_predicted
235
+ PSMS["intensity_ms1"] = 0.0
236
+ PSMS["collision_energy"] = 0.0
237
+ PSMS = PSMS.rename(columns={
238
+ "ms2_intensity": "intensity_ms2",
239
+ "fragment_ppm": "average_ppm",
240
+ "precursor_ppm": "delta_mass"
241
+ })
242
+
243
+ logging.info("Re-scoring PSMs...")
244
+
245
+ RE_SCORE = re_score_psms(
246
+ PSMS,
247
+ num_splits=args.num_splits,
248
+ balance=args.balance,
249
+ positive_example_q_max=args.positive_example_q_max
250
+ )
251
+
252
+ PSMS = pd.merge(PSMS, RE_SCORE, on=["spec_idx", "rank"])
253
+
254
+ TDC = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="hyperscore")
255
+ TDC_rescore = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="re_score")
256
+
257
+ TDC = TDC.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
258
+ TDC_rescore = TDC_rescore.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
259
+
260
+ before = len(TDC[TDC.q_value <= 0.01])
261
+ after = len(TDC_rescore[TDC_rescore.q_value <= 0.01])
262
+ logging.info(f"Before re-scoring: {before} PSMs with q-value <= 0.01")
263
+ logging.info(f"After re-scoring: {after} PSMs with q-value <= 0.01")
264
+
265
+ if args.summary_plot:
266
+ TARGET = PSMS[PSMS.decoy == False]
267
+ DECOY = PSMS[PSMS.decoy]
268
+
269
+ logging.info("Creating summary plot...")
270
+ output_path = os.path.join(args.output, "summary_plot.png")
271
+ plot_summary(TARGET, DECOY, output_path, dpi=300)
272
+
273
+ output_path = os.path.dirname(args.output)
274
+ if not os.path.exists(output_path):
275
+ os.makedirs(output_path)
276
+
277
+ file_name = os.path.join(output_path, "imspy_sage_hyperscore.tsv")
278
+ file_name_rescore = os.path.join(output_path, "imspy_sage_rescore.tsv")
279
+
280
+ if args.store_hyperscore:
281
+ TDC.to_csv(file_name, sep="\t", index=False)
282
+ logging.info(f"Output file {file_name} saved.")
283
+
284
+ TDC_rescore.to_csv(file_name_rescore, sep="\t", index=False)
285
+ logging.info(f"Output file {file_name_rescore} saved.")
286
+
287
+
288
+ if __name__ == "__main__":
289
+ main()
@@ -0,0 +1,15 @@
1
+ raw_data_path = "/media/hd02/data/raw/dda/ccs/Raw_Yeast_Trp/"
2
+ fasta_path = "/media/hd02/data/fasta/yeast/yeast_proteome.fasta"
3
+ num_threads = -1
4
+ cleave_at = "KR"
5
+ restrict = "P"
6
+ n_terminal = false
7
+ verbose = true
8
+ fasta_batch_size = 1
9
+
10
+ [static_modifications]
11
+ C = "[UNIMOD:4]"
12
+
13
+ [variable_modifications]
14
+ M = ["[UNIMOD:35]"]
15
+ "[" = ["[UNIMOD:1]"]
@@ -0,0 +1,83 @@
1
+ # This file contains the modifications that are used in the database search.
2
+ # For a detailed description of the supported modification types, consult the SAGE documentation: https://sage-docs.vercel.app/docs/configuration#file
3
+ # compared to sage, variable modifications are not put into a list, please provide each modified variant of the amino acid as a single key-value pair
4
+ [variable_modifications]
5
+ M = ["[UNIMOD:35]"] # Oxidation of methionine
6
+ "[" = ["[UNIMOD:1]"] # Acetylation of the peptide N-terminus of proteins
7
+
8
+ [static_modifications]
9
+
10
+ [scoring]
11
+ score_type = "openmshyperscore"
12
+ report_psms = 5
13
+ min_matched_peaks = 5
14
+ annotate_matches = true
15
+ max_fragment_charge = 2
16
+
17
+ [precursor_tolerance]
18
+ use_da = false
19
+ lower = -25.0
20
+ upper = 25.0
21
+
22
+ [fragment_tolerance]
23
+ use_da = false
24
+ lower = -20.0
25
+ upper = 20.0
26
+
27
+ [isolation_window]
28
+ lower = -3.0
29
+ upper = 3.0
30
+
31
+ [preprocessing]
32
+ take_top_n = 150
33
+
34
+ [enzyme]
35
+ missed_cleavages = 2
36
+ min_len = 7
37
+ max_len = 25
38
+ cleave_at = ""
39
+ restrict = ""
40
+ c_terminal = true
41
+
42
+ [database]
43
+ generate_decoys = true
44
+ shuffle_decoys = false
45
+ keep_ends = true
46
+ bucket_size = 16384
47
+
48
+ [search]
49
+ fragment_max_mz = 1700.0
50
+
51
+ [re_scoring]
52
+ re_score_num_splits = 5
53
+ balanced_re_score = true
54
+ re_score_metric = "hyperscore"
55
+ re_score_mokapot = true
56
+
57
+ [fdr]
58
+ fdr_threshold = 0.01
59
+ remove_decoys = true
60
+ fdr_psm_method = "psm"
61
+ fdr_peptide_method = "peptide_psm_peptide"
62
+ fdr_score = "re_score"
63
+
64
+ [parallelization]
65
+ num_threads = -1
66
+
67
+ [refinement]
68
+ refine_rt = false
69
+ refine_im = false
70
+ refinement_verbose = false
71
+
72
+ [batch_sizes]
73
+ intensity_prediction_batch_size = 2048
74
+ model_fine_tune_batch_size = 1024
75
+ sample_size_collision_energy_calibration = 256
76
+
77
+ [other]
78
+ calibrate_mz = false
79
+ in_memory = false
80
+ bruker_sdk = true
81
+ randomize_fasta_split = false
82
+ verbose = true
83
+ fasta_batch_size = 10
@@ -0,0 +1,84 @@
1
+ # This file contains the modifications that are used in the database search.
2
+ # For a detailed description of the supported modification types, consult the SAGE documentation: https://sage-docs.vercel.app/docs/configuration#file
3
+ # compared to sage, variable modifications are not put into a list, please provide each modified variant of the amino acid as a single key-value pair
4
+ [variable_modifications]
5
+ M = ["[UNIMOD:35]"] # Oxidation of methionine
6
+ "[" = ["[UNIMOD:1]"] # Acetylation of the peptide N-terminus of proteins
7
+
8
+ [static_modifications]
9
+ C = "[UNIMOD:4]" # Carbamidomethylation of cysteine
10
+
11
+ [scoring]
12
+ score_type = "hyperscore"
13
+ report_psms = 5
14
+ min_matched_peaks = 5
15
+ annotate_matches = true
16
+ max_fragment_charge = 2
17
+
18
+ [precursor_tolerance]
19
+ use_da = false
20
+ lower = -15.0
21
+ upper = 15.0
22
+
23
+ [fragment_tolerance]
24
+ use_da = false
25
+ lower = -20.0
26
+ upper = 20.0
27
+
28
+ [isolation_window]
29
+ lower = -3.0
30
+ upper = 3.0
31
+
32
+ [preprocessing]
33
+ take_top_n = 150
34
+
35
+ [enzyme]
36
+ missed_cleavages = 2
37
+ min_len = 7
38
+ max_len = 30
39
+ cleave_at = "KR"
40
+ restrict = "P"
41
+ c_terminal = true
42
+
43
+ [database]
44
+ generate_decoys = true
45
+ shuffle_decoys = false
46
+ keep_ends = true
47
+ bucket_size = 16384
48
+
49
+ [search]
50
+ fragment_max_mz = 1700.0
51
+
52
+ [re_scoring]
53
+ re_score_num_splits = 5
54
+ balanced_re_score = true
55
+ re_score_metric = "hyperscore"
56
+ re_score_mokapot = true
57
+
58
+ [fdr]
59
+ fdr_threshold = 0.01
60
+ remove_decoys = true
61
+ fdr_psm_method = "psm"
62
+ fdr_peptide_method = "peptide_psm_peptide"
63
+ fdr_score = "re_score"
64
+
65
+ [parallelization]
66
+ num_threads = -1
67
+
68
+ [refinement]
69
+ refine_rt = true
70
+ refine_im = true
71
+ refinement_verbose = false
72
+
73
+ [batch_sizes]
74
+ intensity_prediction_batch_size = 2048
75
+ model_fine_tune_batch_size = 1024
76
+ sample_size_collision_energy_calibration = 256
77
+
78
+ [other]
79
+ calibrate_mz = true
80
+ in_memory = false
81
+ bruker_sdk = true
82
+ randomize_fasta_split = false
83
+ verbose = true
84
+ fasta_batch_size = 1
@@ -0,0 +1,209 @@
1
+ """Extensions to TimsDatasetDDA for sagepy integration.
2
+
3
+ This module provides methods that were removed from imspy-core's dda.py
4
+ to eliminate the sagepy dependency from the core package.
5
+ """
6
+
7
+ from typing import List, Optional
8
+ import pandas as pd
9
+
10
+ from sagepy.core import (
11
+ Precursor, ProcessedSpectrum, SpectrumProcessor, Tolerance
12
+ )
13
+
14
+ from imspy_core.timstof import TimsDatasetDDA
15
+
16
+ from imspy_search.utility import sanitize_mz, sanitize_charge, get_searchable_spec
17
+
18
+
19
+ def to_sage_precursor(
20
+ row: pd.Series,
21
+ isolation_window_lower: float = -3.0,
22
+ isolation_window_upper: float = 3.0,
23
+ ) -> Precursor:
24
+ """Convert a PASEF fragment row to a sagepy Precursor.
25
+
26
+ Args:
27
+ row: A pandas Series containing PASEF fragment data
28
+ isolation_window_lower: Lower bound for isolation window (Da)
29
+ isolation_window_upper: Upper bound for isolation window (Da)
30
+
31
+ Returns:
32
+ A sagepy Precursor object
33
+ """
34
+ return Precursor(
35
+ mz=sanitize_mz(row['monoisotopic_mz'], row['largest_peak_mz']),
36
+ intensity=row['intensity'],
37
+ charge=sanitize_charge(row['charge']),
38
+ isolation_window=Tolerance(da=(isolation_window_lower, isolation_window_upper)),
39
+ collision_energy=row.collision_energy,
40
+ inverse_ion_mobility=row.mobility if 'mobility' in row.index else None,
41
+ )
42
+
43
+
44
+ def get_sage_processed_precursors(
45
+ dataset: TimsDatasetDDA,
46
+ num_threads: int = 16,
47
+ take_top_n: int = 150,
48
+ isolation_window_lower: float = -3.0,
49
+ isolation_window_upper: float = 3.0,
50
+ ds_name: Optional[str] = None,
51
+ ) -> pd.DataFrame:
52
+ """Extract and process PASEF fragments as sagepy ProcessedSpectrum objects.
53
+
54
+ This function extracts PASEF fragments from a TimsDatasetDDA, aggregates
55
+ them by precursor ID, and converts them to sagepy ProcessedSpectrum objects
56
+ suitable for database search.
57
+
58
+ Args:
59
+ dataset: TimsDatasetDDA object
60
+ num_threads: Number of threads for extraction
61
+ take_top_n: Number of top peaks to keep
62
+ isolation_window_lower: Lower bound for isolation window (Da)
63
+ isolation_window_upper: Upper bound for isolation window (Da)
64
+ ds_name: Dataset name for spec_id generation (defaults to dataset path basename)
65
+
66
+ Returns:
67
+ DataFrame with columns: precursor_id, mobility, spec_id, sage_precursor, processed_spec
68
+ """
69
+ import os
70
+
71
+ if ds_name is None:
72
+ ds_name = os.path.basename(str(dataset.data_path))
73
+
74
+ # Get PASEF fragments
75
+ fragments = dataset.get_pasef_fragments(num_threads=num_threads)
76
+
77
+ # Aggregate by precursor_id
78
+ fragments = fragments.groupby('precursor_id').agg({
79
+ 'frame_id': 'first',
80
+ 'time': 'first',
81
+ 'precursor_id': 'first',
82
+ 'raw_data': 'sum',
83
+ 'scan_begin': 'first',
84
+ 'scan_end': 'first',
85
+ 'isolation_mz': 'first',
86
+ 'isolation_width': 'first',
87
+ 'collision_energy': 'first',
88
+ 'largest_peak_mz': 'first',
89
+ 'average_mz': 'first',
90
+ 'monoisotopic_mz': 'first',
91
+ 'charge': 'first',
92
+ 'average_scan': 'first',
93
+ 'intensity': 'first',
94
+ 'parent_id': 'first',
95
+ })
96
+
97
+ # Calculate mobility
98
+ mobility = fragments.apply(
99
+ lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(),
100
+ axis=1
101
+ )
102
+ fragments['mobility'] = mobility
103
+
104
+ # Generate spec_id
105
+ spec_id = fragments.apply(
106
+ lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name,
107
+ axis=1
108
+ )
109
+ fragments['spec_id'] = spec_id
110
+
111
+ # Create sage precursors
112
+ sage_precursor = fragments.apply(
113
+ lambda r: to_sage_precursor(
114
+ r,
115
+ isolation_window_lower=isolation_window_lower,
116
+ isolation_window_upper=isolation_window_upper
117
+ ),
118
+ axis=1
119
+ )
120
+ fragments['sage_precursor'] = sage_precursor
121
+
122
+ # Create spectrum processor
123
+ spec_processor = SpectrumProcessor(take_top_n=take_top_n)
124
+
125
+ # Process spectra
126
+ processed_spec = fragments.apply(
127
+ lambda r: get_searchable_spec(
128
+ precursor=r.sage_precursor,
129
+ raw_fragment_data=r.raw_data,
130
+ spec_processor=spec_processor,
131
+ spec_id=r.spec_id,
132
+ time=r['time'],
133
+ ),
134
+ axis=1
135
+ )
136
+ fragments['processed_spec'] = processed_spec
137
+
138
+ return fragments
139
+
140
+
141
+ def get_processed_spectra_for_search(
142
+ dataset: TimsDatasetDDA,
143
+ num_threads: int = 16,
144
+ take_top_n: int = 150,
145
+ isolation_window_lower: float = -3.0,
146
+ isolation_window_upper: float = 3.0,
147
+ ) -> List[ProcessedSpectrum]:
148
+ """Get list of ProcessedSpectrum objects for database search.
149
+
150
+ This is a convenience wrapper around get_sage_processed_precursors
151
+ that returns just the list of ProcessedSpectrum objects.
152
+
153
+ Args:
154
+ dataset: TimsDatasetDDA object
155
+ num_threads: Number of threads for extraction
156
+ take_top_n: Number of top peaks to keep
157
+ isolation_window_lower: Lower bound for isolation window (Da)
158
+ isolation_window_upper: Upper bound for isolation window (Da)
159
+
160
+ Returns:
161
+ List of ProcessedSpectrum objects
162
+ """
163
+ fragments = get_sage_processed_precursors(
164
+ dataset=dataset,
165
+ num_threads=num_threads,
166
+ take_top_n=take_top_n,
167
+ isolation_window_lower=isolation_window_lower,
168
+ isolation_window_upper=isolation_window_upper,
169
+ )
170
+
171
+ return fragments['processed_spec'].tolist()
172
+
173
+
174
+ def search_timstof_dda(
175
+ dataset: TimsDatasetDDA,
176
+ scorer,
177
+ indexed_db,
178
+ num_threads: int = 16,
179
+ take_top_n: int = 150,
180
+ isolation_window_lower: float = -3.0,
181
+ isolation_window_upper: float = 3.0,
182
+ ):
183
+ """Search a TimsDatasetDDA against a database using sagepy.
184
+
185
+ Args:
186
+ dataset: TimsDatasetDDA object
187
+ scorer: sagepy Scorer object
188
+ indexed_db: Indexed database from sagepy
189
+ num_threads: Number of threads for extraction and search
190
+ take_top_n: Number of top peaks to keep
191
+ isolation_window_lower: Lower bound for isolation window (Da)
192
+ isolation_window_upper: Upper bound for isolation window (Da)
193
+
194
+ Returns:
195
+ Dictionary of PSMs from scorer.score_collection_psm
196
+ """
197
+ spectra = get_processed_spectra_for_search(
198
+ dataset=dataset,
199
+ num_threads=num_threads,
200
+ take_top_n=take_top_n,
201
+ isolation_window_lower=isolation_window_lower,
202
+ isolation_window_upper=isolation_window_upper,
203
+ )
204
+
205
+ return scorer.score_collection_psm(
206
+ db=indexed_db,
207
+ spectrum_collection=spectra,
208
+ num_threads=num_threads,
209
+ )