imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ """
2
+ imspy_search - Database search functionality for timsTOF proteomics data using sagepy.
3
+
4
+ This package provides database search, PSM rescoring, and FDR control for timsTOF DDA data.
5
+
6
+ Requires imspy-core and imspy-predictors for core data structures and ML predictors.
7
+
8
+ Core functionality:
9
+ - SAGE-based database search for timsTOF DDA data
10
+ - Machine learning-based PSM rescoring
11
+ - Target-decoy competition and q-value estimation
12
+ - MGF file parsing for Bruker DataAnalysis output
13
+ """
14
+
15
+ __version__ = "0.4.0"
16
+
17
+ # Core utility functions
18
+ from imspy_search.utility import (
19
+ # Data extraction and preprocessing
20
+ extract_timstof_dda_data,
21
+ get_searchable_spec,
22
+ get_ms1_ims_spectrum,
23
+ # FASTA handling
24
+ split_fasta,
25
+ # PSM handling
26
+ generate_training_data,
27
+ split_psms,
28
+ generate_balanced_rt_dataset,
29
+ generate_balanced_im_dataset,
30
+ # Helper functions
31
+ linear_map,
32
+ map_to_domain,
33
+ sanitize_charge,
34
+ sanitize_mz,
35
+ write_psms_binary,
36
+ merge_dicts_with_merge_dict,
37
+ check_memory,
38
+ # Output formatting
39
+ transform_psm_to_pin,
40
+ parse_to_tims2rescore,
41
+ )
42
+
43
+ # SAGE output processing
44
+ from imspy_search.sage_output_utility import (
45
+ re_score_psms as re_score_psms_lda,
46
+ generate_training_data as generate_training_data_df,
47
+ split_dataframe_randomly,
48
+ row_to_fragment,
49
+ remove_substrings,
50
+ PatternReplacer,
51
+ replace_tokens,
52
+ cosim_from_dict,
53
+ fragments_to_dict,
54
+ plot_summary,
55
+ )
56
+
57
+ # MGF parsing
58
+ from imspy_search.mgf import (
59
+ mgf_to_sagepy_query,
60
+ iter_spectra,
61
+ parse_spectrum,
62
+ )
63
+
64
+ # Rescoring with deep learning features
65
+ from imspy_search.rescoring import (
66
+ re_score_psms,
67
+ create_feature_space,
68
+ )
69
+
70
+ # DDA extensions for sagepy integration
71
+ from imspy_search.dda_extensions import (
72
+ to_sage_precursor,
73
+ get_sage_processed_precursors,
74
+ get_processed_spectra_for_search,
75
+ search_timstof_dda,
76
+ )
77
+
78
+ __all__ = [
79
+ # Version
80
+ '__version__',
81
+ # Data extraction
82
+ 'extract_timstof_dda_data',
83
+ 'get_searchable_spec',
84
+ 'get_ms1_ims_spectrum',
85
+ # FASTA handling
86
+ 'split_fasta',
87
+ # PSM handling
88
+ 'generate_training_data',
89
+ 'split_psms',
90
+ 'generate_balanced_rt_dataset',
91
+ 'generate_balanced_im_dataset',
92
+ # Helper functions
93
+ 'linear_map',
94
+ 'map_to_domain',
95
+ 'sanitize_charge',
96
+ 'sanitize_mz',
97
+ 'write_psms_binary',
98
+ 'merge_dicts_with_merge_dict',
99
+ 'check_memory',
100
+ # Output formatting
101
+ 'transform_psm_to_pin',
102
+ 'parse_to_tims2rescore',
103
+ # SAGE output processing
104
+ 're_score_psms_lda',
105
+ 'generate_training_data_df',
106
+ 'split_dataframe_randomly',
107
+ 'row_to_fragment',
108
+ 'remove_substrings',
109
+ 'PatternReplacer',
110
+ 'replace_tokens',
111
+ 'cosim_from_dict',
112
+ 'fragments_to_dict',
113
+ 'plot_summary',
114
+ # MGF parsing
115
+ 'mgf_to_sagepy_query',
116
+ 'iter_spectra',
117
+ 'parse_spectrum',
118
+ # Rescoring
119
+ 're_score_psms',
120
+ 'create_feature_space',
121
+ # DDA extensions
122
+ 'to_sage_precursor',
123
+ 'get_sage_processed_precursors',
124
+ 'get_processed_spectra_for_search',
125
+ 'search_timstof_dda',
126
+ ]
@@ -0,0 +1,11 @@
1
+ """CLI entry points for imspy-search."""
2
+
3
+ from imspy_search.cli.imspy_dda import main as dda_main
4
+ from imspy_search.cli.imspy_ccs import main as ccs_main
5
+ from imspy_search.cli.imspy_rescore_sage import main as rescore_sage_main
6
+
7
+ __all__ = [
8
+ 'dda_main',
9
+ 'ccs_main',
10
+ 'rescore_sage_main',
11
+ ]
@@ -0,0 +1,322 @@
1
+ """IMSPY CCS CLI - Extract CCS from TIMS-TOF DDA data for machine learning training."""
2
+
3
+ import argparse
4
+ import os
5
+ import toml
6
+ import numpy as np
7
+
8
+ import mokapot
9
+
10
+ from imspy_core.timstof import TimsDatasetDDA
11
+ from sagepy.utility import create_sage_database, compress_psms, decompress_psms
12
+ from sagepy.rescore.utility import transform_psm_to_mokapot_pin
13
+ from sagepy.core import Precursor, Tolerance, Scorer, SpectrumProcessor
14
+ from imspy_search.utility import (
15
+ sanitize_mz, get_searchable_spec, write_psms_binary,
16
+ split_fasta, merge_dicts_with_merge_dict
17
+ )
18
+ from imspy_search.rescoring import create_feature_space
19
+ from sagepy.utility import psm_collection_to_pandas, apply_mz_calibration
20
+
21
+
22
+ def sanitize_charge(charge):
23
+ """Sanitize charge."""
24
+ try:
25
+ return int(charge)
26
+ except Exception:
27
+ return 2
28
+
29
+
30
+ def group_by_mobility(mobility, intensity):
31
+ """Group by mobility."""
32
+ r_dict = {}
33
+ for mob, i in zip(mobility, intensity):
34
+ r_dict[mob] = r_dict.get(mob, 0) + i
35
+ return np.array(list(r_dict.keys())), np.array(list(r_dict.values()))
36
+
37
+
38
+ def main():
39
+ """Main entry point for imspy-ccs CLI."""
40
+ parser = argparse.ArgumentParser(
41
+ description="Extract CCS from TIMS-TOF DDA data to create training examples for machine learning"
42
+ )
43
+ parser.add_argument("--raw_data_path", type=str, help="Path to the dataset.")
44
+ parser.add_argument("--fasta_path", type=str, help="Path to the FASTA file.")
45
+ parser.add_argument("--config", type=str, help="Path to a TOML configuration file.")
46
+ parser.add_argument("--num_threads", type=int, help="Number of threads for processing.")
47
+ parser.add_argument("--cleave_at", type=str, help="Residue to cleave at.")
48
+ parser.add_argument("--restrict", type=str, help="Restriction residues.")
49
+ parser.add_argument("--n_terminal", action="store_true", help="If provided, then c_terminal = False.")
50
+ parser.add_argument("--static_modifications", type=str, help="Static mods in TOML-compatible string form.")
51
+ parser.add_argument("--variable_modifications", type=str, help="Variable mods in TOML-compatible string form.")
52
+ parser.add_argument("--silent", action="store_true", help="Silent mode.")
53
+ parser.add_argument("--no_bruker_sdk", action="store_true", help="Do not use Bruker SDK.")
54
+ parser.add_argument("--fasta_batch_size", type=int, help="Batch size for FASTA processing.")
55
+ parser.add_argument("--lazy", action="store_true", help="Skip existing outputs to avoid re-processing.")
56
+
57
+ temp_args, _ = parser.parse_known_args()
58
+
59
+ # Load config from TOML if provided
60
+ config = {}
61
+ if temp_args.config and os.path.exists(temp_args.config):
62
+ with open(temp_args.config, "r") as f:
63
+ config = toml.load(f)
64
+
65
+ # Set defaults
66
+ defaults = {
67
+ "raw_data_path": config.get("raw_data_path", None),
68
+ "fasta_path": config.get("fasta_path", None),
69
+ "num_threads": config.get("num_threads", -1),
70
+ "cleave_at": config.get("cleave_at", "KR"),
71
+ "restrict": config.get("restrict", "P"),
72
+ "c_terminal": config.get("c_terminal", True),
73
+ "n_terminal": config.get("n_terminal", False),
74
+ "static_modifications": config.get("static_modifications", {"C": "[UNIMOD:4]"}),
75
+ "variable_modifications": config.get("variable_modifications", {"M": ["[UNIMOD:35]"], "[": ["[UNIMOD:1]"]}),
76
+ "verbose": config.get("verbose", True),
77
+ "no_bruker_sdk": config.get("no_bruker_sdk", False),
78
+ "fasta_batch_size": config.get("fasta_batch_size", 1),
79
+ "lazy": config.get("lazy", False),
80
+ }
81
+
82
+ parser.set_defaults(**defaults)
83
+ args = parser.parse_args()
84
+
85
+ if args.silent:
86
+ args.verbose = False
87
+
88
+ if args.n_terminal:
89
+ args.c_terminal = False
90
+
91
+ if isinstance(args.static_modifications, str):
92
+ args.static_modifications = toml.loads(f"data = {args.static_modifications}")["data"]
93
+ if isinstance(args.variable_modifications, str):
94
+ args.variable_modifications = toml.loads(f"data = {args.variable_modifications}")["data"]
95
+
96
+ if args.raw_data_path is None:
97
+ parser.error("raw_data_path is required (either via command line or config)")
98
+ if args.fasta_path is None:
99
+ parser.error("fasta_path is required (either via command line or config)")
100
+
101
+ if args.num_threads == -1:
102
+ args.num_threads = os.cpu_count()
103
+
104
+ if args.verbose:
105
+ print("Arguments:")
106
+ for arg, value in vars(args).items():
107
+ print(f" {arg}: {value}")
108
+
109
+ scorer = Scorer(
110
+ precursor_tolerance=Tolerance(ppm=(-25.0, 25.0)),
111
+ fragment_tolerance=Tolerance(ppm=(-20.0, 20.0)),
112
+ report_psms=5,
113
+ min_matched_peaks=5,
114
+ annotate_matches=True,
115
+ static_mods=args.static_modifications,
116
+ variable_mods=args.variable_modifications,
117
+ )
118
+
119
+ with open(args.fasta_path, "r") as f:
120
+ raw_fasta = f.read()
121
+ fastas = split_fasta(fasta=raw_fasta, num_splits=args.fasta_batch_size, randomize=True)
122
+
123
+ d_files = [f for f in os.listdir(args.raw_data_path) if f.endswith(".d")]
124
+ count = len(d_files)
125
+ if count == 0:
126
+ raise ValueError("No .d files found in the directory.")
127
+
128
+ current_count = 0
129
+
130
+ for file in d_files:
131
+ try:
132
+ current_count += 1
133
+ if args.verbose:
134
+ print(f"Processing {file} ({current_count}/{count}) ...")
135
+
136
+ dataset_name = file.split(".")[0]
137
+ ds_path = os.path.join(args.raw_data_path, file)
138
+
139
+ psm_bin_path = os.path.join(ds_path, "imspy", "psm", f"{dataset_name}.bin")
140
+ parquet_path = os.path.join(ds_path, "imspy", f"{dataset_name}.parquet")
141
+
142
+ if args.lazy and os.path.isfile(psm_bin_path) and os.path.isfile(parquet_path):
143
+ if args.verbose:
144
+ print(f" [LAZY MODE] Skipping '{file}' because outputs already exist.")
145
+ continue
146
+
147
+ dataset = TimsDatasetDDA(ds_path, use_bruker_sdk=not args.no_bruker_sdk)
148
+ fragments = dataset.get_pasef_fragments(args.num_threads)
149
+
150
+ if args.verbose:
151
+ print("Assembling re-fragmented precursors ...")
152
+
153
+ fragments = fragments.groupby('precursor_id').agg({
154
+ 'frame_id': 'first',
155
+ 'time': 'first',
156
+ 'precursor_id': 'first',
157
+ 'raw_data': 'sum',
158
+ 'scan_begin': 'first',
159
+ 'scan_end': 'first',
160
+ 'isolation_mz': 'first',
161
+ 'isolation_width': 'first',
162
+ 'collision_energy': 'first',
163
+ 'largest_peak_mz': 'first',
164
+ 'average_mz': 'first',
165
+ 'monoisotopic_mz': 'first',
166
+ 'charge': 'first',
167
+ 'average_scan': 'first',
168
+ 'intensity': 'first',
169
+ 'parent_id': 'first',
170
+ })
171
+
172
+ mobility = fragments.apply(
173
+ lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1
174
+ )
175
+ fragments['mobility'] = mobility
176
+ fragments['spec_id'] = fragments.apply(
177
+ lambda r: f"{r['frame_id']}-{r['precursor_id']}-{dataset_name}", axis=1
178
+ )
179
+
180
+ if args.verbose:
181
+ print("Extracting precursors ...")
182
+
183
+ fragments['sage_precursor'] = fragments.apply(
184
+ lambda r: Precursor(
185
+ mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
186
+ intensity=r['intensity'],
187
+ charge=sanitize_charge(r['charge']),
188
+ isolation_window=Tolerance(da=(-3, 3)),
189
+ collision_energy=r.collision_energy,
190
+ inverse_ion_mobility=r.mobility,
191
+ ),
192
+ axis=1
193
+ )
194
+
195
+ if args.verbose:
196
+ print("Extracting fragment spectra ...")
197
+
198
+ fragments['processed_spec'] = fragments.apply(
199
+ lambda r: get_searchable_spec(
200
+ precursor=r.sage_precursor,
201
+ raw_fragment_data=r.raw_data,
202
+ spec_processor=SpectrumProcessor(take_top_n=150),
203
+ spec_id=r.spec_id,
204
+ time=r['time']
205
+ ), axis=1
206
+ )
207
+
208
+ if args.verbose:
209
+ print("Scoring spectra ...")
210
+
211
+ psm_dicts = []
212
+ for i, fasta in enumerate(fastas):
213
+ if args.verbose:
214
+ print(f"Processing FASTA {i + 1}/{len(fastas)} ...")
215
+
216
+ indexed_db = create_sage_database(
217
+ fasta=fasta,
218
+ cleave_at=args.cleave_at,
219
+ restrict=args.restrict,
220
+ static_mods=args.static_modifications,
221
+ variable_mods=args.variable_modifications,
222
+ c_terminal=args.c_terminal
223
+ )
224
+ psm_collection = scorer.score_collection_psm(
225
+ db=indexed_db,
226
+ spectrum_collection=fragments['processed_spec'].values,
227
+ num_threads=args.num_threads
228
+ )
229
+ psm_dicts.append(psm_collection)
230
+
231
+ if len(psm_dicts) > 1:
232
+ if args.verbose:
233
+ print("Merging PSMs ...")
234
+ psm_collection = merge_dicts_with_merge_dict(psm_dicts)
235
+ else:
236
+ psm_collection = psm_dicts[0]
237
+
238
+ ppm_error = apply_mz_calibration(psm_collection, fragments)
239
+
240
+ for _, values in psm_collection.items():
241
+ for value in values:
242
+ value.file_name = dataset_name
243
+ value.mz_calibration_ppm = ppm_error
244
+
245
+ psm_list = [psm for values in psm_collection.values() for psm in values]
246
+
247
+ if args.verbose:
248
+ print("Creating re-scoring feature space ...")
249
+
250
+ psm_list = create_feature_space(psms=psm_list)
251
+
252
+ bts = compress_psms(psm_list)
253
+ write_psms_binary(
254
+ byte_array=bts,
255
+ folder_path=ds_path,
256
+ file_name=f"{dataset_name}"
257
+ )
258
+
259
+ I = fragments.apply(lambda r: group_by_mobility(r.raw_data.mobility, r.raw_data.intensity), axis=1)
260
+ inv_mob, intensity = [x[0] for x in I], [x[1] for x in I]
261
+
262
+ fragments["inverse_ion_mobility"] = inv_mob
263
+ fragments["intensity"] = intensity
264
+
265
+ F = fragments[["spec_id", "monoisotopic_mz", "charge", "inverse_ion_mobility", "intensity"]]
266
+ F.to_parquet(parquet_path, index=False)
267
+
268
+ except Exception as e:
269
+ print(f"Error processing {file}: {e}")
270
+
271
+ # Final mokapot re-scoring
272
+ total_psms = []
273
+ if args.verbose:
274
+ print("Loading PSMs ...")
275
+
276
+ final_pin_path = os.path.join(args.raw_data_path, "PSMs.pin")
277
+ mokapot_output = os.path.join(args.raw_data_path, "mokapot.psms.txt")
278
+
279
+ skip_final = False
280
+ if args.lazy and os.path.isfile(final_pin_path) and os.path.isfile(mokapot_output):
281
+ skip_final = True
282
+ if args.verbose:
283
+ print(" [LAZY MODE] Skipping final mokapot step because outputs already exist.")
284
+
285
+ if not skip_final:
286
+ tmp_count = 0
287
+ for file in d_files:
288
+ try:
289
+ dataset_name = file.split(".")[0]
290
+ psm_bin_path = os.path.join(args.raw_data_path, file, "imspy", "psm", f"{dataset_name}.bin")
291
+ if not os.path.isfile(psm_bin_path):
292
+ continue
293
+
294
+ tmp_count += 1
295
+ bts = np.fromfile(psm_bin_path, dtype=np.uint8)
296
+ psm_list = decompress_psms(bts)
297
+ total_psms.extend(psm_list)
298
+
299
+ if args.verbose:
300
+ print(f"Loaded {dataset_name} ({tmp_count}/{count})")
301
+
302
+ except Exception as e:
303
+ print(f"Error loading {file}: {e}")
304
+
305
+ PSM_pandas = psm_collection_to_pandas(total_psms)
306
+
307
+ if args.verbose:
308
+ print("Creating mokapot pin ...")
309
+
310
+ PSM_pin = transform_psm_to_mokapot_pin(PSM_pandas, seq_modified=True)
311
+ PSM_pin.to_csv(final_pin_path, index=False, sep="\t")
312
+
313
+ psms_moka = mokapot.read_pin(final_pin_path)
314
+ results, _ = mokapot.brew(psms_moka, max_workers=args.num_threads)
315
+ results.to_txt(dest_dir=args.raw_data_path)
316
+
317
+ if args.verbose and not skip_final:
318
+ print("Finished.")
319
+
320
+
321
+ if __name__ == "__main__":
322
+ main()