imspy-search 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imspy_search/__init__.py +126 -0
- imspy_search/cli/__init__.py +11 -0
- imspy_search/cli/imspy_ccs.py +322 -0
- imspy_search/cli/imspy_dda.py +836 -0
- imspy_search/cli/imspy_rescore_sage.py +289 -0
- imspy_search/configs/config_ccs.toml +15 -0
- imspy_search/configs/config_hla.toml +83 -0
- imspy_search/configs/config_tryptic.toml +84 -0
- imspy_search/dda_extensions.py +209 -0
- imspy_search/mgf.py +139 -0
- imspy_search/rescoring.py +166 -0
- imspy_search/sage_output_utility.py +318 -0
- imspy_search/utility.py +585 -0
- imspy_search-0.4.0.dist-info/METADATA +108 -0
- imspy_search-0.4.0.dist-info/RECORD +17 -0
- imspy_search-0.4.0.dist-info/WHEEL +4 -0
- imspy_search-0.4.0.dist-info/entry_points.txt +5 -0
imspy_search/utility.py
ADDED
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
"""Utility functions for database search operations."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import List, Tuple, Union, Dict, Optional
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import numpy as np
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
|
+
|
|
11
|
+
from sagepy.core import (
|
|
12
|
+
Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor,
|
|
13
|
+
Representation, Tolerance, ProcessedIMSpectrum
|
|
14
|
+
)
|
|
15
|
+
from sagepy.core.scoring import Psm, merge_psm_dicts
|
|
16
|
+
from sagepy.utility import get_features, psm_collection_to_pandas
|
|
17
|
+
from sagepy.qfdr.tdc import target_decoy_competition_pandas
|
|
18
|
+
|
|
19
|
+
from imspy_core.timstof import TimsDatasetDDA
|
|
20
|
+
from imspy_core.timstof.frame import TimsFrame
|
|
21
|
+
from imspy_core.utility import linear_map
|
|
22
|
+
|
|
23
|
+
import ast
|
|
24
|
+
import re
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def check_memory(
|
|
28
|
+
limit_in_gb: int = 16,
|
|
29
|
+
msg: str = "Warning: System has only {total_ram_gb:.2f}GB of RAM, which is below the recommended {limit_in_gb}GB."):
|
|
30
|
+
"""Check if system has sufficient memory."""
|
|
31
|
+
if hasattr(os, "sysconf"):
|
|
32
|
+
total_ram_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
|
|
33
|
+
total_ram_gb = total_ram_bytes / (1024 ** 3)
|
|
34
|
+
if total_ram_gb < limit_in_gb:
|
|
35
|
+
msg = msg.format(total_ram_gb=total_ram_gb, limit_in_gb=limit_in_gb)
|
|
36
|
+
warnings.warn(msg)
|
|
37
|
+
else:
|
|
38
|
+
warnings.warn("Unable to determine system memory.")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def peptide_length(peptide: str) -> int:
|
|
42
|
+
"""
|
|
43
|
+
Takes a peptide sequence as a string and returns its length,
|
|
44
|
+
excluding [UNIMOD:X] modifications.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
peptide: A peptide sequence with possible UNIMOD modifications.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The length of the peptide without modifications.
|
|
51
|
+
"""
|
|
52
|
+
cleaned_peptide = re.sub(r'\[UNIMOD:\d+\]', '', peptide)
|
|
53
|
+
return len(cleaned_peptide)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_string_list(input_str: str) -> List[str]:
|
|
57
|
+
"""
|
|
58
|
+
Takes a string representation of a list and converts it into an actual list of strings.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
input_str: A string containing a list representation.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A list of strings parsed from the input string.
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(input_str, list):
|
|
67
|
+
return input_str
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
return ast.literal_eval(input_str)
|
|
71
|
+
except (SyntaxError, ValueError):
|
|
72
|
+
raise ValueError("Invalid list format")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def merge_dicts_with_merge_dict(dicts):
|
|
76
|
+
"""Merge multiple PSM dictionaries."""
|
|
77
|
+
d = None
|
|
78
|
+
for i, item in enumerate(dicts):
|
|
79
|
+
if i == 0:
|
|
80
|
+
d = item
|
|
81
|
+
else:
|
|
82
|
+
d = merge_psm_dicts(item, d)
|
|
83
|
+
return d
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def map_to_domain(data, gradient_length: float = 120.0):
|
|
87
|
+
"""
|
|
88
|
+
Maps the input data linearly into the domain [0, l].
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
data: list or numpy array of numerical values
|
|
92
|
+
gradient_length: the upper limit of the target domain [0, l]
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
mapped_data: list of values mapped into the domain [0, l]
|
|
96
|
+
"""
|
|
97
|
+
min_val = min(data)
|
|
98
|
+
max_val = max(data)
|
|
99
|
+
|
|
100
|
+
if max_val == min_val:
|
|
101
|
+
raise ValueError("All elements in data are the same. Linear mapping is not possible.")
|
|
102
|
+
|
|
103
|
+
mapped_data = [(gradient_length * (x - min_val) / (max_val - min_val)) for x in data]
|
|
104
|
+
return mapped_data
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def sanitize_charge(charge: Optional[float]) -> Optional[int]:
|
|
108
|
+
"""Sanitize charge value.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
charge: Charge value as float.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Charge value as int.
|
|
115
|
+
"""
|
|
116
|
+
if charge is not None and not np.isnan(charge):
|
|
117
|
+
return int(charge)
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def sanitize_mz(mz: Optional[float], mz_highest: float) -> Optional[float]:
|
|
122
|
+
"""Sanitize mz value.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
mz: Mz value as float.
|
|
126
|
+
mz_highest: Highest mz value.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Mz value as float.
|
|
130
|
+
"""
|
|
131
|
+
if mz is not None and not np.isnan(mz):
|
|
132
|
+
return mz
|
|
133
|
+
return mz_highest
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def split_fasta(fasta: str, num_splits: int = 16, randomize: bool = True) -> List[str]:
|
|
137
|
+
"""Split a fasta file into multiple fasta files.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
fasta: Fasta file as string.
|
|
141
|
+
num_splits: Number of splits fasta file should be split into.
|
|
142
|
+
randomize: Whether to randomize the order of sequences before splitting.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of fasta files as strings, will contain num_splits fasta files with equal number of sequences.
|
|
146
|
+
"""
|
|
147
|
+
if num_splits == 1:
|
|
148
|
+
return [fasta]
|
|
149
|
+
|
|
150
|
+
split_strings = re.split(r'\n>', fasta)
|
|
151
|
+
print(f"Total number of sequences: {len(split_strings)} ...")
|
|
152
|
+
|
|
153
|
+
if randomize:
|
|
154
|
+
np.random.shuffle(split_strings)
|
|
155
|
+
|
|
156
|
+
if not split_strings[0].startswith('>'):
|
|
157
|
+
split_strings[0] = '>' + split_strings[0]
|
|
158
|
+
|
|
159
|
+
total_items = len(split_strings)
|
|
160
|
+
items_per_batch = total_items // num_splits
|
|
161
|
+
remainder = total_items % num_splits
|
|
162
|
+
|
|
163
|
+
fastas = []
|
|
164
|
+
start_index = 0
|
|
165
|
+
|
|
166
|
+
for i in range(num_splits):
|
|
167
|
+
extra = 1 if i < remainder else 0
|
|
168
|
+
stop_index = start_index + items_per_batch + extra
|
|
169
|
+
|
|
170
|
+
if start_index >= total_items:
|
|
171
|
+
break
|
|
172
|
+
|
|
173
|
+
batch = '\n>'.join(split_strings[start_index:stop_index])
|
|
174
|
+
|
|
175
|
+
if not batch.startswith('>'):
|
|
176
|
+
batch = '>' + batch
|
|
177
|
+
|
|
178
|
+
fastas.append(batch)
|
|
179
|
+
start_index = stop_index
|
|
180
|
+
|
|
181
|
+
return fastas
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_ms1_ims_spectrum(
|
|
185
|
+
raw_spectrum: TimsFrame,
|
|
186
|
+
spec_processor: SpectrumProcessor,
|
|
187
|
+
time: float,
|
|
188
|
+
spec_id: str,
|
|
189
|
+
file_id: int = 0,
|
|
190
|
+
ms_level: int = 1) -> ProcessedIMSpectrum:
|
|
191
|
+
"""
|
|
192
|
+
Get SAGE searchable spectrum from raw data.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
raw_spectrum: TimsFrame object
|
|
196
|
+
time: float
|
|
197
|
+
spec_processor: SpectrumProcessor object
|
|
198
|
+
spec_id: str
|
|
199
|
+
file_id: int
|
|
200
|
+
ms_level: int
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
ProcessedSpectrum object
|
|
204
|
+
"""
|
|
205
|
+
spec = RawSpectrum(
|
|
206
|
+
file_id=file_id,
|
|
207
|
+
ms_level=ms_level,
|
|
208
|
+
spec_id=spec_id,
|
|
209
|
+
precursors=[],
|
|
210
|
+
representation=Representation(),
|
|
211
|
+
scan_start_time=time,
|
|
212
|
+
ion_injection_time=time,
|
|
213
|
+
total_ion_current=np.sum(raw_spectrum.intensity),
|
|
214
|
+
mz=raw_spectrum.mz.astype(np.float32),
|
|
215
|
+
intensity=raw_spectrum.intensity.astype(np.float32),
|
|
216
|
+
mobility=raw_spectrum.mobility.astype(np.float32),
|
|
217
|
+
)
|
|
218
|
+
return spec_processor.process_with_mobility(spec)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def get_searchable_spec(
|
|
222
|
+
precursor: Precursor,
|
|
223
|
+
raw_fragment_data: TimsFrame,
|
|
224
|
+
spec_processor: SpectrumProcessor,
|
|
225
|
+
time: float,
|
|
226
|
+
spec_id: str,
|
|
227
|
+
file_id: int = 0,
|
|
228
|
+
ms_level: int = 2) -> ProcessedSpectrum:
|
|
229
|
+
"""
|
|
230
|
+
Get SAGE searchable spectrum from raw data.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
precursor: Precursor object
|
|
234
|
+
raw_fragment_data: TimsFrame object
|
|
235
|
+
time: float
|
|
236
|
+
spec_processor: SpectrumProcessor object
|
|
237
|
+
spec_id: str
|
|
238
|
+
file_id: int
|
|
239
|
+
ms_level: int
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
ProcessedSpectrum object
|
|
243
|
+
"""
|
|
244
|
+
flat_spec = raw_fragment_data.to_indexed_mz_spectrum()
|
|
245
|
+
|
|
246
|
+
spec = RawSpectrum(
|
|
247
|
+
file_id=file_id,
|
|
248
|
+
ms_level=ms_level,
|
|
249
|
+
spec_id=spec_id,
|
|
250
|
+
representation=Representation(),
|
|
251
|
+
precursors=[precursor],
|
|
252
|
+
scan_start_time=time,
|
|
253
|
+
ion_injection_time=time,
|
|
254
|
+
total_ion_current=np.sum(flat_spec.intensity),
|
|
255
|
+
mz=flat_spec.mz.astype(np.float32),
|
|
256
|
+
intensity=flat_spec.intensity.astype(np.float32)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
processed_spec = spec_processor.process(spec)
|
|
260
|
+
return processed_spec
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def write_psms_binary(byte_array, folder_path: str, file_name: str, total: bool = False):
|
|
264
|
+
"""Write PSMs to binary file.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
byte_array: Byte array
|
|
268
|
+
folder_path: Folder path
|
|
269
|
+
file_name: File name
|
|
270
|
+
total: Whether to write to total folder
|
|
271
|
+
"""
|
|
272
|
+
if not os.path.exists(f'{folder_path}/imspy/psm'):
|
|
273
|
+
os.makedirs(f'{folder_path}/imspy/psm')
|
|
274
|
+
|
|
275
|
+
if os.path.exists(f'{folder_path}/imspy/psm/{file_name}.bin'):
|
|
276
|
+
os.remove(f'{folder_path}/imspy/psm/{file_name}.bin')
|
|
277
|
+
|
|
278
|
+
if not total:
|
|
279
|
+
file = open(f'{folder_path}/imspy/psm/{file_name}.bin', 'wb')
|
|
280
|
+
else:
|
|
281
|
+
file = open(f'{folder_path}/imspy/{file_name}.bin', 'wb')
|
|
282
|
+
try:
|
|
283
|
+
file.write(bytearray(byte_array))
|
|
284
|
+
finally:
|
|
285
|
+
file.close()
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def generate_training_data(
|
|
289
|
+
psms: List[Psm],
|
|
290
|
+
method: str = "psm",
|
|
291
|
+
q_max: float = 0.01,
|
|
292
|
+
balance: bool = True
|
|
293
|
+
) -> Tuple[NDArray, NDArray]:
|
|
294
|
+
"""Generate training data.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
psms: List of PeptideSpectrumMatch objects
|
|
298
|
+
method: Method to use for training data generation
|
|
299
|
+
q_max: Maximum q-value allowed for positive examples
|
|
300
|
+
balance: Whether to balance the dataset
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Tuple of X_train and Y_train
|
|
304
|
+
"""
|
|
305
|
+
PSM_pandas = psm_collection_to_pandas(psms)
|
|
306
|
+
PSM_q = target_decoy_competition_pandas(PSM_pandas, method=method)
|
|
307
|
+
PSM_pandas = PSM_pandas.drop(columns=["hyperscore"])
|
|
308
|
+
|
|
309
|
+
TDC = pd.merge(PSM_q, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
|
|
310
|
+
right_on=["spec_idx", "match_idx", "decoy"])
|
|
311
|
+
|
|
312
|
+
TARGET = TDC[(TDC.decoy == False) & (TDC.q_value <= q_max)]
|
|
313
|
+
X_target, Y_target = get_features(TARGET)
|
|
314
|
+
|
|
315
|
+
DECOY = TDC[TDC.decoy]
|
|
316
|
+
X_decoy, Y_decoy = get_features(DECOY)
|
|
317
|
+
|
|
318
|
+
if balance:
|
|
319
|
+
num_target = np.min((len(DECOY), len(TARGET)))
|
|
320
|
+
target_indices = np.random.choice(np.arange(len(X_target)), size=num_target)
|
|
321
|
+
X_target = X_target[target_indices, :]
|
|
322
|
+
Y_target = Y_target[target_indices]
|
|
323
|
+
|
|
324
|
+
X_train = np.vstack((X_target, X_decoy))
|
|
325
|
+
Y_train = np.hstack((Y_target, Y_decoy))
|
|
326
|
+
|
|
327
|
+
return X_train, Y_train
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def split_psms(psms: List[Psm], num_splits: int = 10) -> List[List[Psm]]:
|
|
331
|
+
"""Split PSMs into multiple splits.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
psms: List of PeptideSpectrumMatch objects
|
|
335
|
+
num_splits: Number of splits
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
List of splits
|
|
339
|
+
"""
|
|
340
|
+
split_size = len(psms) // num_splits
|
|
341
|
+
remainder = len(psms) % num_splits
|
|
342
|
+
splits = []
|
|
343
|
+
start_index = 0
|
|
344
|
+
|
|
345
|
+
for i in range(num_splits):
|
|
346
|
+
end_index = start_index + split_size + (1 if i < remainder else 0)
|
|
347
|
+
splits.append(psms[start_index:end_index])
|
|
348
|
+
start_index = end_index
|
|
349
|
+
|
|
350
|
+
return splits
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def generate_balanced_rt_dataset(psms: Union[List[Psm], Dict[str, List[Psm]]]) -> List[Psm]:
|
|
354
|
+
"""Generate balanced retention time dataset for training."""
|
|
355
|
+
psm_list = []
|
|
356
|
+
if isinstance(psms, dict):
|
|
357
|
+
for key in psms:
|
|
358
|
+
psm_list.extend(psms[key])
|
|
359
|
+
else:
|
|
360
|
+
psm_list = psms
|
|
361
|
+
|
|
362
|
+
PSM_pandas = psm_collection_to_pandas(psm_list)
|
|
363
|
+
PSM_q = target_decoy_competition_pandas(PSM_pandas, method="psm", score="hyperscore")
|
|
364
|
+
PSM_pandas_dropped = PSM_pandas.drop(columns=["hyperscore"])
|
|
365
|
+
|
|
366
|
+
TDC = pd.merge(PSM_q, PSM_pandas_dropped, left_on=["spec_idx", "match_idx", "decoy"],
|
|
367
|
+
right_on=["spec_idx", "match_idx", "decoy"])
|
|
368
|
+
TDC = TDC[(TDC.decoy == False) & (TDC.q_value <= 0.01)].drop_duplicates(subset="sequence")
|
|
369
|
+
|
|
370
|
+
id_set = set(TDC.spec_idx.values)
|
|
371
|
+
r_list = list(filter(lambda p: p.spec_idx in id_set and p.rank == 1, psm_list))
|
|
372
|
+
|
|
373
|
+
return r_list
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def generate_balanced_im_dataset(psms: Union[List[Psm], Dict[str, List[Psm]]]) -> List[Psm]:
|
|
377
|
+
"""Generate balanced ion mobility dataset for training."""
|
|
378
|
+
psm_list = []
|
|
379
|
+
if isinstance(psms, dict):
|
|
380
|
+
for key in psms:
|
|
381
|
+
psm_list.extend(psms[key])
|
|
382
|
+
else:
|
|
383
|
+
psm_list = psms
|
|
384
|
+
|
|
385
|
+
PSM_pandas = psm_collection_to_pandas(psm_list)
|
|
386
|
+
PSM_q = target_decoy_competition_pandas(PSM_pandas, method="psm", score="hyperscore")
|
|
387
|
+
PSM_pandas_dropped = PSM_pandas.drop(columns=["hyperscore"])
|
|
388
|
+
|
|
389
|
+
TDC = pd.merge(PSM_q, PSM_pandas_dropped, left_on=["spec_idx", "match_idx", "decoy"],
|
|
390
|
+
right_on=["spec_idx", "match_idx", "decoy"])
|
|
391
|
+
TDC = TDC[(TDC.decoy == False) & (TDC.q_value <= 0.01)].drop_duplicates(subset=["sequence", "charge"])
|
|
392
|
+
id_set = set(TDC.spec_idx.values)
|
|
393
|
+
|
|
394
|
+
im_list = list(filter(lambda p: p.spec_idx in id_set and p.rank == 1, psm_list))
|
|
395
|
+
return im_list
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def extract_timstof_dda_data(
|
|
399
|
+
path: str,
|
|
400
|
+
in_memory: bool = False,
|
|
401
|
+
use_bruker_sdk: bool = False,
|
|
402
|
+
isolation_window_lower: float = -3.0,
|
|
403
|
+
isolation_window_upper: float = 3.0,
|
|
404
|
+
take_top_n: int = 100,
|
|
405
|
+
num_threads: int = 16,
|
|
406
|
+
) -> pd.DataFrame:
|
|
407
|
+
"""
|
|
408
|
+
Extract TIMSTOF DDA data from bruker timsTOF TDF file.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
path: Path to TIMSTOF DDA data
|
|
412
|
+
in_memory: Whether to load data in memory
|
|
413
|
+
use_bruker_sdk: Whether to use bruker SDK for data extraction
|
|
414
|
+
isolation_window_lower: Lower bound for isolation window (Da)
|
|
415
|
+
isolation_window_upper: Upper bound for isolation window (Da)
|
|
416
|
+
take_top_n: Number of top peaks to take
|
|
417
|
+
num_threads: Number of threads to use
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
DataFrame containing timsTOF DDA data
|
|
421
|
+
"""
|
|
422
|
+
ds_name = os.path.basename(path)
|
|
423
|
+
|
|
424
|
+
dataset = TimsDatasetDDA(path, in_memory=in_memory, use_bruker_sdk=use_bruker_sdk)
|
|
425
|
+
fragments = dataset.get_pasef_fragments(num_threads=num_threads)
|
|
426
|
+
|
|
427
|
+
fragments = fragments.groupby('precursor_id').agg({
|
|
428
|
+
'frame_id': 'first',
|
|
429
|
+
'time': 'first',
|
|
430
|
+
'precursor_id': 'first',
|
|
431
|
+
'raw_data': 'sum',
|
|
432
|
+
'scan_begin': 'first',
|
|
433
|
+
'scan_end': 'first',
|
|
434
|
+
'isolation_mz': 'first',
|
|
435
|
+
'isolation_width': 'first',
|
|
436
|
+
'collision_energy': 'first',
|
|
437
|
+
'largest_peak_mz': 'first',
|
|
438
|
+
'average_mz': 'first',
|
|
439
|
+
'monoisotopic_mz': 'first',
|
|
440
|
+
'charge': 'first',
|
|
441
|
+
'average_scan': 'first',
|
|
442
|
+
'intensity': 'first',
|
|
443
|
+
'parent_id': 'first',
|
|
444
|
+
})
|
|
445
|
+
|
|
446
|
+
mobility = fragments.apply(lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1)
|
|
447
|
+
fragments['mobility'] = mobility
|
|
448
|
+
|
|
449
|
+
spec_id = fragments.apply(lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1)
|
|
450
|
+
fragments['spec_id'] = spec_id
|
|
451
|
+
|
|
452
|
+
sage_precursor = fragments.apply(lambda r: Precursor(
|
|
453
|
+
mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
|
|
454
|
+
intensity=r['intensity'],
|
|
455
|
+
charge=sanitize_charge(r['charge']),
|
|
456
|
+
isolation_window=Tolerance(da=(isolation_window_lower, isolation_window_upper)),
|
|
457
|
+
collision_energy=r.collision_energy,
|
|
458
|
+
inverse_ion_mobility=r.mobility,
|
|
459
|
+
), axis=1)
|
|
460
|
+
|
|
461
|
+
fragments['sage_precursor'] = sage_precursor
|
|
462
|
+
|
|
463
|
+
processed_spec = fragments.apply(
|
|
464
|
+
lambda r: get_searchable_spec(
|
|
465
|
+
precursor=r.sage_precursor,
|
|
466
|
+
raw_fragment_data=r.raw_data,
|
|
467
|
+
spec_processor=SpectrumProcessor(take_top_n=take_top_n),
|
|
468
|
+
spec_id=r.spec_id,
|
|
469
|
+
time=r['time'],
|
|
470
|
+
),
|
|
471
|
+
axis=1
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
fragments['processed_spec'] = processed_spec
|
|
475
|
+
|
|
476
|
+
return fragments
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def transform_psm_to_pin(psm_df):
|
|
480
|
+
"""Transform PSM DataFrame to PIN format for Percolator."""
|
|
481
|
+
columns_map = {
|
|
482
|
+
'spec_idx': 'SpecId',
|
|
483
|
+
'decoy': 'Label',
|
|
484
|
+
'charge': 'Charge',
|
|
485
|
+
'sequence': 'Peptide',
|
|
486
|
+
'hyperscore': 'Feature1',
|
|
487
|
+
'isotope_error': 'Feature2',
|
|
488
|
+
'delta_mass': 'Feature3',
|
|
489
|
+
'delta_rt': 'Feature4',
|
|
490
|
+
'delta_ims': 'Feature5',
|
|
491
|
+
'matched_peaks': 'Feature6',
|
|
492
|
+
'matched_intensity_pct': 'Feature7',
|
|
493
|
+
'intensity_ms1': 'Feature8',
|
|
494
|
+
'intensity_ms2': 'Feature9',
|
|
495
|
+
'average_ppm': 'Feature10',
|
|
496
|
+
'poisson': 'Feature11',
|
|
497
|
+
'spectral_entropy_similarity': 'Feature12',
|
|
498
|
+
'spectral_correlation_similarity_pearson': 'Feature13',
|
|
499
|
+
'spectral_correlation_similarity_spearman': 'Feature14',
|
|
500
|
+
'spectral_normalized_intensity_difference': 'Feature15',
|
|
501
|
+
'collision_energy': 'Feature16',
|
|
502
|
+
'delta_next': 'Feature17',
|
|
503
|
+
'delta_best': 'Feature18',
|
|
504
|
+
'longest_b': 'Feature19',
|
|
505
|
+
'longest_y': 'Feature20',
|
|
506
|
+
'longest_y_pct': 'Feature21',
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
psm_df = psm_df[list(columns_map.keys())]
|
|
510
|
+
df_pin = psm_df.rename(columns=columns_map)
|
|
511
|
+
df_pin_clean = df_pin.dropna(axis=1, how='all')
|
|
512
|
+
df_pin_clean = df_pin_clean.dropna()
|
|
513
|
+
|
|
514
|
+
df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1)
|
|
515
|
+
df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1)
|
|
516
|
+
|
|
517
|
+
return df_pin_clean
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
# Column renaming scheme for tims2rescore compatibility
|
|
521
|
+
full_renaming_scheme = {
|
|
522
|
+
'spec_idx': 'psm_id',
|
|
523
|
+
'sequence_modified': 'peptide',
|
|
524
|
+
'ims': 'ion_mobility',
|
|
525
|
+
'predicted_ims': 'predicted_mobility',
|
|
526
|
+
'delta_ims': 'delta_mobility',
|
|
527
|
+
'discriminant_score': 'sage_discriminant_score',
|
|
528
|
+
'delta_mass': 'precursor_ppm',
|
|
529
|
+
'average_ppm': 'fragment_ppm'
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
sage_target_columns = [
|
|
533
|
+
'psm_id', 'peptide', 'proteins', 'num_proteins', 'filename', 'scannr',
|
|
534
|
+
'rank', 'label', 'expmass', 'calcmass', 'charge', 'peptide_len',
|
|
535
|
+
'missed_cleavages', 'semi_enzymatic', 'isotope_error', 'precursor_ppm',
|
|
536
|
+
'fragment_ppm', 'hyperscore', 'delta_next', 'delta_best', 'rt',
|
|
537
|
+
'aligned_rt', 'predicted_rt', 'delta_rt_model', 'ion_mobility',
|
|
538
|
+
'predicted_mobility', 'delta_mobility', 'matched_peaks', 'longest_b',
|
|
539
|
+
'longest_y', 'longest_y_pct', 'matched_intensity_pct',
|
|
540
|
+
'scored_candidates', 'poisson', 'sage_discriminant_score',
|
|
541
|
+
'posterior_error', 'spectrum_q', 'peptide_q', 'protein_q',
|
|
542
|
+
'ms2_intensity'
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def list_to_semicolon_string(value):
|
|
547
|
+
"""Converts a list of proteins into a semicolon-separated string."""
|
|
548
|
+
if isinstance(value, list):
|
|
549
|
+
return ";".join(value)
|
|
550
|
+
return value
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def parse_to_tims2rescore(TDC, from_mgf: bool = False, file_name: str = None):
|
|
554
|
+
"""Parse PSM results to tims2rescore-compatible format."""
|
|
555
|
+
TDC_tmp = TDC.copy()
|
|
556
|
+
TDC_tmp["filename"] = file_name if from_mgf else TDC_tmp.spec_idx.apply(lambda s: '-'.join(s.split('-')[3:]) + ".d")
|
|
557
|
+
TDC_tmp["scannr"] = TDC_tmp.spec_idx.apply(lambda i: int(i.split("-")[1]) - 1) if from_mgf else TDC_tmp.spec_idx.apply(lambda s: int(s.split('-')[2]) - 1)
|
|
558
|
+
TDC_tmp["num_proteins"] = TDC_tmp.proteins.apply(lambda protein: len(parse_string_list(protein)))
|
|
559
|
+
TDC_tmp["label"] = TDC_tmp.decoy.apply(lambda b: -1 if b else 1)
|
|
560
|
+
TDC_tmp["peptide_len"] = TDC_tmp.sequence.apply(peptide_length)
|
|
561
|
+
TDC_tmp["semi_enzymatic"] = False
|
|
562
|
+
TDC_tmp = TDC_tmp.rename(columns=full_renaming_scheme)
|
|
563
|
+
TDC_tmp = TDC_tmp[sage_target_columns]
|
|
564
|
+
TDC_tmp["rank"] = TDC_tmp["rank"].astype(int)
|
|
565
|
+
TDC_tmp["charge"] = TDC_tmp["charge"].astype(int)
|
|
566
|
+
TDC_tmp["missed_cleavages"] = TDC_tmp["missed_cleavages"].astype(int)
|
|
567
|
+
TDC_tmp["semi_enzymatic"] = TDC_tmp["semi_enzymatic"].astype(int)
|
|
568
|
+
TDC_tmp["scored_candidates"] = TDC_tmp["scored_candidates"].astype(int)
|
|
569
|
+
TDC_tmp["matched_peaks"] = TDC_tmp["matched_peaks"].astype(int)
|
|
570
|
+
TDC_tmp["longest_b"] = TDC_tmp["longest_b"].astype(int)
|
|
571
|
+
TDC_tmp["longest_y"] = TDC_tmp["longest_y"].astype(int)
|
|
572
|
+
TDC_tmp["proteins"] = TDC_tmp.proteins.apply(list_to_semicolon_string)
|
|
573
|
+
TDC_tmp["proteins"] = TDC_tmp["proteins"].astype(str)
|
|
574
|
+
|
|
575
|
+
TDC_tmp = TDC_tmp.sort_values(by="spectrum_q", ascending=True)
|
|
576
|
+
TDC_tmp["psm_id"] = range(1, len(TDC_tmp) + 1)
|
|
577
|
+
|
|
578
|
+
def add_rev_prefix(protein, label):
|
|
579
|
+
if label == -1:
|
|
580
|
+
return f"rev_{protein}"
|
|
581
|
+
return protein
|
|
582
|
+
|
|
583
|
+
TDC_tmp["proteins"] = TDC_tmp.apply(lambda row: add_rev_prefix(row["proteins"], row["label"]), axis=1)
|
|
584
|
+
|
|
585
|
+
return TDC_tmp
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imspy-search
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Database search functionality for timsTOF proteomics data using sagepy.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Author: theGreatHerrLebert
|
|
7
|
+
Author-email: davidteschner@googlemail.com
|
|
8
|
+
Requires-Python: >=3.11,<3.14
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: imspy-core (>=0.4.0)
|
|
14
|
+
Requires-Dist: imspy-predictors (>=0.4.0)
|
|
15
|
+
Requires-Dist: matplotlib (>=3.5)
|
|
16
|
+
Requires-Dist: mokapot (>=0.9.0)
|
|
17
|
+
Requires-Dist: numba (>=0.53)
|
|
18
|
+
Requires-Dist: numpy (>=1.24)
|
|
19
|
+
Requires-Dist: pandas (>=2.0)
|
|
20
|
+
Requires-Dist: sagepy (>=0.4.0)
|
|
21
|
+
Requires-Dist: scikit-learn (>=1.0)
|
|
22
|
+
Requires-Dist: scipy (>=1.7.1)
|
|
23
|
+
Requires-Dist: toml (>=0.10)
|
|
24
|
+
Requires-Dist: tqdm (>=4.66)
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# imspy-search
|
|
28
|
+
|
|
29
|
+
Database search functionality for timsTOF proteomics data using sagepy.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install imspy-search
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Database Search**: SAGE-based database search for timsTOF DDA data
|
|
40
|
+
- **PSM Rescoring**: Machine learning-based rescoring of peptide-spectrum matches
|
|
41
|
+
- **FDR Control**: Target-decoy competition and q-value estimation
|
|
42
|
+
- **MGF Support**: Parse and search Bruker DataAnalysis MGF files
|
|
43
|
+
- **CLI Tools**: Command-line interfaces for common workflows
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from imspy_search import (
|
|
49
|
+
extract_timstof_dda_data,
|
|
50
|
+
get_searchable_spec,
|
|
51
|
+
generate_balanced_rt_dataset,
|
|
52
|
+
generate_balanced_im_dataset,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Extract DDA data for database search
|
|
56
|
+
fragments = extract_timstof_dda_data(
|
|
57
|
+
path="path/to/data.d",
|
|
58
|
+
num_threads=16,
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## CLI Tools
|
|
63
|
+
|
|
64
|
+
### imspy-dda
|
|
65
|
+
Full DDA search pipeline with intensity prediction and rescoring:
|
|
66
|
+
```bash
|
|
67
|
+
imspy-dda /path/to/data /path/to/fasta.fasta --config config.toml
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### imspy-ccs
|
|
71
|
+
Extract CCS values from DDA data for machine learning:
|
|
72
|
+
```bash
|
|
73
|
+
imspy-ccs --raw_data_path /path/to/data --fasta_path /path/to/fasta.fasta
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### imspy-rescore-sage
|
|
77
|
+
Rescore SAGE search results with deep learning features:
|
|
78
|
+
```bash
|
|
79
|
+
imspy-rescore-sage results.tsv fragments.tsv /output/path
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Submodules
|
|
83
|
+
|
|
84
|
+
- **utility**: Core utility functions for database search
|
|
85
|
+
- **sage_output_utility**: SAGE output processing and rescoring
|
|
86
|
+
- **mgf**: MGF file parsing for sagepy queries
|
|
87
|
+
- **rescoring**: PSM rescoring with deep learning features
|
|
88
|
+
- **dda_extensions**: TimsDatasetDDA extensions for sagepy
|
|
89
|
+
- **cli/**: Command-line interface tools
|
|
90
|
+
|
|
91
|
+
## Dependencies
|
|
92
|
+
|
|
93
|
+
- **imspy-core**: Core data structures (required)
|
|
94
|
+
- **imspy-predictors**: ML predictors for CCS, RT, intensity (required)
|
|
95
|
+
- **sagepy**: SAGE database search framework (required)
|
|
96
|
+
- **mokapot**: Machine learning for PSM scoring (required)
|
|
97
|
+
|
|
98
|
+
## Related Packages
|
|
99
|
+
|
|
100
|
+
- **imspy-core**: Core data structures and timsTOF readers
|
|
101
|
+
- **imspy-predictors**: ML-based predictors
|
|
102
|
+
- **imspy-simulation**: Simulation tools for timsTOF data
|
|
103
|
+
- **imspy-vis**: Visualization tools
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
MIT License - see LICENSE file for details.
|
|
108
|
+
|