imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,585 @@
1
+ """Utility functions for database search operations."""
2
+
3
+ import os
4
+ import warnings
5
+ from typing import List, Tuple, Union, Dict, Optional
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ from numpy.typing import NDArray
10
+
11
+ from sagepy.core import (
12
+ Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor,
13
+ Representation, Tolerance, ProcessedIMSpectrum
14
+ )
15
+ from sagepy.core.scoring import Psm, merge_psm_dicts
16
+ from sagepy.utility import get_features, psm_collection_to_pandas
17
+ from sagepy.qfdr.tdc import target_decoy_competition_pandas
18
+
19
+ from imspy_core.timstof import TimsDatasetDDA
20
+ from imspy_core.timstof.frame import TimsFrame
21
+ from imspy_core.utility import linear_map
22
+
23
+ import ast
24
+ import re
25
+
26
+
27
+ def check_memory(
28
+ limit_in_gb: int = 16,
29
+ msg: str = "Warning: System has only {total_ram_gb:.2f}GB of RAM, which is below the recommended {limit_in_gb}GB."):
30
+ """Check if system has sufficient memory."""
31
+ if hasattr(os, "sysconf"):
32
+ total_ram_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
33
+ total_ram_gb = total_ram_bytes / (1024 ** 3)
34
+ if total_ram_gb < limit_in_gb:
35
+ msg = msg.format(total_ram_gb=total_ram_gb, limit_in_gb=limit_in_gb)
36
+ warnings.warn(msg)
37
+ else:
38
+ warnings.warn("Unable to determine system memory.")
39
+
40
+
41
+ def peptide_length(peptide: str) -> int:
42
+ """
43
+ Takes a peptide sequence as a string and returns its length,
44
+ excluding [UNIMOD:X] modifications.
45
+
46
+ Args:
47
+ peptide: A peptide sequence with possible UNIMOD modifications.
48
+
49
+ Returns:
50
+ The length of the peptide without modifications.
51
+ """
52
+ cleaned_peptide = re.sub(r'\[UNIMOD:\d+\]', '', peptide)
53
+ return len(cleaned_peptide)
54
+
55
+
56
+ def parse_string_list(input_str: str) -> List[str]:
57
+ """
58
+ Takes a string representation of a list and converts it into an actual list of strings.
59
+
60
+ Args:
61
+ input_str: A string containing a list representation.
62
+
63
+ Returns:
64
+ A list of strings parsed from the input string.
65
+ """
66
+ if isinstance(input_str, list):
67
+ return input_str
68
+
69
+ try:
70
+ return ast.literal_eval(input_str)
71
+ except (SyntaxError, ValueError):
72
+ raise ValueError("Invalid list format")
73
+
74
+
75
+ def merge_dicts_with_merge_dict(dicts):
76
+ """Merge multiple PSM dictionaries."""
77
+ d = None
78
+ for i, item in enumerate(dicts):
79
+ if i == 0:
80
+ d = item
81
+ else:
82
+ d = merge_psm_dicts(item, d)
83
+ return d
84
+
85
+
86
+ def map_to_domain(data, gradient_length: float = 120.0):
87
+ """
88
+ Maps the input data linearly into the domain [0, l].
89
+
90
+ Args:
91
+ data: list or numpy array of numerical values
92
+ gradient_length: the upper limit of the target domain [0, l]
93
+
94
+ Returns:
95
+ mapped_data: list of values mapped into the domain [0, l]
96
+ """
97
+ min_val = min(data)
98
+ max_val = max(data)
99
+
100
+ if max_val == min_val:
101
+ raise ValueError("All elements in data are the same. Linear mapping is not possible.")
102
+
103
+ mapped_data = [(gradient_length * (x - min_val) / (max_val - min_val)) for x in data]
104
+ return mapped_data
105
+
106
+
107
+ def sanitize_charge(charge: Optional[float]) -> Optional[int]:
108
+ """Sanitize charge value.
109
+
110
+ Args:
111
+ charge: Charge value as float.
112
+
113
+ Returns:
114
+ Charge value as int.
115
+ """
116
+ if charge is not None and not np.isnan(charge):
117
+ return int(charge)
118
+ return None
119
+
120
+
121
+ def sanitize_mz(mz: Optional[float], mz_highest: float) -> Optional[float]:
122
+ """Sanitize mz value.
123
+
124
+ Args:
125
+ mz: Mz value as float.
126
+ mz_highest: Highest mz value.
127
+
128
+ Returns:
129
+ Mz value as float.
130
+ """
131
+ if mz is not None and not np.isnan(mz):
132
+ return mz
133
+ return mz_highest
134
+
135
+
136
+ def split_fasta(fasta: str, num_splits: int = 16, randomize: bool = True) -> List[str]:
137
+ """Split a fasta file into multiple fasta files.
138
+
139
+ Args:
140
+ fasta: Fasta file as string.
141
+ num_splits: Number of splits fasta file should be split into.
142
+ randomize: Whether to randomize the order of sequences before splitting.
143
+
144
+ Returns:
145
+ List of fasta files as strings, will contain num_splits fasta files with equal number of sequences.
146
+ """
147
+ if num_splits == 1:
148
+ return [fasta]
149
+
150
+ split_strings = re.split(r'\n>', fasta)
151
+ print(f"Total number of sequences: {len(split_strings)} ...")
152
+
153
+ if randomize:
154
+ np.random.shuffle(split_strings)
155
+
156
+ if not split_strings[0].startswith('>'):
157
+ split_strings[0] = '>' + split_strings[0]
158
+
159
+ total_items = len(split_strings)
160
+ items_per_batch = total_items // num_splits
161
+ remainder = total_items % num_splits
162
+
163
+ fastas = []
164
+ start_index = 0
165
+
166
+ for i in range(num_splits):
167
+ extra = 1 if i < remainder else 0
168
+ stop_index = start_index + items_per_batch + extra
169
+
170
+ if start_index >= total_items:
171
+ break
172
+
173
+ batch = '\n>'.join(split_strings[start_index:stop_index])
174
+
175
+ if not batch.startswith('>'):
176
+ batch = '>' + batch
177
+
178
+ fastas.append(batch)
179
+ start_index = stop_index
180
+
181
+ return fastas
182
+
183
+
184
+ def get_ms1_ims_spectrum(
185
+ raw_spectrum: TimsFrame,
186
+ spec_processor: SpectrumProcessor,
187
+ time: float,
188
+ spec_id: str,
189
+ file_id: int = 0,
190
+ ms_level: int = 1) -> ProcessedIMSpectrum:
191
+ """
192
+ Get SAGE searchable spectrum from raw data.
193
+
194
+ Args:
195
+ raw_spectrum: TimsFrame object
196
+ time: float
197
+ spec_processor: SpectrumProcessor object
198
+ spec_id: str
199
+ file_id: int
200
+ ms_level: int
201
+
202
+ Returns:
203
+ ProcessedSpectrum object
204
+ """
205
+ spec = RawSpectrum(
206
+ file_id=file_id,
207
+ ms_level=ms_level,
208
+ spec_id=spec_id,
209
+ precursors=[],
210
+ representation=Representation(),
211
+ scan_start_time=time,
212
+ ion_injection_time=time,
213
+ total_ion_current=np.sum(raw_spectrum.intensity),
214
+ mz=raw_spectrum.mz.astype(np.float32),
215
+ intensity=raw_spectrum.intensity.astype(np.float32),
216
+ mobility=raw_spectrum.mobility.astype(np.float32),
217
+ )
218
+ return spec_processor.process_with_mobility(spec)
219
+
220
+
221
+ def get_searchable_spec(
222
+ precursor: Precursor,
223
+ raw_fragment_data: TimsFrame,
224
+ spec_processor: SpectrumProcessor,
225
+ time: float,
226
+ spec_id: str,
227
+ file_id: int = 0,
228
+ ms_level: int = 2) -> ProcessedSpectrum:
229
+ """
230
+ Get SAGE searchable spectrum from raw data.
231
+
232
+ Args:
233
+ precursor: Precursor object
234
+ raw_fragment_data: TimsFrame object
235
+ time: float
236
+ spec_processor: SpectrumProcessor object
237
+ spec_id: str
238
+ file_id: int
239
+ ms_level: int
240
+
241
+ Returns:
242
+ ProcessedSpectrum object
243
+ """
244
+ flat_spec = raw_fragment_data.to_indexed_mz_spectrum()
245
+
246
+ spec = RawSpectrum(
247
+ file_id=file_id,
248
+ ms_level=ms_level,
249
+ spec_id=spec_id,
250
+ representation=Representation(),
251
+ precursors=[precursor],
252
+ scan_start_time=time,
253
+ ion_injection_time=time,
254
+ total_ion_current=np.sum(flat_spec.intensity),
255
+ mz=flat_spec.mz.astype(np.float32),
256
+ intensity=flat_spec.intensity.astype(np.float32)
257
+ )
258
+
259
+ processed_spec = spec_processor.process(spec)
260
+ return processed_spec
261
+
262
+
263
+ def write_psms_binary(byte_array, folder_path: str, file_name: str, total: bool = False):
264
+ """Write PSMs to binary file.
265
+
266
+ Args:
267
+ byte_array: Byte array
268
+ folder_path: Folder path
269
+ file_name: File name
270
+ total: Whether to write to total folder
271
+ """
272
+ if not os.path.exists(f'{folder_path}/imspy/psm'):
273
+ os.makedirs(f'{folder_path}/imspy/psm')
274
+
275
+ if os.path.exists(f'{folder_path}/imspy/psm/{file_name}.bin'):
276
+ os.remove(f'{folder_path}/imspy/psm/{file_name}.bin')
277
+
278
+ if not total:
279
+ file = open(f'{folder_path}/imspy/psm/{file_name}.bin', 'wb')
280
+ else:
281
+ file = open(f'{folder_path}/imspy/{file_name}.bin', 'wb')
282
+ try:
283
+ file.write(bytearray(byte_array))
284
+ finally:
285
+ file.close()
286
+
287
+
288
+ def generate_training_data(
289
+ psms: List[Psm],
290
+ method: str = "psm",
291
+ q_max: float = 0.01,
292
+ balance: bool = True
293
+ ) -> Tuple[NDArray, NDArray]:
294
+ """Generate training data.
295
+
296
+ Args:
297
+ psms: List of PeptideSpectrumMatch objects
298
+ method: Method to use for training data generation
299
+ q_max: Maximum q-value allowed for positive examples
300
+ balance: Whether to balance the dataset
301
+
302
+ Returns:
303
+ Tuple of X_train and Y_train
304
+ """
305
+ PSM_pandas = psm_collection_to_pandas(psms)
306
+ PSM_q = target_decoy_competition_pandas(PSM_pandas, method=method)
307
+ PSM_pandas = PSM_pandas.drop(columns=["hyperscore"])
308
+
309
+ TDC = pd.merge(PSM_q, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
310
+ right_on=["spec_idx", "match_idx", "decoy"])
311
+
312
+ TARGET = TDC[(TDC.decoy == False) & (TDC.q_value <= q_max)]
313
+ X_target, Y_target = get_features(TARGET)
314
+
315
+ DECOY = TDC[TDC.decoy]
316
+ X_decoy, Y_decoy = get_features(DECOY)
317
+
318
+ if balance:
319
+ num_target = np.min((len(DECOY), len(TARGET)))
320
+ target_indices = np.random.choice(np.arange(len(X_target)), size=num_target)
321
+ X_target = X_target[target_indices, :]
322
+ Y_target = Y_target[target_indices]
323
+
324
+ X_train = np.vstack((X_target, X_decoy))
325
+ Y_train = np.hstack((Y_target, Y_decoy))
326
+
327
+ return X_train, Y_train
328
+
329
+
330
+ def split_psms(psms: List[Psm], num_splits: int = 10) -> List[List[Psm]]:
331
+ """Split PSMs into multiple splits.
332
+
333
+ Args:
334
+ psms: List of PeptideSpectrumMatch objects
335
+ num_splits: Number of splits
336
+
337
+ Returns:
338
+ List of splits
339
+ """
340
+ split_size = len(psms) // num_splits
341
+ remainder = len(psms) % num_splits
342
+ splits = []
343
+ start_index = 0
344
+
345
+ for i in range(num_splits):
346
+ end_index = start_index + split_size + (1 if i < remainder else 0)
347
+ splits.append(psms[start_index:end_index])
348
+ start_index = end_index
349
+
350
+ return splits
351
+
352
+
353
+ def generate_balanced_rt_dataset(psms: Union[List[Psm], Dict[str, List[Psm]]]) -> List[Psm]:
354
+ """Generate balanced retention time dataset for training."""
355
+ psm_list = []
356
+ if isinstance(psms, dict):
357
+ for key in psms:
358
+ psm_list.extend(psms[key])
359
+ else:
360
+ psm_list = psms
361
+
362
+ PSM_pandas = psm_collection_to_pandas(psm_list)
363
+ PSM_q = target_decoy_competition_pandas(PSM_pandas, method="psm", score="hyperscore")
364
+ PSM_pandas_dropped = PSM_pandas.drop(columns=["hyperscore"])
365
+
366
+ TDC = pd.merge(PSM_q, PSM_pandas_dropped, left_on=["spec_idx", "match_idx", "decoy"],
367
+ right_on=["spec_idx", "match_idx", "decoy"])
368
+ TDC = TDC[(TDC.decoy == False) & (TDC.q_value <= 0.01)].drop_duplicates(subset="sequence")
369
+
370
+ id_set = set(TDC.spec_idx.values)
371
+ r_list = list(filter(lambda p: p.spec_idx in id_set and p.rank == 1, psm_list))
372
+
373
+ return r_list
374
+
375
+
376
+ def generate_balanced_im_dataset(psms: Union[List[Psm], Dict[str, List[Psm]]]) -> List[Psm]:
377
+ """Generate balanced ion mobility dataset for training."""
378
+ psm_list = []
379
+ if isinstance(psms, dict):
380
+ for key in psms:
381
+ psm_list.extend(psms[key])
382
+ else:
383
+ psm_list = psms
384
+
385
+ PSM_pandas = psm_collection_to_pandas(psm_list)
386
+ PSM_q = target_decoy_competition_pandas(PSM_pandas, method="psm", score="hyperscore")
387
+ PSM_pandas_dropped = PSM_pandas.drop(columns=["hyperscore"])
388
+
389
+ TDC = pd.merge(PSM_q, PSM_pandas_dropped, left_on=["spec_idx", "match_idx", "decoy"],
390
+ right_on=["spec_idx", "match_idx", "decoy"])
391
+ TDC = TDC[(TDC.decoy == False) & (TDC.q_value <= 0.01)].drop_duplicates(subset=["sequence", "charge"])
392
+ id_set = set(TDC.spec_idx.values)
393
+
394
+ im_list = list(filter(lambda p: p.spec_idx in id_set and p.rank == 1, psm_list))
395
+ return im_list
396
+
397
+
398
+ def extract_timstof_dda_data(
399
+ path: str,
400
+ in_memory: bool = False,
401
+ use_bruker_sdk: bool = False,
402
+ isolation_window_lower: float = -3.0,
403
+ isolation_window_upper: float = 3.0,
404
+ take_top_n: int = 100,
405
+ num_threads: int = 16,
406
+ ) -> pd.DataFrame:
407
+ """
408
+ Extract TIMSTOF DDA data from bruker timsTOF TDF file.
409
+
410
+ Args:
411
+ path: Path to TIMSTOF DDA data
412
+ in_memory: Whether to load data in memory
413
+ use_bruker_sdk: Whether to use bruker SDK for data extraction
414
+ isolation_window_lower: Lower bound for isolation window (Da)
415
+ isolation_window_upper: Upper bound for isolation window (Da)
416
+ take_top_n: Number of top peaks to take
417
+ num_threads: Number of threads to use
418
+
419
+ Returns:
420
+ DataFrame containing timsTOF DDA data
421
+ """
422
+ ds_name = os.path.basename(path)
423
+
424
+ dataset = TimsDatasetDDA(path, in_memory=in_memory, use_bruker_sdk=use_bruker_sdk)
425
+ fragments = dataset.get_pasef_fragments(num_threads=num_threads)
426
+
427
+ fragments = fragments.groupby('precursor_id').agg({
428
+ 'frame_id': 'first',
429
+ 'time': 'first',
430
+ 'precursor_id': 'first',
431
+ 'raw_data': 'sum',
432
+ 'scan_begin': 'first',
433
+ 'scan_end': 'first',
434
+ 'isolation_mz': 'first',
435
+ 'isolation_width': 'first',
436
+ 'collision_energy': 'first',
437
+ 'largest_peak_mz': 'first',
438
+ 'average_mz': 'first',
439
+ 'monoisotopic_mz': 'first',
440
+ 'charge': 'first',
441
+ 'average_scan': 'first',
442
+ 'intensity': 'first',
443
+ 'parent_id': 'first',
444
+ })
445
+
446
+ mobility = fragments.apply(lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1)
447
+ fragments['mobility'] = mobility
448
+
449
+ spec_id = fragments.apply(lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1)
450
+ fragments['spec_id'] = spec_id
451
+
452
+ sage_precursor = fragments.apply(lambda r: Precursor(
453
+ mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
454
+ intensity=r['intensity'],
455
+ charge=sanitize_charge(r['charge']),
456
+ isolation_window=Tolerance(da=(isolation_window_lower, isolation_window_upper)),
457
+ collision_energy=r.collision_energy,
458
+ inverse_ion_mobility=r.mobility,
459
+ ), axis=1)
460
+
461
+ fragments['sage_precursor'] = sage_precursor
462
+
463
+ processed_spec = fragments.apply(
464
+ lambda r: get_searchable_spec(
465
+ precursor=r.sage_precursor,
466
+ raw_fragment_data=r.raw_data,
467
+ spec_processor=SpectrumProcessor(take_top_n=take_top_n),
468
+ spec_id=r.spec_id,
469
+ time=r['time'],
470
+ ),
471
+ axis=1
472
+ )
473
+
474
+ fragments['processed_spec'] = processed_spec
475
+
476
+ return fragments
477
+
478
+
479
+ def transform_psm_to_pin(psm_df):
480
+ """Transform PSM DataFrame to PIN format for Percolator."""
481
+ columns_map = {
482
+ 'spec_idx': 'SpecId',
483
+ 'decoy': 'Label',
484
+ 'charge': 'Charge',
485
+ 'sequence': 'Peptide',
486
+ 'hyperscore': 'Feature1',
487
+ 'isotope_error': 'Feature2',
488
+ 'delta_mass': 'Feature3',
489
+ 'delta_rt': 'Feature4',
490
+ 'delta_ims': 'Feature5',
491
+ 'matched_peaks': 'Feature6',
492
+ 'matched_intensity_pct': 'Feature7',
493
+ 'intensity_ms1': 'Feature8',
494
+ 'intensity_ms2': 'Feature9',
495
+ 'average_ppm': 'Feature10',
496
+ 'poisson': 'Feature11',
497
+ 'spectral_entropy_similarity': 'Feature12',
498
+ 'spectral_correlation_similarity_pearson': 'Feature13',
499
+ 'spectral_correlation_similarity_spearman': 'Feature14',
500
+ 'spectral_normalized_intensity_difference': 'Feature15',
501
+ 'collision_energy': 'Feature16',
502
+ 'delta_next': 'Feature17',
503
+ 'delta_best': 'Feature18',
504
+ 'longest_b': 'Feature19',
505
+ 'longest_y': 'Feature20',
506
+ 'longest_y_pct': 'Feature21',
507
+ }
508
+
509
+ psm_df = psm_df[list(columns_map.keys())]
510
+ df_pin = psm_df.rename(columns=columns_map)
511
+ df_pin_clean = df_pin.dropna(axis=1, how='all')
512
+ df_pin_clean = df_pin_clean.dropna()
513
+
514
+ df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1)
515
+ df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1)
516
+
517
+ return df_pin_clean
518
+
519
+
520
+ # Column renaming scheme for tims2rescore compatibility
521
+ full_renaming_scheme = {
522
+ 'spec_idx': 'psm_id',
523
+ 'sequence_modified': 'peptide',
524
+ 'ims': 'ion_mobility',
525
+ 'predicted_ims': 'predicted_mobility',
526
+ 'delta_ims': 'delta_mobility',
527
+ 'discriminant_score': 'sage_discriminant_score',
528
+ 'delta_mass': 'precursor_ppm',
529
+ 'average_ppm': 'fragment_ppm'
530
+ }
531
+
532
+ sage_target_columns = [
533
+ 'psm_id', 'peptide', 'proteins', 'num_proteins', 'filename', 'scannr',
534
+ 'rank', 'label', 'expmass', 'calcmass', 'charge', 'peptide_len',
535
+ 'missed_cleavages', 'semi_enzymatic', 'isotope_error', 'precursor_ppm',
536
+ 'fragment_ppm', 'hyperscore', 'delta_next', 'delta_best', 'rt',
537
+ 'aligned_rt', 'predicted_rt', 'delta_rt_model', 'ion_mobility',
538
+ 'predicted_mobility', 'delta_mobility', 'matched_peaks', 'longest_b',
539
+ 'longest_y', 'longest_y_pct', 'matched_intensity_pct',
540
+ 'scored_candidates', 'poisson', 'sage_discriminant_score',
541
+ 'posterior_error', 'spectrum_q', 'peptide_q', 'protein_q',
542
+ 'ms2_intensity'
543
+ ]
544
+
545
+
546
+ def list_to_semicolon_string(value):
547
+ """Converts a list of proteins into a semicolon-separated string."""
548
+ if isinstance(value, list):
549
+ return ";".join(value)
550
+ return value
551
+
552
+
553
+ def parse_to_tims2rescore(TDC, from_mgf: bool = False, file_name: str = None):
554
+ """Parse PSM results to tims2rescore-compatible format."""
555
+ TDC_tmp = TDC.copy()
556
+ TDC_tmp["filename"] = file_name if from_mgf else TDC_tmp.spec_idx.apply(lambda s: '-'.join(s.split('-')[3:]) + ".d")
557
+ TDC_tmp["scannr"] = TDC_tmp.spec_idx.apply(lambda i: int(i.split("-")[1]) - 1) if from_mgf else TDC_tmp.spec_idx.apply(lambda s: int(s.split('-')[2]) - 1)
558
+ TDC_tmp["num_proteins"] = TDC_tmp.proteins.apply(lambda protein: len(parse_string_list(protein)))
559
+ TDC_tmp["label"] = TDC_tmp.decoy.apply(lambda b: -1 if b else 1)
560
+ TDC_tmp["peptide_len"] = TDC_tmp.sequence.apply(peptide_length)
561
+ TDC_tmp["semi_enzymatic"] = False
562
+ TDC_tmp = TDC_tmp.rename(columns=full_renaming_scheme)
563
+ TDC_tmp = TDC_tmp[sage_target_columns]
564
+ TDC_tmp["rank"] = TDC_tmp["rank"].astype(int)
565
+ TDC_tmp["charge"] = TDC_tmp["charge"].astype(int)
566
+ TDC_tmp["missed_cleavages"] = TDC_tmp["missed_cleavages"].astype(int)
567
+ TDC_tmp["semi_enzymatic"] = TDC_tmp["semi_enzymatic"].astype(int)
568
+ TDC_tmp["scored_candidates"] = TDC_tmp["scored_candidates"].astype(int)
569
+ TDC_tmp["matched_peaks"] = TDC_tmp["matched_peaks"].astype(int)
570
+ TDC_tmp["longest_b"] = TDC_tmp["longest_b"].astype(int)
571
+ TDC_tmp["longest_y"] = TDC_tmp["longest_y"].astype(int)
572
+ TDC_tmp["proteins"] = TDC_tmp.proteins.apply(list_to_semicolon_string)
573
+ TDC_tmp["proteins"] = TDC_tmp["proteins"].astype(str)
574
+
575
+ TDC_tmp = TDC_tmp.sort_values(by="spectrum_q", ascending=True)
576
+ TDC_tmp["psm_id"] = range(1, len(TDC_tmp) + 1)
577
+
578
+ def add_rev_prefix(protein, label):
579
+ if label == -1:
580
+ return f"rev_{protein}"
581
+ return protein
582
+
583
+ TDC_tmp["proteins"] = TDC_tmp.apply(lambda row: add_rev_prefix(row["proteins"], row["label"]), axis=1)
584
+
585
+ return TDC_tmp
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: imspy-search
3
+ Version: 0.4.0
4
+ Summary: Database search functionality for timsTOF proteomics data using sagepy.
5
+ License-Expression: MIT
6
+ Author: theGreatHerrLebert
7
+ Author-email: davidteschner@googlemail.com
8
+ Requires-Python: >=3.11,<3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: imspy-core (>=0.4.0)
14
+ Requires-Dist: imspy-predictors (>=0.4.0)
15
+ Requires-Dist: matplotlib (>=3.5)
16
+ Requires-Dist: mokapot (>=0.9.0)
17
+ Requires-Dist: numba (>=0.53)
18
+ Requires-Dist: numpy (>=1.24)
19
+ Requires-Dist: pandas (>=2.0)
20
+ Requires-Dist: sagepy (>=0.4.0)
21
+ Requires-Dist: scikit-learn (>=1.0)
22
+ Requires-Dist: scipy (>=1.7.1)
23
+ Requires-Dist: toml (>=0.10)
24
+ Requires-Dist: tqdm (>=4.66)
25
+ Description-Content-Type: text/markdown
26
+
27
+ # imspy-search
28
+
29
+ Database search functionality for timsTOF proteomics data using sagepy.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install imspy-search
35
+ ```
36
+
37
+ ## Features
38
+
39
+ - **Database Search**: SAGE-based database search for timsTOF DDA data
40
+ - **PSM Rescoring**: Machine learning-based rescoring of peptide-spectrum matches
41
+ - **FDR Control**: Target-decoy competition and q-value estimation
42
+ - **MGF Support**: Parse and search Bruker DataAnalysis MGF files
43
+ - **CLI Tools**: Command-line interfaces for common workflows
44
+
45
+ ## Quick Start
46
+
47
+ ```python
48
+ from imspy_search import (
49
+ extract_timstof_dda_data,
50
+ get_searchable_spec,
51
+ generate_balanced_rt_dataset,
52
+ generate_balanced_im_dataset,
53
+ )
54
+
55
+ # Extract DDA data for database search
56
+ fragments = extract_timstof_dda_data(
57
+ path="path/to/data.d",
58
+ num_threads=16,
59
+ )
60
+ ```
61
+
62
+ ## CLI Tools
63
+
64
+ ### imspy-dda
65
+ Full DDA search pipeline with intensity prediction and rescoring:
66
+ ```bash
67
+ imspy-dda /path/to/data /path/to/fasta.fasta --config config.toml
68
+ ```
69
+
70
+ ### imspy-ccs
71
+ Extract CCS values from DDA data for machine learning:
72
+ ```bash
73
+ imspy-ccs --raw_data_path /path/to/data --fasta_path /path/to/fasta.fasta
74
+ ```
75
+
76
+ ### imspy-rescore-sage
77
+ Rescore SAGE search results with deep learning features:
78
+ ```bash
79
+ imspy-rescore-sage results.tsv fragments.tsv /output/path
80
+ ```
81
+
82
+ ## Submodules
83
+
84
+ - **utility**: Core utility functions for database search
85
+ - **sage_output_utility**: SAGE output processing and rescoring
86
+ - **mgf**: MGF file parsing for sagepy queries
87
+ - **rescoring**: PSM rescoring with deep learning features
88
+ - **dda_extensions**: TimsDatasetDDA extensions for sagepy
89
+ - **cli/**: Command-line interface tools
90
+
91
+ ## Dependencies
92
+
93
+ - **imspy-core**: Core data structures (required)
94
+ - **imspy-predictors**: ML predictors for CCS, RT, intensity (required)
95
+ - **sagepy**: SAGE database search framework (required)
96
+ - **mokapot**: Machine learning for PSM scoring (required)
97
+
98
+ ## Related Packages
99
+
100
+ - **imspy-core**: Core data structures and timsTOF readers
101
+ - **imspy-predictors**: ML-based predictors
102
+ - **imspy-simulation**: Simulation tools for timsTOF data
103
+ - **imspy-vis**: Visualization tools
104
+
105
+ ## License
106
+
107
+ MIT License - see LICENSE file for details.
108
+