imspy-search 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imspy_search-0.4.0/PKG-INFO +108 -0
- imspy_search-0.4.0/README.md +81 -0
- imspy_search-0.4.0/pyproject.toml +44 -0
- imspy_search-0.4.0/src/imspy_search/__init__.py +126 -0
- imspy_search-0.4.0/src/imspy_search/cli/__init__.py +11 -0
- imspy_search-0.4.0/src/imspy_search/cli/imspy_ccs.py +322 -0
- imspy_search-0.4.0/src/imspy_search/cli/imspy_dda.py +836 -0
- imspy_search-0.4.0/src/imspy_search/cli/imspy_rescore_sage.py +289 -0
- imspy_search-0.4.0/src/imspy_search/configs/config_ccs.toml +15 -0
- imspy_search-0.4.0/src/imspy_search/configs/config_hla.toml +83 -0
- imspy_search-0.4.0/src/imspy_search/configs/config_tryptic.toml +84 -0
- imspy_search-0.4.0/src/imspy_search/dda_extensions.py +209 -0
- imspy_search-0.4.0/src/imspy_search/mgf.py +139 -0
- imspy_search-0.4.0/src/imspy_search/rescoring.py +166 -0
- imspy_search-0.4.0/src/imspy_search/sage_output_utility.py +318 -0
- imspy_search-0.4.0/src/imspy_search/utility.py +585 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imspy-search
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Database search functionality for timsTOF proteomics data using sagepy.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Author: theGreatHerrLebert
|
|
7
|
+
Author-email: davidteschner@googlemail.com
|
|
8
|
+
Requires-Python: >=3.11,<3.14
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: imspy-core (>=0.4.0)
|
|
14
|
+
Requires-Dist: imspy-predictors (>=0.4.0)
|
|
15
|
+
Requires-Dist: matplotlib (>=3.5)
|
|
16
|
+
Requires-Dist: mokapot (>=0.9.0)
|
|
17
|
+
Requires-Dist: numba (>=0.53)
|
|
18
|
+
Requires-Dist: numpy (>=1.24)
|
|
19
|
+
Requires-Dist: pandas (>=2.0)
|
|
20
|
+
Requires-Dist: sagepy (>=0.4.0)
|
|
21
|
+
Requires-Dist: scikit-learn (>=1.0)
|
|
22
|
+
Requires-Dist: scipy (>=1.7.1)
|
|
23
|
+
Requires-Dist: toml (>=0.10)
|
|
24
|
+
Requires-Dist: tqdm (>=4.66)
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# imspy-search
|
|
28
|
+
|
|
29
|
+
Database search functionality for timsTOF proteomics data using sagepy.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install imspy-search
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Database Search**: SAGE-based database search for timsTOF DDA data
|
|
40
|
+
- **PSM Rescoring**: Machine learning-based rescoring of peptide-spectrum matches
|
|
41
|
+
- **FDR Control**: Target-decoy competition and q-value estimation
|
|
42
|
+
- **MGF Support**: Parse and search Bruker DataAnalysis MGF files
|
|
43
|
+
- **CLI Tools**: Command-line interfaces for common workflows
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from imspy_search import (
|
|
49
|
+
extract_timstof_dda_data,
|
|
50
|
+
get_searchable_spec,
|
|
51
|
+
generate_balanced_rt_dataset,
|
|
52
|
+
generate_balanced_im_dataset,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Extract DDA data for database search
|
|
56
|
+
fragments = extract_timstof_dda_data(
|
|
57
|
+
path="path/to/data.d",
|
|
58
|
+
num_threads=16,
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## CLI Tools
|
|
63
|
+
|
|
64
|
+
### imspy-dda
|
|
65
|
+
Full DDA search pipeline with intensity prediction and rescoring:
|
|
66
|
+
```bash
|
|
67
|
+
imspy-dda /path/to/data /path/to/fasta.fasta --config config.toml
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### imspy-ccs
|
|
71
|
+
Extract CCS values from DDA data for machine learning:
|
|
72
|
+
```bash
|
|
73
|
+
imspy-ccs --raw_data_path /path/to/data --fasta_path /path/to/fasta.fasta
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### imspy-rescore-sage
|
|
77
|
+
Rescore SAGE search results with deep learning features:
|
|
78
|
+
```bash
|
|
79
|
+
imspy-rescore-sage results.tsv fragments.tsv /output/path
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Submodules
|
|
83
|
+
|
|
84
|
+
- **utility**: Core utility functions for database search
|
|
85
|
+
- **sage_output_utility**: SAGE output processing and rescoring
|
|
86
|
+
- **mgf**: MGF file parsing for sagepy queries
|
|
87
|
+
- **rescoring**: PSM rescoring with deep learning features
|
|
88
|
+
- **dda_extensions**: TimsDatasetDDA extensions for sagepy
|
|
89
|
+
- **cli/**: Command-line interface tools
|
|
90
|
+
|
|
91
|
+
## Dependencies
|
|
92
|
+
|
|
93
|
+
- **imspy-core**: Core data structures (required)
|
|
94
|
+
- **imspy-predictors**: ML predictors for CCS, RT, intensity (required)
|
|
95
|
+
- **sagepy**: SAGE database search framework (required)
|
|
96
|
+
- **mokapot**: Machine learning for PSM scoring (required)
|
|
97
|
+
|
|
98
|
+
## Related Packages
|
|
99
|
+
|
|
100
|
+
- **imspy-core**: Core data structures and timsTOF readers
|
|
101
|
+
- **imspy-predictors**: ML-based predictors
|
|
102
|
+
- **imspy-simulation**: Simulation tools for timsTOF data
|
|
103
|
+
- **imspy-vis**: Visualization tools
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
MIT License - see LICENSE file for details.
|
|
108
|
+
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# imspy-search
|
|
2
|
+
|
|
3
|
+
Database search functionality for timsTOF proteomics data using sagepy.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install imspy-search
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Database Search**: SAGE-based database search for timsTOF DDA data
|
|
14
|
+
- **PSM Rescoring**: Machine learning-based rescoring of peptide-spectrum matches
|
|
15
|
+
- **FDR Control**: Target-decoy competition and q-value estimation
|
|
16
|
+
- **MGF Support**: Parse and search Bruker DataAnalysis MGF files
|
|
17
|
+
- **CLI Tools**: Command-line interfaces for common workflows
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from imspy_search import (
|
|
23
|
+
extract_timstof_dda_data,
|
|
24
|
+
get_searchable_spec,
|
|
25
|
+
generate_balanced_rt_dataset,
|
|
26
|
+
generate_balanced_im_dataset,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Extract DDA data for database search
|
|
30
|
+
fragments = extract_timstof_dda_data(
|
|
31
|
+
path="path/to/data.d",
|
|
32
|
+
num_threads=16,
|
|
33
|
+
)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## CLI Tools
|
|
37
|
+
|
|
38
|
+
### imspy-dda
|
|
39
|
+
Full DDA search pipeline with intensity prediction and rescoring:
|
|
40
|
+
```bash
|
|
41
|
+
imspy-dda /path/to/data /path/to/fasta.fasta --config config.toml
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### imspy-ccs
|
|
45
|
+
Extract CCS values from DDA data for machine learning:
|
|
46
|
+
```bash
|
|
47
|
+
imspy-ccs --raw_data_path /path/to/data --fasta_path /path/to/fasta.fasta
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### imspy-rescore-sage
|
|
51
|
+
Rescore SAGE search results with deep learning features:
|
|
52
|
+
```bash
|
|
53
|
+
imspy-rescore-sage results.tsv fragments.tsv /output/path
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Submodules
|
|
57
|
+
|
|
58
|
+
- **utility**: Core utility functions for database search
|
|
59
|
+
- **sage_output_utility**: SAGE output processing and rescoring
|
|
60
|
+
- **mgf**: MGF file parsing for sagepy queries
|
|
61
|
+
- **rescoring**: PSM rescoring with deep learning features
|
|
62
|
+
- **dda_extensions**: TimsDatasetDDA extensions for sagepy
|
|
63
|
+
- **cli/**: Command-line interface tools
|
|
64
|
+
|
|
65
|
+
## Dependencies
|
|
66
|
+
|
|
67
|
+
- **imspy-core**: Core data structures (required)
|
|
68
|
+
- **imspy-predictors**: ML predictors for CCS, RT, intensity (required)
|
|
69
|
+
- **sagepy**: SAGE database search framework (required)
|
|
70
|
+
- **mokapot**: Machine learning for PSM scoring (required)
|
|
71
|
+
|
|
72
|
+
## Related Packages
|
|
73
|
+
|
|
74
|
+
- **imspy-core**: Core data structures and timsTOF readers
|
|
75
|
+
- **imspy-predictors**: ML-based predictors
|
|
76
|
+
- **imspy-simulation**: Simulation tools for timsTOF data
|
|
77
|
+
- **imspy-vis**: Visualization tools
|
|
78
|
+
|
|
79
|
+
## License
|
|
80
|
+
|
|
81
|
+
MIT License - see LICENSE file for details.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "imspy-search"
|
|
3
|
+
version = "0.4.0"
|
|
4
|
+
description = "Database search functionality for timsTOF proteomics data using sagepy."
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "theGreatHerrLebert", email = "davidteschner@googlemail.com" }
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "MIT"
|
|
10
|
+
requires-python = ">=3.11,<3.14"
|
|
11
|
+
|
|
12
|
+
dependencies = [
|
|
13
|
+
"imspy-core>=0.4.0",
|
|
14
|
+
"imspy-predictors>=0.4.0",
|
|
15
|
+
"sagepy>=0.4.0",
|
|
16
|
+
"mokapot>=0.9.0",
|
|
17
|
+
"scikit-learn>=1.0",
|
|
18
|
+
"pandas>=2.0",
|
|
19
|
+
"numpy>=1.24",
|
|
20
|
+
"numba>=0.53",
|
|
21
|
+
"toml>=0.10",
|
|
22
|
+
"tqdm>=4.66",
|
|
23
|
+
"matplotlib>=3.5",
|
|
24
|
+
"scipy>=1.7.1",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
imspy-dda = "imspy_search.cli.imspy_dda:main"
|
|
29
|
+
imspy-ccs = "imspy_search.cli.imspy_ccs:main"
|
|
30
|
+
imspy-rescore-sage = "imspy_search.cli.imspy_rescore_sage:main"
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["poetry-core"]
|
|
34
|
+
build-backend = "poetry.core.masonry.api"
|
|
35
|
+
|
|
36
|
+
[tool.poetry.group.dev.dependencies]
|
|
37
|
+
pytest = "^8.0.0"
|
|
38
|
+
pytest-cov = "^4.1.0"
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
42
|
+
python_files = ["test_*.py"]
|
|
43
|
+
python_functions = ["test_*"]
|
|
44
|
+
addopts = "-v --tb=short"
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
imspy_search - Database search functionality for timsTOF proteomics data using sagepy.
|
|
3
|
+
|
|
4
|
+
This package provides database search, PSM rescoring, and FDR control for timsTOF DDA data.
|
|
5
|
+
|
|
6
|
+
Requires imspy-core and imspy-predictors for core data structures and ML predictors.
|
|
7
|
+
|
|
8
|
+
Core functionality:
|
|
9
|
+
- SAGE-based database search for timsTOF DDA data
|
|
10
|
+
- Machine learning-based PSM rescoring
|
|
11
|
+
- Target-decoy competition and q-value estimation
|
|
12
|
+
- MGF file parsing for Bruker DataAnalysis output
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__version__ = "0.4.0"
|
|
16
|
+
|
|
17
|
+
# Core utility functions
|
|
18
|
+
from imspy_search.utility import (
|
|
19
|
+
# Data extraction and preprocessing
|
|
20
|
+
extract_timstof_dda_data,
|
|
21
|
+
get_searchable_spec,
|
|
22
|
+
get_ms1_ims_spectrum,
|
|
23
|
+
# FASTA handling
|
|
24
|
+
split_fasta,
|
|
25
|
+
# PSM handling
|
|
26
|
+
generate_training_data,
|
|
27
|
+
split_psms,
|
|
28
|
+
generate_balanced_rt_dataset,
|
|
29
|
+
generate_balanced_im_dataset,
|
|
30
|
+
# Helper functions
|
|
31
|
+
linear_map,
|
|
32
|
+
map_to_domain,
|
|
33
|
+
sanitize_charge,
|
|
34
|
+
sanitize_mz,
|
|
35
|
+
write_psms_binary,
|
|
36
|
+
merge_dicts_with_merge_dict,
|
|
37
|
+
check_memory,
|
|
38
|
+
# Output formatting
|
|
39
|
+
transform_psm_to_pin,
|
|
40
|
+
parse_to_tims2rescore,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# SAGE output processing
|
|
44
|
+
from imspy_search.sage_output_utility import (
|
|
45
|
+
re_score_psms as re_score_psms_lda,
|
|
46
|
+
generate_training_data as generate_training_data_df,
|
|
47
|
+
split_dataframe_randomly,
|
|
48
|
+
row_to_fragment,
|
|
49
|
+
remove_substrings,
|
|
50
|
+
PatternReplacer,
|
|
51
|
+
replace_tokens,
|
|
52
|
+
cosim_from_dict,
|
|
53
|
+
fragments_to_dict,
|
|
54
|
+
plot_summary,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# MGF parsing
|
|
58
|
+
from imspy_search.mgf import (
|
|
59
|
+
mgf_to_sagepy_query,
|
|
60
|
+
iter_spectra,
|
|
61
|
+
parse_spectrum,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Rescoring with deep learning features
|
|
65
|
+
from imspy_search.rescoring import (
|
|
66
|
+
re_score_psms,
|
|
67
|
+
create_feature_space,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# DDA extensions for sagepy integration
|
|
71
|
+
from imspy_search.dda_extensions import (
|
|
72
|
+
to_sage_precursor,
|
|
73
|
+
get_sage_processed_precursors,
|
|
74
|
+
get_processed_spectra_for_search,
|
|
75
|
+
search_timstof_dda,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
# Version
|
|
80
|
+
'__version__',
|
|
81
|
+
# Data extraction
|
|
82
|
+
'extract_timstof_dda_data',
|
|
83
|
+
'get_searchable_spec',
|
|
84
|
+
'get_ms1_ims_spectrum',
|
|
85
|
+
# FASTA handling
|
|
86
|
+
'split_fasta',
|
|
87
|
+
# PSM handling
|
|
88
|
+
'generate_training_data',
|
|
89
|
+
'split_psms',
|
|
90
|
+
'generate_balanced_rt_dataset',
|
|
91
|
+
'generate_balanced_im_dataset',
|
|
92
|
+
# Helper functions
|
|
93
|
+
'linear_map',
|
|
94
|
+
'map_to_domain',
|
|
95
|
+
'sanitize_charge',
|
|
96
|
+
'sanitize_mz',
|
|
97
|
+
'write_psms_binary',
|
|
98
|
+
'merge_dicts_with_merge_dict',
|
|
99
|
+
'check_memory',
|
|
100
|
+
# Output formatting
|
|
101
|
+
'transform_psm_to_pin',
|
|
102
|
+
'parse_to_tims2rescore',
|
|
103
|
+
# SAGE output processing
|
|
104
|
+
're_score_psms_lda',
|
|
105
|
+
'generate_training_data_df',
|
|
106
|
+
'split_dataframe_randomly',
|
|
107
|
+
'row_to_fragment',
|
|
108
|
+
'remove_substrings',
|
|
109
|
+
'PatternReplacer',
|
|
110
|
+
'replace_tokens',
|
|
111
|
+
'cosim_from_dict',
|
|
112
|
+
'fragments_to_dict',
|
|
113
|
+
'plot_summary',
|
|
114
|
+
# MGF parsing
|
|
115
|
+
'mgf_to_sagepy_query',
|
|
116
|
+
'iter_spectra',
|
|
117
|
+
'parse_spectrum',
|
|
118
|
+
# Rescoring
|
|
119
|
+
're_score_psms',
|
|
120
|
+
'create_feature_space',
|
|
121
|
+
# DDA extensions
|
|
122
|
+
'to_sage_precursor',
|
|
123
|
+
'get_sage_processed_precursors',
|
|
124
|
+
'get_processed_spectra_for_search',
|
|
125
|
+
'search_timstof_dda',
|
|
126
|
+
]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""CLI entry points for imspy-search."""
|
|
2
|
+
|
|
3
|
+
from imspy_search.cli.imspy_dda import main as dda_main
|
|
4
|
+
from imspy_search.cli.imspy_ccs import main as ccs_main
|
|
5
|
+
from imspy_search.cli.imspy_rescore_sage import main as rescore_sage_main
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'dda_main',
|
|
9
|
+
'ccs_main',
|
|
10
|
+
'rescore_sage_main',
|
|
11
|
+
]
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""IMSPY CCS CLI - Extract CCS from TIMS-TOF DDA data for machine learning training."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import toml
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
import mokapot
|
|
9
|
+
|
|
10
|
+
from imspy_core.timstof import TimsDatasetDDA
|
|
11
|
+
from sagepy.utility import create_sage_database, compress_psms, decompress_psms
|
|
12
|
+
from sagepy.rescore.utility import transform_psm_to_mokapot_pin
|
|
13
|
+
from sagepy.core import Precursor, Tolerance, Scorer, SpectrumProcessor
|
|
14
|
+
from imspy_search.utility import (
|
|
15
|
+
sanitize_mz, get_searchable_spec, write_psms_binary,
|
|
16
|
+
split_fasta, merge_dicts_with_merge_dict
|
|
17
|
+
)
|
|
18
|
+
from imspy_search.rescoring import create_feature_space
|
|
19
|
+
from sagepy.utility import psm_collection_to_pandas, apply_mz_calibration
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def sanitize_charge(charge):
|
|
23
|
+
"""Sanitize charge."""
|
|
24
|
+
try:
|
|
25
|
+
return int(charge)
|
|
26
|
+
except Exception:
|
|
27
|
+
return 2
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def group_by_mobility(mobility, intensity):
|
|
31
|
+
"""Group by mobility."""
|
|
32
|
+
r_dict = {}
|
|
33
|
+
for mob, i in zip(mobility, intensity):
|
|
34
|
+
r_dict[mob] = r_dict.get(mob, 0) + i
|
|
35
|
+
return np.array(list(r_dict.keys())), np.array(list(r_dict.values()))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def main():
|
|
39
|
+
"""Main entry point for imspy-ccs CLI."""
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
description="Extract CCS from TIMS-TOF DDA data to create training examples for machine learning"
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument("--raw_data_path", type=str, help="Path to the dataset.")
|
|
44
|
+
parser.add_argument("--fasta_path", type=str, help="Path to the FASTA file.")
|
|
45
|
+
parser.add_argument("--config", type=str, help="Path to a TOML configuration file.")
|
|
46
|
+
parser.add_argument("--num_threads", type=int, help="Number of threads for processing.")
|
|
47
|
+
parser.add_argument("--cleave_at", type=str, help="Residue to cleave at.")
|
|
48
|
+
parser.add_argument("--restrict", type=str, help="Restriction residues.")
|
|
49
|
+
parser.add_argument("--n_terminal", action="store_true", help="If provided, then c_terminal = False.")
|
|
50
|
+
parser.add_argument("--static_modifications", type=str, help="Static mods in TOML-compatible string form.")
|
|
51
|
+
parser.add_argument("--variable_modifications", type=str, help="Variable mods in TOML-compatible string form.")
|
|
52
|
+
parser.add_argument("--silent", action="store_true", help="Silent mode.")
|
|
53
|
+
parser.add_argument("--no_bruker_sdk", action="store_true", help="Do not use Bruker SDK.")
|
|
54
|
+
parser.add_argument("--fasta_batch_size", type=int, help="Batch size for FASTA processing.")
|
|
55
|
+
parser.add_argument("--lazy", action="store_true", help="Skip existing outputs to avoid re-processing.")
|
|
56
|
+
|
|
57
|
+
temp_args, _ = parser.parse_known_args()
|
|
58
|
+
|
|
59
|
+
# Load config from TOML if provided
|
|
60
|
+
config = {}
|
|
61
|
+
if temp_args.config and os.path.exists(temp_args.config):
|
|
62
|
+
with open(temp_args.config, "r") as f:
|
|
63
|
+
config = toml.load(f)
|
|
64
|
+
|
|
65
|
+
# Set defaults
|
|
66
|
+
defaults = {
|
|
67
|
+
"raw_data_path": config.get("raw_data_path", None),
|
|
68
|
+
"fasta_path": config.get("fasta_path", None),
|
|
69
|
+
"num_threads": config.get("num_threads", -1),
|
|
70
|
+
"cleave_at": config.get("cleave_at", "KR"),
|
|
71
|
+
"restrict": config.get("restrict", "P"),
|
|
72
|
+
"c_terminal": config.get("c_terminal", True),
|
|
73
|
+
"n_terminal": config.get("n_terminal", False),
|
|
74
|
+
"static_modifications": config.get("static_modifications", {"C": "[UNIMOD:4]"}),
|
|
75
|
+
"variable_modifications": config.get("variable_modifications", {"M": ["[UNIMOD:35]"], "[": ["[UNIMOD:1]"]}),
|
|
76
|
+
"verbose": config.get("verbose", True),
|
|
77
|
+
"no_bruker_sdk": config.get("no_bruker_sdk", False),
|
|
78
|
+
"fasta_batch_size": config.get("fasta_batch_size", 1),
|
|
79
|
+
"lazy": config.get("lazy", False),
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
parser.set_defaults(**defaults)
|
|
83
|
+
args = parser.parse_args()
|
|
84
|
+
|
|
85
|
+
if args.silent:
|
|
86
|
+
args.verbose = False
|
|
87
|
+
|
|
88
|
+
if args.n_terminal:
|
|
89
|
+
args.c_terminal = False
|
|
90
|
+
|
|
91
|
+
if isinstance(args.static_modifications, str):
|
|
92
|
+
args.static_modifications = toml.loads(f"data = {args.static_modifications}")["data"]
|
|
93
|
+
if isinstance(args.variable_modifications, str):
|
|
94
|
+
args.variable_modifications = toml.loads(f"data = {args.variable_modifications}")["data"]
|
|
95
|
+
|
|
96
|
+
if args.raw_data_path is None:
|
|
97
|
+
parser.error("raw_data_path is required (either via command line or config)")
|
|
98
|
+
if args.fasta_path is None:
|
|
99
|
+
parser.error("fasta_path is required (either via command line or config)")
|
|
100
|
+
|
|
101
|
+
if args.num_threads == -1:
|
|
102
|
+
args.num_threads = os.cpu_count()
|
|
103
|
+
|
|
104
|
+
if args.verbose:
|
|
105
|
+
print("Arguments:")
|
|
106
|
+
for arg, value in vars(args).items():
|
|
107
|
+
print(f" {arg}: {value}")
|
|
108
|
+
|
|
109
|
+
scorer = Scorer(
|
|
110
|
+
precursor_tolerance=Tolerance(ppm=(-25.0, 25.0)),
|
|
111
|
+
fragment_tolerance=Tolerance(ppm=(-20.0, 20.0)),
|
|
112
|
+
report_psms=5,
|
|
113
|
+
min_matched_peaks=5,
|
|
114
|
+
annotate_matches=True,
|
|
115
|
+
static_mods=args.static_modifications,
|
|
116
|
+
variable_mods=args.variable_modifications,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
with open(args.fasta_path, "r") as f:
|
|
120
|
+
raw_fasta = f.read()
|
|
121
|
+
fastas = split_fasta(fasta=raw_fasta, num_splits=args.fasta_batch_size, randomize=True)
|
|
122
|
+
|
|
123
|
+
d_files = [f for f in os.listdir(args.raw_data_path) if f.endswith(".d")]
|
|
124
|
+
count = len(d_files)
|
|
125
|
+
if count == 0:
|
|
126
|
+
raise ValueError("No .d files found in the directory.")
|
|
127
|
+
|
|
128
|
+
current_count = 0
|
|
129
|
+
|
|
130
|
+
for file in d_files:
|
|
131
|
+
try:
|
|
132
|
+
current_count += 1
|
|
133
|
+
if args.verbose:
|
|
134
|
+
print(f"Processing {file} ({current_count}/{count}) ...")
|
|
135
|
+
|
|
136
|
+
dataset_name = file.split(".")[0]
|
|
137
|
+
ds_path = os.path.join(args.raw_data_path, file)
|
|
138
|
+
|
|
139
|
+
psm_bin_path = os.path.join(ds_path, "imspy", "psm", f"{dataset_name}.bin")
|
|
140
|
+
parquet_path = os.path.join(ds_path, "imspy", f"{dataset_name}.parquet")
|
|
141
|
+
|
|
142
|
+
if args.lazy and os.path.isfile(psm_bin_path) and os.path.isfile(parquet_path):
|
|
143
|
+
if args.verbose:
|
|
144
|
+
print(f" [LAZY MODE] Skipping '{file}' because outputs already exist.")
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
dataset = TimsDatasetDDA(ds_path, use_bruker_sdk=not args.no_bruker_sdk)
|
|
148
|
+
fragments = dataset.get_pasef_fragments(args.num_threads)
|
|
149
|
+
|
|
150
|
+
if args.verbose:
|
|
151
|
+
print("Assembling re-fragmented precursors ...")
|
|
152
|
+
|
|
153
|
+
fragments = fragments.groupby('precursor_id').agg({
|
|
154
|
+
'frame_id': 'first',
|
|
155
|
+
'time': 'first',
|
|
156
|
+
'precursor_id': 'first',
|
|
157
|
+
'raw_data': 'sum',
|
|
158
|
+
'scan_begin': 'first',
|
|
159
|
+
'scan_end': 'first',
|
|
160
|
+
'isolation_mz': 'first',
|
|
161
|
+
'isolation_width': 'first',
|
|
162
|
+
'collision_energy': 'first',
|
|
163
|
+
'largest_peak_mz': 'first',
|
|
164
|
+
'average_mz': 'first',
|
|
165
|
+
'monoisotopic_mz': 'first',
|
|
166
|
+
'charge': 'first',
|
|
167
|
+
'average_scan': 'first',
|
|
168
|
+
'intensity': 'first',
|
|
169
|
+
'parent_id': 'first',
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
mobility = fragments.apply(
|
|
173
|
+
lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1
|
|
174
|
+
)
|
|
175
|
+
fragments['mobility'] = mobility
|
|
176
|
+
fragments['spec_id'] = fragments.apply(
|
|
177
|
+
lambda r: f"{r['frame_id']}-{r['precursor_id']}-{dataset_name}", axis=1
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
if args.verbose:
|
|
181
|
+
print("Extracting precursors ...")
|
|
182
|
+
|
|
183
|
+
fragments['sage_precursor'] = fragments.apply(
|
|
184
|
+
lambda r: Precursor(
|
|
185
|
+
mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
|
|
186
|
+
intensity=r['intensity'],
|
|
187
|
+
charge=sanitize_charge(r['charge']),
|
|
188
|
+
isolation_window=Tolerance(da=(-3, 3)),
|
|
189
|
+
collision_energy=r.collision_energy,
|
|
190
|
+
inverse_ion_mobility=r.mobility,
|
|
191
|
+
),
|
|
192
|
+
axis=1
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if args.verbose:
|
|
196
|
+
print("Extracting fragment spectra ...")
|
|
197
|
+
|
|
198
|
+
fragments['processed_spec'] = fragments.apply(
|
|
199
|
+
lambda r: get_searchable_spec(
|
|
200
|
+
precursor=r.sage_precursor,
|
|
201
|
+
raw_fragment_data=r.raw_data,
|
|
202
|
+
spec_processor=SpectrumProcessor(take_top_n=150),
|
|
203
|
+
spec_id=r.spec_id,
|
|
204
|
+
time=r['time']
|
|
205
|
+
), axis=1
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if args.verbose:
|
|
209
|
+
print("Scoring spectra ...")
|
|
210
|
+
|
|
211
|
+
psm_dicts = []
|
|
212
|
+
for i, fasta in enumerate(fastas):
|
|
213
|
+
if args.verbose:
|
|
214
|
+
print(f"Processing FASTA {i + 1}/{len(fastas)} ...")
|
|
215
|
+
|
|
216
|
+
indexed_db = create_sage_database(
|
|
217
|
+
fasta=fasta,
|
|
218
|
+
cleave_at=args.cleave_at,
|
|
219
|
+
restrict=args.restrict,
|
|
220
|
+
static_mods=args.static_modifications,
|
|
221
|
+
variable_mods=args.variable_modifications,
|
|
222
|
+
c_terminal=args.c_terminal
|
|
223
|
+
)
|
|
224
|
+
psm_collection = scorer.score_collection_psm(
|
|
225
|
+
db=indexed_db,
|
|
226
|
+
spectrum_collection=fragments['processed_spec'].values,
|
|
227
|
+
num_threads=args.num_threads
|
|
228
|
+
)
|
|
229
|
+
psm_dicts.append(psm_collection)
|
|
230
|
+
|
|
231
|
+
if len(psm_dicts) > 1:
|
|
232
|
+
if args.verbose:
|
|
233
|
+
print("Merging PSMs ...")
|
|
234
|
+
psm_collection = merge_dicts_with_merge_dict(psm_dicts)
|
|
235
|
+
else:
|
|
236
|
+
psm_collection = psm_dicts[0]
|
|
237
|
+
|
|
238
|
+
ppm_error = apply_mz_calibration(psm_collection, fragments)
|
|
239
|
+
|
|
240
|
+
for _, values in psm_collection.items():
|
|
241
|
+
for value in values:
|
|
242
|
+
value.file_name = dataset_name
|
|
243
|
+
value.mz_calibration_ppm = ppm_error
|
|
244
|
+
|
|
245
|
+
psm_list = [psm for values in psm_collection.values() for psm in values]
|
|
246
|
+
|
|
247
|
+
if args.verbose:
|
|
248
|
+
print("Creating re-scoring feature space ...")
|
|
249
|
+
|
|
250
|
+
psm_list = create_feature_space(psms=psm_list)
|
|
251
|
+
|
|
252
|
+
bts = compress_psms(psm_list)
|
|
253
|
+
write_psms_binary(
|
|
254
|
+
byte_array=bts,
|
|
255
|
+
folder_path=ds_path,
|
|
256
|
+
file_name=f"{dataset_name}"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
I = fragments.apply(lambda r: group_by_mobility(r.raw_data.mobility, r.raw_data.intensity), axis=1)
|
|
260
|
+
inv_mob, intensity = [x[0] for x in I], [x[1] for x in I]
|
|
261
|
+
|
|
262
|
+
fragments["inverse_ion_mobility"] = inv_mob
|
|
263
|
+
fragments["intensity"] = intensity
|
|
264
|
+
|
|
265
|
+
F = fragments[["spec_id", "monoisotopic_mz", "charge", "inverse_ion_mobility", "intensity"]]
|
|
266
|
+
F.to_parquet(parquet_path, index=False)
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
print(f"Error processing {file}: {e}")
|
|
270
|
+
|
|
271
|
+
# Final mokapot re-scoring
|
|
272
|
+
total_psms = []
|
|
273
|
+
if args.verbose:
|
|
274
|
+
print("Loading PSMs ...")
|
|
275
|
+
|
|
276
|
+
final_pin_path = os.path.join(args.raw_data_path, "PSMs.pin")
|
|
277
|
+
mokapot_output = os.path.join(args.raw_data_path, "mokapot.psms.txt")
|
|
278
|
+
|
|
279
|
+
skip_final = False
|
|
280
|
+
if args.lazy and os.path.isfile(final_pin_path) and os.path.isfile(mokapot_output):
|
|
281
|
+
skip_final = True
|
|
282
|
+
if args.verbose:
|
|
283
|
+
print(" [LAZY MODE] Skipping final mokapot step because outputs already exist.")
|
|
284
|
+
|
|
285
|
+
if not skip_final:
|
|
286
|
+
tmp_count = 0
|
|
287
|
+
for file in d_files:
|
|
288
|
+
try:
|
|
289
|
+
dataset_name = file.split(".")[0]
|
|
290
|
+
psm_bin_path = os.path.join(args.raw_data_path, file, "imspy", "psm", f"{dataset_name}.bin")
|
|
291
|
+
if not os.path.isfile(psm_bin_path):
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
tmp_count += 1
|
|
295
|
+
bts = np.fromfile(psm_bin_path, dtype=np.uint8)
|
|
296
|
+
psm_list = decompress_psms(bts)
|
|
297
|
+
total_psms.extend(psm_list)
|
|
298
|
+
|
|
299
|
+
if args.verbose:
|
|
300
|
+
print(f"Loaded {dataset_name} ({tmp_count}/{count})")
|
|
301
|
+
|
|
302
|
+
except Exception as e:
|
|
303
|
+
print(f"Error loading {file}: {e}")
|
|
304
|
+
|
|
305
|
+
PSM_pandas = psm_collection_to_pandas(total_psms)
|
|
306
|
+
|
|
307
|
+
if args.verbose:
|
|
308
|
+
print("Creating mokapot pin ...")
|
|
309
|
+
|
|
310
|
+
PSM_pin = transform_psm_to_mokapot_pin(PSM_pandas, seq_modified=True)
|
|
311
|
+
PSM_pin.to_csv(final_pin_path, index=False, sep="\t")
|
|
312
|
+
|
|
313
|
+
psms_moka = mokapot.read_pin(final_pin_path)
|
|
314
|
+
results, _ = mokapot.brew(psms_moka, max_workers=args.num_threads)
|
|
315
|
+
results.to_txt(dest_dir=args.raw_data_path)
|
|
316
|
+
|
|
317
|
+
if args.verbose and not skip_final:
|
|
318
|
+
print("Finished.")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
if __name__ == "__main__":
|
|
322
|
+
main()
|