metaumbra 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaumbra-1.1.0/LICENSE +29 -0
- metaumbra-1.1.0/MANIFEST.in +3 -0
- metaumbra-1.1.0/PKG-INFO +133 -0
- metaumbra-1.1.0/README.md +97 -0
- metaumbra-1.1.0/pyproject.toml +59 -0
- metaumbra-1.1.0/setup.cfg +4 -0
- metaumbra-1.1.0/src/metaumbra/__init__.py +5 -0
- metaumbra-1.1.0/src/metaumbra/__main__.py +14 -0
- metaumbra-1.1.0/src/metaumbra/__version__.py +5 -0
- metaumbra-1.1.0/src/metaumbra/assets/baner.png +0 -0
- metaumbra-1.1.0/src/metaumbra/assets/metaumbra_icon.png +0 -0
- metaumbra-1.1.0/src/metaumbra/assets/workflow.png +0 -0
- metaumbra-1.1.0/src/metaumbra/cli.py +268 -0
- metaumbra-1.1.0/src/metaumbra/digest.py +768 -0
- metaumbra-1.1.0/src/metaumbra/gui.py +2110 -0
- metaumbra-1.1.0/src/metaumbra/scoring.py +1737 -0
- metaumbra-1.1.0/src/metaumbra/workflows.py +399 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/PKG-INFO +133 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/SOURCES.txt +21 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/dependency_links.txt +1 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/entry_points.txt +5 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/requires.txt +18 -0
- metaumbra-1.1.0/src/metaumbra.egg-info/top_level.txt +1 -0
metaumbra-1.1.0/LICENSE
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, MetaUmbra contributors
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
metaumbra-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metaumbra
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Genome-level presence inference from metaproteomic peptide lists.
|
|
5
|
+
License-Expression: BSD-3-Clause
|
|
6
|
+
Keywords: metaproteomics,proteomics,bioinformatics,genome inference,peptides
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: rpg==2.0.5
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: tqdm
|
|
25
|
+
Provides-Extra: gui
|
|
26
|
+
Requires-Dist: PySide6; extra == "gui"
|
|
27
|
+
Provides-Extra: parquet
|
|
28
|
+
Requires-Dist: pyarrow; extra == "parquet"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: build; extra == "dev"
|
|
31
|
+
Requires-Dist: twine; extra == "dev"
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: PySide6; extra == "all"
|
|
34
|
+
Requires-Dist: pyarrow; extra == "all"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# MetaUmbra
|
|
38
|
+
[](src/metaumbra/assets/baner.png)
|
|
39
|
+
|
|
40
|
+
## Genome-level presence inference from metaproteomic peptides
|
|
41
|
+
|
|
42
|
+
MetaUmbra converts identified metaproteomic peptides into statistically supported genome presence calls. It evaluates each candidate genome using both unique and shared peptide evidence and reports genome-level p-values, BH-adjusted q-values, and presence scores.
|
|
43
|
+
|
|
44
|
+
## Main features
|
|
45
|
+
|
|
46
|
+
- Evaluate candidate genome support from metaproteomic peptide tables
|
|
47
|
+
- Build genome-specific theoretical peptide references from protein FASTA files
|
|
48
|
+
- Support user-defined genome collections, including isolate genomes, strain panels, and MAG catalogs
|
|
49
|
+
- Use both unique and shared peptide evidence for genome presence inference
|
|
50
|
+
- Report genome-level p-values, BH-adjusted q-values, and presence scores
|
|
51
|
+
- Provide GUI, command-line, and Python workflow support
|
|
52
|
+
- Support peptide tables from common metaproteomics workflows such as DIA-NN and MaxQuant
|
|
53
|
+
|
|
54
|
+
## Workflow overview
|
|
55
|
+
[](src/metaumbra/assets/workflow.png)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
MetaUmbra requires Python 3.10 or newer.
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install ".[all]"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
MetaUmbra can be used through either the graphical interface or the command line.
|
|
69
|
+
|
|
70
|
+
### Graphical interface
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
metaumbra-gui
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The GUI supports FASTA digestion, peptide table loading, genome presence scoring, and result export.
|
|
77
|
+
|
|
78
|
+
### Command line
|
|
79
|
+
|
|
80
|
+
MetaUmbra provides separate commands for the main workflow steps:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
metaumbra digest --help
|
|
84
|
+
metaumbra score --help
|
|
85
|
+
metaumbra extract-parquet --help
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
A typical workflow is:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
metaumbra digest ...
|
|
92
|
+
metaumbra score ...
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Use `metaumbra extract-parquet ...` to convert DIA-NN parquet reports to peptide TSV files before scoring.
|
|
96
|
+
|
|
97
|
+
## Input
|
|
98
|
+
|
|
99
|
+
MetaUmbra requires:
|
|
100
|
+
|
|
101
|
+
- Protein FASTA files, with one FASTA file per genome
|
|
102
|
+
- An observed peptide table containing peptide sequences
|
|
103
|
+
|
|
104
|
+
Optional inputs include peptide scores, peptide-level error values, decoy flags, and genome lineage annotations.
|
|
105
|
+
|
|
106
|
+
## Output
|
|
107
|
+
|
|
108
|
+
The main output is a TSV table containing genome-level evidence and significance values.
|
|
109
|
+
|
|
110
|
+
Key output columns include:
|
|
111
|
+
|
|
112
|
+
| Column | Description |
|
|
113
|
+
| --- | --- |
|
|
114
|
+
| `genome_id` | Candidate genome identifier |
|
|
115
|
+
| `num_peptides_matched` | Number of observed peptides matched to the genome |
|
|
116
|
+
| `num_peptides_unique` | Number of matched peptides unique to the genome |
|
|
117
|
+
| `weighted_evidence` | Total degeneracy-weighted peptide evidence |
|
|
118
|
+
| `weighted_evidence_shared` | Weighted evidence from shared peptides |
|
|
119
|
+
| `p_presence` | Genome-level p-value |
|
|
120
|
+
| `q_presence` | BH-adjusted genome-level q-value |
|
|
121
|
+
| `presence_score` | Ranking score based on q-value |
|
|
122
|
+
|
|
123
|
+
## Citation
|
|
124
|
+
|
|
125
|
+
If you use MetaUmbra, please cite:
|
|
126
|
+
|
|
127
|
+
> Wu Q, Ning Z, Zhang A, Cheng K, Figeys D. MetaUmbra: Statistically Controlled Genome-Level Presence Inference from Metaproteomic Peptides.
|
|
128
|
+
|
|
129
|
+
A formal citation will be added after publication.
|
|
130
|
+
|
|
131
|
+
## Contact
|
|
132
|
+
|
|
133
|
+
For questions or issues, please use the GitHub issue tracker or contact the corresponding author listed in the associated manuscript.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# MetaUmbra
|
|
2
|
+
[](src/metaumbra/assets/baner.png)
|
|
3
|
+
|
|
4
|
+
## Genome-level presence inference from metaproteomic peptides
|
|
5
|
+
|
|
6
|
+
MetaUmbra converts identified metaproteomic peptides into statistically supported genome presence calls. It evaluates each candidate genome using both unique and shared peptide evidence and reports genome-level p-values, BH-adjusted q-values, and presence scores.
|
|
7
|
+
|
|
8
|
+
## Main features
|
|
9
|
+
|
|
10
|
+
- Evaluate candidate genome support from metaproteomic peptide tables
|
|
11
|
+
- Build genome-specific theoretical peptide references from protein FASTA files
|
|
12
|
+
- Support user-defined genome collections, including isolate genomes, strain panels, and MAG catalogs
|
|
13
|
+
- Use both unique and shared peptide evidence for genome presence inference
|
|
14
|
+
- Report genome-level p-values, BH-adjusted q-values, and presence scores
|
|
15
|
+
- Provide GUI, command-line, and Python workflow support
|
|
16
|
+
- Support peptide tables from common metaproteomics workflows such as DIA-NN and MaxQuant
|
|
17
|
+
|
|
18
|
+
## Workflow overview
|
|
19
|
+
[](src/metaumbra/assets/workflow.png)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
MetaUmbra requires Python 3.10 or newer.
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install ".[all]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
MetaUmbra can be used through either the graphical interface or the command line.
|
|
33
|
+
|
|
34
|
+
### Graphical interface
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
metaumbra-gui
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
The GUI supports FASTA digestion, peptide table loading, genome presence scoring, and result export.
|
|
41
|
+
|
|
42
|
+
### Command line
|
|
43
|
+
|
|
44
|
+
MetaUmbra provides separate commands for the main workflow steps:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
metaumbra digest --help
|
|
48
|
+
metaumbra score --help
|
|
49
|
+
metaumbra extract-parquet --help
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
A typical workflow is:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
metaumbra digest ...
|
|
56
|
+
metaumbra score ...
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Use `metaumbra extract-parquet ...` to convert DIA-NN parquet reports to peptide TSV files before scoring.
|
|
60
|
+
|
|
61
|
+
## Input
|
|
62
|
+
|
|
63
|
+
MetaUmbra requires:
|
|
64
|
+
|
|
65
|
+
- Protein FASTA files, with one FASTA file per genome
|
|
66
|
+
- An observed peptide table containing peptide sequences
|
|
67
|
+
|
|
68
|
+
Optional inputs include peptide scores, peptide-level error values, decoy flags, and genome lineage annotations.
|
|
69
|
+
|
|
70
|
+
## Output
|
|
71
|
+
|
|
72
|
+
The main output is a TSV table containing genome-level evidence and significance values.
|
|
73
|
+
|
|
74
|
+
Key output columns include:
|
|
75
|
+
|
|
76
|
+
| Column | Description |
|
|
77
|
+
| --- | --- |
|
|
78
|
+
| `genome_id` | Candidate genome identifier |
|
|
79
|
+
| `num_peptides_matched` | Number of observed peptides matched to the genome |
|
|
80
|
+
| `num_peptides_unique` | Number of matched peptides unique to the genome |
|
|
81
|
+
| `weighted_evidence` | Total degeneracy-weighted peptide evidence |
|
|
82
|
+
| `weighted_evidence_shared` | Weighted evidence from shared peptides |
|
|
83
|
+
| `p_presence` | Genome-level p-value |
|
|
84
|
+
| `q_presence` | BH-adjusted genome-level q-value |
|
|
85
|
+
| `presence_score` | Ranking score based on q-value |
|
|
86
|
+
|
|
87
|
+
## Citation
|
|
88
|
+
|
|
89
|
+
If you use MetaUmbra, please cite:
|
|
90
|
+
|
|
91
|
+
> Wu Q, Ning Z, Zhang A, Cheng K, Figeys D. MetaUmbra: Statistically Controlled Genome-Level Presence Inference from Metaproteomic Peptides.
|
|
92
|
+
|
|
93
|
+
A formal citation will be added after publication.
|
|
94
|
+
|
|
95
|
+
## Contact
|
|
96
|
+
|
|
97
|
+
For questions or issues, please use the GitHub issue tracker or contact the corresponding author listed in the associated manuscript.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "metaumbra"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Genome-level presence inference from metaproteomic peptide lists."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "BSD-3-Clause"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
keywords = ["metaproteomics", "proteomics", "bioinformatics", "genome inference", "peptides"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"rpg==2.0.5",
|
|
28
|
+
"numpy",
|
|
29
|
+
"pandas",
|
|
30
|
+
"tqdm",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
gui = ["PySide6"]
|
|
35
|
+
parquet = ["pyarrow"]
|
|
36
|
+
dev = ["build", "twine"]
|
|
37
|
+
all = [
|
|
38
|
+
"PySide6",
|
|
39
|
+
"pyarrow",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
metaumbra = "metaumbra.cli:main"
|
|
44
|
+
|
|
45
|
+
[project.gui-scripts]
|
|
46
|
+
metaumbra-gui = "metaumbra.gui:main"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools]
|
|
49
|
+
package-dir = {"" = "src"}
|
|
50
|
+
include-package-data = true
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.dynamic]
|
|
53
|
+
version = {attr = "metaumbra.__version__.__version__"}
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.packages.find]
|
|
56
|
+
where = ["src"]
|
|
57
|
+
|
|
58
|
+
[tool.setuptools.package-data]
|
|
59
|
+
metaumbra = ["assets/*.png"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Run the MetaUmbra command-line interface."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
if __package__ in {None, ""}:
|
|
7
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
8
|
+
from metaumbra.cli import main
|
|
9
|
+
else:
|
|
10
|
+
from .cli import main
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
raise SystemExit(main(sys.argv[1:]))
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
if __package__ in {None, ""}:
|
|
10
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
11
|
+
from metaumbra import __version__
|
|
12
|
+
from metaumbra.workflows import (
|
|
13
|
+
DigestConfig,
|
|
14
|
+
ParquetExtractionConfig,
|
|
15
|
+
ScoringConfig,
|
|
16
|
+
run_digest_workflow,
|
|
17
|
+
run_parquet_extraction_workflow,
|
|
18
|
+
run_scoring_workflow,
|
|
19
|
+
)
|
|
20
|
+
else:
|
|
21
|
+
from . import __version__
|
|
22
|
+
from .workflows import (
|
|
23
|
+
DigestConfig,
|
|
24
|
+
ParquetExtractionConfig,
|
|
25
|
+
ScoringConfig,
|
|
26
|
+
run_digest_workflow,
|
|
27
|
+
run_parquet_extraction_workflow,
|
|
28
|
+
run_scoring_workflow,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _print_result(payload: dict[str, Any]) -> None:
|
|
33
|
+
print(json.dumps(payload, indent=2, ensure_ascii=False))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _add_common_version_flag(parser: argparse.ArgumentParser) -> None:
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--version",
|
|
39
|
+
action="version",
|
|
40
|
+
version=f"%(prog)s {__version__}",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
45
|
+
parser = argparse.ArgumentParser(
|
|
46
|
+
prog="metaumbra",
|
|
47
|
+
description="MetaUmbra packaging-friendly command line interface.",
|
|
48
|
+
)
|
|
49
|
+
_add_common_version_flag(parser)
|
|
50
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
51
|
+
|
|
52
|
+
gui_parser = subparsers.add_parser("gui", help="Launch the Qt GUI.")
|
|
53
|
+
_add_common_version_flag(gui_parser)
|
|
54
|
+
|
|
55
|
+
digest_parser = subparsers.add_parser("digest", help="Digest FASTA files into peptide tables.")
|
|
56
|
+
_add_common_version_flag(digest_parser)
|
|
57
|
+
digest_input = digest_parser.add_mutually_exclusive_group(required=True)
|
|
58
|
+
digest_input.add_argument("--input-file", help="Single FASTA file to digest.")
|
|
59
|
+
digest_input.add_argument("--input-dir", help="Directory of FASTA files to digest.")
|
|
60
|
+
digest_parser.add_argument("--output-file", help="Output TSV path for single-file mode.")
|
|
61
|
+
digest_parser.add_argument("--output-dir", help="Output directory for directory mode.")
|
|
62
|
+
digest_parser.add_argument("--enzyme-id", default="42", help="RPG enzyme ID. Default: 42 (Trypsin).")
|
|
63
|
+
digest_parser.add_argument("--min-length", type=int, default=7, help="Minimum peptide length.")
|
|
64
|
+
digest_parser.add_argument("--max-length", type=int, default=30, help="Maximum peptide length.")
|
|
65
|
+
digest_parser.add_argument("--max-miscleavages", type=int, default=2, help="Maximum missed cleavages.")
|
|
66
|
+
digest_parser.add_argument("--processes", type=int, help="Worker process count.")
|
|
67
|
+
digest_parser.add_argument(
|
|
68
|
+
"--full-header",
|
|
69
|
+
action="store_true",
|
|
70
|
+
help="Keep full FASTA headers instead of truncating at the first space.",
|
|
71
|
+
)
|
|
72
|
+
digest_parser.add_argument(
|
|
73
|
+
"--no-skip-existing",
|
|
74
|
+
action="store_true",
|
|
75
|
+
help="Rebuild existing output files in directory mode.",
|
|
76
|
+
)
|
|
77
|
+
digest_parser.add_argument(
|
|
78
|
+
"--quiet",
|
|
79
|
+
action="store_true",
|
|
80
|
+
help="Reduce runtime log output.",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
score_parser = subparsers.add_parser("score", help="Score genome presence from peptide observations.")
|
|
84
|
+
_add_common_version_flag(score_parser)
|
|
85
|
+
score_parser.add_argument("--peptide-table", required=True, help="Observed peptide TSV path.")
|
|
86
|
+
score_parser.add_argument(
|
|
87
|
+
"--genome-digest-dir",
|
|
88
|
+
action="append",
|
|
89
|
+
required=True,
|
|
90
|
+
help="Genome digest directory. Repeat for multiple directories.",
|
|
91
|
+
)
|
|
92
|
+
score_parser.add_argument("--output", required=True, help="Output TSV path.")
|
|
93
|
+
score_parser.add_argument("--peptide-seq-col", default="Sequence", help="Peptide sequence column name.")
|
|
94
|
+
score_parser.add_argument("--peptide-score-col", default="score", help="Peptide score column name.")
|
|
95
|
+
score_parser.add_argument("--peptide-error-col", default="Q.Value", help="Peptide error/FDR column name.")
|
|
96
|
+
score_parser.add_argument("--peptide-error-cutoff", type=float, default=0.05, help="Peptide error cutoff.")
|
|
97
|
+
score_parser.add_argument(
|
|
98
|
+
"--peptide-decoy-flag-col",
|
|
99
|
+
default="Reverse",
|
|
100
|
+
help="Optional decoy flag column. Pass an empty string to disable it.",
|
|
101
|
+
)
|
|
102
|
+
score_parser.add_argument("--decoy-flag-value", default="+", help="Decoy marker value.")
|
|
103
|
+
score_parser.add_argument("--num-workers", type=int, help="Worker process count.")
|
|
104
|
+
score_parser.add_argument(
|
|
105
|
+
"--selected-genome-id",
|
|
106
|
+
action="append",
|
|
107
|
+
default=[],
|
|
108
|
+
help="Restrict scoring to specific genome IDs. Repeat as needed.",
|
|
109
|
+
)
|
|
110
|
+
score_parser.add_argument(
|
|
111
|
+
"--exclude-genome-id",
|
|
112
|
+
action="append",
|
|
113
|
+
default=[],
|
|
114
|
+
help="Genome IDs to exclude. Repeat as needed.",
|
|
115
|
+
)
|
|
116
|
+
score_parser.add_argument("--lineage-table", default="", help="Optional genome lineage table.")
|
|
117
|
+
score_parser.add_argument("--lineage-genome-id-col", default="", help="Genome ID column in the lineage table.")
|
|
118
|
+
score_parser.add_argument("--lineage-lineage-col", default="", help="Lineage column in the lineage table.")
|
|
119
|
+
score_parser.add_argument("--cache-path", default="", help="Optional matched peptide cache path.")
|
|
120
|
+
score_parser.add_argument(
|
|
121
|
+
"--use-cache-if-exists",
|
|
122
|
+
action="store_true",
|
|
123
|
+
help="Reuse an existing matched peptide cache if available.",
|
|
124
|
+
)
|
|
125
|
+
score_parser.add_argument(
|
|
126
|
+
"--no-save-cache",
|
|
127
|
+
action="store_true",
|
|
128
|
+
help="Do not persist matched peptide cache output.",
|
|
129
|
+
)
|
|
130
|
+
score_parser.add_argument(
|
|
131
|
+
"--no-compute-coverage",
|
|
132
|
+
action="store_true",
|
|
133
|
+
help="Skip cumulative coverage calculations.",
|
|
134
|
+
)
|
|
135
|
+
score_parser.add_argument(
|
|
136
|
+
"--no-export-temp",
|
|
137
|
+
action="store_true",
|
|
138
|
+
help="Skip temporary artifact exports.",
|
|
139
|
+
)
|
|
140
|
+
score_parser.add_argument(
|
|
141
|
+
"--return-full-table",
|
|
142
|
+
action="store_true",
|
|
143
|
+
help="Return and write the full internal result table.",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
parquet_parser = subparsers.add_parser(
|
|
147
|
+
"extract-parquet",
|
|
148
|
+
help="Extract selected columns from a parquet peptide table into TSV.",
|
|
149
|
+
)
|
|
150
|
+
_add_common_version_flag(parquet_parser)
|
|
151
|
+
parquet_parser.add_argument("--input", required=True, help="Input parquet file.")
|
|
152
|
+
parquet_parser.add_argument("--output", required=True, help="Output TSV file.")
|
|
153
|
+
parquet_parser.add_argument(
|
|
154
|
+
"--input-column",
|
|
155
|
+
action="append",
|
|
156
|
+
default=[],
|
|
157
|
+
help="Input column to extract. Repeat to control order.",
|
|
158
|
+
)
|
|
159
|
+
parquet_parser.add_argument(
|
|
160
|
+
"--output-column",
|
|
161
|
+
action="append",
|
|
162
|
+
default=[],
|
|
163
|
+
help="Output column name. Repeat to match --input-column order.",
|
|
164
|
+
)
|
|
165
|
+
parquet_parser.add_argument("--batch-size", type=int, default=65536, help="Parquet streaming batch size.")
|
|
166
|
+
parquet_parser.add_argument("--force", action="store_true", help="Overwrite an existing TSV output.")
|
|
167
|
+
|
|
168
|
+
return parser
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _run_gui() -> int:
|
|
172
|
+
from .gui import main as gui_main
|
|
173
|
+
|
|
174
|
+
gui_main()
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _run_digest(args: argparse.Namespace) -> int:
|
|
179
|
+
input_mode = "file" if args.input_file else "directory"
|
|
180
|
+
if input_mode == "file" and not args.output_file:
|
|
181
|
+
raise SystemExit("--output-file is required when using --input-file.")
|
|
182
|
+
if input_mode == "directory" and not args.output_dir:
|
|
183
|
+
raise SystemExit("--output-dir is required when using --input-dir.")
|
|
184
|
+
|
|
185
|
+
config = DigestConfig(
|
|
186
|
+
input_mode=input_mode,
|
|
187
|
+
input_file=args.input_file or "",
|
|
188
|
+
input_dir=args.input_dir or "",
|
|
189
|
+
output_file=args.output_file or "",
|
|
190
|
+
output_dir=args.output_dir or "",
|
|
191
|
+
enzyme_id=str(args.enzyme_id),
|
|
192
|
+
min_length=args.min_length,
|
|
193
|
+
max_length=args.max_length,
|
|
194
|
+
max_num_miscleavages=args.max_miscleavages,
|
|
195
|
+
processes=args.processes,
|
|
196
|
+
short_header=not args.full_header,
|
|
197
|
+
verbose=not args.quiet,
|
|
198
|
+
skip_existing=not args.no_skip_existing,
|
|
199
|
+
)
|
|
200
|
+
_print_result(run_digest_workflow(config))
|
|
201
|
+
return 0
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _run_score(args: argparse.Namespace) -> int:
|
|
205
|
+
config = ScoringConfig(
|
|
206
|
+
peptide_table_path=args.peptide_table,
|
|
207
|
+
genome_lineage_table_path=args.lineage_table,
|
|
208
|
+
genome_lineage_genome_id_col=args.lineage_genome_id_col,
|
|
209
|
+
genome_lineage_lineage_col=args.lineage_lineage_col,
|
|
210
|
+
genome_digest_dirs=args.genome_digest_dir,
|
|
211
|
+
selected_genome_ids=args.selected_genome_id,
|
|
212
|
+
output_tsv_path=args.output,
|
|
213
|
+
peptide_seq_col=args.peptide_seq_col,
|
|
214
|
+
peptide_score_col=args.peptide_score_col,
|
|
215
|
+
peptide_error_col=args.peptide_error_col,
|
|
216
|
+
peptide_error_cutoff=args.peptide_error_cutoff,
|
|
217
|
+
peptide_decoy_flag_col=args.peptide_decoy_flag_col,
|
|
218
|
+
decoy_flag_value=args.decoy_flag_value,
|
|
219
|
+
exclude_genome_ids=args.exclude_genome_id,
|
|
220
|
+
num_workers=args.num_workers,
|
|
221
|
+
matched_peptides_cache_path=args.cache_path,
|
|
222
|
+
save_matched_peptides_cache=not args.no_save_cache,
|
|
223
|
+
use_cache_if_exists=args.use_cache_if_exists,
|
|
224
|
+
compute_coverage=not args.no_compute_coverage,
|
|
225
|
+
export_temp=not args.no_export_temp,
|
|
226
|
+
return_full_table=args.return_full_table,
|
|
227
|
+
)
|
|
228
|
+
_print_result(run_scoring_workflow(config))
|
|
229
|
+
return 0
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _run_parquet_extraction(args: argparse.Namespace) -> int:
|
|
233
|
+
input_columns = args.input_column or ["Run", "Stripped.Sequence", "Evidence", "Q.Value"]
|
|
234
|
+
output_columns = args.output_column or ["Run", "Sequence", "score", "Q.Value"]
|
|
235
|
+
|
|
236
|
+
config = ParquetExtractionConfig(
|
|
237
|
+
input_parquet_path=args.input,
|
|
238
|
+
output_tsv_path=args.output,
|
|
239
|
+
input_columns=input_columns,
|
|
240
|
+
output_columns=output_columns,
|
|
241
|
+
batch_size=args.batch_size,
|
|
242
|
+
force=args.force,
|
|
243
|
+
)
|
|
244
|
+
_print_result(run_parquet_extraction_workflow(config))
|
|
245
|
+
return 0
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def main(argv: list[str] | None = None) -> int:
|
|
249
|
+
parser = build_parser()
|
|
250
|
+
args = parser.parse_args(argv)
|
|
251
|
+
|
|
252
|
+
if args.command is None:
|
|
253
|
+
parser.print_help()
|
|
254
|
+
return 0
|
|
255
|
+
if args.command == "gui":
|
|
256
|
+
return _run_gui()
|
|
257
|
+
if args.command == "digest":
|
|
258
|
+
return _run_digest(args)
|
|
259
|
+
if args.command == "score":
|
|
260
|
+
return _run_score(args)
|
|
261
|
+
if args.command == "extract-parquet":
|
|
262
|
+
return _run_parquet_extraction(args)
|
|
263
|
+
|
|
264
|
+
raise SystemExit(f"Unknown command: {args.command}")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
if __name__ == "__main__":
|
|
268
|
+
raise SystemExit(main(sys.argv[1:]))
|