DeepMutSim 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepmutsim-1.1.0/PKG-INFO +172 -0
- deepmutsim-1.1.0/README.md +142 -0
- deepmutsim-1.1.0/pyproject.toml +51 -0
- deepmutsim-1.1.0/src/DeepMutSim.egg-info/PKG-INFO +172 -0
- {deepmutsim-1.0.0 → deepmutsim-1.1.0}/src/DeepMutSim.egg-info/SOURCES.txt +5 -1
- deepmutsim-1.1.0/src/DeepMutSim.egg-info/requires.txt +5 -0
- deepmutsim-1.1.0/src/deepmutsim/__init__.py +34 -0
- deepmutsim-1.1.0/src/deepmutsim/_fetch.py +92 -0
- deepmutsim-1.1.0/src/deepmutsim/_variants.py +412 -0
- deepmutsim-1.1.0/tests/test_fetch.py +77 -0
- deepmutsim-1.1.0/tests/test_variants.py +327 -0
- deepmutsim-1.0.0/PKG-INFO +0 -79
- deepmutsim-1.0.0/README.md +0 -63
- deepmutsim-1.0.0/pyproject.toml +0 -26
- deepmutsim-1.0.0/src/DeepMutSim.egg-info/PKG-INFO +0 -79
- deepmutsim-1.0.0/src/DeepMutSim.egg-info/requires.txt +0 -1
- deepmutsim-1.0.0/src/deepmutsim/__init__.py +0 -611
- {deepmutsim-1.0.0 → deepmutsim-1.1.0}/LICENSE +0 -0
- {deepmutsim-1.0.0 → deepmutsim-1.1.0}/setup.cfg +0 -0
- {deepmutsim-1.0.0 → deepmutsim-1.1.0}/src/DeepMutSim.egg-info/dependency_links.txt +0 -0
- {deepmutsim-1.0.0 → deepmutsim-1.1.0}/src/DeepMutSim.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: DeepMutSim
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Enumerate all possible SNVs for MANE transcripts using HGVS nomenclature.
|
|
5
|
+
Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/liu-sun/DeepMutSim
|
|
8
|
+
Project-URL: Issues, https://github.com/liu-sun/DeepMutSim/issues
|
|
9
|
+
Project-URL: Repository, https://github.com/liu-sun/DeepMutSim
|
|
10
|
+
Keywords: bioinformatics,hgvs,variant,mane,ncbi,entrez,mutation
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: biopython
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# DeepMutSim
|
|
32
|
+
|
|
33
|
+
Generate all possible single nucleotide variants (SNVs) for MANE transcripts
|
|
34
|
+
using [HGVS nomenclature][hgvs]. DeepMutSim queries NCBI Entrez to fetch the
|
|
35
|
+
MANE Select / MANE Plus Clinical transcript for any human gene and enumerates
|
|
36
|
+
every possible substitution — coding, UTR, splice sites, and protein-level.
|
|
37
|
+
|
|
38
|
+
[hgvs]: https://varnomen.hgvs.org/
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
| Function | Output | Description |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `cds(gene)` | `list[tuple]` | All CDS SNVs with protein consequence (1-letter & 3-letter) |
|
|
45
|
+
| `missense(gene)` | `list[tuple]` | Missense variants (codon substitutions that change the amino acid) |
|
|
46
|
+
| `codon_sub(gene)` | `list[str]` | All codon substitutions (silent + missense, incl. multi-base `delins`) |
|
|
47
|
+
| `aa_sub(gene)` | `list[tuple]` | All single amino-acid substitutions |
|
|
48
|
+
| `utr5(gene)` | `list[str]` | All 5' UTR SNVs (negative HGVS numbering) |
|
|
49
|
+
| `utr3(gene)` | `list[str]` | All 3' UTR SNVs (`c.*` numbering) |
|
|
50
|
+
| `splice_site(gene)` | `list[str]` | Canonical splice donor/acceptor ±1, ±2 variants |
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install deepmutsim
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
DeepMutSim requires Python ≥3.8 and [Biopython][biopython].
|
|
59
|
+
|
|
60
|
+
[biopython]: https://biopython.org/
|
|
61
|
+
|
|
62
|
+
## Configuration
|
|
63
|
+
|
|
64
|
+
Two environment variables are **mandatory** for NCBI Entrez access:
|
|
65
|
+
|
|
66
|
+
| Variable | Required | Description |
|
|
67
|
+
|---|---|---|
|
|
68
|
+
| `EMAIL` | Yes | Your email address (NCBI policy) |
|
|
69
|
+
| `API_KEY` | Recommended | NCBI API key — raises the rate limit from 3 to 10 req/s |
|
|
70
|
+
|
|
71
|
+
[Obtain an API key][ncbi-key] from your NCBI account settings.
|
|
72
|
+
|
|
73
|
+
[ncbi-key]: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
|
|
74
|
+
|
|
75
|
+
**Linux / macOS:**
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
export EMAIL="your.email@example.com"
|
|
79
|
+
export API_KEY="your_api_key_here"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Windows (PowerShell):**
|
|
83
|
+
|
|
84
|
+
```powershell
|
|
85
|
+
$env:EMAIL = "your.email@example.com"
|
|
86
|
+
$env:API_KEY = "your_api_key_here"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Quick start
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import deepmutsim
|
|
93
|
+
|
|
94
|
+
# All CDS single-nucleotide variants
|
|
95
|
+
variants = deepmutsim.cds("INS")
|
|
96
|
+
# [('NM_000207.3:c.1A>G', 'NP_000198.1:p.(M1?)', 'NP_000198.1:p.(Met1?)'), ...]
|
|
97
|
+
|
|
98
|
+
# All missense variants
|
|
99
|
+
missense = deepmutsim.missense("TP53")
|
|
100
|
+
|
|
101
|
+
# All 5' UTR SNVs
|
|
102
|
+
utr5_vars = deepmutsim.utr5("BRCA1")
|
|
103
|
+
|
|
104
|
+
# All canonical splice site variants
|
|
105
|
+
splice = deepmutsim.splice_site("CFTR")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## API reference
|
|
109
|
+
|
|
110
|
+
### Query helpers
|
|
111
|
+
|
|
112
|
+
- **`nm(gene)`** — Fetch the MANE nucleotide record (GenBank).
|
|
113
|
+
- **`np(gene)`** — Fetch the MANE protein record (FASTA).
|
|
114
|
+
- **`nc(gene)`** — Fetch the primary assembly RefSeq accession.
|
|
115
|
+
|
|
116
|
+
### CDS variants
|
|
117
|
+
|
|
118
|
+
- **`cds(gene)`** → `list[tuple[str, str, str]]`
|
|
119
|
+
Every possible single-nucleotide substitution across the coding sequence.
|
|
120
|
+
Each entry is `(c.HGVS, p.HGVS_1letter, p.HGVS_3letter)`. The initiator
|
|
121
|
+
methionine is always reported as `M1?` / `Met1?`.
|
|
122
|
+
|
|
123
|
+
- **`missense(gene)`** → `list[tuple[str, str, str]]`
|
|
124
|
+
Codon substitutions that change the encoded amino acid. Includes multi-base
|
|
125
|
+
changes (reported as `delins`). Same tuple format as `cds()`.
|
|
126
|
+
|
|
127
|
+
- **`codon_sub(gene)`** → `list[str]`
|
|
128
|
+
All possible codon substitutions (silent + missense). Single-base changes
|
|
129
|
+
use `X>Y` notation; multi-base changes use `delins`.
|
|
130
|
+
|
|
131
|
+
### UTR variants
|
|
132
|
+
|
|
133
|
+
- **`utr5(gene)`** → `list[str]`
|
|
134
|
+
All SNVs in the 5' untranslated region. Positions use negative HGVS
|
|
135
|
+
numbering (`c.-59`, `c.-58`, …, `c.-1`).
|
|
136
|
+
|
|
137
|
+
- **`utr3(gene)`** → `list[str]`
|
|
138
|
+
All SNVs in the 3' untranslated region. Positions use `c.*` numbering
|
|
139
|
+
(`c.*1`, `c.*2`, …).
|
|
140
|
+
|
|
141
|
+
### Splice site variants
|
|
142
|
+
|
|
143
|
+
- **`splice_site(gene)`** → `list[str]`
|
|
144
|
+
Canonical donor (+1, +2) and acceptor (−2, −1) positions of every intron
|
|
145
|
+
within the CDS boundaries. Prefixed with the genomic RefSeq accession.
|
|
146
|
+
|
|
147
|
+
### Protein-level variants
|
|
148
|
+
|
|
149
|
+
- **`aa_sub(gene)`** → `list[tuple[str, str]]`
|
|
150
|
+
All single amino-acid substitutions. Each entry is
|
|
151
|
+
`(p.HGVS_1letter, p.HGVS_3letter)`. The initiator methionine is reported
|
|
152
|
+
as `M1?` / `Met1?`.
|
|
153
|
+
|
|
154
|
+
## Development
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Editable install with test dependencies
|
|
158
|
+
pip install -e ".[test]"
|
|
159
|
+
|
|
160
|
+
# Run tests (no network required — 38 tests)
|
|
161
|
+
pytest tests/ -v
|
|
162
|
+
|
|
163
|
+
# Run tests with coverage
|
|
164
|
+
pytest --cov=deepmutsim --cov-report=term-missing
|
|
165
|
+
|
|
166
|
+
# Build a distribution
|
|
167
|
+
python -m build
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# DeepMutSim
|
|
2
|
+
|
|
3
|
+
Generate all possible single nucleotide variants (SNVs) for MANE transcripts
|
|
4
|
+
using [HGVS nomenclature][hgvs]. DeepMutSim queries NCBI Entrez to fetch the
|
|
5
|
+
MANE Select / MANE Plus Clinical transcript for any human gene and enumerates
|
|
6
|
+
every possible substitution — coding, UTR, splice sites, and protein-level.
|
|
7
|
+
|
|
8
|
+
[hgvs]: https://varnomen.hgvs.org/
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
| Function | Output | Description |
|
|
13
|
+
|---|---|---|
|
|
14
|
+
| `cds(gene)` | `list[tuple]` | All CDS SNVs with protein consequence (1-letter & 3-letter) |
|
|
15
|
+
| `missense(gene)` | `list[tuple]` | Missense variants (codon substitutions that change the amino acid) |
|
|
16
|
+
| `codon_sub(gene)` | `list[str]` | All codon substitutions (silent + missense, incl. multi-base `delins`) |
|
|
17
|
+
| `aa_sub(gene)` | `list[tuple]` | All single amino-acid substitutions |
|
|
18
|
+
| `utr5(gene)` | `list[str]` | All 5' UTR SNVs (negative HGVS numbering) |
|
|
19
|
+
| `utr3(gene)` | `list[str]` | All 3' UTR SNVs (`c.*` numbering) |
|
|
20
|
+
| `splice_site(gene)` | `list[str]` | Canonical splice donor/acceptor ±1, ±2 variants |
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install deepmutsim
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
DeepMutSim requires Python ≥3.8 and [Biopython][biopython].
|
|
29
|
+
|
|
30
|
+
[biopython]: https://biopython.org/
|
|
31
|
+
|
|
32
|
+
## Configuration
|
|
33
|
+
|
|
34
|
+
Two environment variables are **mandatory** for NCBI Entrez access:
|
|
35
|
+
|
|
36
|
+
| Variable | Required | Description |
|
|
37
|
+
|---|---|---|
|
|
38
|
+
| `EMAIL` | Yes | Your email address (NCBI policy) |
|
|
39
|
+
| `API_KEY` | Recommended | NCBI API key — raises the rate limit from 3 to 10 req/s |
|
|
40
|
+
|
|
41
|
+
[Obtain an API key][ncbi-key] from your NCBI account settings.
|
|
42
|
+
|
|
43
|
+
[ncbi-key]: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
|
|
44
|
+
|
|
45
|
+
**Linux / macOS:**
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
export EMAIL="your.email@example.com"
|
|
49
|
+
export API_KEY="your_api_key_here"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Windows (PowerShell):**
|
|
53
|
+
|
|
54
|
+
```powershell
|
|
55
|
+
$env:EMAIL = "your.email@example.com"
|
|
56
|
+
$env:API_KEY = "your_api_key_here"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick start
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import deepmutsim
|
|
63
|
+
|
|
64
|
+
# All CDS single-nucleotide variants
|
|
65
|
+
variants = deepmutsim.cds("INS")
|
|
66
|
+
# [('NM_000207.3:c.1A>G', 'NP_000198.1:p.(M1?)', 'NP_000198.1:p.(Met1?)'), ...]
|
|
67
|
+
|
|
68
|
+
# All missense variants
|
|
69
|
+
missense = deepmutsim.missense("TP53")
|
|
70
|
+
|
|
71
|
+
# All 5' UTR SNVs
|
|
72
|
+
utr5_vars = deepmutsim.utr5("BRCA1")
|
|
73
|
+
|
|
74
|
+
# All canonical splice site variants
|
|
75
|
+
splice = deepmutsim.splice_site("CFTR")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## API reference
|
|
79
|
+
|
|
80
|
+
### Query helpers
|
|
81
|
+
|
|
82
|
+
- **`nm(gene)`** — Fetch the MANE nucleotide record (GenBank).
|
|
83
|
+
- **`np(gene)`** — Fetch the MANE protein record (FASTA).
|
|
84
|
+
- **`nc(gene)`** — Fetch the primary assembly RefSeq accession.
|
|
85
|
+
|
|
86
|
+
### CDS variants
|
|
87
|
+
|
|
88
|
+
- **`cds(gene)`** → `list[tuple[str, str, str]]`
|
|
89
|
+
Every possible single-nucleotide substitution across the coding sequence.
|
|
90
|
+
Each entry is `(c.HGVS, p.HGVS_1letter, p.HGVS_3letter)`. The initiator
|
|
91
|
+
methionine is always reported as `M1?` / `Met1?`.
|
|
92
|
+
|
|
93
|
+
- **`missense(gene)`** → `list[tuple[str, str, str]]`
|
|
94
|
+
Codon substitutions that change the encoded amino acid. Includes multi-base
|
|
95
|
+
changes (reported as `delins`). Same tuple format as `cds()`.
|
|
96
|
+
|
|
97
|
+
- **`codon_sub(gene)`** → `list[str]`
|
|
98
|
+
All possible codon substitutions (silent + missense). Single-base changes
|
|
99
|
+
use `X>Y` notation; multi-base changes use `delins`.
|
|
100
|
+
|
|
101
|
+
### UTR variants
|
|
102
|
+
|
|
103
|
+
- **`utr5(gene)`** → `list[str]`
|
|
104
|
+
All SNVs in the 5' untranslated region. Positions use negative HGVS
|
|
105
|
+
numbering (`c.-59`, `c.-58`, …, `c.-1`).
|
|
106
|
+
|
|
107
|
+
- **`utr3(gene)`** → `list[str]`
|
|
108
|
+
All SNVs in the 3' untranslated region. Positions use `c.*` numbering
|
|
109
|
+
(`c.*1`, `c.*2`, …).
|
|
110
|
+
|
|
111
|
+
### Splice site variants
|
|
112
|
+
|
|
113
|
+
- **`splice_site(gene)`** → `list[str]`
|
|
114
|
+
Canonical donor (+1, +2) and acceptor (−2, −1) positions of every intron
|
|
115
|
+
within the CDS boundaries. Prefixed with the genomic RefSeq accession.
|
|
116
|
+
|
|
117
|
+
### Protein-level variants
|
|
118
|
+
|
|
119
|
+
- **`aa_sub(gene)`** → `list[tuple[str, str]]`
|
|
120
|
+
All single amino-acid substitutions. Each entry is
|
|
121
|
+
`(p.HGVS_1letter, p.HGVS_3letter)`. The initiator methionine is reported
|
|
122
|
+
as `M1?` / `Met1?`.
|
|
123
|
+
|
|
124
|
+
## Development
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Editable install with test dependencies
|
|
128
|
+
pip install -e ".[test]"
|
|
129
|
+
|
|
130
|
+
# Run tests (no network required — 38 tests)
|
|
131
|
+
pytest tests/ -v
|
|
132
|
+
|
|
133
|
+
# Run tests with coverage
|
|
134
|
+
pytest --cov=deepmutsim --cov-report=term-missing
|
|
135
|
+
|
|
136
|
+
# Build a distribution
|
|
137
|
+
python -m build
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "DeepMutSim"
|
|
7
|
+
version = "1.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Liu Sun", email="sunliu@yxnu.edu.cn" },
|
|
10
|
+
{ name="Jian Yang", email="yangjian@yxnu.edu.cn" },
|
|
11
|
+
]
|
|
12
|
+
description = "Enumerate all possible SNVs for MANE transcripts using HGVS nomenclature."
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
license = "MIT"
|
|
15
|
+
requires-python = ">=3.8"
|
|
16
|
+
keywords = ["bioinformatics", "hgvs", "variant", "mane", "ncbi", "entrez", "mutation"]
|
|
17
|
+
dependencies = [
|
|
18
|
+
"biopython",
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 5 - Production/Stable",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.8",
|
|
26
|
+
"Programming Language :: Python :: 3.9",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
test = [
|
|
36
|
+
"pytest>=7.0",
|
|
37
|
+
"pytest-cov>=4.0",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/liu-sun/DeepMutSim"
|
|
42
|
+
Issues = "https://github.com/liu-sun/DeepMutSim/issues"
|
|
43
|
+
Repository = "https://github.com/liu-sun/DeepMutSim"
|
|
44
|
+
|
|
45
|
+
[tool.pytest.ini_options]
|
|
46
|
+
testpaths = ["tests"]
|
|
47
|
+
pythonpath = ["src"]
|
|
48
|
+
addopts = [
|
|
49
|
+
"--doctest-modules",
|
|
50
|
+
"-v",
|
|
51
|
+
]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: DeepMutSim
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Enumerate all possible SNVs for MANE transcripts using HGVS nomenclature.
|
|
5
|
+
Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/liu-sun/DeepMutSim
|
|
8
|
+
Project-URL: Issues, https://github.com/liu-sun/DeepMutSim/issues
|
|
9
|
+
Project-URL: Repository, https://github.com/liu-sun/DeepMutSim
|
|
10
|
+
Keywords: bioinformatics,hgvs,variant,mane,ncbi,entrez,mutation
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: biopython
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# DeepMutSim
|
|
32
|
+
|
|
33
|
+
Generate all possible single nucleotide variants (SNVs) for MANE transcripts
|
|
34
|
+
using [HGVS nomenclature][hgvs]. DeepMutSim queries NCBI Entrez to fetch the
|
|
35
|
+
MANE Select / MANE Plus Clinical transcript for any human gene and enumerates
|
|
36
|
+
every possible substitution — coding, UTR, splice sites, and protein-level.
|
|
37
|
+
|
|
38
|
+
[hgvs]: https://varnomen.hgvs.org/
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
| Function | Output | Description |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `cds(gene)` | `list[tuple]` | All CDS SNVs with protein consequence (1-letter & 3-letter) |
|
|
45
|
+
| `missense(gene)` | `list[tuple]` | Missense variants (codon substitutions that change the amino acid) |
|
|
46
|
+
| `codon_sub(gene)` | `list[str]` | All codon substitutions (silent + missense, incl. multi-base `delins`) |
|
|
47
|
+
| `aa_sub(gene)` | `list[tuple]` | All single amino-acid substitutions |
|
|
48
|
+
| `utr5(gene)` | `list[str]` | All 5' UTR SNVs (negative HGVS numbering) |
|
|
49
|
+
| `utr3(gene)` | `list[str]` | All 3' UTR SNVs (`c.*` numbering) |
|
|
50
|
+
| `splice_site(gene)` | `list[str]` | Canonical splice donor/acceptor ±1, ±2 variants |
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install deepmutsim
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
DeepMutSim requires Python ≥3.8 and [Biopython][biopython].
|
|
59
|
+
|
|
60
|
+
[biopython]: https://biopython.org/
|
|
61
|
+
|
|
62
|
+
## Configuration
|
|
63
|
+
|
|
64
|
+
Two environment variables are **mandatory** for NCBI Entrez access:
|
|
65
|
+
|
|
66
|
+
| Variable | Required | Description |
|
|
67
|
+
|---|---|---|
|
|
68
|
+
| `EMAIL` | Yes | Your email address (NCBI policy) |
|
|
69
|
+
| `API_KEY` | Recommended | NCBI API key — raises the rate limit from 3 to 10 req/s |
|
|
70
|
+
|
|
71
|
+
[Obtain an API key][ncbi-key] from your NCBI account settings.
|
|
72
|
+
|
|
73
|
+
[ncbi-key]: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
|
|
74
|
+
|
|
75
|
+
**Linux / macOS:**
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
export EMAIL="your.email@example.com"
|
|
79
|
+
export API_KEY="your_api_key_here"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Windows (PowerShell):**
|
|
83
|
+
|
|
84
|
+
```powershell
|
|
85
|
+
$env:EMAIL = "your.email@example.com"
|
|
86
|
+
$env:API_KEY = "your_api_key_here"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Quick start
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import deepmutsim
|
|
93
|
+
|
|
94
|
+
# All CDS single-nucleotide variants
|
|
95
|
+
variants = deepmutsim.cds("INS")
|
|
96
|
+
# [('NM_000207.3:c.1A>G', 'NP_000198.1:p.(M1?)', 'NP_000198.1:p.(Met1?)'), ...]
|
|
97
|
+
|
|
98
|
+
# All missense variants
|
|
99
|
+
missense = deepmutsim.missense("TP53")
|
|
100
|
+
|
|
101
|
+
# All 5' UTR SNVs
|
|
102
|
+
utr5_vars = deepmutsim.utr5("BRCA1")
|
|
103
|
+
|
|
104
|
+
# All canonical splice site variants
|
|
105
|
+
splice = deepmutsim.splice_site("CFTR")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## API reference
|
|
109
|
+
|
|
110
|
+
### Query helpers
|
|
111
|
+
|
|
112
|
+
- **`nm(gene)`** — Fetch the MANE nucleotide record (GenBank).
|
|
113
|
+
- **`np(gene)`** — Fetch the MANE protein record (FASTA).
|
|
114
|
+
- **`nc(gene)`** — Fetch the primary assembly RefSeq accession.
|
|
115
|
+
|
|
116
|
+
### CDS variants
|
|
117
|
+
|
|
118
|
+
- **`cds(gene)`** → `list[tuple[str, str, str]]`
|
|
119
|
+
Every possible single-nucleotide substitution across the coding sequence.
|
|
120
|
+
Each entry is `(c.HGVS, p.HGVS_1letter, p.HGVS_3letter)`. The initiator
|
|
121
|
+
methionine is always reported as `M1?` / `Met1?`.
|
|
122
|
+
|
|
123
|
+
- **`missense(gene)`** → `list[tuple[str, str, str]]`
|
|
124
|
+
Codon substitutions that change the encoded amino acid. Includes multi-base
|
|
125
|
+
changes (reported as `delins`). Same tuple format as `cds()`.
|
|
126
|
+
|
|
127
|
+
- **`codon_sub(gene)`** → `list[str]`
|
|
128
|
+
All possible codon substitutions (silent + missense). Single-base changes
|
|
129
|
+
use `X>Y` notation; multi-base changes use `delins`.
|
|
130
|
+
|
|
131
|
+
### UTR variants
|
|
132
|
+
|
|
133
|
+
- **`utr5(gene)`** → `list[str]`
|
|
134
|
+
All SNVs in the 5' untranslated region. Positions use negative HGVS
|
|
135
|
+
numbering (`c.-59`, `c.-58`, …, `c.-1`).
|
|
136
|
+
|
|
137
|
+
- **`utr3(gene)`** → `list[str]`
|
|
138
|
+
All SNVs in the 3' untranslated region. Positions use `c.*` numbering
|
|
139
|
+
(`c.*1`, `c.*2`, …).
|
|
140
|
+
|
|
141
|
+
### Splice site variants
|
|
142
|
+
|
|
143
|
+
- **`splice_site(gene)`** → `list[str]`
|
|
144
|
+
Canonical donor (+1, +2) and acceptor (−2, −1) positions of every intron
|
|
145
|
+
within the CDS boundaries. Prefixed with the genomic RefSeq accession.
|
|
146
|
+
|
|
147
|
+
### Protein-level variants
|
|
148
|
+
|
|
149
|
+
- **`aa_sub(gene)`** → `list[tuple[str, str]]`
|
|
150
|
+
All single amino-acid substitutions. Each entry is
|
|
151
|
+
`(p.HGVS_1letter, p.HGVS_3letter)`. The initiator methionine is reported
|
|
152
|
+
as `M1?` / `Met1?`.
|
|
153
|
+
|
|
154
|
+
## Development
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Editable install with test dependencies
|
|
158
|
+
pip install -e ".[test]"
|
|
159
|
+
|
|
160
|
+
# Run tests (no network required — 38 tests)
|
|
161
|
+
pytest tests/ -v
|
|
162
|
+
|
|
163
|
+
# Run tests with coverage
|
|
164
|
+
pytest --cov=deepmutsim --cov-report=term-missing
|
|
165
|
+
|
|
166
|
+
# Build a distribution
|
|
167
|
+
python -m build
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -6,4 +6,8 @@ src/DeepMutSim.egg-info/SOURCES.txt
|
|
|
6
6
|
src/DeepMutSim.egg-info/dependency_links.txt
|
|
7
7
|
src/DeepMutSim.egg-info/requires.txt
|
|
8
8
|
src/DeepMutSim.egg-info/top_level.txt
|
|
9
|
-
src/deepmutsim/__init__.py
|
|
9
|
+
src/deepmutsim/__init__.py
|
|
10
|
+
src/deepmutsim/_fetch.py
|
|
11
|
+
src/deepmutsim/_variants.py
|
|
12
|
+
tests/test_fetch.py
|
|
13
|
+
tests/test_variants.py
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""DeepMutSim — generate all possible SNVs for MANE transcripts in HGVS notation.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
import deepmutsim
|
|
6
|
+
deepmutsim.cds("INS") # all CDS single-nucleotide variants
|
|
7
|
+
deepmutsim.missense("INS") # all missense variants
|
|
8
|
+
deepmutsim.aa_sub("INS") # all amino-acid substitutions
|
|
9
|
+
deepmutsim.utr5("INS") # all 5'UTR SNVs
|
|
10
|
+
deepmutsim.utr3("INS") # all 3'UTR SNVs
|
|
11
|
+
deepmutsim.splice_site("INS") # canonical splice-site SNVs
|
|
12
|
+
deepmutsim.codon_sub("INS") # all codon substitutions
|
|
13
|
+
|
|
14
|
+
Requires ``EMAIL`` and ``API_KEY`` environment variables for NCBI Entrez access.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ._fetch import _configure_entrez, nm, np, nc
|
|
18
|
+
from ._variants import aa_sub, cds, codon_sub, missense, splice_site, utr3, utr5
|
|
19
|
+
|
|
20
|
+
# Configure NCBI Entrez credentials on first import.
|
|
21
|
+
_configure_entrez()
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"nm",
|
|
25
|
+
"np",
|
|
26
|
+
"nc",
|
|
27
|
+
"cds",
|
|
28
|
+
"utr5",
|
|
29
|
+
"utr3",
|
|
30
|
+
"splice_site",
|
|
31
|
+
"aa_sub",
|
|
32
|
+
"codon_sub",
|
|
33
|
+
"missense",
|
|
34
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""NCBI Entrez query helpers for fetching MANE transcript records.
|
|
2
|
+
|
|
3
|
+
All functions require ``EMAIL`` and ``API_KEY`` environment variables to be set
|
|
4
|
+
before use. Call :func:`_configure_entrez` once, or set them through
|
|
5
|
+
``Bio.Entrez.email`` / ``Bio.Entrez.api_key`` directly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
from Bio import Entrez, SeqIO
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _configure_entrez():
|
|
14
|
+
"""Set NCBI Entrez credentials from environment variables.
|
|
15
|
+
|
|
16
|
+
Reads ``EMAIL`` (mandatory) and ``API_KEY`` (recommended) and configures
|
|
17
|
+
Biopython's Entrez module accordingly.
|
|
18
|
+
"""
|
|
19
|
+
Entrez.email = os.environ["EMAIL"]
|
|
20
|
+
Entrez.api_key = os.environ.get("API_KEY", "")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def nm(gene: str):
|
|
24
|
+
"""Fetch the MANE Select / MANE Plus Clinical nucleotide record for *gene*.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
gene : str
|
|
29
|
+
Gene symbol (e.g. ``"INS"``, ``"TP53"``).
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
Bio.SeqRecord.SeqRecord
|
|
34
|
+
GenBank record of the MANE transcript.
|
|
35
|
+
"""
|
|
36
|
+
stream = Entrez.esearch(
|
|
37
|
+
db="nucleotide",
|
|
38
|
+
term=f'{gene}[Gene Name] AND ("MANE Select"[Keyword] OR "MANE Plus Clinical"[keyword])',
|
|
39
|
+
)
|
|
40
|
+
record = Entrez.read(stream)
|
|
41
|
+
stream = Entrez.efetch(
|
|
42
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
|
43
|
+
)
|
|
44
|
+
return SeqIO.read(stream, "genbank")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def np(gene: str):
|
|
48
|
+
"""Fetch the MANE Select / MANE Plus Clinical protein record for *gene*.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
gene : str
|
|
53
|
+
Gene symbol (e.g. ``"INS"``, ``"TP53"``).
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
Bio.SeqRecord.SeqRecord
|
|
58
|
+
FASTA record of the MANE protein.
|
|
59
|
+
"""
|
|
60
|
+
stream = Entrez.esearch(
|
|
61
|
+
db="protein",
|
|
62
|
+
term=f'{gene}[Gene Name] AND ("MANE Select"[Keyword] OR "MANE Plus Clinical"[keyword])',
|
|
63
|
+
)
|
|
64
|
+
record = Entrez.read(stream)
|
|
65
|
+
stream = Entrez.efetch(
|
|
66
|
+
db="protein", id=record["IdList"], rettype="fasta", retmode="text"
|
|
67
|
+
)
|
|
68
|
+
return SeqIO.read(stream, "fasta")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def nc(gene: str) -> str:
|
|
72
|
+
"""Fetch the primary assembly RefSeq accession for *gene*.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
gene : str
|
|
77
|
+
Gene symbol (e.g. ``"INS"``, ``"TP53"``).
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
str
|
|
82
|
+
RefSeq accession of the primary assembly (e.g. ``"NC_000011.10"``).
|
|
83
|
+
"""
|
|
84
|
+
stream = Entrez.esearch(
|
|
85
|
+
db="nucleotide",
|
|
86
|
+
term=f'{gene}[Gene Name] AND "Primary Assembly"[Title] AND human[Organism]',
|
|
87
|
+
)
|
|
88
|
+
record = Entrez.read(stream)
|
|
89
|
+
stream = Entrez.efetch(
|
|
90
|
+
db="nucleotide", id=record["IdList"], rettype="acc", retmode="text"
|
|
91
|
+
)
|
|
92
|
+
return stream.read().strip()
|