varsim 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varsim/__init__.py +16 -423
- varsim/core.py +432 -0
- {varsim-1.0.6.dist-info → varsim-1.0.7.dist-info}/METADATA +1 -1
- varsim-1.0.7.dist-info/RECORD +7 -0
- varsim-1.0.6.dist-info/RECORD +0 -6
- {varsim-1.0.6.dist-info → varsim-1.0.7.dist-info}/WHEEL +0 -0
- {varsim-1.0.6.dist-info → varsim-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {varsim-1.0.6.dist-info → varsim-1.0.7.dist-info}/top_level.txt +0 -0
varsim/__init__.py
CHANGED
@@ -1,432 +1,25 @@
|
|
1
|
-
import
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
1
|
+
from .core import (
|
2
|
+
cds,
|
3
|
+
utr5,
|
4
|
+
utr3,
|
5
|
+
splicing,
|
6
|
+
inframe_del,
|
7
|
+
inframe_dup,
|
8
|
+
frameshift_del,
|
9
|
+
frameshift_dup,
|
10
|
+
aa_sub,
|
11
|
+
missense,
|
10
12
|
)
|
11
|
-
from Bio.Seq import Seq
|
12
|
-
from Bio.SeqFeature import SimpleLocation
|
13
|
-
from Bio.SeqUtils import seq3
|
14
13
|
|
15
14
|
__all__ = [
|
16
|
-
"frameshift_dup",
|
17
|
-
"frameshift_del",
|
18
15
|
"cds",
|
19
|
-
"inframe_dup",
|
20
|
-
"inframe_del",
|
21
|
-
"splicing",
|
22
16
|
"utr5",
|
23
17
|
"utr3",
|
18
|
+
"splicing",
|
19
|
+
"inframe_del",
|
20
|
+
"inframe_dup",
|
21
|
+
"frameshift_del",
|
22
|
+
"frameshift_dup",
|
24
23
|
"aa_sub",
|
25
24
|
"missense",
|
26
25
|
]
|
27
|
-
|
28
|
-
Entrez.email = os.environ["EMAIL"]
|
29
|
-
Entrez.api_key = os.environ["API_KEY"]
|
30
|
-
codons = standard_dna_table.forward_table.keys()
|
31
|
-
|
32
|
-
|
33
|
-
def cds(gene: str) -> list:
|
34
|
-
variants = []
|
35
|
-
stream = Entrez.esearch(
|
36
|
-
db="nucleotide",
|
37
|
-
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
38
|
-
)
|
39
|
-
record = Entrez.read(stream)
|
40
|
-
stream = Entrez.efetch(
|
41
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
42
|
-
)
|
43
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
44
|
-
for feature in seqrecord.features:
|
45
|
-
if feature.type == "CDS":
|
46
|
-
protein = "".join(feature.qualifiers.get("translation"))
|
47
|
-
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
48
|
-
cds = feature.extract(seqrecord).seq
|
49
|
-
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
50
|
-
for base in unambiguous_dna_letters:
|
51
|
-
if base != cds[codon]:
|
52
|
-
seq = Seq(base) + cds[codon + 1 : codon + 3]
|
53
|
-
if protein[index] != seq.translate():
|
54
|
-
variants.append(
|
55
|
-
(
|
56
|
-
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
|
57
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
58
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
59
|
-
)
|
60
|
-
)
|
61
|
-
else:
|
62
|
-
variants.append(
|
63
|
-
(
|
64
|
-
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
|
65
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
66
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
67
|
-
)
|
68
|
-
)
|
69
|
-
if base != cds[codon + 1]:
|
70
|
-
seq = cds[codon] + Seq(base) + cds[codon + 2]
|
71
|
-
if protein[index] != seq.translate():
|
72
|
-
variants.append(
|
73
|
-
(
|
74
|
-
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
|
75
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
76
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
77
|
-
)
|
78
|
-
)
|
79
|
-
else:
|
80
|
-
variants.append(
|
81
|
-
(
|
82
|
-
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
|
83
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
84
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
85
|
-
)
|
86
|
-
)
|
87
|
-
if base != cds[codon + 2]:
|
88
|
-
seq = cds[codon : codon + 2] + Seq(base)
|
89
|
-
if protein[index] != seq.translate():
|
90
|
-
variants.append(
|
91
|
-
(
|
92
|
-
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
|
93
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
94
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
95
|
-
)
|
96
|
-
)
|
97
|
-
else:
|
98
|
-
variants.append(
|
99
|
-
(
|
100
|
-
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
|
101
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
102
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
103
|
-
)
|
104
|
-
)
|
105
|
-
return variants
|
106
|
-
|
107
|
-
|
108
|
-
def utr5(gene: str) -> list:
|
109
|
-
variants = []
|
110
|
-
stream = Entrez.esearch(
|
111
|
-
db="nucleotide",
|
112
|
-
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
113
|
-
)
|
114
|
-
record = Entrez.read(stream)
|
115
|
-
stream = Entrez.efetch(
|
116
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
117
|
-
)
|
118
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
119
|
-
for feature in seqrecord.features:
|
120
|
-
if feature.type == "CDS":
|
121
|
-
utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
|
122
|
-
for index in range(len(utr5)):
|
123
|
-
for base in unambiguous_dna_letters:
|
124
|
-
if base != utr5[index]:
|
125
|
-
variants.append(
|
126
|
-
(
|
127
|
-
f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
|
128
|
-
"",
|
129
|
-
"",
|
130
|
-
)
|
131
|
-
)
|
132
|
-
return variants
|
133
|
-
|
134
|
-
|
135
|
-
def utr3(gene: str) -> list:
|
136
|
-
variants = []
|
137
|
-
stream = Entrez.esearch(
|
138
|
-
db="nucleotide",
|
139
|
-
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
140
|
-
)
|
141
|
-
record = Entrez.read(stream)
|
142
|
-
stream = Entrez.efetch(
|
143
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
144
|
-
)
|
145
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
146
|
-
for feature in seqrecord.features:
|
147
|
-
if feature.type == "CDS":
|
148
|
-
utr3 = (
|
149
|
-
SimpleLocation(feature.location.end, len(seqrecord))
|
150
|
-
.extract(seqrecord)
|
151
|
-
.seq
|
152
|
-
)
|
153
|
-
for index in range(len(utr3)):
|
154
|
-
for base in unambiguous_dna_letters:
|
155
|
-
if base != utr3[index]:
|
156
|
-
variants.append(
|
157
|
-
(
|
158
|
-
f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
|
159
|
-
"",
|
160
|
-
"",
|
161
|
-
)
|
162
|
-
)
|
163
|
-
return variants
|
164
|
-
|
165
|
-
|
166
|
-
def splicing(gene: str) -> list:
|
167
|
-
variants = []
|
168
|
-
exon = []
|
169
|
-
stream = Entrez.esearch(
|
170
|
-
db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
|
171
|
-
)
|
172
|
-
record = Entrez.read(stream)
|
173
|
-
|
174
|
-
stream = Entrez.efetch(
|
175
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
176
|
-
)
|
177
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
178
|
-
splicing = []
|
179
|
-
variants = []
|
180
|
-
start = 0
|
181
|
-
end = 0
|
182
|
-
for feature in seqrecord.features:
|
183
|
-
if feature.type == "CDS":
|
184
|
-
start = feature.location.start
|
185
|
-
end = feature.location.end
|
186
|
-
for feature in seqrecord.features:
|
187
|
-
if feature.type == "exon":
|
188
|
-
if feature.location.start < start and feature.location.end < start:
|
189
|
-
splicing.extend(
|
190
|
-
(
|
191
|
-
feature.location.start - start - 1,
|
192
|
-
feature.location.end - start - 1,
|
193
|
-
)
|
194
|
-
)
|
195
|
-
elif feature.location.start < start and feature.location.end > start:
|
196
|
-
splicing.extend(
|
197
|
-
(feature.location.start - start - 1, feature.location.end - start)
|
198
|
-
)
|
199
|
-
else:
|
200
|
-
splicing.extend(
|
201
|
-
(feature.location.start - start, feature.location.end - start)
|
202
|
-
)
|
203
|
-
|
204
|
-
for coordinate in range(1, len(splicing) - 1, 2):
|
205
|
-
site = splicing[coordinate], splicing[coordinate] + 1
|
206
|
-
for base in unambiguous_dna_letters:
|
207
|
-
if base != "G":
|
208
|
-
variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
|
209
|
-
if base != "T":
|
210
|
-
variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
|
211
|
-
if base != "A":
|
212
|
-
variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
|
213
|
-
if base != "G":
|
214
|
-
variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
|
215
|
-
return variants
|
216
|
-
|
217
|
-
|
218
|
-
def aa_sub(gene: str) -> list:
|
219
|
-
variants = []
|
220
|
-
term = f'{gene}[Gene Name] AND "mane select"[keyword]'
|
221
|
-
stream = Entrez.esearch(db="protein", term=term)
|
222
|
-
record = Entrez.read(stream)
|
223
|
-
|
224
|
-
stream = Entrez.efetch(
|
225
|
-
db="protein", rettype="gp", retmode="text", id=record["IdList"]
|
226
|
-
)
|
227
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
228
|
-
for index, residue in enumerate(seqrecord.seq, 1):
|
229
|
-
for aa in protein_letters:
|
230
|
-
if aa != residue:
|
231
|
-
variants.append(
|
232
|
-
(
|
233
|
-
f"{seqrecord.id}:p.{residue}{index}{aa}",
|
234
|
-
f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
|
235
|
-
)
|
236
|
-
)
|
237
|
-
return variants
|
238
|
-
|
239
|
-
|
240
|
-
def missense(gene: str) -> list:
|
241
|
-
variants = []
|
242
|
-
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
243
|
-
stream = Entrez.esearch(db="nucleotide", term=term)
|
244
|
-
record = Entrez.read(stream)
|
245
|
-
stream = Entrez.efetch(
|
246
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
247
|
-
)
|
248
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
249
|
-
for feature in seqrecord.features:
|
250
|
-
if feature.type == "CDS":
|
251
|
-
protein = "".join(feature.qualifiers.get("translation"))
|
252
|
-
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
253
|
-
cds = feature.location.extract(seqrecord).seq
|
254
|
-
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
255
|
-
for base in codons:
|
256
|
-
if base != cds[codon : codon + 3]:
|
257
|
-
seq = Seq(base)
|
258
|
-
if protein[index] != seq.translate():
|
259
|
-
if (
|
260
|
-
base[0] == cds[codon]
|
261
|
-
and base[1] == cds[codon + 1]
|
262
|
-
and base[2] != cds[codon + 2]
|
263
|
-
):
|
264
|
-
variants.append(
|
265
|
-
(
|
266
|
-
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
|
267
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
268
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
269
|
-
)
|
270
|
-
)
|
271
|
-
elif (
|
272
|
-
base[0] == cds[codon]
|
273
|
-
and base[1] != cds[codon + 1]
|
274
|
-
and base[2] == cds[codon + 2]
|
275
|
-
):
|
276
|
-
variants.append(
|
277
|
-
(
|
278
|
-
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
|
279
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
280
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
281
|
-
)
|
282
|
-
)
|
283
|
-
elif (
|
284
|
-
base[0] != cds[codon]
|
285
|
-
and base[1] == cds[codon + 1]
|
286
|
-
and base[2] == cds[codon + 2]
|
287
|
-
):
|
288
|
-
variants.append(
|
289
|
-
(
|
290
|
-
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
|
291
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
292
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
293
|
-
)
|
294
|
-
)
|
295
|
-
else:
|
296
|
-
variants.append(
|
297
|
-
(
|
298
|
-
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
|
299
|
-
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
300
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
301
|
-
)
|
302
|
-
)
|
303
|
-
else:
|
304
|
-
if (
|
305
|
-
base[0] == cds[codon]
|
306
|
-
and base[1] == cds[codon + 1]
|
307
|
-
and base[2] != cds[codon + 2]
|
308
|
-
):
|
309
|
-
variants.append(
|
310
|
-
(
|
311
|
-
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
|
312
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
313
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
314
|
-
)
|
315
|
-
)
|
316
|
-
elif (
|
317
|
-
base[0] == cds[codon]
|
318
|
-
and base[1] != cds[codon + 1]
|
319
|
-
and base[2] == cds[codon + 2]
|
320
|
-
):
|
321
|
-
variants.append(
|
322
|
-
(
|
323
|
-
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
|
324
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
325
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
326
|
-
)
|
327
|
-
)
|
328
|
-
elif (
|
329
|
-
base[0] != cds[codon]
|
330
|
-
and base[1] == cds[codon + 1]
|
331
|
-
and base[2] == cds[codon + 2]
|
332
|
-
):
|
333
|
-
variants.append(
|
334
|
-
(
|
335
|
-
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
|
336
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
337
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
338
|
-
)
|
339
|
-
)
|
340
|
-
else:
|
341
|
-
variants.append(
|
342
|
-
(
|
343
|
-
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
|
344
|
-
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
345
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
346
|
-
)
|
347
|
-
)
|
348
|
-
return variants
|
349
|
-
|
350
|
-
|
351
|
-
def inframe_del(gene: str) -> list:
|
352
|
-
variants = []
|
353
|
-
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
354
|
-
stream = Entrez.esearch(db="nucleotide", term=term)
|
355
|
-
record = Entrez.read(stream)
|
356
|
-
stream = Entrez.efetch(
|
357
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
358
|
-
)
|
359
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
360
|
-
for feature in seqrecord.features:
|
361
|
-
if feature.type == "CDS":
|
362
|
-
protein = "".join(feature.qualifiers.get("translation"))
|
363
|
-
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
364
|
-
cds = feature.location.extract(seqrecord).seq
|
365
|
-
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
366
|
-
variants.append(
|
367
|
-
(
|
368
|
-
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
|
369
|
-
f"{protein_id}:p.{protein[index]}{index + 1}del",
|
370
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
|
371
|
-
)
|
372
|
-
)
|
373
|
-
return variants
|
374
|
-
|
375
|
-
|
376
|
-
def inframe_dup(gene: str) -> list:
|
377
|
-
variants = []
|
378
|
-
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
379
|
-
stream = Entrez.esearch(db="nucleotide", term=term)
|
380
|
-
record = Entrez.read(stream)
|
381
|
-
stream = Entrez.efetch(
|
382
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
383
|
-
)
|
384
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
385
|
-
for feature in seqrecord.features:
|
386
|
-
if feature.type == "CDS":
|
387
|
-
protein = "".join(feature.qualifiers.get("translation"))
|
388
|
-
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
389
|
-
cds = feature.location.extract(seqrecord).seq
|
390
|
-
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
391
|
-
variants.append(
|
392
|
-
(
|
393
|
-
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
|
394
|
-
f"{protein_id}:p.{protein[index]}{index + 1}dup",
|
395
|
-
f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
|
396
|
-
)
|
397
|
-
)
|
398
|
-
return variants
|
399
|
-
|
400
|
-
|
401
|
-
def frameshift_dup(gene: str) -> list:
|
402
|
-
variants = []
|
403
|
-
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
404
|
-
stream = Entrez.esearch(db="nucleotide", term=term)
|
405
|
-
record = Entrez.read(stream)
|
406
|
-
stream = Entrez.efetch(
|
407
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
408
|
-
)
|
409
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
410
|
-
for feature in seqrecord.features:
|
411
|
-
if feature.type == "CDS":
|
412
|
-
cds = feature.location.extract(seqrecord).seq
|
413
|
-
for index, base in enumerate(cds, start=1):
|
414
|
-
variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
|
415
|
-
return variants
|
416
|
-
|
417
|
-
|
418
|
-
def frameshift_del(gene: str) -> list:
|
419
|
-
variants = []
|
420
|
-
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
421
|
-
stream = Entrez.esearch(db="nucleotide", term=term)
|
422
|
-
record = Entrez.read(stream)
|
423
|
-
stream = Entrez.efetch(
|
424
|
-
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
425
|
-
)
|
426
|
-
seqrecord = SeqIO.read(stream, "genbank")
|
427
|
-
for feature in seqrecord.features:
|
428
|
-
if feature.type == "CDS":
|
429
|
-
cds = feature.location.extract(seqrecord).seq
|
430
|
-
for index, base in enumerate(cds, start=1):
|
431
|
-
variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
|
432
|
-
return variants
|
varsim/core.py
ADDED
@@ -0,0 +1,432 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from Bio import Entrez, SeqIO
|
4
|
+
from Bio.Data.CodonTable import standard_dna_table
|
5
|
+
from Bio.Data.IUPACData import (
|
6
|
+
unambiguous_dna_letters,
|
7
|
+
protein_letters,
|
8
|
+
protein_letters_1to3,
|
9
|
+
protein_letters_3to1,
|
10
|
+
)
|
11
|
+
from Bio.Seq import Seq
|
12
|
+
from Bio.SeqFeature import SimpleLocation
|
13
|
+
from Bio.SeqUtils import seq3
|
14
|
+
|
15
|
+
__all__ = [
|
16
|
+
"frameshift_dup",
|
17
|
+
"frameshift_del",
|
18
|
+
"cds",
|
19
|
+
"inframe_dup",
|
20
|
+
"inframe_del",
|
21
|
+
"splicing",
|
22
|
+
"utr5",
|
23
|
+
"utr3",
|
24
|
+
"aa_sub",
|
25
|
+
"missense",
|
26
|
+
]
|
27
|
+
|
28
|
+
Entrez.email = os.environ["EMAIL"]
|
29
|
+
Entrez.api_key = os.environ["API_KEY"]
|
30
|
+
codons = standard_dna_table.forward_table.keys()
|
31
|
+
|
32
|
+
|
33
|
+
def cds(gene: str) -> list:
|
34
|
+
variants = []
|
35
|
+
stream = Entrez.esearch(
|
36
|
+
db="nucleotide",
|
37
|
+
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
38
|
+
)
|
39
|
+
record = Entrez.read(stream)
|
40
|
+
stream = Entrez.efetch(
|
41
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
42
|
+
)
|
43
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
44
|
+
for feature in seqrecord.features:
|
45
|
+
if feature.type == "CDS":
|
46
|
+
protein = "".join(feature.qualifiers.get("translation"))
|
47
|
+
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
48
|
+
cds = feature.extract(seqrecord).seq
|
49
|
+
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
50
|
+
for base in unambiguous_dna_letters:
|
51
|
+
if base != cds[codon]:
|
52
|
+
seq = Seq(base) + cds[codon + 1 : codon + 3]
|
53
|
+
if protein[index] != seq.translate():
|
54
|
+
variants.append(
|
55
|
+
(
|
56
|
+
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
|
57
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
58
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
59
|
+
)
|
60
|
+
)
|
61
|
+
else:
|
62
|
+
variants.append(
|
63
|
+
(
|
64
|
+
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
|
65
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
66
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
67
|
+
)
|
68
|
+
)
|
69
|
+
if base != cds[codon + 1]:
|
70
|
+
seq = cds[codon] + Seq(base) + cds[codon + 2]
|
71
|
+
if protein[index] != seq.translate():
|
72
|
+
variants.append(
|
73
|
+
(
|
74
|
+
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
|
75
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
76
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
77
|
+
)
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
variants.append(
|
81
|
+
(
|
82
|
+
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
|
83
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
84
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
85
|
+
)
|
86
|
+
)
|
87
|
+
if base != cds[codon + 2]:
|
88
|
+
seq = cds[codon : codon + 2] + Seq(base)
|
89
|
+
if protein[index] != seq.translate():
|
90
|
+
variants.append(
|
91
|
+
(
|
92
|
+
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
|
93
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
94
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
95
|
+
)
|
96
|
+
)
|
97
|
+
else:
|
98
|
+
variants.append(
|
99
|
+
(
|
100
|
+
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
|
101
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
102
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
103
|
+
)
|
104
|
+
)
|
105
|
+
return variants
|
106
|
+
|
107
|
+
|
108
|
+
def utr5(gene: str) -> list:
|
109
|
+
variants = []
|
110
|
+
stream = Entrez.esearch(
|
111
|
+
db="nucleotide",
|
112
|
+
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
113
|
+
)
|
114
|
+
record = Entrez.read(stream)
|
115
|
+
stream = Entrez.efetch(
|
116
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
117
|
+
)
|
118
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
119
|
+
for feature in seqrecord.features:
|
120
|
+
if feature.type == "CDS":
|
121
|
+
utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
|
122
|
+
for index in range(len(utr5)):
|
123
|
+
for base in unambiguous_dna_letters:
|
124
|
+
if base != utr5[index]:
|
125
|
+
variants.append(
|
126
|
+
(
|
127
|
+
f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
|
128
|
+
"",
|
129
|
+
"",
|
130
|
+
)
|
131
|
+
)
|
132
|
+
return variants
|
133
|
+
|
134
|
+
|
135
|
+
def utr3(gene: str) -> list:
|
136
|
+
variants = []
|
137
|
+
stream = Entrez.esearch(
|
138
|
+
db="nucleotide",
|
139
|
+
term=f'{gene}[Gene Name] "mane select"[Keyword]',
|
140
|
+
)
|
141
|
+
record = Entrez.read(stream)
|
142
|
+
stream = Entrez.efetch(
|
143
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
144
|
+
)
|
145
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
146
|
+
for feature in seqrecord.features:
|
147
|
+
if feature.type == "CDS":
|
148
|
+
utr3 = (
|
149
|
+
SimpleLocation(feature.location.end, len(seqrecord))
|
150
|
+
.extract(seqrecord)
|
151
|
+
.seq
|
152
|
+
)
|
153
|
+
for index in range(len(utr3)):
|
154
|
+
for base in unambiguous_dna_letters:
|
155
|
+
if base != utr3[index]:
|
156
|
+
variants.append(
|
157
|
+
(
|
158
|
+
f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
|
159
|
+
"",
|
160
|
+
"",
|
161
|
+
)
|
162
|
+
)
|
163
|
+
return variants
|
164
|
+
|
165
|
+
|
166
|
+
def splicing(gene: str) -> list:
|
167
|
+
variants = []
|
168
|
+
exon = []
|
169
|
+
stream = Entrez.esearch(
|
170
|
+
db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
|
171
|
+
)
|
172
|
+
record = Entrez.read(stream)
|
173
|
+
|
174
|
+
stream = Entrez.efetch(
|
175
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
176
|
+
)
|
177
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
178
|
+
splicing = []
|
179
|
+
variants = []
|
180
|
+
start = 0
|
181
|
+
end = 0
|
182
|
+
for feature in seqrecord.features:
|
183
|
+
if feature.type == "CDS":
|
184
|
+
start = feature.location.start
|
185
|
+
end = feature.location.end
|
186
|
+
for feature in seqrecord.features:
|
187
|
+
if feature.type == "exon":
|
188
|
+
if feature.location.start < start and feature.location.end < start:
|
189
|
+
splicing.extend(
|
190
|
+
(
|
191
|
+
feature.location.start - start - 1,
|
192
|
+
feature.location.end - start - 1,
|
193
|
+
)
|
194
|
+
)
|
195
|
+
elif feature.location.start < start and feature.location.end > start:
|
196
|
+
splicing.extend(
|
197
|
+
(feature.location.start - start - 1, feature.location.end - start)
|
198
|
+
)
|
199
|
+
else:
|
200
|
+
splicing.extend(
|
201
|
+
(feature.location.start - start, feature.location.end - start)
|
202
|
+
)
|
203
|
+
|
204
|
+
for coordinate in range(1, len(splicing) - 1, 2):
|
205
|
+
site = splicing[coordinate], splicing[coordinate] + 1
|
206
|
+
for base in unambiguous_dna_letters:
|
207
|
+
if base != "G":
|
208
|
+
variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
|
209
|
+
if base != "T":
|
210
|
+
variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
|
211
|
+
if base != "A":
|
212
|
+
variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
|
213
|
+
if base != "G":
|
214
|
+
variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
|
215
|
+
return variants
|
216
|
+
|
217
|
+
|
218
|
+
def aa_sub(gene: str) -> list:
|
219
|
+
variants = []
|
220
|
+
term = f'{gene}[Gene Name] AND "mane select"[keyword]'
|
221
|
+
stream = Entrez.esearch(db="protein", term=term)
|
222
|
+
record = Entrez.read(stream)
|
223
|
+
|
224
|
+
stream = Entrez.efetch(
|
225
|
+
db="protein", rettype="gp", retmode="text", id=record["IdList"]
|
226
|
+
)
|
227
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
228
|
+
for index, residue in enumerate(seqrecord.seq, 1):
|
229
|
+
for aa in protein_letters:
|
230
|
+
if aa != residue:
|
231
|
+
variants.append(
|
232
|
+
(
|
233
|
+
f"{seqrecord.id}:p.{residue}{index}{aa}",
|
234
|
+
f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
|
235
|
+
)
|
236
|
+
)
|
237
|
+
return variants
|
238
|
+
|
239
|
+
|
240
|
+
def missense(gene: str) -> list:
|
241
|
+
variants = []
|
242
|
+
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
243
|
+
stream = Entrez.esearch(db="nucleotide", term=term)
|
244
|
+
record = Entrez.read(stream)
|
245
|
+
stream = Entrez.efetch(
|
246
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
247
|
+
)
|
248
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
249
|
+
for feature in seqrecord.features:
|
250
|
+
if feature.type == "CDS":
|
251
|
+
protein = "".join(feature.qualifiers.get("translation"))
|
252
|
+
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
253
|
+
cds = feature.location.extract(seqrecord).seq
|
254
|
+
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
255
|
+
for base in codons:
|
256
|
+
if base != cds[codon : codon + 3]:
|
257
|
+
seq = Seq(base)
|
258
|
+
if protein[index] != seq.translate():
|
259
|
+
if (
|
260
|
+
base[0] == cds[codon]
|
261
|
+
and base[1] == cds[codon + 1]
|
262
|
+
and base[2] != cds[codon + 2]
|
263
|
+
):
|
264
|
+
variants.append(
|
265
|
+
(
|
266
|
+
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
|
267
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
268
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
269
|
+
)
|
270
|
+
)
|
271
|
+
elif (
|
272
|
+
base[0] == cds[codon]
|
273
|
+
and base[1] != cds[codon + 1]
|
274
|
+
and base[2] == cds[codon + 2]
|
275
|
+
):
|
276
|
+
variants.append(
|
277
|
+
(
|
278
|
+
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
|
279
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
280
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
281
|
+
)
|
282
|
+
)
|
283
|
+
elif (
|
284
|
+
base[0] != cds[codon]
|
285
|
+
and base[1] == cds[codon + 1]
|
286
|
+
and base[2] == cds[codon + 2]
|
287
|
+
):
|
288
|
+
variants.append(
|
289
|
+
(
|
290
|
+
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
|
291
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
292
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
293
|
+
)
|
294
|
+
)
|
295
|
+
else:
|
296
|
+
variants.append(
|
297
|
+
(
|
298
|
+
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
|
299
|
+
f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
|
300
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
|
301
|
+
)
|
302
|
+
)
|
303
|
+
else:
|
304
|
+
if (
|
305
|
+
base[0] == cds[codon]
|
306
|
+
and base[1] == cds[codon + 1]
|
307
|
+
and base[2] != cds[codon + 2]
|
308
|
+
):
|
309
|
+
variants.append(
|
310
|
+
(
|
311
|
+
f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
|
312
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
313
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
314
|
+
)
|
315
|
+
)
|
316
|
+
elif (
|
317
|
+
base[0] == cds[codon]
|
318
|
+
and base[1] != cds[codon + 1]
|
319
|
+
and base[2] == cds[codon + 2]
|
320
|
+
):
|
321
|
+
variants.append(
|
322
|
+
(
|
323
|
+
f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
|
324
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
325
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
326
|
+
)
|
327
|
+
)
|
328
|
+
elif (
|
329
|
+
base[0] != cds[codon]
|
330
|
+
and base[1] == cds[codon + 1]
|
331
|
+
and base[2] == cds[codon + 2]
|
332
|
+
):
|
333
|
+
variants.append(
|
334
|
+
(
|
335
|
+
f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
|
336
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
337
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
338
|
+
)
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
variants.append(
|
342
|
+
(
|
343
|
+
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
|
344
|
+
f"{protein_id}:p.{protein[index]}{index + 1}=",
|
345
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
|
346
|
+
)
|
347
|
+
)
|
348
|
+
return variants
|
349
|
+
|
350
|
+
|
351
|
+
def inframe_del(gene: str) -> list:
|
352
|
+
variants = []
|
353
|
+
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
354
|
+
stream = Entrez.esearch(db="nucleotide", term=term)
|
355
|
+
record = Entrez.read(stream)
|
356
|
+
stream = Entrez.efetch(
|
357
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
358
|
+
)
|
359
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
360
|
+
for feature in seqrecord.features:
|
361
|
+
if feature.type == "CDS":
|
362
|
+
protein = "".join(feature.qualifiers.get("translation"))
|
363
|
+
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
364
|
+
cds = feature.location.extract(seqrecord).seq
|
365
|
+
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
366
|
+
variants.append(
|
367
|
+
(
|
368
|
+
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
|
369
|
+
f"{protein_id}:p.{protein[index]}{index + 1}del",
|
370
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
|
371
|
+
)
|
372
|
+
)
|
373
|
+
return variants
|
374
|
+
|
375
|
+
|
376
|
+
def inframe_dup(gene: str) -> list:
|
377
|
+
variants = []
|
378
|
+
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
379
|
+
stream = Entrez.esearch(db="nucleotide", term=term)
|
380
|
+
record = Entrez.read(stream)
|
381
|
+
stream = Entrez.efetch(
|
382
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
383
|
+
)
|
384
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
385
|
+
for feature in seqrecord.features:
|
386
|
+
if feature.type == "CDS":
|
387
|
+
protein = "".join(feature.qualifiers.get("translation"))
|
388
|
+
protein_id = "".join(feature.qualifiers.get("protein_id"))
|
389
|
+
cds = feature.location.extract(seqrecord).seq
|
390
|
+
for index, codon in enumerate(range(0, len(cds) - 3, 3)):
|
391
|
+
variants.append(
|
392
|
+
(
|
393
|
+
f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
|
394
|
+
f"{protein_id}:p.{protein[index]}{index + 1}dup",
|
395
|
+
f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
|
396
|
+
)
|
397
|
+
)
|
398
|
+
return variants
|
399
|
+
|
400
|
+
|
401
|
+
def frameshift_dup(gene: str) -> list:
|
402
|
+
variants = []
|
403
|
+
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
404
|
+
stream = Entrez.esearch(db="nucleotide", term=term)
|
405
|
+
record = Entrez.read(stream)
|
406
|
+
stream = Entrez.efetch(
|
407
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
408
|
+
)
|
409
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
410
|
+
for feature in seqrecord.features:
|
411
|
+
if feature.type == "CDS":
|
412
|
+
cds = feature.location.extract(seqrecord).seq
|
413
|
+
for index, base in enumerate(cds, start=1):
|
414
|
+
variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
|
415
|
+
return variants
|
416
|
+
|
417
|
+
|
418
|
+
def frameshift_del(gene: str) -> list:
|
419
|
+
variants = []
|
420
|
+
term = f'{gene}[Gene Name] "mane select"[keyword]'
|
421
|
+
stream = Entrez.esearch(db="nucleotide", term=term)
|
422
|
+
record = Entrez.read(stream)
|
423
|
+
stream = Entrez.efetch(
|
424
|
+
db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
|
425
|
+
)
|
426
|
+
seqrecord = SeqIO.read(stream, "genbank")
|
427
|
+
for feature in seqrecord.features:
|
428
|
+
if feature.type == "CDS":
|
429
|
+
cds = feature.location.extract(seqrecord).seq
|
430
|
+
for index, base in enumerate(cds, start=1):
|
431
|
+
variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
|
432
|
+
return variants
|
@@ -0,0 +1,7 @@
|
|
1
|
+
varsim/__init__.py,sha256=_Lb0mutPhj2XKnvssCgsxqGMk2LFmcJRmRA9EXS10ZU,368
|
2
|
+
varsim/core.py,sha256=ZIHKGBIp-8_seM6suoUCe2A8OgOL0gR8Oocu-IpCe_k,17712
|
3
|
+
varsim-1.0.7.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
4
|
+
varsim-1.0.7.dist-info/METADATA,sha256=-fHxAfvRqDftFLfz-oYZL9fTrxQtTh5Mpf3Jk61mVGw,2464
|
5
|
+
varsim-1.0.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
6
|
+
varsim-1.0.7.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
|
7
|
+
varsim-1.0.7.dist-info/RECORD,,
|
varsim-1.0.6.dist-info/RECORD
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
varsim/__init__.py,sha256=ZIHKGBIp-8_seM6suoUCe2A8OgOL0gR8Oocu-IpCe_k,17712
|
2
|
-
varsim-1.0.6.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
3
|
-
varsim-1.0.6.dist-info/METADATA,sha256=p205IC4VbHE2OXj7KexqchrFKTp0Ema67c37s4O3rFs,2464
|
4
|
-
varsim-1.0.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
5
|
-
varsim-1.0.6.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
|
6
|
-
varsim-1.0.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|