varsim 1.0.4__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.4
3
+ Version: 1.0.5
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "varsim"
7
- version = "1.0.4"
7
+ version = "1.0.5"
8
8
  authors = [
9
9
  { name="Liu Sun", email="sunliu@yxnu.edu.cn" },
10
10
  { name="Jian Yang", email="yangjian@yxnu.edu.cn" },
@@ -0,0 +1,433 @@
1
+ import os
2
+
3
+ from Bio import Entrez, SeqIO
4
+ from Bio.Data.CodonTable import standard_dna_table
5
+ from Bio.Data.IUPACData import (
6
+ unambiguous_dna_letters,
7
+ protein_letters,
8
+ protein_letters_1to3,
9
+ protein_letters_3to1,
10
+ )
11
+ from Bio.Seq import Seq
12
+ from Bio.SeqFeature import SimpleLocation
13
+ from Bio.SeqUtils import seq3
14
+
15
+ Entrez.email = os.environ["EMAIL"]
16
+ Entrez.api_key = os.environ["API_KEY"]
17
+ codons = standard_dna_table.forward_table.keys()
18
+
19
+
20
+ def cds(gene: str) -> list:
21
+ variants = []
22
+ stream = Entrez.esearch(
23
+ db="nucleotide",
24
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
25
+ )
26
+ record = Entrez.read(stream)
27
+ stream = Entrez.efetch(
28
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
29
+ )
30
+ seqrecord = SeqIO.read(stream, "genbank")
31
+ for feature in seqrecord.features:
32
+ if feature.type == "CDS":
33
+ protein = "".join(feature.qualifiers.get("translation"))
34
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
35
+ cds = feature.extract(seqrecord).seq
36
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
37
+ for base in unambiguous_dna_letters:
38
+ if base != cds[codon]:
39
+ seq = Seq(base) + cds[codon + 1 : codon + 3]
40
+ if protein[index] != seq.translate():
41
+ variants.append(
42
+ (
43
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
44
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
45
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
46
+ )
47
+ )
48
+ else:
49
+ variants.append(
50
+ (
51
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
52
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
53
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
54
+ )
55
+ )
56
+ if base != cds[codon + 1]:
57
+ seq = cds[codon] + Seq(base) + cds[codon + 2]
58
+ if protein[index] != seq.translate():
59
+ variants.append(
60
+ (
61
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
62
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
63
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
64
+ )
65
+ )
66
+ else:
67
+ variants.append(
68
+ (
69
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
70
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
71
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
72
+ )
73
+ )
74
+ if base != cds[codon + 2]:
75
+ seq = cds[codon : codon + 2] + Seq(base)
76
+ if protein[index] != seq.translate():
77
+ variants.append(
78
+ (
79
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
80
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
81
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
82
+ )
83
+ )
84
+ else:
85
+ variants.append(
86
+ (
87
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
88
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
89
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
90
+ )
91
+ )
92
+ return variants
93
+
94
+
95
+ def utr5(gene: str) -> list:
96
+ variants = []
97
+ stream = Entrez.esearch(
98
+ db="nucleotide",
99
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
100
+ )
101
+ record = Entrez.read(stream)
102
+ stream = Entrez.efetch(
103
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
104
+ )
105
+ seqrecord = SeqIO.read(stream, "genbank")
106
+ for feature in seqrecord.features:
107
+ if feature.type == "CDS":
108
+ utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
109
+ for index in range(len(utr5)):
110
+ for base in unambiguous_dna_letters:
111
+ if base != utr5[index]:
112
+ variants.append(
113
+ (
114
+ f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
115
+ "",
116
+ "",
117
+ )
118
+ )
119
+ return variants
120
+
121
+
122
+ def utr3(gene: str) -> list:
123
+ variants = []
124
+ stream = Entrez.esearch(
125
+ db="nucleotide",
126
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
127
+ )
128
+ record = Entrez.read(stream)
129
+ stream = Entrez.efetch(
130
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
131
+ )
132
+ seqrecord = SeqIO.read(stream, "genbank")
133
+ for feature in seqrecord.features:
134
+ if feature.type == "CDS":
135
+ utr3 = (
136
+ SimpleLocation(feature.location.end, len(seqrecord))
137
+ .extract(seqrecord)
138
+ .seq
139
+ )
140
+ for index in range(len(utr3)):
141
+ for base in unambiguous_dna_letters:
142
+ if base != utr3[index]:
143
+ variants.append(
144
+ (
145
+ f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
146
+ "",
147
+ "",
148
+ )
149
+ )
150
+ return variants
151
+
152
+
153
+ def splicing(gene: str) -> list:
154
+ variants = []
155
+ exon = []
156
+ stream = Entrez.esearch(
157
+ db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
158
+ )
159
+ record = Entrez.read(stream)
160
+
161
+ stream = Entrez.efetch(
162
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
163
+ )
164
+ seqrecord = SeqIO.read(stream, "genbank")
165
+ splicing = []
166
+ variants = []
167
+ start = 0
168
+ end = 0
169
+ for feature in seqrecord.features:
170
+ if feature.type == "CDS":
171
+ start = feature.location.start
172
+ end = feature.location.end
173
+ for feature in seqrecord.features:
174
+ if feature.type == "exon":
175
+ if feature.location.start < start and feature.location.end < start:
176
+ splicing.extend(
177
+ (
178
+ feature.location.start - start - 1,
179
+ feature.location.end - start - 1,
180
+ )
181
+ )
182
+ elif feature.location.start < start and feature.location.end > start:
183
+ splicing.extend(
184
+ (feature.location.start - start - 1, feature.location.end - start)
185
+ )
186
+ else:
187
+ splicing.extend(
188
+ (feature.location.start - start, feature.location.end - start)
189
+ )
190
+
191
+ for coordinate in range(1, len(splicing) - 1, 2):
192
+ site = splicing[coordinate], splicing[coordinate] + 1
193
+ for base in unambiguous_dna_letters:
194
+ if base != "G":
195
+ variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
196
+ if base != "T":
197
+ variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
198
+ if base != "A":
199
+ variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
200
+ if base != "G":
201
+ variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
202
+ return variants
203
+
204
+
205
+ def aa_sub(gene: str) -> list:
206
+ variants = []
207
+ term = f'{gene}[Gene Name] AND "mane select"[keyword]'
208
+ stream = Entrez.esearch(db="protein", term=term)
209
+ record = Entrez.read(stream)
210
+
211
+ stream = Entrez.efetch(
212
+ db="protein", rettype="gp", retmode="text", id=record["IdList"]
213
+ )
214
+ seqrecord = SeqIO.read(stream, "genbank")
215
+ for index, residue in enumerate(seqrecord.seq, 1):
216
+ for aa in protein_letters:
217
+ if aa != residue:
218
+ variants.append(
219
+ (
220
+ f"{seqrecord.id}:p.{residue}{index}{aa}",
221
+ f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
222
+ )
223
+ )
224
+ return variants
225
+
226
+
227
+ def missense(gene: str) -> list:
228
+ variants = []
229
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
230
+ stream = Entrez.esearch(db="nucleotide", term=term)
231
+ record = Entrez.read(stream)
232
+ stream = Entrez.efetch(
233
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
234
+ )
235
+ seqrecord = SeqIO.read(stream, "genbank")
236
+ for feature in seqrecord.features:
237
+ if feature.type == "CDS":
238
+ protein = "".join(feature.qualifiers.get("translation"))
239
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
240
+ cds = feature.location.extract(seqrecord).seq
241
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
242
+ for base in codons:
243
+ if base != cds[codon : codon + 3]:
244
+ seq = Seq(base)
245
+ if protein[index] != seq.translate():
246
+ if (
247
+ base[0] == cds[codon]
248
+ and base[1] == cds[codon + 1]
249
+ and base[2] != cds[codon + 2]
250
+ ):
251
+ variants.append(
252
+ (
253
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
254
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
255
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
256
+ )
257
+ )
258
+ elif (
259
+ base[0] == cds[codon]
260
+ and base[1] != cds[codon + 1]
261
+ and base[2] == cds[codon + 2]
262
+ ):
263
+ variants.append(
264
+ (
265
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
266
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
267
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
268
+ )
269
+ )
270
+ elif (
271
+ base[0] != cds[codon]
272
+ and base[1] == cds[codon + 1]
273
+ and base[2] == cds[codon + 2]
274
+ ):
275
+ variants.append(
276
+ (
277
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
278
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
279
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
280
+ )
281
+ )
282
+ else:
283
+ variants.append(
284
+ (
285
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
286
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
287
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
288
+ )
289
+ )
290
+ else:
291
+ if (
292
+ base[0] == cds[codon]
293
+ and base[1] == cds[codon + 1]
294
+ and base[2] != cds[codon + 2]
295
+ ):
296
+ variants.append(
297
+ (
298
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
299
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
300
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
301
+ )
302
+ )
303
+ elif (
304
+ base[0] == cds[codon]
305
+ and base[1] != cds[codon + 1]
306
+ and base[2] == cds[codon + 2]
307
+ ):
308
+ variants.append(
309
+ (
310
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
311
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
312
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
313
+ )
314
+ )
315
+ elif (
316
+ base[0] != cds[codon]
317
+ and base[1] == cds[codon + 1]
318
+ and base[2] == cds[codon + 2]
319
+ ):
320
+ variants.append(
321
+ (
322
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
323
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
324
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
325
+ )
326
+ )
327
+ else:
328
+ variants.append(
329
+ (
330
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
331
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
332
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
333
+ )
334
+ )
335
+ return variants
336
+
337
+
338
+ def inframe_del(gene: str) -> list:
339
+ variants = []
340
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
341
+ stream = Entrez.esearch(db="nucleotide", term=term)
342
+ record = Entrez.read(stream)
343
+ stream = Entrez.efetch(
344
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
345
+ )
346
+ seqrecord = SeqIO.read(stream, "genbank")
347
+ for feature in seqrecord.features:
348
+ if feature.type == "CDS":
349
+ protein = "".join(feature.qualifiers.get("translation"))
350
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
351
+ cds = feature.location.extract(seqrecord).seq
352
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
353
+ variants.append(
354
+ (
355
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
356
+ f"{protein_id}:p.{protein[index]}{index + 1}del",
357
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
358
+ )
359
+ )
360
+ return variants
361
+
362
+
363
+ def inframe_dup(gene: str) -> list:
364
+ variants = []
365
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
366
+ stream = Entrez.esearch(db="nucleotide", term=term)
367
+ record = Entrez.read(stream)
368
+ stream = Entrez.efetch(
369
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
370
+ )
371
+ seqrecord = SeqIO.read(stream, "genbank")
372
+ for feature in seqrecord.features:
373
+ if feature.type == "CDS":
374
+ protein = "".join(feature.qualifiers.get("translation"))
375
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
376
+ cds = feature.location.extract(seqrecord).seq
377
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
378
+ variants.append(
379
+ (
380
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
381
+ f"{protein_id}:p.{protein[index]}{index + 1}dup",
382
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
383
+ )
384
+ )
385
+ return variants
386
+
387
+
388
+ def frameshift_dup(gene: str) -> list:
389
+ variants = []
390
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
391
+ stream = Entrez.esearch(db="nucleotide", term=term)
392
+ record = Entrez.read(stream)
393
+ stream = Entrez.efetch(
394
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
395
+ )
396
+ seqrecord = SeqIO.read(stream, "genbank")
397
+ for feature in seqrecord.features:
398
+ if feature.type == "CDS":
399
+ cds = feature.location.extract(seqrecord).seq
400
+ for index, base in enumerate(cds, start=1):
401
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
402
+ return variants
403
+
404
+
405
+ def frameshift_del(gene: str) -> list:
406
+ variants = []
407
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
408
+ stream = Entrez.esearch(db="nucleotide", term=term)
409
+ record = Entrez.read(stream)
410
+ stream = Entrez.efetch(
411
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
412
+ )
413
+ seqrecord = SeqIO.read(stream, "genbank")
414
+ for feature in seqrecord.features:
415
+ if feature.type == "CDS":
416
+ cds = feature.location.extract(seqrecord).seq
417
+ for index, base in enumerate(cds, start=1):
418
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
419
+ return variants
420
+
421
+
422
+ __all__ = [
423
+ "frameshift_dup",
424
+ "frameshift_del",
425
+ "cds",
426
+ "inframe_dup",
427
+ "inframe_del",
428
+ "splicing",
429
+ "utr5",
430
+ "utr3",
431
+ "aa_sub",
432
+ "missense",
433
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.4
3
+ Version: 1.0.5
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -1,270 +0,0 @@
1
- import os
2
-
3
- from Bio import Entrez, SeqIO
4
- from Bio.Data.CodonTable import standard_dna_table
5
- from Bio.Data.IUPACData import (unambiguous_dna_letters, protein_letters, protein_letters_1to3, protein_letters_3to1, )
6
- from Bio.Seq import Seq
7
- from Bio.SeqFeature import SimpleLocation
8
- from Bio.SeqUtils import seq3
9
-
10
- Entrez.email = os.environ["EMAIL"]
11
- Entrez.api_key = os.environ["API_KEY"]
12
- codons = standard_dna_table.forward_table.keys()
13
-
14
-
15
- def cds(gene: str) -> list:
16
- variants = []
17
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
18
- record = Entrez.read(stream)
19
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
20
- seqrecord = SeqIO.read(stream, "genbank")
21
- for feature in seqrecord.features:
22
- if feature.type == "CDS":
23
- protein = "".join(feature.qualifiers.get("translation"))
24
- protein_id = "".join(feature.qualifiers.get("protein_id"))
25
- cds = feature.extract(seqrecord).seq
26
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
27
- for base in unambiguous_dna_letters:
28
- if base != cds[codon]:
29
- seq = Seq(base) + cds[codon + 1: codon + 3]
30
- if protein[index] != seq.translate():
31
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
32
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
33
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
34
- else:
35
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
36
- f"{protein_id}:p.{protein[index]}{index + 1}=",
37
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
38
- if base != cds[codon + 1]:
39
- seq = cds[codon] + Seq(base) + cds[codon + 2]
40
- if protein[index] != seq.translate():
41
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
42
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
43
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
44
- else:
45
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
46
- f"{protein_id}:p.{protein[index]}{index + 1}=",
47
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
48
- if base != cds[codon + 2]:
49
- seq = cds[codon: codon + 2] + Seq(base)
50
- if protein[index] != seq.translate():
51
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
52
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
53
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
54
- else:
55
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
56
- f"{protein_id}:p.{protein[index]}{index + 1}=",
57
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
58
- return variants
59
-
60
-
61
- def utr5(gene: str) -> list:
62
- variants = []
63
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
64
- record = Entrez.read(stream)
65
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
66
- seqrecord = SeqIO.read(stream, "genbank")
67
- for feature in seqrecord.features:
68
- if feature.type == "CDS":
69
- utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
70
- for index in range(len(utr5)):
71
- for base in unambiguous_dna_letters:
72
- if base != utr5[index]:
73
- variants.append((f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}", "", "",))
74
- return variants
75
-
76
-
77
- def utr3(gene: str) -> list:
78
- variants = []
79
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
80
- record = Entrez.read(stream)
81
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
82
- seqrecord = SeqIO.read(stream, "genbank")
83
- for feature in seqrecord.features:
84
- if feature.type == "CDS":
85
- utr3 = (SimpleLocation(feature.location.end, len(seqrecord)).extract(seqrecord).seq)
86
- for index in range(len(utr3)):
87
- for base in unambiguous_dna_letters:
88
- if base != utr3[index]:
89
- variants.append((f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}", "", "",))
90
- return variants
91
-
92
-
93
- def splicing(gene: str) -> list:
94
- variants = []
95
- exon = []
96
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]')
97
- record = Entrez.read(stream)
98
-
99
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
100
- seqrecord = SeqIO.read(stream, "genbank")
101
- splicing = []
102
- variants = []
103
- start = 0
104
- end = 0
105
- for feature in seqrecord.features:
106
- if feature.type == "CDS":
107
- start = feature.location.start
108
- end = feature.location.end
109
- for feature in seqrecord.features:
110
- if feature.type == "exon":
111
- if feature.location.start < start and feature.location.end < start:
112
- splicing.extend((feature.location.start - start - 1, feature.location.end - start - 1,))
113
- elif feature.location.start < start and feature.location.end > start:
114
- splicing.extend((feature.location.start - start - 1, feature.location.end - start))
115
- else:
116
- splicing.extend((feature.location.start - start, feature.location.end - start))
117
-
118
- for coordinate in range(1, len(splicing) - 1, 2):
119
- site = splicing[coordinate], splicing[coordinate] + 1
120
- for base in unambiguous_dna_letters:
121
- if base != "G":
122
- variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
123
- if base != "T":
124
- variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
125
- if base != "A":
126
- variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
127
- if base != "G":
128
- variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
129
- return variants
130
-
131
-
132
- def aa_sub(gene: str) -> list:
133
- variants = []
134
- term = f'{gene}[Gene Name] AND "mane select"[keyword]'
135
- stream = Entrez.esearch(db="protein", term=term)
136
- record = Entrez.read(stream)
137
-
138
- stream = Entrez.efetch(db="protein", rettype="gp", retmode="text", id=record["IdList"])
139
- seqrecord = SeqIO.read(stream, "genbank")
140
- for index, residue in enumerate(seqrecord.seq, 1):
141
- for aa in protein_letters:
142
- if aa != residue:
143
- variants.append((f"{seqrecord.id}:p.{residue}{index}{aa}",
144
- f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",))
145
- return variants
146
-
147
-
148
- def missense(gene: str) -> list:
149
- variants = []
150
- term = f'{gene}[Gene Name] "mane select"[keyword]'
151
- stream = Entrez.esearch(db="nucleotide", term=term)
152
- record = Entrez.read(stream)
153
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
154
- seqrecord = SeqIO.read(stream, "genbank")
155
- for feature in seqrecord.features:
156
- if feature.type == "CDS":
157
- protein = "".join(feature.qualifiers.get("translation"))
158
- protein_id = "".join(feature.qualifiers.get("protein_id"))
159
- cds = feature.location.extract(seqrecord).seq
160
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
161
- for base in codons:
162
- if base != cds[codon: codon + 3]:
163
- seq = Seq(base)
164
- if protein[index] != seq.translate():
165
- if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
166
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
167
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
168
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
169
- elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
170
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
171
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
172
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
173
- elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
174
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
175
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
176
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
177
- else:
178
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
179
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
180
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
181
- else:
182
- if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
183
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
184
- f"{protein_id}:p.{protein[index]}{index + 1}=",
185
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
186
- elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
187
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
188
- f"{protein_id}:p.{protein[index]}{index + 1}=",
189
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
190
- elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
191
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
192
- f"{protein_id}:p.{protein[index]}{index + 1}=",
193
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
194
- else:
195
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
196
- f"{protein_id}:p.{protein[index]}{index + 1}=",
197
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
198
- return variants
199
-
200
-
201
- def inframe_del(gene: str) -> list:
202
- variants = []
203
- term = f'{gene}[Gene Name] "mane select"[keyword]'
204
- stream = Entrez.esearch(db="nucleotide", term=term)
205
- record = Entrez.read(stream)
206
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
207
- seqrecord = SeqIO.read(stream, "genbank")
208
- for feature in seqrecord.features:
209
- if feature.type == "CDS":
210
- protein = "".join(feature.qualifiers.get("translation"))
211
- protein_id = "".join(feature.qualifiers.get("protein_id"))
212
- cds = feature.location.extract(seqrecord).seq
213
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
214
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
215
- f"{protein_id}:p.{protein[index]}{index + 1}del",
216
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",))
217
- return variants
218
-
219
-
220
- def inframe_dup(gene: str) -> list:
221
- variants = []
222
- term = f'{gene}[Gene Name] "mane select"[keyword]'
223
- stream = Entrez.esearch(db="nucleotide", term=term)
224
- record = Entrez.read(stream)
225
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
226
- seqrecord = SeqIO.read(stream, "genbank")
227
- for feature in seqrecord.features:
228
- if feature.type == "CDS":
229
- protein = "".join(feature.qualifiers.get("translation"))
230
- protein_id = "".join(feature.qualifiers.get("protein_id"))
231
- cds = feature.location.extract(seqrecord).seq
232
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
233
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
234
- f"{protein_id}:p.{protein[index]}{index + 1}dup",
235
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",))
236
- return variants
237
-
238
-
239
- def frameshift_dup(gene: str) -> list:
240
- variants = []
241
- term = f'{gene}[Gene Name] "mane select"[keyword]'
242
- stream = Entrez.esearch(db="nucleotide", term=term)
243
- record = Entrez.read(stream)
244
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
245
- seqrecord = SeqIO.read(stream, "genbank")
246
- for feature in seqrecord.features:
247
- if feature.type == "CDS":
248
- cds = feature.location.extract(seqrecord).seq
249
- for index, base in enumerate(cds, start=1):
250
- variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
251
- return variants
252
-
253
-
254
- def frameshift_del(gene: str) -> list:
255
- variants = []
256
- term = f'{gene}[Gene Name] "mane select"[keyword]'
257
- stream = Entrez.esearch(db="nucleotide", term=term)
258
- record = Entrez.read(stream)
259
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
260
- seqrecord = SeqIO.read(stream, "genbank")
261
- for feature in seqrecord.features:
262
- if feature.type == "CDS":
263
- cds = feature.location.extract(seqrecord).seq
264
- for index, base in enumerate(cds, start=1):
265
- variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
266
- return variants
267
-
268
-
269
- if __name__ == "__main__":
270
- print(frameshift_del("INS"))
File without changes
File without changes
File without changes