varsim 1.0.4__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "varsim"
7
- version = "1.0.4"
7
+ version = "1.0.6"
8
8
  authors = [
9
9
  { name="Liu Sun", email="sunliu@yxnu.edu.cn" },
10
10
  { name="Jian Yang", email="yangjian@yxnu.edu.cn" },
@@ -0,0 +1,432 @@
1
+ import os
2
+
3
+ from Bio import Entrez, SeqIO
4
+ from Bio.Data.CodonTable import standard_dna_table
5
+ from Bio.Data.IUPACData import (
6
+ unambiguous_dna_letters,
7
+ protein_letters,
8
+ protein_letters_1to3,
9
+ protein_letters_3to1,
10
+ )
11
+ from Bio.Seq import Seq
12
+ from Bio.SeqFeature import SimpleLocation
13
+ from Bio.SeqUtils import seq3
14
+
15
+ __all__ = [
16
+ "frameshift_dup",
17
+ "frameshift_del",
18
+ "cds",
19
+ "inframe_dup",
20
+ "inframe_del",
21
+ "splicing",
22
+ "utr5",
23
+ "utr3",
24
+ "aa_sub",
25
+ "missense",
26
+ ]
27
+
28
+ Entrez.email = os.environ["EMAIL"]
29
+ Entrez.api_key = os.environ["API_KEY"]
30
+ codons = standard_dna_table.forward_table.keys()
31
+
32
+
33
+ def cds(gene: str) -> list:
34
+ variants = []
35
+ stream = Entrez.esearch(
36
+ db="nucleotide",
37
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
38
+ )
39
+ record = Entrez.read(stream)
40
+ stream = Entrez.efetch(
41
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
42
+ )
43
+ seqrecord = SeqIO.read(stream, "genbank")
44
+ for feature in seqrecord.features:
45
+ if feature.type == "CDS":
46
+ protein = "".join(feature.qualifiers.get("translation"))
47
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
48
+ cds = feature.extract(seqrecord).seq
49
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
50
+ for base in unambiguous_dna_letters:
51
+ if base != cds[codon]:
52
+ seq = Seq(base) + cds[codon + 1 : codon + 3]
53
+ if protein[index] != seq.translate():
54
+ variants.append(
55
+ (
56
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
57
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
58
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
59
+ )
60
+ )
61
+ else:
62
+ variants.append(
63
+ (
64
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
65
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
66
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
67
+ )
68
+ )
69
+ if base != cds[codon + 1]:
70
+ seq = cds[codon] + Seq(base) + cds[codon + 2]
71
+ if protein[index] != seq.translate():
72
+ variants.append(
73
+ (
74
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
75
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
76
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
77
+ )
78
+ )
79
+ else:
80
+ variants.append(
81
+ (
82
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
83
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
84
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
85
+ )
86
+ )
87
+ if base != cds[codon + 2]:
88
+ seq = cds[codon : codon + 2] + Seq(base)
89
+ if protein[index] != seq.translate():
90
+ variants.append(
91
+ (
92
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
93
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
94
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
95
+ )
96
+ )
97
+ else:
98
+ variants.append(
99
+ (
100
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
101
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
102
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
103
+ )
104
+ )
105
+ return variants
106
+
107
+
108
+ def utr5(gene: str) -> list:
109
+ variants = []
110
+ stream = Entrez.esearch(
111
+ db="nucleotide",
112
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
113
+ )
114
+ record = Entrez.read(stream)
115
+ stream = Entrez.efetch(
116
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
117
+ )
118
+ seqrecord = SeqIO.read(stream, "genbank")
119
+ for feature in seqrecord.features:
120
+ if feature.type == "CDS":
121
+ utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
122
+ for index in range(len(utr5)):
123
+ for base in unambiguous_dna_letters:
124
+ if base != utr5[index]:
125
+ variants.append(
126
+ (
127
+ f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
128
+ "",
129
+ "",
130
+ )
131
+ )
132
+ return variants
133
+
134
+
135
+ def utr3(gene: str) -> list:
136
+ variants = []
137
+ stream = Entrez.esearch(
138
+ db="nucleotide",
139
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
140
+ )
141
+ record = Entrez.read(stream)
142
+ stream = Entrez.efetch(
143
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
144
+ )
145
+ seqrecord = SeqIO.read(stream, "genbank")
146
+ for feature in seqrecord.features:
147
+ if feature.type == "CDS":
148
+ utr3 = (
149
+ SimpleLocation(feature.location.end, len(seqrecord))
150
+ .extract(seqrecord)
151
+ .seq
152
+ )
153
+ for index in range(len(utr3)):
154
+ for base in unambiguous_dna_letters:
155
+ if base != utr3[index]:
156
+ variants.append(
157
+ (
158
+ f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
159
+ "",
160
+ "",
161
+ )
162
+ )
163
+ return variants
164
+
165
+
166
+ def splicing(gene: str) -> list:
167
+ variants = []
168
+ exon = []
169
+ stream = Entrez.esearch(
170
+ db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
171
+ )
172
+ record = Entrez.read(stream)
173
+
174
+ stream = Entrez.efetch(
175
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
176
+ )
177
+ seqrecord = SeqIO.read(stream, "genbank")
178
+ splicing = []
179
+ variants = []
180
+ start = 0
181
+ end = 0
182
+ for feature in seqrecord.features:
183
+ if feature.type == "CDS":
184
+ start = feature.location.start
185
+ end = feature.location.end
186
+ for feature in seqrecord.features:
187
+ if feature.type == "exon":
188
+ if feature.location.start < start and feature.location.end < start:
189
+ splicing.extend(
190
+ (
191
+ feature.location.start - start - 1,
192
+ feature.location.end - start - 1,
193
+ )
194
+ )
195
+ elif feature.location.start < start and feature.location.end > start:
196
+ splicing.extend(
197
+ (feature.location.start - start - 1, feature.location.end - start)
198
+ )
199
+ else:
200
+ splicing.extend(
201
+ (feature.location.start - start, feature.location.end - start)
202
+ )
203
+
204
+ for coordinate in range(1, len(splicing) - 1, 2):
205
+ site = splicing[coordinate], splicing[coordinate] + 1
206
+ for base in unambiguous_dna_letters:
207
+ if base != "G":
208
+ variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
209
+ if base != "T":
210
+ variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
211
+ if base != "A":
212
+ variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
213
+ if base != "G":
214
+ variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
215
+ return variants
216
+
217
+
218
+ def aa_sub(gene: str) -> list:
219
+ variants = []
220
+ term = f'{gene}[Gene Name] AND "mane select"[keyword]'
221
+ stream = Entrez.esearch(db="protein", term=term)
222
+ record = Entrez.read(stream)
223
+
224
+ stream = Entrez.efetch(
225
+ db="protein", rettype="gp", retmode="text", id=record["IdList"]
226
+ )
227
+ seqrecord = SeqIO.read(stream, "genbank")
228
+ for index, residue in enumerate(seqrecord.seq, 1):
229
+ for aa in protein_letters:
230
+ if aa != residue:
231
+ variants.append(
232
+ (
233
+ f"{seqrecord.id}:p.{residue}{index}{aa}",
234
+ f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
235
+ )
236
+ )
237
+ return variants
238
+
239
+
240
+ def missense(gene: str) -> list:
241
+ variants = []
242
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
243
+ stream = Entrez.esearch(db="nucleotide", term=term)
244
+ record = Entrez.read(stream)
245
+ stream = Entrez.efetch(
246
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
247
+ )
248
+ seqrecord = SeqIO.read(stream, "genbank")
249
+ for feature in seqrecord.features:
250
+ if feature.type == "CDS":
251
+ protein = "".join(feature.qualifiers.get("translation"))
252
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
253
+ cds = feature.location.extract(seqrecord).seq
254
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
255
+ for base in codons:
256
+ if base != cds[codon : codon + 3]:
257
+ seq = Seq(base)
258
+ if protein[index] != seq.translate():
259
+ if (
260
+ base[0] == cds[codon]
261
+ and base[1] == cds[codon + 1]
262
+ and base[2] != cds[codon + 2]
263
+ ):
264
+ variants.append(
265
+ (
266
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
267
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
268
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
269
+ )
270
+ )
271
+ elif (
272
+ base[0] == cds[codon]
273
+ and base[1] != cds[codon + 1]
274
+ and base[2] == cds[codon + 2]
275
+ ):
276
+ variants.append(
277
+ (
278
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
279
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
280
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
281
+ )
282
+ )
283
+ elif (
284
+ base[0] != cds[codon]
285
+ and base[1] == cds[codon + 1]
286
+ and base[2] == cds[codon + 2]
287
+ ):
288
+ variants.append(
289
+ (
290
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
291
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
292
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
293
+ )
294
+ )
295
+ else:
296
+ variants.append(
297
+ (
298
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
299
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
300
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
301
+ )
302
+ )
303
+ else:
304
+ if (
305
+ base[0] == cds[codon]
306
+ and base[1] == cds[codon + 1]
307
+ and base[2] != cds[codon + 2]
308
+ ):
309
+ variants.append(
310
+ (
311
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
312
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
313
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
314
+ )
315
+ )
316
+ elif (
317
+ base[0] == cds[codon]
318
+ and base[1] != cds[codon + 1]
319
+ and base[2] == cds[codon + 2]
320
+ ):
321
+ variants.append(
322
+ (
323
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
324
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
325
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
326
+ )
327
+ )
328
+ elif (
329
+ base[0] != cds[codon]
330
+ and base[1] == cds[codon + 1]
331
+ and base[2] == cds[codon + 2]
332
+ ):
333
+ variants.append(
334
+ (
335
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
336
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
337
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
338
+ )
339
+ )
340
+ else:
341
+ variants.append(
342
+ (
343
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
344
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
345
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
346
+ )
347
+ )
348
+ return variants
349
+
350
+
351
+ def inframe_del(gene: str) -> list:
352
+ variants = []
353
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
354
+ stream = Entrez.esearch(db="nucleotide", term=term)
355
+ record = Entrez.read(stream)
356
+ stream = Entrez.efetch(
357
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
358
+ )
359
+ seqrecord = SeqIO.read(stream, "genbank")
360
+ for feature in seqrecord.features:
361
+ if feature.type == "CDS":
362
+ protein = "".join(feature.qualifiers.get("translation"))
363
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
364
+ cds = feature.location.extract(seqrecord).seq
365
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
366
+ variants.append(
367
+ (
368
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
369
+ f"{protein_id}:p.{protein[index]}{index + 1}del",
370
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
371
+ )
372
+ )
373
+ return variants
374
+
375
+
376
+ def inframe_dup(gene: str) -> list:
377
+ variants = []
378
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
379
+ stream = Entrez.esearch(db="nucleotide", term=term)
380
+ record = Entrez.read(stream)
381
+ stream = Entrez.efetch(
382
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
383
+ )
384
+ seqrecord = SeqIO.read(stream, "genbank")
385
+ for feature in seqrecord.features:
386
+ if feature.type == "CDS":
387
+ protein = "".join(feature.qualifiers.get("translation"))
388
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
389
+ cds = feature.location.extract(seqrecord).seq
390
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
391
+ variants.append(
392
+ (
393
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
394
+ f"{protein_id}:p.{protein[index]}{index + 1}dup",
395
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
396
+ )
397
+ )
398
+ return variants
399
+
400
+
401
+ def frameshift_dup(gene: str) -> list:
402
+ variants = []
403
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
404
+ stream = Entrez.esearch(db="nucleotide", term=term)
405
+ record = Entrez.read(stream)
406
+ stream = Entrez.efetch(
407
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
408
+ )
409
+ seqrecord = SeqIO.read(stream, "genbank")
410
+ for feature in seqrecord.features:
411
+ if feature.type == "CDS":
412
+ cds = feature.location.extract(seqrecord).seq
413
+ for index, base in enumerate(cds, start=1):
414
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
415
+ return variants
416
+
417
+
418
+ def frameshift_del(gene: str) -> list:
419
+ variants = []
420
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
421
+ stream = Entrez.esearch(db="nucleotide", term=term)
422
+ record = Entrez.read(stream)
423
+ stream = Entrez.efetch(
424
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
425
+ )
426
+ seqrecord = SeqIO.read(stream, "genbank")
427
+ for feature in seqrecord.features:
428
+ if feature.type == "CDS":
429
+ cds = feature.location.extract(seqrecord).seq
430
+ for index, base in enumerate(cds, start=1):
431
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
432
+ return variants
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -1,270 +0,0 @@
1
- import os
2
-
3
- from Bio import Entrez, SeqIO
4
- from Bio.Data.CodonTable import standard_dna_table
5
- from Bio.Data.IUPACData import (unambiguous_dna_letters, protein_letters, protein_letters_1to3, protein_letters_3to1, )
6
- from Bio.Seq import Seq
7
- from Bio.SeqFeature import SimpleLocation
8
- from Bio.SeqUtils import seq3
9
-
10
- Entrez.email = os.environ["EMAIL"]
11
- Entrez.api_key = os.environ["API_KEY"]
12
- codons = standard_dna_table.forward_table.keys()
13
-
14
-
15
- def cds(gene: str) -> list:
16
- variants = []
17
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
18
- record = Entrez.read(stream)
19
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
20
- seqrecord = SeqIO.read(stream, "genbank")
21
- for feature in seqrecord.features:
22
- if feature.type == "CDS":
23
- protein = "".join(feature.qualifiers.get("translation"))
24
- protein_id = "".join(feature.qualifiers.get("protein_id"))
25
- cds = feature.extract(seqrecord).seq
26
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
27
- for base in unambiguous_dna_letters:
28
- if base != cds[codon]:
29
- seq = Seq(base) + cds[codon + 1: codon + 3]
30
- if protein[index] != seq.translate():
31
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
32
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
33
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
34
- else:
35
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
36
- f"{protein_id}:p.{protein[index]}{index + 1}=",
37
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
38
- if base != cds[codon + 1]:
39
- seq = cds[codon] + Seq(base) + cds[codon + 2]
40
- if protein[index] != seq.translate():
41
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
42
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
43
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
44
- else:
45
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
46
- f"{protein_id}:p.{protein[index]}{index + 1}=",
47
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
48
- if base != cds[codon + 2]:
49
- seq = cds[codon: codon + 2] + Seq(base)
50
- if protein[index] != seq.translate():
51
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
52
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
53
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
54
- else:
55
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
56
- f"{protein_id}:p.{protein[index]}{index + 1}=",
57
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
58
- return variants
59
-
60
-
61
- def utr5(gene: str) -> list:
62
- variants = []
63
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
64
- record = Entrez.read(stream)
65
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
66
- seqrecord = SeqIO.read(stream, "genbank")
67
- for feature in seqrecord.features:
68
- if feature.type == "CDS":
69
- utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
70
- for index in range(len(utr5)):
71
- for base in unambiguous_dna_letters:
72
- if base != utr5[index]:
73
- variants.append((f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}", "", "",))
74
- return variants
75
-
76
-
77
- def utr3(gene: str) -> list:
78
- variants = []
79
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
80
- record = Entrez.read(stream)
81
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
82
- seqrecord = SeqIO.read(stream, "genbank")
83
- for feature in seqrecord.features:
84
- if feature.type == "CDS":
85
- utr3 = (SimpleLocation(feature.location.end, len(seqrecord)).extract(seqrecord).seq)
86
- for index in range(len(utr3)):
87
- for base in unambiguous_dna_letters:
88
- if base != utr3[index]:
89
- variants.append((f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}", "", "",))
90
- return variants
91
-
92
-
93
- def splicing(gene: str) -> list:
94
- variants = []
95
- exon = []
96
- stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]')
97
- record = Entrez.read(stream)
98
-
99
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
100
- seqrecord = SeqIO.read(stream, "genbank")
101
- splicing = []
102
- variants = []
103
- start = 0
104
- end = 0
105
- for feature in seqrecord.features:
106
- if feature.type == "CDS":
107
- start = feature.location.start
108
- end = feature.location.end
109
- for feature in seqrecord.features:
110
- if feature.type == "exon":
111
- if feature.location.start < start and feature.location.end < start:
112
- splicing.extend((feature.location.start - start - 1, feature.location.end - start - 1,))
113
- elif feature.location.start < start and feature.location.end > start:
114
- splicing.extend((feature.location.start - start - 1, feature.location.end - start))
115
- else:
116
- splicing.extend((feature.location.start - start, feature.location.end - start))
117
-
118
- for coordinate in range(1, len(splicing) - 1, 2):
119
- site = splicing[coordinate], splicing[coordinate] + 1
120
- for base in unambiguous_dna_letters:
121
- if base != "G":
122
- variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
123
- if base != "T":
124
- variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
125
- if base != "A":
126
- variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
127
- if base != "G":
128
- variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
129
- return variants
130
-
131
-
132
- def aa_sub(gene: str) -> list:
133
- variants = []
134
- term = f'{gene}[Gene Name] AND "mane select"[keyword]'
135
- stream = Entrez.esearch(db="protein", term=term)
136
- record = Entrez.read(stream)
137
-
138
- stream = Entrez.efetch(db="protein", rettype="gp", retmode="text", id=record["IdList"])
139
- seqrecord = SeqIO.read(stream, "genbank")
140
- for index, residue in enumerate(seqrecord.seq, 1):
141
- for aa in protein_letters:
142
- if aa != residue:
143
- variants.append((f"{seqrecord.id}:p.{residue}{index}{aa}",
144
- f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",))
145
- return variants
146
-
147
-
148
- def missense(gene: str) -> list:
149
- variants = []
150
- term = f'{gene}[Gene Name] "mane select"[keyword]'
151
- stream = Entrez.esearch(db="nucleotide", term=term)
152
- record = Entrez.read(stream)
153
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
154
- seqrecord = SeqIO.read(stream, "genbank")
155
- for feature in seqrecord.features:
156
- if feature.type == "CDS":
157
- protein = "".join(feature.qualifiers.get("translation"))
158
- protein_id = "".join(feature.qualifiers.get("protein_id"))
159
- cds = feature.location.extract(seqrecord).seq
160
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
161
- for base in codons:
162
- if base != cds[codon: codon + 3]:
163
- seq = Seq(base)
164
- if protein[index] != seq.translate():
165
- if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
166
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
167
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
168
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
169
- elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
170
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
171
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
172
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
173
- elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
174
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
175
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
176
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
177
- else:
178
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
179
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
180
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
181
- else:
182
- if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
183
- variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
184
- f"{protein_id}:p.{protein[index]}{index + 1}=",
185
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
186
- elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
187
- variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
188
- f"{protein_id}:p.{protein[index]}{index + 1}=",
189
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
190
- elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
191
- variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
192
- f"{protein_id}:p.{protein[index]}{index + 1}=",
193
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
194
- else:
195
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
196
- f"{protein_id}:p.{protein[index]}{index + 1}=",
197
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
198
- return variants
199
-
200
-
201
- def inframe_del(gene: str) -> list:
202
- variants = []
203
- term = f'{gene}[Gene Name] "mane select"[keyword]'
204
- stream = Entrez.esearch(db="nucleotide", term=term)
205
- record = Entrez.read(stream)
206
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
207
- seqrecord = SeqIO.read(stream, "genbank")
208
- for feature in seqrecord.features:
209
- if feature.type == "CDS":
210
- protein = "".join(feature.qualifiers.get("translation"))
211
- protein_id = "".join(feature.qualifiers.get("protein_id"))
212
- cds = feature.location.extract(seqrecord).seq
213
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
214
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
215
- f"{protein_id}:p.{protein[index]}{index + 1}del",
216
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",))
217
- return variants
218
-
219
-
220
- def inframe_dup(gene: str) -> list:
221
- variants = []
222
- term = f'{gene}[Gene Name] "mane select"[keyword]'
223
- stream = Entrez.esearch(db="nucleotide", term=term)
224
- record = Entrez.read(stream)
225
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
226
- seqrecord = SeqIO.read(stream, "genbank")
227
- for feature in seqrecord.features:
228
- if feature.type == "CDS":
229
- protein = "".join(feature.qualifiers.get("translation"))
230
- protein_id = "".join(feature.qualifiers.get("protein_id"))
231
- cds = feature.location.extract(seqrecord).seq
232
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
233
- variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
234
- f"{protein_id}:p.{protein[index]}{index + 1}dup",
235
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",))
236
- return variants
237
-
238
-
239
- def frameshift_dup(gene: str) -> list:
240
- variants = []
241
- term = f'{gene}[Gene Name] "mane select"[keyword]'
242
- stream = Entrez.esearch(db="nucleotide", term=term)
243
- record = Entrez.read(stream)
244
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
245
- seqrecord = SeqIO.read(stream, "genbank")
246
- for feature in seqrecord.features:
247
- if feature.type == "CDS":
248
- cds = feature.location.extract(seqrecord).seq
249
- for index, base in enumerate(cds, start=1):
250
- variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
251
- return variants
252
-
253
-
254
- def frameshift_del(gene: str) -> list:
255
- variants = []
256
- term = f'{gene}[Gene Name] "mane select"[keyword]'
257
- stream = Entrez.esearch(db="nucleotide", term=term)
258
- record = Entrez.read(stream)
259
- stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
260
- seqrecord = SeqIO.read(stream, "genbank")
261
- for feature in seqrecord.features:
262
- if feature.type == "CDS":
263
- cds = feature.location.extract(seqrecord).seq
264
- for index, base in enumerate(cds, start=1):
265
- variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
266
- return variants
267
-
268
-
269
- if __name__ == "__main__":
270
- print(frameshift_del("INS"))
File without changes
File without changes
File without changes