varsim 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
varsim/__init__.py CHANGED
@@ -1,433 +1,25 @@
1
- import os
2
-
3
- from Bio import Entrez, SeqIO
4
- from Bio.Data.CodonTable import standard_dna_table
5
- from Bio.Data.IUPACData import (
6
- unambiguous_dna_letters,
7
- protein_letters,
8
- protein_letters_1to3,
9
- protein_letters_3to1,
1
+ from .core import (
2
+ cds,
3
+ utr5,
4
+ utr3,
5
+ splicing,
6
+ inframe_del,
7
+ inframe_dup,
8
+ frameshift_del,
9
+ frameshift_dup,
10
+ aa_sub,
11
+ missense,
10
12
  )
11
- from Bio.Seq import Seq
12
- from Bio.SeqFeature import SimpleLocation
13
- from Bio.SeqUtils import seq3
14
-
15
- Entrez.email = os.environ["EMAIL"]
16
- Entrez.api_key = os.environ["API_KEY"]
17
- codons = standard_dna_table.forward_table.keys()
18
-
19
-
20
- def cds(gene: str) -> list:
21
- variants = []
22
- stream = Entrez.esearch(
23
- db="nucleotide",
24
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
25
- )
26
- record = Entrez.read(stream)
27
- stream = Entrez.efetch(
28
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
29
- )
30
- seqrecord = SeqIO.read(stream, "genbank")
31
- for feature in seqrecord.features:
32
- if feature.type == "CDS":
33
- protein = "".join(feature.qualifiers.get("translation"))
34
- protein_id = "".join(feature.qualifiers.get("protein_id"))
35
- cds = feature.extract(seqrecord).seq
36
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
37
- for base in unambiguous_dna_letters:
38
- if base != cds[codon]:
39
- seq = Seq(base) + cds[codon + 1 : codon + 3]
40
- if protein[index] != seq.translate():
41
- variants.append(
42
- (
43
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
44
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
45
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
46
- )
47
- )
48
- else:
49
- variants.append(
50
- (
51
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
52
- f"{protein_id}:p.{protein[index]}{index + 1}=",
53
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
54
- )
55
- )
56
- if base != cds[codon + 1]:
57
- seq = cds[codon] + Seq(base) + cds[codon + 2]
58
- if protein[index] != seq.translate():
59
- variants.append(
60
- (
61
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
62
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
63
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
64
- )
65
- )
66
- else:
67
- variants.append(
68
- (
69
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
70
- f"{protein_id}:p.{protein[index]}{index + 1}=",
71
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
72
- )
73
- )
74
- if base != cds[codon + 2]:
75
- seq = cds[codon : codon + 2] + Seq(base)
76
- if protein[index] != seq.translate():
77
- variants.append(
78
- (
79
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
80
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
81
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
82
- )
83
- )
84
- else:
85
- variants.append(
86
- (
87
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
88
- f"{protein_id}:p.{protein[index]}{index + 1}=",
89
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
90
- )
91
- )
92
- return variants
93
-
94
-
95
- def utr5(gene: str) -> list:
96
- variants = []
97
- stream = Entrez.esearch(
98
- db="nucleotide",
99
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
100
- )
101
- record = Entrez.read(stream)
102
- stream = Entrez.efetch(
103
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
104
- )
105
- seqrecord = SeqIO.read(stream, "genbank")
106
- for feature in seqrecord.features:
107
- if feature.type == "CDS":
108
- utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
109
- for index in range(len(utr5)):
110
- for base in unambiguous_dna_letters:
111
- if base != utr5[index]:
112
- variants.append(
113
- (
114
- f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
115
- "",
116
- "",
117
- )
118
- )
119
- return variants
120
-
121
-
122
- def utr3(gene: str) -> list:
123
- variants = []
124
- stream = Entrez.esearch(
125
- db="nucleotide",
126
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
127
- )
128
- record = Entrez.read(stream)
129
- stream = Entrez.efetch(
130
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
131
- )
132
- seqrecord = SeqIO.read(stream, "genbank")
133
- for feature in seqrecord.features:
134
- if feature.type == "CDS":
135
- utr3 = (
136
- SimpleLocation(feature.location.end, len(seqrecord))
137
- .extract(seqrecord)
138
- .seq
139
- )
140
- for index in range(len(utr3)):
141
- for base in unambiguous_dna_letters:
142
- if base != utr3[index]:
143
- variants.append(
144
- (
145
- f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
146
- "",
147
- "",
148
- )
149
- )
150
- return variants
151
-
152
-
153
- def splicing(gene: str) -> list:
154
- variants = []
155
- exon = []
156
- stream = Entrez.esearch(
157
- db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
158
- )
159
- record = Entrez.read(stream)
160
-
161
- stream = Entrez.efetch(
162
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
163
- )
164
- seqrecord = SeqIO.read(stream, "genbank")
165
- splicing = []
166
- variants = []
167
- start = 0
168
- end = 0
169
- for feature in seqrecord.features:
170
- if feature.type == "CDS":
171
- start = feature.location.start
172
- end = feature.location.end
173
- for feature in seqrecord.features:
174
- if feature.type == "exon":
175
- if feature.location.start < start and feature.location.end < start:
176
- splicing.extend(
177
- (
178
- feature.location.start - start - 1,
179
- feature.location.end - start - 1,
180
- )
181
- )
182
- elif feature.location.start < start and feature.location.end > start:
183
- splicing.extend(
184
- (feature.location.start - start - 1, feature.location.end - start)
185
- )
186
- else:
187
- splicing.extend(
188
- (feature.location.start - start, feature.location.end - start)
189
- )
190
-
191
- for coordinate in range(1, len(splicing) - 1, 2):
192
- site = splicing[coordinate], splicing[coordinate] + 1
193
- for base in unambiguous_dna_letters:
194
- if base != "G":
195
- variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
196
- if base != "T":
197
- variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
198
- if base != "A":
199
- variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
200
- if base != "G":
201
- variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
202
- return variants
203
-
204
-
205
- def aa_sub(gene: str) -> list:
206
- variants = []
207
- term = f'{gene}[Gene Name] AND "mane select"[keyword]'
208
- stream = Entrez.esearch(db="protein", term=term)
209
- record = Entrez.read(stream)
210
-
211
- stream = Entrez.efetch(
212
- db="protein", rettype="gp", retmode="text", id=record["IdList"]
213
- )
214
- seqrecord = SeqIO.read(stream, "genbank")
215
- for index, residue in enumerate(seqrecord.seq, 1):
216
- for aa in protein_letters:
217
- if aa != residue:
218
- variants.append(
219
- (
220
- f"{seqrecord.id}:p.{residue}{index}{aa}",
221
- f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
222
- )
223
- )
224
- return variants
225
-
226
-
227
- def missense(gene: str) -> list:
228
- variants = []
229
- term = f'{gene}[Gene Name] "mane select"[keyword]'
230
- stream = Entrez.esearch(db="nucleotide", term=term)
231
- record = Entrez.read(stream)
232
- stream = Entrez.efetch(
233
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
234
- )
235
- seqrecord = SeqIO.read(stream, "genbank")
236
- for feature in seqrecord.features:
237
- if feature.type == "CDS":
238
- protein = "".join(feature.qualifiers.get("translation"))
239
- protein_id = "".join(feature.qualifiers.get("protein_id"))
240
- cds = feature.location.extract(seqrecord).seq
241
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
242
- for base in codons:
243
- if base != cds[codon : codon + 3]:
244
- seq = Seq(base)
245
- if protein[index] != seq.translate():
246
- if (
247
- base[0] == cds[codon]
248
- and base[1] == cds[codon + 1]
249
- and base[2] != cds[codon + 2]
250
- ):
251
- variants.append(
252
- (
253
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
254
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
255
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
256
- )
257
- )
258
- elif (
259
- base[0] == cds[codon]
260
- and base[1] != cds[codon + 1]
261
- and base[2] == cds[codon + 2]
262
- ):
263
- variants.append(
264
- (
265
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
266
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
267
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
268
- )
269
- )
270
- elif (
271
- base[0] != cds[codon]
272
- and base[1] == cds[codon + 1]
273
- and base[2] == cds[codon + 2]
274
- ):
275
- variants.append(
276
- (
277
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
278
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
279
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
280
- )
281
- )
282
- else:
283
- variants.append(
284
- (
285
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
286
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
287
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
288
- )
289
- )
290
- else:
291
- if (
292
- base[0] == cds[codon]
293
- and base[1] == cds[codon + 1]
294
- and base[2] != cds[codon + 2]
295
- ):
296
- variants.append(
297
- (
298
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
299
- f"{protein_id}:p.{protein[index]}{index + 1}=",
300
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
301
- )
302
- )
303
- elif (
304
- base[0] == cds[codon]
305
- and base[1] != cds[codon + 1]
306
- and base[2] == cds[codon + 2]
307
- ):
308
- variants.append(
309
- (
310
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
311
- f"{protein_id}:p.{protein[index]}{index + 1}=",
312
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
313
- )
314
- )
315
- elif (
316
- base[0] != cds[codon]
317
- and base[1] == cds[codon + 1]
318
- and base[2] == cds[codon + 2]
319
- ):
320
- variants.append(
321
- (
322
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
323
- f"{protein_id}:p.{protein[index]}{index + 1}=",
324
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
325
- )
326
- )
327
- else:
328
- variants.append(
329
- (
330
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
331
- f"{protein_id}:p.{protein[index]}{index + 1}=",
332
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
333
- )
334
- )
335
- return variants
336
-
337
-
338
- def inframe_del(gene: str) -> list:
339
- variants = []
340
- term = f'{gene}[Gene Name] "mane select"[keyword]'
341
- stream = Entrez.esearch(db="nucleotide", term=term)
342
- record = Entrez.read(stream)
343
- stream = Entrez.efetch(
344
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
345
- )
346
- seqrecord = SeqIO.read(stream, "genbank")
347
- for feature in seqrecord.features:
348
- if feature.type == "CDS":
349
- protein = "".join(feature.qualifiers.get("translation"))
350
- protein_id = "".join(feature.qualifiers.get("protein_id"))
351
- cds = feature.location.extract(seqrecord).seq
352
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
353
- variants.append(
354
- (
355
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
356
- f"{protein_id}:p.{protein[index]}{index + 1}del",
357
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
358
- )
359
- )
360
- return variants
361
-
362
-
363
- def inframe_dup(gene: str) -> list:
364
- variants = []
365
- term = f'{gene}[Gene Name] "mane select"[keyword]'
366
- stream = Entrez.esearch(db="nucleotide", term=term)
367
- record = Entrez.read(stream)
368
- stream = Entrez.efetch(
369
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
370
- )
371
- seqrecord = SeqIO.read(stream, "genbank")
372
- for feature in seqrecord.features:
373
- if feature.type == "CDS":
374
- protein = "".join(feature.qualifiers.get("translation"))
375
- protein_id = "".join(feature.qualifiers.get("protein_id"))
376
- cds = feature.location.extract(seqrecord).seq
377
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
378
- variants.append(
379
- (
380
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
381
- f"{protein_id}:p.{protein[index]}{index + 1}dup",
382
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
383
- )
384
- )
385
- return variants
386
-
387
-
388
- def frameshift_dup(gene: str) -> list:
389
- variants = []
390
- term = f'{gene}[Gene Name] "mane select"[keyword]'
391
- stream = Entrez.esearch(db="nucleotide", term=term)
392
- record = Entrez.read(stream)
393
- stream = Entrez.efetch(
394
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
395
- )
396
- seqrecord = SeqIO.read(stream, "genbank")
397
- for feature in seqrecord.features:
398
- if feature.type == "CDS":
399
- cds = feature.location.extract(seqrecord).seq
400
- for index, base in enumerate(cds, start=1):
401
- variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
402
- return variants
403
-
404
-
405
- def frameshift_del(gene: str) -> list:
406
- variants = []
407
- term = f'{gene}[Gene Name] "mane select"[keyword]'
408
- stream = Entrez.esearch(db="nucleotide", term=term)
409
- record = Entrez.read(stream)
410
- stream = Entrez.efetch(
411
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
412
- )
413
- seqrecord = SeqIO.read(stream, "genbank")
414
- for feature in seqrecord.features:
415
- if feature.type == "CDS":
416
- cds = feature.location.extract(seqrecord).seq
417
- for index, base in enumerate(cds, start=1):
418
- variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
419
- return variants
420
-
421
13
 
422
14
  __all__ = [
423
- "frameshift_dup",
424
- "frameshift_del",
425
15
  "cds",
426
- "inframe_dup",
427
- "inframe_del",
428
- "splicing",
429
16
  "utr5",
430
17
  "utr3",
18
+ "splicing",
19
+ "inframe_del",
20
+ "inframe_dup",
21
+ "frameshift_del",
22
+ "frameshift_dup",
431
23
  "aa_sub",
432
24
  "missense",
433
25
  ]
varsim/core.py ADDED
@@ -0,0 +1,432 @@
1
+ import os
2
+
3
+ from Bio import Entrez, SeqIO
4
+ from Bio.Data.CodonTable import standard_dna_table
5
+ from Bio.Data.IUPACData import (
6
+ unambiguous_dna_letters,
7
+ protein_letters,
8
+ protein_letters_1to3,
9
+ protein_letters_3to1,
10
+ )
11
+ from Bio.Seq import Seq
12
+ from Bio.SeqFeature import SimpleLocation
13
+ from Bio.SeqUtils import seq3
14
+
15
+ __all__ = [
16
+ "frameshift_dup",
17
+ "frameshift_del",
18
+ "cds",
19
+ "inframe_dup",
20
+ "inframe_del",
21
+ "splicing",
22
+ "utr5",
23
+ "utr3",
24
+ "aa_sub",
25
+ "missense",
26
+ ]
27
+
28
+ Entrez.email = os.environ["EMAIL"]
29
+ Entrez.api_key = os.environ["API_KEY"]
30
+ codons = standard_dna_table.forward_table.keys()
31
+
32
+
33
+ def cds(gene: str) -> list:
34
+ variants = []
35
+ stream = Entrez.esearch(
36
+ db="nucleotide",
37
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
38
+ )
39
+ record = Entrez.read(stream)
40
+ stream = Entrez.efetch(
41
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
42
+ )
43
+ seqrecord = SeqIO.read(stream, "genbank")
44
+ for feature in seqrecord.features:
45
+ if feature.type == "CDS":
46
+ protein = "".join(feature.qualifiers.get("translation"))
47
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
48
+ cds = feature.extract(seqrecord).seq
49
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
50
+ for base in unambiguous_dna_letters:
51
+ if base != cds[codon]:
52
+ seq = Seq(base) + cds[codon + 1 : codon + 3]
53
+ if protein[index] != seq.translate():
54
+ variants.append(
55
+ (
56
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
57
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
58
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
59
+ )
60
+ )
61
+ else:
62
+ variants.append(
63
+ (
64
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
65
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
66
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
67
+ )
68
+ )
69
+ if base != cds[codon + 1]:
70
+ seq = cds[codon] + Seq(base) + cds[codon + 2]
71
+ if protein[index] != seq.translate():
72
+ variants.append(
73
+ (
74
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
75
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
76
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
77
+ )
78
+ )
79
+ else:
80
+ variants.append(
81
+ (
82
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
83
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
84
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
85
+ )
86
+ )
87
+ if base != cds[codon + 2]:
88
+ seq = cds[codon : codon + 2] + Seq(base)
89
+ if protein[index] != seq.translate():
90
+ variants.append(
91
+ (
92
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
93
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
94
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
95
+ )
96
+ )
97
+ else:
98
+ variants.append(
99
+ (
100
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
101
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
102
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
103
+ )
104
+ )
105
+ return variants
106
+
107
+
108
+ def utr5(gene: str) -> list:
109
+ variants = []
110
+ stream = Entrez.esearch(
111
+ db="nucleotide",
112
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
113
+ )
114
+ record = Entrez.read(stream)
115
+ stream = Entrez.efetch(
116
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
117
+ )
118
+ seqrecord = SeqIO.read(stream, "genbank")
119
+ for feature in seqrecord.features:
120
+ if feature.type == "CDS":
121
+ utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
122
+ for index in range(len(utr5)):
123
+ for base in unambiguous_dna_letters:
124
+ if base != utr5[index]:
125
+ variants.append(
126
+ (
127
+ f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
128
+ "",
129
+ "",
130
+ )
131
+ )
132
+ return variants
133
+
134
+
135
+ def utr3(gene: str) -> list:
136
+ variants = []
137
+ stream = Entrez.esearch(
138
+ db="nucleotide",
139
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
140
+ )
141
+ record = Entrez.read(stream)
142
+ stream = Entrez.efetch(
143
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
144
+ )
145
+ seqrecord = SeqIO.read(stream, "genbank")
146
+ for feature in seqrecord.features:
147
+ if feature.type == "CDS":
148
+ utr3 = (
149
+ SimpleLocation(feature.location.end, len(seqrecord))
150
+ .extract(seqrecord)
151
+ .seq
152
+ )
153
+ for index in range(len(utr3)):
154
+ for base in unambiguous_dna_letters:
155
+ if base != utr3[index]:
156
+ variants.append(
157
+ (
158
+ f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
159
+ "",
160
+ "",
161
+ )
162
+ )
163
+ return variants
164
+
165
+
166
+ def splicing(gene: str) -> list:
167
+ variants = []
168
+ exon = []
169
+ stream = Entrez.esearch(
170
+ db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
171
+ )
172
+ record = Entrez.read(stream)
173
+
174
+ stream = Entrez.efetch(
175
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
176
+ )
177
+ seqrecord = SeqIO.read(stream, "genbank")
178
+ splicing = []
179
+ variants = []
180
+ start = 0
181
+ end = 0
182
+ for feature in seqrecord.features:
183
+ if feature.type == "CDS":
184
+ start = feature.location.start
185
+ end = feature.location.end
186
+ for feature in seqrecord.features:
187
+ if feature.type == "exon":
188
+ if feature.location.start < start and feature.location.end < start:
189
+ splicing.extend(
190
+ (
191
+ feature.location.start - start - 1,
192
+ feature.location.end - start - 1,
193
+ )
194
+ )
195
+ elif feature.location.start < start and feature.location.end > start:
196
+ splicing.extend(
197
+ (feature.location.start - start - 1, feature.location.end - start)
198
+ )
199
+ else:
200
+ splicing.extend(
201
+ (feature.location.start - start, feature.location.end - start)
202
+ )
203
+
204
+ for coordinate in range(1, len(splicing) - 1, 2):
205
+ site = splicing[coordinate], splicing[coordinate] + 1
206
+ for base in unambiguous_dna_letters:
207
+ if base != "G":
208
+ variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
209
+ if base != "T":
210
+ variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
211
+ if base != "A":
212
+ variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
213
+ if base != "G":
214
+ variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
215
+ return variants
216
+
217
+
218
+ def aa_sub(gene: str) -> list:
219
+ variants = []
220
+ term = f'{gene}[Gene Name] AND "mane select"[keyword]'
221
+ stream = Entrez.esearch(db="protein", term=term)
222
+ record = Entrez.read(stream)
223
+
224
+ stream = Entrez.efetch(
225
+ db="protein", rettype="gp", retmode="text", id=record["IdList"]
226
+ )
227
+ seqrecord = SeqIO.read(stream, "genbank")
228
+ for index, residue in enumerate(seqrecord.seq, 1):
229
+ for aa in protein_letters:
230
+ if aa != residue:
231
+ variants.append(
232
+ (
233
+ f"{seqrecord.id}:p.{residue}{index}{aa}",
234
+ f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
235
+ )
236
+ )
237
+ return variants
238
+
239
+
240
+ def missense(gene: str) -> list:
241
+ variants = []
242
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
243
+ stream = Entrez.esearch(db="nucleotide", term=term)
244
+ record = Entrez.read(stream)
245
+ stream = Entrez.efetch(
246
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
247
+ )
248
+ seqrecord = SeqIO.read(stream, "genbank")
249
+ for feature in seqrecord.features:
250
+ if feature.type == "CDS":
251
+ protein = "".join(feature.qualifiers.get("translation"))
252
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
253
+ cds = feature.location.extract(seqrecord).seq
254
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
255
+ for base in codons:
256
+ if base != cds[codon : codon + 3]:
257
+ seq = Seq(base)
258
+ if protein[index] != seq.translate():
259
+ if (
260
+ base[0] == cds[codon]
261
+ and base[1] == cds[codon + 1]
262
+ and base[2] != cds[codon + 2]
263
+ ):
264
+ variants.append(
265
+ (
266
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
267
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
268
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
269
+ )
270
+ )
271
+ elif (
272
+ base[0] == cds[codon]
273
+ and base[1] != cds[codon + 1]
274
+ and base[2] == cds[codon + 2]
275
+ ):
276
+ variants.append(
277
+ (
278
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
279
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
280
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
281
+ )
282
+ )
283
+ elif (
284
+ base[0] != cds[codon]
285
+ and base[1] == cds[codon + 1]
286
+ and base[2] == cds[codon + 2]
287
+ ):
288
+ variants.append(
289
+ (
290
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
291
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
292
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
293
+ )
294
+ )
295
+ else:
296
+ variants.append(
297
+ (
298
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
299
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
300
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
301
+ )
302
+ )
303
+ else:
304
+ if (
305
+ base[0] == cds[codon]
306
+ and base[1] == cds[codon + 1]
307
+ and base[2] != cds[codon + 2]
308
+ ):
309
+ variants.append(
310
+ (
311
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
312
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
313
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
314
+ )
315
+ )
316
+ elif (
317
+ base[0] == cds[codon]
318
+ and base[1] != cds[codon + 1]
319
+ and base[2] == cds[codon + 2]
320
+ ):
321
+ variants.append(
322
+ (
323
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
324
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
325
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
326
+ )
327
+ )
328
+ elif (
329
+ base[0] != cds[codon]
330
+ and base[1] == cds[codon + 1]
331
+ and base[2] == cds[codon + 2]
332
+ ):
333
+ variants.append(
334
+ (
335
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
336
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
337
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
338
+ )
339
+ )
340
+ else:
341
+ variants.append(
342
+ (
343
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
344
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
345
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
346
+ )
347
+ )
348
+ return variants
349
+
350
+
351
+ def inframe_del(gene: str) -> list:
352
+ variants = []
353
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
354
+ stream = Entrez.esearch(db="nucleotide", term=term)
355
+ record = Entrez.read(stream)
356
+ stream = Entrez.efetch(
357
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
358
+ )
359
+ seqrecord = SeqIO.read(stream, "genbank")
360
+ for feature in seqrecord.features:
361
+ if feature.type == "CDS":
362
+ protein = "".join(feature.qualifiers.get("translation"))
363
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
364
+ cds = feature.location.extract(seqrecord).seq
365
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
366
+ variants.append(
367
+ (
368
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
369
+ f"{protein_id}:p.{protein[index]}{index + 1}del",
370
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
371
+ )
372
+ )
373
+ return variants
374
+
375
+
376
+ def inframe_dup(gene: str) -> list:
377
+ variants = []
378
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
379
+ stream = Entrez.esearch(db="nucleotide", term=term)
380
+ record = Entrez.read(stream)
381
+ stream = Entrez.efetch(
382
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
383
+ )
384
+ seqrecord = SeqIO.read(stream, "genbank")
385
+ for feature in seqrecord.features:
386
+ if feature.type == "CDS":
387
+ protein = "".join(feature.qualifiers.get("translation"))
388
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
389
+ cds = feature.location.extract(seqrecord).seq
390
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
391
+ variants.append(
392
+ (
393
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
394
+ f"{protein_id}:p.{protein[index]}{index + 1}dup",
395
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
396
+ )
397
+ )
398
+ return variants
399
+
400
+
401
+ def frameshift_dup(gene: str) -> list:
402
+ variants = []
403
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
404
+ stream = Entrez.esearch(db="nucleotide", term=term)
405
+ record = Entrez.read(stream)
406
+ stream = Entrez.efetch(
407
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
408
+ )
409
+ seqrecord = SeqIO.read(stream, "genbank")
410
+ for feature in seqrecord.features:
411
+ if feature.type == "CDS":
412
+ cds = feature.location.extract(seqrecord).seq
413
+ for index, base in enumerate(cds, start=1):
414
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
415
+ return variants
416
+
417
+
418
+ def frameshift_del(gene: str) -> list:
419
+ variants = []
420
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
421
+ stream = Entrez.esearch(db="nucleotide", term=term)
422
+ record = Entrez.read(stream)
423
+ stream = Entrez.efetch(
424
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
425
+ )
426
+ seqrecord = SeqIO.read(stream, "genbank")
427
+ for feature in seqrecord.features:
428
+ if feature.type == "CDS":
429
+ cds = feature.location.extract(seqrecord).seq
430
+ for index, base in enumerate(cds, start=1):
431
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
432
+ return variants
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -0,0 +1,7 @@
1
+ varsim/__init__.py,sha256=_Lb0mutPhj2XKnvssCgsxqGMk2LFmcJRmRA9EXS10ZU,368
2
+ varsim/core.py,sha256=ZIHKGBIp-8_seM6suoUCe2A8OgOL0gR8Oocu-IpCe_k,17712
3
+ varsim-1.0.7.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
4
+ varsim-1.0.7.dist-info/METADATA,sha256=-fHxAfvRqDftFLfz-oYZL9fTrxQtTh5Mpf3Jk61mVGw,2464
5
+ varsim-1.0.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
6
+ varsim-1.0.7.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
7
+ varsim-1.0.7.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- varsim/__init__.py,sha256=r2jXdCqlbLaJsNGXum0B9hBoX0gpGMNQ_aIqt2GyXr8,17714
2
- varsim-1.0.5.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
3
- varsim-1.0.5.dist-info/METADATA,sha256=8Uo9_Gn2Pg3Mkct-ZC7oJ9la2ZfddqvduepCUwnorPA,2464
4
- varsim-1.0.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
- varsim-1.0.5.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
6
- varsim-1.0.5.dist-info/RECORD,,
File without changes