varsim 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
varsim/__init__.py CHANGED
@@ -1,432 +1,25 @@
1
- import os
2
-
3
- from Bio import Entrez, SeqIO
4
- from Bio.Data.CodonTable import standard_dna_table
5
- from Bio.Data.IUPACData import (
6
- unambiguous_dna_letters,
7
- protein_letters,
8
- protein_letters_1to3,
9
- protein_letters_3to1,
1
+ from ._core import (
2
+ cds,
3
+ utr5,
4
+ utr3,
5
+ splicing,
6
+ inframe_del,
7
+ inframe_dup,
8
+ frameshift_del,
9
+ frameshift_dup,
10
+ aa_sub,
11
+ missense,
10
12
  )
11
- from Bio.Seq import Seq
12
- from Bio.SeqFeature import SimpleLocation
13
- from Bio.SeqUtils import seq3
14
13
 
15
14
  __all__ = [
16
- "frameshift_dup",
17
- "frameshift_del",
18
15
  "cds",
19
- "inframe_dup",
20
- "inframe_del",
21
- "splicing",
22
16
  "utr5",
23
17
  "utr3",
18
+ "splicing",
19
+ "inframe_del",
20
+ "inframe_dup",
21
+ "frameshift_del",
22
+ "frameshift_dup",
24
23
  "aa_sub",
25
24
  "missense",
26
25
  ]
27
-
28
- Entrez.email = os.environ["EMAIL"]
29
- Entrez.api_key = os.environ["API_KEY"]
30
- codons = standard_dna_table.forward_table.keys()
31
-
32
-
33
- def cds(gene: str) -> list:
34
- variants = []
35
- stream = Entrez.esearch(
36
- db="nucleotide",
37
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
38
- )
39
- record = Entrez.read(stream)
40
- stream = Entrez.efetch(
41
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
42
- )
43
- seqrecord = SeqIO.read(stream, "genbank")
44
- for feature in seqrecord.features:
45
- if feature.type == "CDS":
46
- protein = "".join(feature.qualifiers.get("translation"))
47
- protein_id = "".join(feature.qualifiers.get("protein_id"))
48
- cds = feature.extract(seqrecord).seq
49
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
50
- for base in unambiguous_dna_letters:
51
- if base != cds[codon]:
52
- seq = Seq(base) + cds[codon + 1 : codon + 3]
53
- if protein[index] != seq.translate():
54
- variants.append(
55
- (
56
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
57
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
58
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
59
- )
60
- )
61
- else:
62
- variants.append(
63
- (
64
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
65
- f"{protein_id}:p.{protein[index]}{index + 1}=",
66
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
67
- )
68
- )
69
- if base != cds[codon + 1]:
70
- seq = cds[codon] + Seq(base) + cds[codon + 2]
71
- if protein[index] != seq.translate():
72
- variants.append(
73
- (
74
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
75
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
76
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
77
- )
78
- )
79
- else:
80
- variants.append(
81
- (
82
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
83
- f"{protein_id}:p.{protein[index]}{index + 1}=",
84
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
85
- )
86
- )
87
- if base != cds[codon + 2]:
88
- seq = cds[codon : codon + 2] + Seq(base)
89
- if protein[index] != seq.translate():
90
- variants.append(
91
- (
92
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
93
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
94
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
95
- )
96
- )
97
- else:
98
- variants.append(
99
- (
100
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
101
- f"{protein_id}:p.{protein[index]}{index + 1}=",
102
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
103
- )
104
- )
105
- return variants
106
-
107
-
108
- def utr5(gene: str) -> list:
109
- variants = []
110
- stream = Entrez.esearch(
111
- db="nucleotide",
112
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
113
- )
114
- record = Entrez.read(stream)
115
- stream = Entrez.efetch(
116
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
117
- )
118
- seqrecord = SeqIO.read(stream, "genbank")
119
- for feature in seqrecord.features:
120
- if feature.type == "CDS":
121
- utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
122
- for index in range(len(utr5)):
123
- for base in unambiguous_dna_letters:
124
- if base != utr5[index]:
125
- variants.append(
126
- (
127
- f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
128
- "",
129
- "",
130
- )
131
- )
132
- return variants
133
-
134
-
135
- def utr3(gene: str) -> list:
136
- variants = []
137
- stream = Entrez.esearch(
138
- db="nucleotide",
139
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
140
- )
141
- record = Entrez.read(stream)
142
- stream = Entrez.efetch(
143
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
144
- )
145
- seqrecord = SeqIO.read(stream, "genbank")
146
- for feature in seqrecord.features:
147
- if feature.type == "CDS":
148
- utr3 = (
149
- SimpleLocation(feature.location.end, len(seqrecord))
150
- .extract(seqrecord)
151
- .seq
152
- )
153
- for index in range(len(utr3)):
154
- for base in unambiguous_dna_letters:
155
- if base != utr3[index]:
156
- variants.append(
157
- (
158
- f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
159
- "",
160
- "",
161
- )
162
- )
163
- return variants
164
-
165
-
166
- def splicing(gene: str) -> list:
167
- variants = []
168
- exon = []
169
- stream = Entrez.esearch(
170
- db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
171
- )
172
- record = Entrez.read(stream)
173
-
174
- stream = Entrez.efetch(
175
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
176
- )
177
- seqrecord = SeqIO.read(stream, "genbank")
178
- splicing = []
179
- variants = []
180
- start = 0
181
- end = 0
182
- for feature in seqrecord.features:
183
- if feature.type == "CDS":
184
- start = feature.location.start
185
- end = feature.location.end
186
- for feature in seqrecord.features:
187
- if feature.type == "exon":
188
- if feature.location.start < start and feature.location.end < start:
189
- splicing.extend(
190
- (
191
- feature.location.start - start - 1,
192
- feature.location.end - start - 1,
193
- )
194
- )
195
- elif feature.location.start < start and feature.location.end > start:
196
- splicing.extend(
197
- (feature.location.start - start - 1, feature.location.end - start)
198
- )
199
- else:
200
- splicing.extend(
201
- (feature.location.start - start, feature.location.end - start)
202
- )
203
-
204
- for coordinate in range(1, len(splicing) - 1, 2):
205
- site = splicing[coordinate], splicing[coordinate] + 1
206
- for base in unambiguous_dna_letters:
207
- if base != "G":
208
- variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
209
- if base != "T":
210
- variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
211
- if base != "A":
212
- variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
213
- if base != "G":
214
- variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
215
- return variants
216
-
217
-
218
- def aa_sub(gene: str) -> list:
219
- variants = []
220
- term = f'{gene}[Gene Name] AND "mane select"[keyword]'
221
- stream = Entrez.esearch(db="protein", term=term)
222
- record = Entrez.read(stream)
223
-
224
- stream = Entrez.efetch(
225
- db="protein", rettype="gp", retmode="text", id=record["IdList"]
226
- )
227
- seqrecord = SeqIO.read(stream, "genbank")
228
- for index, residue in enumerate(seqrecord.seq, 1):
229
- for aa in protein_letters:
230
- if aa != residue:
231
- variants.append(
232
- (
233
- f"{seqrecord.id}:p.{residue}{index}{aa}",
234
- f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
235
- )
236
- )
237
- return variants
238
-
239
-
240
- def missense(gene: str) -> list:
241
- variants = []
242
- term = f'{gene}[Gene Name] "mane select"[keyword]'
243
- stream = Entrez.esearch(db="nucleotide", term=term)
244
- record = Entrez.read(stream)
245
- stream = Entrez.efetch(
246
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
247
- )
248
- seqrecord = SeqIO.read(stream, "genbank")
249
- for feature in seqrecord.features:
250
- if feature.type == "CDS":
251
- protein = "".join(feature.qualifiers.get("translation"))
252
- protein_id = "".join(feature.qualifiers.get("protein_id"))
253
- cds = feature.location.extract(seqrecord).seq
254
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
255
- for base in codons:
256
- if base != cds[codon : codon + 3]:
257
- seq = Seq(base)
258
- if protein[index] != seq.translate():
259
- if (
260
- base[0] == cds[codon]
261
- and base[1] == cds[codon + 1]
262
- and base[2] != cds[codon + 2]
263
- ):
264
- variants.append(
265
- (
266
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
267
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
268
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
269
- )
270
- )
271
- elif (
272
- base[0] == cds[codon]
273
- and base[1] != cds[codon + 1]
274
- and base[2] == cds[codon + 2]
275
- ):
276
- variants.append(
277
- (
278
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
279
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
280
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
281
- )
282
- )
283
- elif (
284
- base[0] != cds[codon]
285
- and base[1] == cds[codon + 1]
286
- and base[2] == cds[codon + 2]
287
- ):
288
- variants.append(
289
- (
290
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
291
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
292
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
293
- )
294
- )
295
- else:
296
- variants.append(
297
- (
298
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
299
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
300
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
301
- )
302
- )
303
- else:
304
- if (
305
- base[0] == cds[codon]
306
- and base[1] == cds[codon + 1]
307
- and base[2] != cds[codon + 2]
308
- ):
309
- variants.append(
310
- (
311
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
312
- f"{protein_id}:p.{protein[index]}{index + 1}=",
313
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
314
- )
315
- )
316
- elif (
317
- base[0] == cds[codon]
318
- and base[1] != cds[codon + 1]
319
- and base[2] == cds[codon + 2]
320
- ):
321
- variants.append(
322
- (
323
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
324
- f"{protein_id}:p.{protein[index]}{index + 1}=",
325
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
326
- )
327
- )
328
- elif (
329
- base[0] != cds[codon]
330
- and base[1] == cds[codon + 1]
331
- and base[2] == cds[codon + 2]
332
- ):
333
- variants.append(
334
- (
335
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
336
- f"{protein_id}:p.{protein[index]}{index + 1}=",
337
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
338
- )
339
- )
340
- else:
341
- variants.append(
342
- (
343
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
344
- f"{protein_id}:p.{protein[index]}{index + 1}=",
345
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
346
- )
347
- )
348
- return variants
349
-
350
-
351
- def inframe_del(gene: str) -> list:
352
- variants = []
353
- term = f'{gene}[Gene Name] "mane select"[keyword]'
354
- stream = Entrez.esearch(db="nucleotide", term=term)
355
- record = Entrez.read(stream)
356
- stream = Entrez.efetch(
357
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
358
- )
359
- seqrecord = SeqIO.read(stream, "genbank")
360
- for feature in seqrecord.features:
361
- if feature.type == "CDS":
362
- protein = "".join(feature.qualifiers.get("translation"))
363
- protein_id = "".join(feature.qualifiers.get("protein_id"))
364
- cds = feature.location.extract(seqrecord).seq
365
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
366
- variants.append(
367
- (
368
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
369
- f"{protein_id}:p.{protein[index]}{index + 1}del",
370
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
371
- )
372
- )
373
- return variants
374
-
375
-
376
- def inframe_dup(gene: str) -> list:
377
- variants = []
378
- term = f'{gene}[Gene Name] "mane select"[keyword]'
379
- stream = Entrez.esearch(db="nucleotide", term=term)
380
- record = Entrez.read(stream)
381
- stream = Entrez.efetch(
382
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
383
- )
384
- seqrecord = SeqIO.read(stream, "genbank")
385
- for feature in seqrecord.features:
386
- if feature.type == "CDS":
387
- protein = "".join(feature.qualifiers.get("translation"))
388
- protein_id = "".join(feature.qualifiers.get("protein_id"))
389
- cds = feature.location.extract(seqrecord).seq
390
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
391
- variants.append(
392
- (
393
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
394
- f"{protein_id}:p.{protein[index]}{index + 1}dup",
395
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
396
- )
397
- )
398
- return variants
399
-
400
-
401
- def frameshift_dup(gene: str) -> list:
402
- variants = []
403
- term = f'{gene}[Gene Name] "mane select"[keyword]'
404
- stream = Entrez.esearch(db="nucleotide", term=term)
405
- record = Entrez.read(stream)
406
- stream = Entrez.efetch(
407
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
408
- )
409
- seqrecord = SeqIO.read(stream, "genbank")
410
- for feature in seqrecord.features:
411
- if feature.type == "CDS":
412
- cds = feature.location.extract(seqrecord).seq
413
- for index, base in enumerate(cds, start=1):
414
- variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
415
- return variants
416
-
417
-
418
- def frameshift_del(gene: str) -> list:
419
- variants = []
420
- term = f'{gene}[Gene Name] "mane select"[keyword]'
421
- stream = Entrez.esearch(db="nucleotide", term=term)
422
- record = Entrez.read(stream)
423
- stream = Entrez.efetch(
424
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
425
- )
426
- seqrecord = SeqIO.read(stream, "genbank")
427
- for feature in seqrecord.features:
428
- if feature.type == "CDS":
429
- cds = feature.location.extract(seqrecord).seq
430
- for index, base in enumerate(cds, start=1):
431
- variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
432
- return variants
varsim/_core.py ADDED
@@ -0,0 +1,432 @@
1
+ import os
2
+
3
+ from Bio import Entrez, SeqIO
4
+ from Bio.Data.CodonTable import standard_dna_table
5
+ from Bio.Data.IUPACData import (
6
+ unambiguous_dna_letters,
7
+ protein_letters,
8
+ protein_letters_1to3,
9
+ protein_letters_3to1,
10
+ )
11
+ from Bio.Seq import Seq
12
+ from Bio.SeqFeature import SimpleLocation
13
+ from Bio.SeqUtils import seq3
14
+
15
+ __all__ = [
16
+ "frameshift_dup",
17
+ "frameshift_del",
18
+ "cds",
19
+ "inframe_dup",
20
+ "inframe_del",
21
+ "splicing",
22
+ "utr5",
23
+ "utr3",
24
+ "aa_sub",
25
+ "missense",
26
+ ]
27
+
28
+ Entrez.email = os.environ["EMAIL"]
29
+ Entrez.api_key = os.environ["API_KEY"]
30
+ codons = standard_dna_table.forward_table.keys()
31
+
32
+
33
+ def cds(gene: str) -> list:
34
+ variants = []
35
+ stream = Entrez.esearch(
36
+ db="nucleotide",
37
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
38
+ )
39
+ record = Entrez.read(stream)
40
+ stream = Entrez.efetch(
41
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
42
+ )
43
+ seqrecord = SeqIO.read(stream, "genbank")
44
+ for feature in seqrecord.features:
45
+ if feature.type == "CDS":
46
+ protein = "".join(feature.qualifiers.get("translation"))
47
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
48
+ cds = feature.extract(seqrecord).seq
49
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
50
+ for base in unambiguous_dna_letters:
51
+ if base != cds[codon]:
52
+ seq = Seq(base) + cds[codon + 1 : codon + 3]
53
+ if protein[index] != seq.translate():
54
+ variants.append(
55
+ (
56
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
57
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
58
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
59
+ )
60
+ )
61
+ else:
62
+ variants.append(
63
+ (
64
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
65
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
66
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
67
+ )
68
+ )
69
+ if base != cds[codon + 1]:
70
+ seq = cds[codon] + Seq(base) + cds[codon + 2]
71
+ if protein[index] != seq.translate():
72
+ variants.append(
73
+ (
74
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
75
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
76
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
77
+ )
78
+ )
79
+ else:
80
+ variants.append(
81
+ (
82
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
83
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
84
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
85
+ )
86
+ )
87
+ if base != cds[codon + 2]:
88
+ seq = cds[codon : codon + 2] + Seq(base)
89
+ if protein[index] != seq.translate():
90
+ variants.append(
91
+ (
92
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
93
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
94
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
95
+ )
96
+ )
97
+ else:
98
+ variants.append(
99
+ (
100
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
101
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
102
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
103
+ )
104
+ )
105
+ return variants
106
+
107
+
108
+ def utr5(gene: str) -> list:
109
+ variants = []
110
+ stream = Entrez.esearch(
111
+ db="nucleotide",
112
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
113
+ )
114
+ record = Entrez.read(stream)
115
+ stream = Entrez.efetch(
116
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
117
+ )
118
+ seqrecord = SeqIO.read(stream, "genbank")
119
+ for feature in seqrecord.features:
120
+ if feature.type == "CDS":
121
+ utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
122
+ for index in range(len(utr5)):
123
+ for base in unambiguous_dna_letters:
124
+ if base != utr5[index]:
125
+ variants.append(
126
+ (
127
+ f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
128
+ "",
129
+ "",
130
+ )
131
+ )
132
+ return variants
133
+
134
+
135
+ def utr3(gene: str) -> list:
136
+ variants = []
137
+ stream = Entrez.esearch(
138
+ db="nucleotide",
139
+ term=f'{gene}[Gene Name] "mane select"[Keyword]',
140
+ )
141
+ record = Entrez.read(stream)
142
+ stream = Entrez.efetch(
143
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
144
+ )
145
+ seqrecord = SeqIO.read(stream, "genbank")
146
+ for feature in seqrecord.features:
147
+ if feature.type == "CDS":
148
+ utr3 = (
149
+ SimpleLocation(feature.location.end, len(seqrecord))
150
+ .extract(seqrecord)
151
+ .seq
152
+ )
153
+ for index in range(len(utr3)):
154
+ for base in unambiguous_dna_letters:
155
+ if base != utr3[index]:
156
+ variants.append(
157
+ (
158
+ f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
159
+ "",
160
+ "",
161
+ )
162
+ )
163
+ return variants
164
+
165
+
166
+ def splicing(gene: str) -> list:
167
+ variants = []
168
+ exon = []
169
+ stream = Entrez.esearch(
170
+ db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
171
+ )
172
+ record = Entrez.read(stream)
173
+
174
+ stream = Entrez.efetch(
175
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
176
+ )
177
+ seqrecord = SeqIO.read(stream, "genbank")
178
+ splicing = []
179
+ variants = []
180
+ start = 0
181
+ end = 0
182
+ for feature in seqrecord.features:
183
+ if feature.type == "CDS":
184
+ start = feature.location.start
185
+ end = feature.location.end
186
+ for feature in seqrecord.features:
187
+ if feature.type == "exon":
188
+ if feature.location.start < start and feature.location.end < start:
189
+ splicing.extend(
190
+ (
191
+ feature.location.start - start - 1,
192
+ feature.location.end - start - 1,
193
+ )
194
+ )
195
+ elif feature.location.start < start and feature.location.end > start:
196
+ splicing.extend(
197
+ (feature.location.start - start - 1, feature.location.end - start)
198
+ )
199
+ else:
200
+ splicing.extend(
201
+ (feature.location.start - start, feature.location.end - start)
202
+ )
203
+
204
+ for coordinate in range(1, len(splicing) - 1, 2):
205
+ site = splicing[coordinate], splicing[coordinate] + 1
206
+ for base in unambiguous_dna_letters:
207
+ if base != "G":
208
+ variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
209
+ if base != "T":
210
+ variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
211
+ if base != "A":
212
+ variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
213
+ if base != "G":
214
+ variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
215
+ return variants
216
+
217
+
218
+ def aa_sub(gene: str) -> list:
219
+ variants = []
220
+ term = f'{gene}[Gene Name] AND "mane select"[keyword]'
221
+ stream = Entrez.esearch(db="protein", term=term)
222
+ record = Entrez.read(stream)
223
+
224
+ stream = Entrez.efetch(
225
+ db="protein", rettype="gp", retmode="text", id=record["IdList"]
226
+ )
227
+ seqrecord = SeqIO.read(stream, "genbank")
228
+ for index, residue in enumerate(seqrecord.seq, 1):
229
+ for aa in protein_letters:
230
+ if aa != residue:
231
+ variants.append(
232
+ (
233
+ f"{seqrecord.id}:p.{residue}{index}{aa}",
234
+ f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",
235
+ )
236
+ )
237
+ return variants
238
+
239
+
240
+ def missense(gene: str) -> list:
241
+ variants = []
242
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
243
+ stream = Entrez.esearch(db="nucleotide", term=term)
244
+ record = Entrez.read(stream)
245
+ stream = Entrez.efetch(
246
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
247
+ )
248
+ seqrecord = SeqIO.read(stream, "genbank")
249
+ for feature in seqrecord.features:
250
+ if feature.type == "CDS":
251
+ protein = "".join(feature.qualifiers.get("translation"))
252
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
253
+ cds = feature.location.extract(seqrecord).seq
254
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
255
+ for base in codons:
256
+ if base != cds[codon : codon + 3]:
257
+ seq = Seq(base)
258
+ if protein[index] != seq.translate():
259
+ if (
260
+ base[0] == cds[codon]
261
+ and base[1] == cds[codon + 1]
262
+ and base[2] != cds[codon + 2]
263
+ ):
264
+ variants.append(
265
+ (
266
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
267
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
268
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
269
+ )
270
+ )
271
+ elif (
272
+ base[0] == cds[codon]
273
+ and base[1] != cds[codon + 1]
274
+ and base[2] == cds[codon + 2]
275
+ ):
276
+ variants.append(
277
+ (
278
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
279
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
280
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
281
+ )
282
+ )
283
+ elif (
284
+ base[0] != cds[codon]
285
+ and base[1] == cds[codon + 1]
286
+ and base[2] == cds[codon + 2]
287
+ ):
288
+ variants.append(
289
+ (
290
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
291
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
292
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
293
+ )
294
+ )
295
+ else:
296
+ variants.append(
297
+ (
298
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
299
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
300
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
301
+ )
302
+ )
303
+ else:
304
+ if (
305
+ base[0] == cds[codon]
306
+ and base[1] == cds[codon + 1]
307
+ and base[2] != cds[codon + 2]
308
+ ):
309
+ variants.append(
310
+ (
311
+ f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
312
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
313
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
314
+ )
315
+ )
316
+ elif (
317
+ base[0] == cds[codon]
318
+ and base[1] != cds[codon + 1]
319
+ and base[2] == cds[codon + 2]
320
+ ):
321
+ variants.append(
322
+ (
323
+ f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
324
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
325
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
326
+ )
327
+ )
328
+ elif (
329
+ base[0] != cds[codon]
330
+ and base[1] == cds[codon + 1]
331
+ and base[2] == cds[codon + 2]
332
+ ):
333
+ variants.append(
334
+ (
335
+ f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
336
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
337
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
338
+ )
339
+ )
340
+ else:
341
+ variants.append(
342
+ (
343
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
344
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
345
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
346
+ )
347
+ )
348
+ return variants
349
+
350
+
351
+ def inframe_del(gene: str) -> list:
352
+ variants = []
353
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
354
+ stream = Entrez.esearch(db="nucleotide", term=term)
355
+ record = Entrez.read(stream)
356
+ stream = Entrez.efetch(
357
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
358
+ )
359
+ seqrecord = SeqIO.read(stream, "genbank")
360
+ for feature in seqrecord.features:
361
+ if feature.type == "CDS":
362
+ protein = "".join(feature.qualifiers.get("translation"))
363
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
364
+ cds = feature.location.extract(seqrecord).seq
365
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
366
+ variants.append(
367
+ (
368
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
369
+ f"{protein_id}:p.{protein[index]}{index + 1}del",
370
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
371
+ )
372
+ )
373
+ return variants
374
+
375
+
376
+ def inframe_dup(gene: str) -> list:
377
+ variants = []
378
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
379
+ stream = Entrez.esearch(db="nucleotide", term=term)
380
+ record = Entrez.read(stream)
381
+ stream = Entrez.efetch(
382
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
383
+ )
384
+ seqrecord = SeqIO.read(stream, "genbank")
385
+ for feature in seqrecord.features:
386
+ if feature.type == "CDS":
387
+ protein = "".join(feature.qualifiers.get("translation"))
388
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
389
+ cds = feature.location.extract(seqrecord).seq
390
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
391
+ variants.append(
392
+ (
393
+ f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
394
+ f"{protein_id}:p.{protein[index]}{index + 1}dup",
395
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
396
+ )
397
+ )
398
+ return variants
399
+
400
+
401
+ def frameshift_dup(gene: str) -> list:
402
+ variants = []
403
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
404
+ stream = Entrez.esearch(db="nucleotide", term=term)
405
+ record = Entrez.read(stream)
406
+ stream = Entrez.efetch(
407
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
408
+ )
409
+ seqrecord = SeqIO.read(stream, "genbank")
410
+ for feature in seqrecord.features:
411
+ if feature.type == "CDS":
412
+ cds = feature.location.extract(seqrecord).seq
413
+ for index, base in enumerate(cds, start=1):
414
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
415
+ return variants
416
+
417
+
418
+ def frameshift_del(gene: str) -> list:
419
+ variants = []
420
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
421
+ stream = Entrez.esearch(db="nucleotide", term=term)
422
+ record = Entrez.read(stream)
423
+ stream = Entrez.efetch(
424
+ db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
425
+ )
426
+ seqrecord = SeqIO.read(stream, "genbank")
427
+ for feature in seqrecord.features:
428
+ if feature.type == "CDS":
429
+ cds = feature.location.extract(seqrecord).seq
430
+ for index, base in enumerate(cds, start=1):
431
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
432
+ return variants
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.6
3
+ Version: 1.0.8
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -0,0 +1,7 @@
1
+ varsim/__init__.py,sha256=JaMqmsarXIhBjA7LU2xo8JzGVxQthcRbwuUBf4VFQKI,369
2
+ varsim/_core.py,sha256=ZIHKGBIp-8_seM6suoUCe2A8OgOL0gR8Oocu-IpCe_k,17712
3
+ varsim-1.0.8.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
4
+ varsim-1.0.8.dist-info/METADATA,sha256=3IScDeREX_hKNqFbyGemKmyHvgBQ84-tflM7cixFxe0,2464
5
+ varsim-1.0.8.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
6
+ varsim-1.0.8.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
7
+ varsim-1.0.8.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- varsim/__init__.py,sha256=ZIHKGBIp-8_seM6suoUCe2A8OgOL0gR8Oocu-IpCe_k,17712
2
- varsim-1.0.6.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
3
- varsim-1.0.6.dist-info/METADATA,sha256=p205IC4VbHE2OXj7KexqchrFKTp0Ema67c37s4O3rFs,2464
4
- varsim-1.0.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
- varsim-1.0.6.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
6
- varsim-1.0.6.dist-info/RECORD,,
File without changes