varsim 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
varsim/__init__.py ADDED
@@ -0,0 +1,270 @@
1
+ import os
2
+
3
+ from Bio import Entrez, SeqIO
4
+ from Bio.Data.CodonTable import standard_dna_table
5
+ from Bio.Data.IUPACData import (unambiguous_dna_letters, protein_letters, protein_letters_1to3, protein_letters_3to1, )
6
+ from Bio.Seq import Seq
7
+ from Bio.SeqFeature import SimpleLocation
8
+ from Bio.SeqUtils import seq3
9
+
10
+ Entrez.email = os.environ["EMAIL"]
11
+ Entrez.api_key = os.environ["API_KEY"]
12
+ codons = standard_dna_table.forward_table.keys()
13
+
14
+
15
+ def cds(gene: str) -> list:
16
+ variants = []
17
+ stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
18
+ record = Entrez.read(stream)
19
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
20
+ seqrecord = SeqIO.read(stream, "genbank")
21
+ for feature in seqrecord.features:
22
+ if feature.type == "CDS":
23
+ protein = "".join(feature.qualifiers.get("translation"))
24
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
25
+ cds = feature.extract(seqrecord).seq
26
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
27
+ for base in unambiguous_dna_letters:
28
+ if base != cds[codon]:
29
+ seq = Seq(base) + cds[codon + 1: codon + 3]
30
+ if protein[index] != seq.translate():
31
+ variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
32
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
33
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
34
+ else:
35
+ variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
36
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
37
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
38
+ if base != cds[codon + 1]:
39
+ seq = cds[codon] + Seq(base) + cds[codon + 2]
40
+ if protein[index] != seq.translate():
41
+ variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
42
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
43
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
44
+ else:
45
+ variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
46
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
47
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
48
+ if base != cds[codon + 2]:
49
+ seq = cds[codon: codon + 2] + Seq(base)
50
+ if protein[index] != seq.translate():
51
+ variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
52
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
53
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
54
+ else:
55
+ variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
56
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
57
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
58
+ return variants
59
+
60
+
61
+ def utr5(gene: str) -> list:
62
+ variants = []
63
+ stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
64
+ record = Entrez.read(stream)
65
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
66
+ seqrecord = SeqIO.read(stream, "genbank")
67
+ for feature in seqrecord.features:
68
+ if feature.type == "CDS":
69
+ utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
70
+ for index in range(len(utr5)):
71
+ for base in unambiguous_dna_letters:
72
+ if base != utr5[index]:
73
+ variants.append((f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}", "", "",))
74
+ return variants
75
+
76
+
77
+ def utr3(gene: str) -> list:
78
+ variants = []
79
+ stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]', )
80
+ record = Entrez.read(stream)
81
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
82
+ seqrecord = SeqIO.read(stream, "genbank")
83
+ for feature in seqrecord.features:
84
+ if feature.type == "CDS":
85
+ utr3 = (SimpleLocation(feature.location.end, len(seqrecord)).extract(seqrecord).seq)
86
+ for index in range(len(utr3)):
87
+ for base in unambiguous_dna_letters:
88
+ if base != utr3[index]:
89
+ variants.append((f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}", "", "",))
90
+ return variants
91
+
92
+
93
+ def splicing(gene: str) -> list:
94
+ variants = []
95
+ exon = []
96
+ stream = Entrez.esearch(db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]')
97
+ record = Entrez.read(stream)
98
+
99
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
100
+ seqrecord = SeqIO.read(stream, "genbank")
101
+ splicing = []
102
+ variants = []
103
+ start = 0
104
+ end = 0
105
+ for feature in seqrecord.features:
106
+ if feature.type == "CDS":
107
+ start = feature.location.start
108
+ end = feature.location.end
109
+ for feature in seqrecord.features:
110
+ if feature.type == "exon":
111
+ if feature.location.start < start and feature.location.end < start:
112
+ splicing.extend((feature.location.start - start - 1, feature.location.end - start - 1,))
113
+ elif feature.location.start < start and feature.location.end > start:
114
+ splicing.extend((feature.location.start - start - 1, feature.location.end - start))
115
+ else:
116
+ splicing.extend((feature.location.start - start, feature.location.end - start))
117
+
118
+ for coordinate in range(1, len(splicing) - 1, 2):
119
+ site = splicing[coordinate], splicing[coordinate] + 1
120
+ for base in unambiguous_dna_letters:
121
+ if base != "G":
122
+ variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
123
+ if base != "T":
124
+ variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
125
+ if base != "A":
126
+ variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
127
+ if base != "G":
128
+ variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
129
+ return variants
130
+
131
+
132
+ def aa_sub(gene: str) -> list:
133
+ variants = []
134
+ term = f'{gene}[Gene Name] AND "mane select"[keyword]'
135
+ stream = Entrez.esearch(db="protein", term=term)
136
+ record = Entrez.read(stream)
137
+
138
+ stream = Entrez.efetch(db="protein", rettype="gp", retmode="text", id=record["IdList"])
139
+ seqrecord = SeqIO.read(stream, "genbank")
140
+ for index, residue in enumerate(seqrecord.seq, 1):
141
+ for aa in protein_letters:
142
+ if aa != residue:
143
+ variants.append((f"{seqrecord.id}:p.{residue}{index}{aa}",
144
+ f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{index}{protein_letters_1to3[aa]}",))
145
+ return variants
146
+
147
+
148
+ def missense(gene: str) -> list:
149
+ variants = []
150
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
151
+ stream = Entrez.esearch(db="nucleotide", term=term)
152
+ record = Entrez.read(stream)
153
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
154
+ seqrecord = SeqIO.read(stream, "genbank")
155
+ for feature in seqrecord.features:
156
+ if feature.type == "CDS":
157
+ protein = "".join(feature.qualifiers.get("translation"))
158
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
159
+ cds = feature.location.extract(seqrecord).seq
160
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
161
+ for base in codons:
162
+ if base != cds[codon: codon + 3]:
163
+ seq = Seq(base)
164
+ if protein[index] != seq.translate():
165
+ if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
166
+ variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
167
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
168
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
169
+ elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
170
+ variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
171
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
172
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
173
+ elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
174
+ variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
175
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
176
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
177
+ else:
178
+ variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
179
+ f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
180
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",))
181
+ else:
182
+ if (base[0] == cds[codon] and base[1] == cds[codon + 1] and base[2] != cds[codon + 2]):
183
+ variants.append((f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base[2]}",
184
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
185
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
186
+ elif (base[0] == cds[codon] and base[1] != cds[codon + 1] and base[2] == cds[codon + 2]):
187
+ variants.append((f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base[1]}",
188
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
189
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
190
+ elif (base[0] != cds[codon] and base[1] == cds[codon + 1] and base[2] == cds[codon + 2]):
191
+ variants.append((f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base[0]}",
192
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
193
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
194
+ else:
195
+ variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}>{base}",
196
+ f"{protein_id}:p.{protein[index]}{index + 1}=",
197
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",))
198
+ return variants
199
+
200
+
201
+ def inframe_del(gene: str) -> list:
202
+ variants = []
203
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
204
+ stream = Entrez.esearch(db="nucleotide", term=term)
205
+ record = Entrez.read(stream)
206
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
207
+ seqrecord = SeqIO.read(stream, "genbank")
208
+ for feature in seqrecord.features:
209
+ if feature.type == "CDS":
210
+ protein = "".join(feature.qualifiers.get("translation"))
211
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
212
+ cds = feature.location.extract(seqrecord).seq
213
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
214
+ variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
215
+ f"{protein_id}:p.{protein[index]}{index + 1}del",
216
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",))
217
+ return variants
218
+
219
+
220
+ def inframe_dup(gene: str) -> list:
221
+ variants = []
222
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
223
+ stream = Entrez.esearch(db="nucleotide", term=term)
224
+ record = Entrez.read(stream)
225
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
226
+ seqrecord = SeqIO.read(stream, "genbank")
227
+ for feature in seqrecord.features:
228
+ if feature.type == "CDS":
229
+ protein = "".join(feature.qualifiers.get("translation"))
230
+ protein_id = "".join(feature.qualifiers.get("protein_id"))
231
+ cds = feature.location.extract(seqrecord).seq
232
+ for index, codon in enumerate(range(0, len(cds) - 3, 3)):
233
+ variants.append((f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
234
+ f"{protein_id}:p.{protein[index]}{index + 1}dup",
235
+ f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",))
236
+ return variants
237
+
238
+
239
+ def frameshift_dup(gene: str) -> list:
240
+ variants = []
241
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
242
+ stream = Entrez.esearch(db="nucleotide", term=term)
243
+ record = Entrez.read(stream)
244
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
245
+ seqrecord = SeqIO.read(stream, "genbank")
246
+ for feature in seqrecord.features:
247
+ if feature.type == "CDS":
248
+ cds = feature.location.extract(seqrecord).seq
249
+ for index, base in enumerate(cds, start=1):
250
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}dup",))
251
+ return variants
252
+
253
+
254
+ def frameshift_del(gene: str) -> list:
255
+ variants = []
256
+ term = f'{gene}[Gene Name] "mane select"[keyword]'
257
+ stream = Entrez.esearch(db="nucleotide", term=term)
258
+ record = Entrez.read(stream)
259
+ stream = Entrez.efetch(db="nucleotide", id=record["IdList"], rettype="gb", retmode="text")
260
+ seqrecord = SeqIO.read(stream, "genbank")
261
+ for feature in seqrecord.features:
262
+ if feature.type == "CDS":
263
+ cds = feature.location.extract(seqrecord).seq
264
+ for index, base in enumerate(cds, start=1):
265
+ variants.append((f"{seqrecord.id}:c.{str(index) + base}del",))
266
+ return variants
267
+
268
+
269
+ if __name__ == "__main__":
270
+ print(frameshift_del("INS"))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varsim
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: Variant Simulator
5
5
  Author-email: Liu Sun <sunliu@yxnu.edu.cn>, Jian Yang <yangjian@yxnu.edu.cn>
6
6
  Project-URL: Homepage, https://github.com/liu-sun/VarSim
@@ -0,0 +1,6 @@
1
+ varsim/__init__.py,sha256=pZL4Wz3LJBoYvBdbNXByTx6qtCSkHqzBbR2dBWl7E60,14702
2
+ varsim-1.0.4.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
3
+ varsim-1.0.4.dist-info/METADATA,sha256=4W43kr56JnS5Jpw-U6E_wDEpepZYSYnojsdV_YgI-1M,2464
4
+ varsim-1.0.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
+ varsim-1.0.4.dist-info/top_level.txt,sha256=2fLprhnBvkF-7VEOzGcpKoodqW08HjyNbVzM6emJrTI,7
6
+ varsim-1.0.4.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ varsim
VarSim/__init__.py DELETED
@@ -1,414 +0,0 @@
1
- import os
2
-
3
- from Bio import Entrez, SeqIO
4
- from Bio.Data.IUPACData import (
5
- unambiguous_dna_letters,
6
- protein_letters,
7
- protein_letters_1to3,
8
- protein_letters_3to1,
9
- )
10
- from Bio.Seq import Seq
11
- from Bio.SeqUtils import seq3
12
- from Bio.SeqFeature import SimpleLocation
13
- from Bio.Data.CodonTable import standard_dna_table
14
-
15
- Entrez.email = os.environ["EMAIL"]
16
- Entrez.api_key = os.environ["API_KEY"]
17
- codons = standard_dna_table.forward_table.keys()
18
-
19
-
20
- def cds(gene: str) -> list:
21
- variants = []
22
- stream = Entrez.esearch(
23
- db="nucleotide",
24
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
25
- )
26
- record = Entrez.read(stream)
27
- stream = Entrez.efetch(
28
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
29
- )
30
- seqrecord = SeqIO.read(stream, "genbank")
31
- for feature in seqrecord.features:
32
- if feature.type == "CDS":
33
- protein = "".join(feature.qualifiers.get("translation"))
34
- protein_id = "".join(feature.qualifiers.get("protein_id"))
35
- cds = feature.extract(seqrecord).seq
36
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
37
- for base in unambiguous_dna_letters:
38
- if base != cds[codon]:
39
- seq = Seq(base) + cds[codon + 1 : codon + 3]
40
- if protein[index] != seq.translate():
41
- variants.append(
42
- (
43
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
44
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
45
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
46
- )
47
- )
48
- else:
49
- variants.append(
50
- (
51
- f"{seqrecord.id}:c.{codon + 1}{cds[codon]}>{base}",
52
- f"{protein_id}:p.{protein[index]}{index + 1}=",
53
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
54
- )
55
- )
56
- if base != cds[codon + 1]:
57
- seq = cds[codon] + Seq(base) + cds[codon + 2]
58
- if protein[index] != seq.translate():
59
- variants.append(
60
- (
61
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
62
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
63
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
64
- )
65
- )
66
- else:
67
- variants.append(
68
- (
69
- f"{seqrecord.id}:c.{codon + 2}{cds[codon + 1]}>{base}",
70
- f"{protein_id}:p.{protein[index]}{index + 1}=",
71
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
72
- )
73
- )
74
- if base != cds[codon + 2]:
75
- seq = cds[codon : codon + 2] + Seq(base)
76
- if protein[index] != seq.translate():
77
- variants.append(
78
- (
79
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
80
- f"{protein_id}:p.{protein[index]}{index + 1}{seq.translate()}",
81
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}{seq3(seq.translate())}",
82
- )
83
- )
84
- else:
85
- variants.append(
86
- (
87
- f"{seqrecord.id}:c.{codon + 3}{cds[codon + 2]}>{base}",
88
- f"{protein_id}:p.{protein[index]}{index + 1}=",
89
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}=",
90
- )
91
- )
92
- return variants
93
-
94
-
95
- def utr5(gene: str) -> list:
96
- variants = []
97
- stream = Entrez.esearch(
98
- db="nucleotide",
99
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
100
- )
101
- record = Entrez.read(stream)
102
- stream = Entrez.efetch(
103
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
104
- )
105
- seqrecord = SeqIO.read(stream, "genbank")
106
- for feature in seqrecord.features:
107
- if feature.type == "CDS":
108
- utr5 = SimpleLocation(0, feature.location.start).extract(seqrecord).seq
109
- for index in range(len(utr5)):
110
- for base in unambiguous_dna_letters:
111
- if base != utr5[index]:
112
- variants.append(
113
- (
114
- f"{seqrecord.id}:c.{index - len(utr5)}{utr5[index]}>{base}",
115
- "",
116
- "",
117
- )
118
- )
119
- return variants
120
-
121
-
122
- def utr3(gene: str) -> list:
123
- variants = []
124
- stream = Entrez.esearch(
125
- db="nucleotide",
126
- term=f'{gene}[Gene Name] "mane select"[Keyword]',
127
- )
128
- record = Entrez.read(stream)
129
- stream = Entrez.efetch(
130
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
131
- )
132
- seqrecord = SeqIO.read(stream, "genbank")
133
- for feature in seqrecord.features:
134
- if feature.type == "CDS":
135
- utr3 = (
136
- SimpleLocation(feature.location.end, len(seqrecord))
137
- .extract(seqrecord)
138
- .seq
139
- )
140
- for index in range(len(utr3)):
141
- for base in unambiguous_dna_letters:
142
- if base != utr3[index]:
143
- variants.append(
144
- (
145
- f"{seqrecord.id}:c.*{index + 1}{utr3[index]}>{base}",
146
- "",
147
- "",
148
- )
149
- )
150
- return variants
151
-
152
-
153
- def splicing(gene: str) -> list:
154
- variants = []
155
- exon = []
156
- stream = Entrez.esearch(
157
- db="nucleotide", term=f'{gene}[Gene Name] "mane select"[Keyword]'
158
- )
159
- record = Entrez.read(stream)
160
-
161
- stream = Entrez.efetch(
162
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
163
- )
164
- seqrecord = SeqIO.read(stream, "genbank")
165
- splicing = []
166
- variants = []
167
- start = 0
168
- end = 0
169
- for feature in seqrecord.features:
170
- if feature.type == "CDS":
171
- start = feature.location.start
172
- end = feature.location.end
173
- for feature in seqrecord.features:
174
- if feature.type == "exon":
175
- if feature.location.start < start and feature.location.end < start:
176
- splicing.extend(
177
- (
178
- feature.location.start - start - 1,
179
- feature.location.end - start - 1,
180
- )
181
- )
182
- elif feature.location.start < start and feature.location.end > start:
183
- splicing.extend(
184
- (feature.location.start - start - 1, feature.location.end - start)
185
- )
186
- else:
187
- splicing.extend(
188
- (feature.location.start - start, feature.location.end - start)
189
- )
190
-
191
- for coordinate in range(1, len(splicing) - 1, 2):
192
- site = splicing[coordinate], splicing[coordinate] + 1
193
- for base in unambiguous_dna_letters:
194
- if base != "G":
195
- variants.append((f"{seqrecord.id}:c.{site[0]}+1G>{base}"))
196
- if base != "T":
197
- variants.append((f"{seqrecord.id}:c.{site[0]}+2T>{base}"))
198
- if base != "A":
199
- variants.append((f"{seqrecord.id}:c.{site[1]}-2A>{base}"))
200
- if base != "G":
201
- variants.append((f"{seqrecord.id}:c.{site[1]}-1G>{base}"))
202
- return variants
203
-
204
-
205
- def aa_sub(gene: str) -> list:
206
- variants = []
207
- term = f'{gene}[Gene Name] AND "mane select"[keyword]'
208
- stream = Entrez.esearch(db="protein", term=term)
209
- record = Entrez.read(stream)
210
-
211
- stream = Entrez.efetch(
212
- db="protein", rettype="gp", retmode="text", id=record["IdList"]
213
- )
214
- seqrecord = SeqIO.read(stream, "genbank")
215
- for index, residue in enumerate(seqrecord.seq, 1):
216
- for aa in protein_letters:
217
- if aa != residue:
218
- variants.append(
219
- (
220
- f"{seqrecord.id}:p.{residue}{index}{aa}",
221
- f"{seqrecord.id}:p.{protein_letters_1to3[residue]}{
222
- index}{protein_letters_1to3[aa]}",
223
- )
224
- )
225
- return variants
226
-
227
-
228
- def missense(gene: str) -> list:
229
- variants = []
230
- term = f'{gene}[Gene Name] "mane select"[keyword]'
231
- stream = Entrez.esearch(db="nucleotide", term=term)
232
- record = Entrez.read(stream)
233
- stream = Entrez.efetch(
234
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
235
- )
236
- seqrecord = SeqIO.read(stream, "genbank")
237
- for feature in seqrecord.features:
238
- if feature.type == "CDS":
239
- protein = "".join(feature.qualifiers.get("translation"))
240
- protein_id = "".join(feature.qualifiers.get("protein_id"))
241
- cds = feature.location.extract(seqrecord).seq
242
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
243
- for base in codons:
244
- if base != cds[codon : codon + 3]:
245
- seq = Seq(base)
246
- if protein[index] != seq.translate():
247
- if (
248
- base[0] == cds[codon]
249
- and base[1] == cds[codon + 1]
250
- and base[2] != cds[codon + 2]
251
- ):
252
- variants.append(
253
- (
254
- f"{seqrecord.id}:c.{
255
- codon + 3}{cds[codon + 2]}>{base[2]}",
256
- f"{protein_id}:p.{protein[index]}{
257
- index + 1}{seq.translate()}",
258
- f"{protein_id}:p.{seq3(protein[index])}{
259
- index + 1}{seq3(seq.translate())}",
260
- )
261
- )
262
- elif (
263
- base[0] == cds[codon]
264
- and base[1] != cds[codon + 1]
265
- and base[2] == cds[codon + 2]
266
- ):
267
- variants.append(
268
- (
269
- f"{seqrecord.id}:c.{
270
- codon + 2}{cds[codon + 1]}>{base[1]}",
271
- f"{protein_id}:p.{protein[index]}{
272
- index + 1}{seq.translate()}",
273
- f"{protein_id}:p.{seq3(protein[index])}{
274
- index + 1}{seq3(seq.translate())}",
275
- )
276
- )
277
- elif (
278
- base[0] != cds[codon]
279
- and base[1] == cds[codon + 1]
280
- and base[2] == cds[codon + 2]
281
- ):
282
- variants.append(
283
- (
284
- f"{seqrecord.id}:c.{
285
- codon + 1}{cds[codon]}>{base[0]}",
286
- f"{protein_id}:p.{protein[index]}{
287
- index + 1}{seq.translate()}",
288
- f"{protein_id}:p.{seq3(protein[index])}{
289
- index + 1}{seq3(seq.translate())}",
290
- )
291
- )
292
- else:
293
- variants.append(
294
- (
295
- f"{seqrecord.id}:c.{codon + 1}_{codon +
296
- 3}{cds[codon:codon + 3]}>{base}",
297
- f"{protein_id}:p.{protein[index]}{
298
- index + 1}{seq.translate()}",
299
- f"{protein_id}:p.{seq3(protein[index])}{
300
- index + 1}{seq3(seq.translate())}",
301
- )
302
- )
303
- else:
304
- if (
305
- base[0] == cds[codon]
306
- and base[1] == cds[codon + 1]
307
- and base[2] != cds[codon + 2]
308
- ):
309
- variants.append(
310
- (
311
- f"{seqrecord.id}:c.{
312
- codon + 3}{cds[codon + 2]}>{base[2]}",
313
- f"{protein_id}:p.{
314
- protein[index]}{index + 1}=",
315
- f"{protein_id}:p.{seq3(protein[index])}{
316
- index + 1}=",
317
- )
318
- )
319
- elif (
320
- base[0] == cds[codon]
321
- and base[1] != cds[codon + 1]
322
- and base[2] == cds[codon + 2]
323
- ):
324
- variants.append(
325
- (
326
- f"{seqrecord.id}:c.{
327
- codon + 2}{cds[codon + 1]}>{base[1]}",
328
- f"{protein_id}:p.{
329
- protein[index]}{index + 1}=",
330
- f"{protein_id}:p.{seq3(protein[index])}{
331
- index + 1}=",
332
- )
333
- )
334
- elif (
335
- base[0] != cds[codon]
336
- and base[1] == cds[codon + 1]
337
- and base[2] == cds[codon + 2]
338
- ):
339
- variants.append(
340
- (
341
- f"{seqrecord.id}:c.{
342
- codon + 1}{cds[codon]}>{base[0]}",
343
- f"{protein_id}:p.{
344
- protein[index]}{index + 1}=",
345
- f"{protein_id}:p.{seq3(protein[index])}{
346
- index + 1}=",
347
- )
348
- )
349
- else:
350
- variants.append(
351
- (
352
- f"{seqrecord.id}:c.{codon + 1}_{codon +
353
- 3}{cds[codon:codon + 3]}>{base}",
354
- f"{protein_id}:p.{
355
- protein[index]}{index + 1}=",
356
- f"{protein_id}:p.{seq3(protein[index])}{
357
- index + 1}=",
358
- )
359
- )
360
- return variants
361
-
362
-
363
- def inframe_del(gene: str) -> list:
364
- variants = []
365
- term = f'{gene}[Gene Name] "mane select"[keyword]'
366
- stream = Entrez.esearch(db="nucleotide", term=term)
367
- record = Entrez.read(stream)
368
- stream = Entrez.efetch(
369
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
370
- )
371
- seqrecord = SeqIO.read(stream, "genbank")
372
- for feature in seqrecord.features:
373
- if feature.type == "CDS":
374
- protein = "".join(feature.qualifiers.get("translation"))
375
- protein_id = "".join(feature.qualifiers.get("protein_id"))
376
- cds = feature.location.extract(seqrecord).seq
377
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
378
- variants.append(
379
- (
380
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}del",
381
- f"{protein_id}:p.{protein[index]}{index + 1}del",
382
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}del",
383
- )
384
- )
385
- return variants
386
-
387
-
388
- def inframe_dup(gene: str) -> list:
389
- variants = []
390
- term = f'{gene}[Gene Name] "mane select"[keyword]'
391
- stream = Entrez.esearch(db="nucleotide", term=term)
392
- record = Entrez.read(stream)
393
- stream = Entrez.efetch(
394
- db="nucleotide", id=record["IdList"], rettype="gb", retmode="text"
395
- )
396
- seqrecord = SeqIO.read(stream, "genbank")
397
- for feature in seqrecord.features:
398
- if feature.type == "CDS":
399
- protein = "".join(feature.qualifiers.get("translation"))
400
- protein_id = "".join(feature.qualifiers.get("protein_id"))
401
- cds = feature.location.extract(seqrecord).seq
402
- for index, codon in enumerate(range(0, len(cds) - 3, 3)):
403
- variants.append(
404
- (
405
- f"{seqrecord.id}:c.{codon + 1}_{codon + 3}{cds[codon:codon + 3]}dup",
406
- f"{protein_id}:p.{protein[index]}{index + 1}dup",
407
- f"{protein_id}:p.{seq3(protein[index])}{index + 1}dup",
408
- )
409
- )
410
- return variants
411
-
412
-
413
- if __name__ == "__main__":
414
- print(splicing("TMPRSS6"))
@@ -1,6 +0,0 @@
1
- VarSim/__init__.py,sha256=6bFR9e1vaNb4Ow1pBvraet1o-5WXtkGnenmlkLjGFKg,17291
2
- varsim-1.0.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
3
- varsim-1.0.3.dist-info/METADATA,sha256=8HrhM1PXMWht3ca3CBshTNq26ZgV2DytKTW5INsPSoE,2464
4
- varsim-1.0.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
- varsim-1.0.3.dist-info/top_level.txt,sha256=k7Z7TmZCty_ldWkOo_O6Nw15AZ2d55Sj8v7GKtu_Pzo,7
6
- varsim-1.0.3.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- VarSim
File without changes