pydna 5.5.4__py3-none-any.whl → 5.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +24 -193
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +283 -294
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/download.py +6 -15
- pydna/dseq.py +1794 -718
- pydna/dseqrecord.py +170 -169
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +187 -60
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +19 -19
- pydna/utils.py +97 -75
- {pydna-5.5.4.dist-info → pydna-5.5.5.dist-info}/METADATA +8 -8
- pydna-5.5.5.dist-info/RECORD +43 -0
- {pydna-5.5.4.dist-info → pydna-5.5.5.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.4.dist-info/RECORD +0 -46
- {pydna-5.5.4.dist-info → pydna-5.5.5.dist-info}/licenses/LICENSE.txt +0 -0
pydna/gateway.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
from Bio.Seq import reverse_complement
|
|
3
|
-
from pydna.dseqrecord import Dseqrecord
|
|
3
|
+
from pydna.dseqrecord import Dseqrecord
|
|
4
4
|
import re
|
|
5
|
-
import itertools
|
|
5
|
+
import itertools
|
|
6
6
|
from Bio.SeqFeature import SimpleLocation, SeqFeature
|
|
7
7
|
from pydna.utils import shift_location
|
|
8
8
|
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
|
|
@@ -79,7 +79,7 @@ primer_design_attB = {
|
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def gateway_overlap(
|
|
82
|
-
seqx:
|
|
82
|
+
seqx: Dseqrecord, seqy: Dseqrecord, reaction: str, greedy: bool
|
|
83
83
|
) -> list[tuple[int, int, int]]:
|
|
84
84
|
"""
|
|
85
85
|
Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
|
|
@@ -110,7 +110,7 @@ def gateway_overlap(
|
|
|
110
110
|
if len(matches_y) == 0:
|
|
111
111
|
continue
|
|
112
112
|
|
|
113
|
-
for match_x, match_y in
|
|
113
|
+
for match_x, match_y in itertools.product(matches_x, matches_y):
|
|
114
114
|
# Find the overlap sequence within each match, and use the
|
|
115
115
|
# core 7 pbs that are constant
|
|
116
116
|
overlap_x = re.search(overlap_regex, match_x.group())
|
|
@@ -133,7 +133,7 @@ def gateway_overlap(
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
def find_gateway_sites(
|
|
136
|
-
seq:
|
|
136
|
+
seq: Dseqrecord, greedy: bool
|
|
137
137
|
) -> dict[str, list[SimpleLocation]]:
|
|
138
138
|
"""Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
|
|
139
139
|
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
|
|
@@ -154,7 +154,7 @@ def find_gateway_sites(
|
|
|
154
154
|
return out
|
|
155
155
|
|
|
156
156
|
|
|
157
|
-
def annotate_gateway_sites(seq:
|
|
157
|
+
def annotate_gateway_sites(seq: Dseqrecord, greedy: bool) -> Dseqrecord:
|
|
158
158
|
sites = find_gateway_sites(seq, greedy)
|
|
159
159
|
for site in sites:
|
|
160
160
|
for loc in sites[site]:
|
pydna/gel.py
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
"""docstring."""
|
|
11
11
|
|
|
12
|
-
import math
|
|
12
|
+
import math
|
|
13
13
|
from pydna.ladders import GeneRuler_1kb_plus as _mwstd
|
|
14
14
|
|
|
15
15
|
|
|
@@ -31,8 +31,8 @@ def gel(
|
|
|
31
31
|
samples=None, gel_length=600, margin=50, interpolator=interpolator(mwstd=_mwstd)
|
|
32
32
|
):
|
|
33
33
|
import numpy as np
|
|
34
|
-
from PIL import Image
|
|
35
|
-
from PIL import ImageDraw
|
|
34
|
+
from PIL import Image
|
|
35
|
+
from PIL import ImageDraw
|
|
36
36
|
|
|
37
37
|
"""docstring."""
|
|
38
38
|
max_intensity = 256
|
|
@@ -54,7 +54,7 @@ def gel(
|
|
|
54
54
|
|
|
55
55
|
for lane_number, lane in enumerate(samples):
|
|
56
56
|
for band in lane:
|
|
57
|
-
log =
|
|
57
|
+
log = math.log(len(band), 10)
|
|
58
58
|
height = (band.m() / (240 * log)) * 1e10
|
|
59
59
|
peak_centre = interpolator(len(band)) * scale + start
|
|
60
60
|
max_spread = 10
|
|
@@ -68,7 +68,7 @@ def gel(
|
|
|
68
68
|
y2 = peak_centre + i
|
|
69
69
|
intensity = (
|
|
70
70
|
height
|
|
71
|
-
*
|
|
71
|
+
* math.exp(
|
|
72
72
|
-float(((y1 - peak_centre) ** 2)) / (2 * (band_spread**2))
|
|
73
73
|
)
|
|
74
74
|
* max_intensity
|
pydna/genbank.py
CHANGED
|
@@ -11,21 +11,17 @@ The function can be used if the environmental variable **pydna_email** has
|
|
|
11
11
|
been set to a valid email address. The easiest way to do this permanantly is to edit the
|
|
12
12
|
`pydna.ini` file. See the documentation of :func:`pydna.open_config_folder`"""
|
|
13
13
|
|
|
14
|
-
# from pydna.utils import memorize as _memorize
|
|
15
|
-
from pydna.genbankrecord import GenbankRecord as _GenbankRecord
|
|
16
|
-
from pydna.readers import read as _read
|
|
17
14
|
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
|
|
21
|
-
import os as _os
|
|
15
|
+
from pydna.opencloning_models import NCBISequenceSource
|
|
16
|
+
from pydna.readers import read
|
|
17
|
+
from pydna.dseqrecord import Dseqrecord
|
|
22
18
|
|
|
23
|
-
|
|
19
|
+
from Bio import Entrez
|
|
20
|
+
from Bio.SeqFeature import SimpleLocation
|
|
24
21
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# TODO http://httpbin.org/ use for testing?
|
|
22
|
+
from typing import Literal, Optional
|
|
23
|
+
import re
|
|
24
|
+
import os
|
|
29
25
|
|
|
30
26
|
|
|
31
27
|
class Genbank:
|
|
@@ -54,15 +50,11 @@ class Genbank:
|
|
|
54
50
|
*,
|
|
55
51
|
tool: str = "pydna",
|
|
56
52
|
) -> None:
|
|
57
|
-
if not
|
|
58
|
-
r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", users_email,
|
|
53
|
+
if not re.match(
|
|
54
|
+
r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", users_email, re.IGNORECASE
|
|
59
55
|
):
|
|
60
56
|
raise ValueError("email address {} is not valid.".format(users_email))
|
|
61
57
|
|
|
62
|
-
# _module_logger.info("#### Genbank ititiation ####")
|
|
63
|
-
# _module_logger.info("Genbank initiated with email: %s", users_email)
|
|
64
|
-
# _module_logger.info("Genbank initiated with tool : %s", tool)
|
|
65
|
-
|
|
66
58
|
if users_email == "someone@example.com":
|
|
67
59
|
raise ValueError(
|
|
68
60
|
"you have to set your email address in order to download from Genbank"
|
|
@@ -78,10 +70,10 @@ class Genbank:
|
|
|
78
70
|
def nucleotide(
|
|
79
71
|
self,
|
|
80
72
|
item: str,
|
|
81
|
-
seq_start:
|
|
82
|
-
seq_stop:
|
|
83
|
-
strand:
|
|
84
|
-
) ->
|
|
73
|
+
seq_start: Optional[int] = None,
|
|
74
|
+
seq_stop: Optional[int] = None,
|
|
75
|
+
strand: Literal[1, 2] = 1,
|
|
76
|
+
) -> Dseqrecord:
|
|
85
77
|
"""This method downloads a genbank nuclotide record from genbank. This method is
|
|
86
78
|
cached by default. This can be controlled by editing the **pydna_cached_funcs** environment
|
|
87
79
|
variable. The best way to do this permanently is to edit the edit the
|
|
@@ -120,7 +112,7 @@ class Genbank:
|
|
|
120
112
|
"2", 2, "-" or "-1", the antisense (Crick) strand is returned, otherwise
|
|
121
113
|
the sense (Watson) strand is returned.
|
|
122
114
|
|
|
123
|
-
Result is returned as a :class:`
|
|
115
|
+
Result is returned as a :class:`Dseqrecord` object.
|
|
124
116
|
|
|
125
117
|
References
|
|
126
118
|
----------
|
|
@@ -129,15 +121,15 @@ class Genbank:
|
|
|
129
121
|
.. [#] http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
|
|
130
122
|
"""
|
|
131
123
|
matches = (
|
|
132
|
-
(1,
|
|
124
|
+
(1, re.search(r"(REGION:\s(?P<start>\d+)\.\.(?P<stop>\d+))", item)),
|
|
133
125
|
(
|
|
134
126
|
2,
|
|
135
|
-
|
|
127
|
+
re.search(
|
|
136
128
|
r"(REGION: complement\((?P<start>\d+)\.\.(?P<stop>\d+)\))", item
|
|
137
129
|
),
|
|
138
130
|
),
|
|
139
|
-
(1,
|
|
140
|
-
(2,
|
|
131
|
+
(1, re.search(r"(:|\s)(?P<start>\d+)-(?P<stop>\d+)", item)),
|
|
132
|
+
(2, re.search(r"(:|\s)c(?P<start>\d+)-(?P<stop>\d+)", item)),
|
|
141
133
|
)
|
|
142
134
|
|
|
143
135
|
for strand_, match in matches:
|
|
@@ -156,18 +148,10 @@ class Genbank:
|
|
|
156
148
|
except (KeyError, AttributeError):
|
|
157
149
|
strand = 1
|
|
158
150
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# _module_logger.info("start %s", seq_start)
|
|
162
|
-
# _module_logger.info("stop %s", seq_stop)
|
|
163
|
-
|
|
164
|
-
# _module_logger.info("strand %s", str(strand))
|
|
151
|
+
Entrez.email = self.email
|
|
152
|
+
Entrez.tool = self.tool
|
|
165
153
|
|
|
166
|
-
|
|
167
|
-
_Entrez.tool = self.tool
|
|
168
|
-
|
|
169
|
-
# _module_logger.info("Entrez.email %s", self.email)
|
|
170
|
-
text = _Entrez.efetch(
|
|
154
|
+
text = Entrez.efetch(
|
|
171
155
|
db="nuccore",
|
|
172
156
|
id=item,
|
|
173
157
|
rettype="gbwithparts",
|
|
@@ -177,14 +161,30 @@ class Genbank:
|
|
|
177
161
|
retmode="text",
|
|
178
162
|
).read()
|
|
179
163
|
|
|
180
|
-
|
|
164
|
+
result = read(text)
|
|
165
|
+
# TODO: Address this for cases where only one is defined
|
|
166
|
+
if seq_start is not None and seq_stop is not None:
|
|
167
|
+
location = SimpleLocation(
|
|
168
|
+
int(seq_start) - 1, int(seq_stop), -1 if strand == 2 else strand
|
|
169
|
+
)
|
|
170
|
+
elif seq_start is None and seq_stop is None:
|
|
171
|
+
location = None
|
|
172
|
+
elif seq_stop is not None:
|
|
173
|
+
location = SimpleLocation(0, int(seq_stop), -1 if strand == 2 else strand)
|
|
174
|
+
else:
|
|
175
|
+
st = int(seq_start) - 1
|
|
176
|
+
location = SimpleLocation(
|
|
177
|
+
st, st + len(result), -1 if strand == 2 else strand
|
|
178
|
+
)
|
|
181
179
|
|
|
182
|
-
|
|
183
|
-
|
|
180
|
+
result.source = NCBISequenceSource(
|
|
181
|
+
repository_id=item,
|
|
182
|
+
coordinates=location,
|
|
184
183
|
)
|
|
184
|
+
return result
|
|
185
185
|
|
|
186
186
|
|
|
187
|
-
def genbank(accession: str = "CS570233.1", *args, **kwargs) ->
|
|
187
|
+
def genbank(accession: str = "CS570233.1", *args, email=None, **kwargs) -> Dseqrecord:
|
|
188
188
|
"""
|
|
189
189
|
Download a genbank nuclotide record.
|
|
190
190
|
|
|
@@ -229,9 +229,6 @@ def genbank(accession: str = "CS570233.1", *args, **kwargs) -> _GenbankRecord:
|
|
|
229
229
|
//
|
|
230
230
|
|
|
231
231
|
"""
|
|
232
|
-
email =
|
|
233
|
-
# _module_logger.info("#### genbank function called ####")
|
|
234
|
-
# _module_logger.info("email %s", email)
|
|
235
|
-
# _module_logger.info("accession %s", email)
|
|
232
|
+
email = email or os.getenv("pydna_email")
|
|
236
233
|
gb = Genbank(email)
|
|
237
234
|
return gb.nucleotide(accession, *args, **kwargs)
|
pydna/genbankfixer.py
CHANGED
|
@@ -24,63 +24,63 @@ This should not be a difficult fix. The returned result has two properties,
|
|
|
24
24
|
which is the formatted genbank string."""
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
import re
|
|
28
|
-
import pyparsing as
|
|
27
|
+
import re
|
|
28
|
+
import pyparsing as pp
|
|
29
29
|
|
|
30
30
|
GoodLocus = (
|
|
31
|
-
|
|
32
|
-
+
|
|
33
|
-
+
|
|
34
|
-
+
|
|
35
|
-
+
|
|
36
|
-
+ (
|
|
31
|
+
pp.Literal("LOCUS")
|
|
32
|
+
+ pp.Word(pp.alphas + pp.nums + "-_()." + "\\").setResultsName("name")
|
|
33
|
+
+ pp.Word(pp.nums).setResultsName("size")
|
|
34
|
+
+ pp.Suppress(pp.CaselessLiteral("bp"))
|
|
35
|
+
+ pp.Word(pp.alphas + "-").setResultsName("seqtype")
|
|
36
|
+
+ (pp.CaselessLiteral("linear") | pp.CaselessLiteral("circular")).setResultsName(
|
|
37
37
|
"topology"
|
|
38
38
|
)
|
|
39
|
-
+
|
|
40
|
-
+
|
|
39
|
+
+ pp.Optional(pp.Word(pp.alphas), default=" ").setResultsName("divcode")
|
|
40
|
+
+ pp.Regex(r"(\d{2})-(\S{3})-(\d{4})").setResultsName("date")
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
# Older versions of ApE don't include a LOCUS name! Need separate def for this case:
|
|
44
44
|
BrokenLocus1 = (
|
|
45
|
-
|
|
46
|
-
+
|
|
47
|
-
+
|
|
48
|
-
+
|
|
49
|
-
+ (
|
|
45
|
+
pp.Literal("LOCUS").setResultsName("name")
|
|
46
|
+
+ pp.Word(pp.nums).setResultsName("size")
|
|
47
|
+
+ pp.Suppress(pp.CaselessLiteral("bp"))
|
|
48
|
+
+ pp.Word(pp.alphas + "-").setResultsName("seqtype")
|
|
49
|
+
+ (pp.CaselessLiteral("linear") | pp.CaselessLiteral("circular")).setResultsName(
|
|
50
50
|
"topology"
|
|
51
51
|
)
|
|
52
|
-
+
|
|
53
|
-
+
|
|
52
|
+
+ pp.Optional(pp.Word(pp.alphas), default=" ").setResultsName("divcode")
|
|
53
|
+
+ pp.Regex(r"(\d{2})-(\S{3})-(\d{4})").setResultsName("date")
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
# LOCUS YEplac181 5741 bp DNA SYN
|
|
57
57
|
BrokenLocus2 = (
|
|
58
|
-
|
|
59
|
-
+
|
|
60
|
-
+
|
|
61
|
-
+
|
|
62
|
-
+
|
|
63
|
-
+
|
|
64
|
-
|
|
58
|
+
pp.Literal("LOCUS")
|
|
59
|
+
+ pp.Word(pp.alphas + pp.nums + "-_()." + "\\").setResultsName("name")
|
|
60
|
+
+ pp.Word(pp.nums).setResultsName("size")
|
|
61
|
+
+ pp.Suppress(pp.CaselessLiteral("bp"))
|
|
62
|
+
+ pp.Word(pp.alphas + "-").setResultsName("seqtype")
|
|
63
|
+
+ pp.Optional(
|
|
64
|
+
pp.CaselessLiteral("linear") | pp.CaselessLiteral("circular"),
|
|
65
65
|
default="linear",
|
|
66
66
|
).setResultsName("topology")
|
|
67
|
-
+
|
|
68
|
-
+
|
|
67
|
+
+ pp.Optional(pp.Word(pp.alphas), default=" ").setResultsName("divcode")
|
|
68
|
+
+ pp.Regex(r"(\d{2})-(\S{3})-(\d{4})").setResultsName("date")
|
|
69
69
|
)
|
|
70
70
|
|
|
71
71
|
BrokenLocus3 = (
|
|
72
|
-
|
|
73
|
-
+
|
|
74
|
-
+
|
|
75
|
-
+
|
|
76
|
-
+
|
|
77
|
-
+
|
|
78
|
-
|
|
72
|
+
pp.Literal("LOCUS")
|
|
73
|
+
+ pp.Word(pp.alphas + pp.nums + "-_()." + "\\").setResultsName("name")
|
|
74
|
+
+ pp.Word(pp.nums).setResultsName("size")
|
|
75
|
+
+ pp.Suppress(pp.CaselessLiteral("bp"))
|
|
76
|
+
+ pp.Word(pp.alphas + "-").setResultsName("seqtype")
|
|
77
|
+
+ pp.Optional(
|
|
78
|
+
pp.CaselessLiteral("linear") | pp.CaselessLiteral("circular"),
|
|
79
79
|
default="linear",
|
|
80
80
|
).setResultsName("topology")
|
|
81
|
-
+
|
|
82
|
-
+
|
|
83
|
-
|
|
81
|
+
+ pp.Word(pp.alphas).setResultsName("divcode")
|
|
82
|
+
+ pp.Optional(
|
|
83
|
+
pp.Regex(r"(\d{2})-(\S{3})-(\d{4})").setResultsName("date"),
|
|
84
84
|
default="19-MAR-1970",
|
|
85
85
|
).setResultsName("date")
|
|
86
86
|
)
|
|
@@ -95,14 +95,13 @@ LocusEntry = GoodLocus | BrokenLocus1 | BrokenLocus2 | BrokenLocus3
|
|
|
95
95
|
# (Though these entries are generally useless when it comes to hacking on DNA)
|
|
96
96
|
|
|
97
97
|
# All entries in a genbank file headed by an all-caps title with no space between start-of-line and title
|
|
98
|
-
CapWord =
|
|
98
|
+
CapWord = pp.Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
99
99
|
# after titled line, all subsequent lines have to have at least one space in front of them
|
|
100
100
|
# this is how we split up the genbank record
|
|
101
|
-
SpacedLine =
|
|
101
|
+
SpacedLine = pp.White(min=1) + pp.CharsNotIn("\n") + pp.LineEnd()
|
|
102
102
|
# HeaderLine = CapWord + CharsNotIn("\n") + LineEnd()
|
|
103
|
-
GenericEntry =
|
|
104
|
-
CapWord
|
|
105
|
-
+ _pp.Combine(_pp.CharsNotIn("\n") + _pp.LineEnd() + _pp.ZeroOrMore(SpacedLine))
|
|
103
|
+
GenericEntry = pp.Group(
|
|
104
|
+
CapWord + pp.Combine(pp.CharsNotIn("\n") + pp.LineEnd() + pp.ZeroOrMore(SpacedLine))
|
|
106
105
|
).setResultsName("generics", listAllMatches=True)
|
|
107
106
|
|
|
108
107
|
|
|
@@ -135,28 +134,28 @@ GenericEntry = _pp.Group(
|
|
|
135
134
|
#
|
|
136
135
|
# if you don't know where something is, don't use it or guess and move on
|
|
137
136
|
|
|
138
|
-
LPAREN =
|
|
139
|
-
RPAREN =
|
|
140
|
-
SEP =
|
|
137
|
+
LPAREN = pp.Suppress("(")
|
|
138
|
+
RPAREN = pp.Suppress(")")
|
|
139
|
+
SEP = pp.Suppress(pp.Literal(".."))
|
|
141
140
|
|
|
142
141
|
# recognize numbers w. < & > uncertainty specs, then strip the <> chars to make it fixed
|
|
143
|
-
gbIndex =
|
|
142
|
+
gbIndex = pp.Word(pp.nums + "<>").setParseAction(
|
|
144
143
|
lambda s, l_, t: int(t[0].replace("<", "").replace(">", ""))
|
|
145
144
|
)
|
|
146
|
-
SimpleSlice =
|
|
145
|
+
SimpleSlice = pp.Group(gbIndex + SEP + gbIndex) | pp.Group(gbIndex).setParseAction(
|
|
147
146
|
lambda s, l_, t: [[t[0][0], t[0][0]]]
|
|
148
147
|
)
|
|
149
148
|
|
|
150
149
|
# recursive def for nested function syntax: f( g(), g() )
|
|
151
|
-
complexSlice =
|
|
150
|
+
complexSlice = pp.Forward()
|
|
152
151
|
(
|
|
153
152
|
complexSlice
|
|
154
|
-
<< (
|
|
153
|
+
<< (pp.Literal("complement") | pp.Literal("join"))
|
|
155
154
|
+ LPAREN
|
|
156
|
-
+ (
|
|
155
|
+
+ (pp.delimitedList(complexSlice) | pp.delimitedList(SimpleSlice))
|
|
157
156
|
+ RPAREN
|
|
158
157
|
)
|
|
159
|
-
featLocation =
|
|
158
|
+
featLocation = pp.Group(SimpleSlice | complexSlice)
|
|
160
159
|
|
|
161
160
|
|
|
162
161
|
def parseGBLoc(s, l_, t):
|
|
@@ -183,7 +182,7 @@ featLocation.setParseAction(parseGBLoc)
|
|
|
183
182
|
|
|
184
183
|
|
|
185
184
|
def strip_multiline(s, l_, t):
|
|
186
|
-
whitespace =
|
|
185
|
+
whitespace = re.compile("[\n]{1}[ ]+")
|
|
187
186
|
return whitespace.sub(" ", t[0])
|
|
188
187
|
|
|
189
188
|
|
|
@@ -192,59 +191,57 @@ def toInt(s, l_, t):
|
|
|
192
191
|
|
|
193
192
|
|
|
194
193
|
# Quoted KeyVal: /key="value"
|
|
195
|
-
QuoteFeaturekeyval =
|
|
196
|
-
|
|
197
|
-
+
|
|
198
|
-
+
|
|
199
|
-
+
|
|
194
|
+
QuoteFeaturekeyval = pp.Group(
|
|
195
|
+
pp.Suppress("/")
|
|
196
|
+
+ pp.Word(pp.alphas + pp.nums + "_-")
|
|
197
|
+
+ pp.Suppress("=")
|
|
198
|
+
+ pp.QuotedString('"', multiline=True).setParseAction(strip_multiline)
|
|
200
199
|
)
|
|
201
200
|
|
|
202
201
|
# UnQuoted KeyVal: /key=value (I'm assuming it doesn't do multilines this way? wrong! ApE does store long labels this way! sigh.)
|
|
203
202
|
# NoQuoteFeaturekeyval = Group(Suppress('/') + Word(alphas+nums+"_-") + Suppress('=') + OneOrMore(CharsNotIn("\n")) )
|
|
204
203
|
keyvalspacedline = (
|
|
205
|
-
|
|
206
|
-
+
|
|
207
|
-
+
|
|
208
|
-
+
|
|
204
|
+
pp.White(exact=21)
|
|
205
|
+
+ pp.CharsNotIn("/")
|
|
206
|
+
+ pp.OneOrMore(pp.CharsNotIn("\n"))
|
|
207
|
+
+ pp.LineEnd()
|
|
209
208
|
)
|
|
210
|
-
NoQuoteFeaturekeyval =
|
|
211
|
-
|
|
212
|
-
+
|
|
213
|
-
+
|
|
214
|
-
+
|
|
215
|
-
_pp.CharsNotIn("\n") + _pp.LineEnd() + _pp.ZeroOrMore(keyvalspacedline)
|
|
216
|
-
)
|
|
209
|
+
NoQuoteFeaturekeyval = pp.Group(
|
|
210
|
+
pp.Suppress("/")
|
|
211
|
+
+ pp.Word(pp.alphas + pp.nums + "_-")
|
|
212
|
+
+ pp.Suppress("=")
|
|
213
|
+
+ pp.Combine(pp.CharsNotIn("\n") + pp.LineEnd() + pp.ZeroOrMore(keyvalspacedline))
|
|
217
214
|
)
|
|
218
215
|
|
|
219
216
|
# Special Case for Numerical Vals: /bases=12 OR /bases="12"
|
|
220
|
-
NumFeaturekeyval =
|
|
221
|
-
|
|
222
|
-
+
|
|
223
|
-
+
|
|
224
|
-
+ (
|
|
225
|
-
| (
|
|
217
|
+
NumFeaturekeyval = pp.Group(
|
|
218
|
+
pp.Suppress("/")
|
|
219
|
+
+ pp.Word(pp.alphas + pp.nums + "_-")
|
|
220
|
+
+ pp.Suppress("=")
|
|
221
|
+
+ (pp.Suppress('"') + pp.Word(pp.nums).setParseAction(toInt) + pp.Suppress('"'))
|
|
222
|
+
| (pp.Word(pp.nums).setParseAction(toInt))
|
|
226
223
|
)
|
|
227
224
|
|
|
228
225
|
# Key Only KeyVal: /pseudo
|
|
229
226
|
# post-parse convert it into a pair to resemble the structure of the first three cases i.e. [pseudo, True]
|
|
230
|
-
FlagFeaturekeyval =
|
|
231
|
-
|
|
227
|
+
FlagFeaturekeyval = pp.Group(
|
|
228
|
+
pp.Suppress("/") + pp.Word(pp.alphas + pp.nums + "_-")
|
|
232
229
|
).setParseAction(lambda s, l_, t: [[t[0][0], True]])
|
|
233
230
|
|
|
234
|
-
Feature =
|
|
235
|
-
|
|
231
|
+
Feature = pp.Group(
|
|
232
|
+
pp.Word(pp.alphas + pp.nums + "_-").setParseAction(
|
|
236
233
|
lambda s, l_, t: [["type", t[0]]]
|
|
237
234
|
)
|
|
238
235
|
+ featLocation.setResultsName("location")
|
|
239
|
-
+
|
|
236
|
+
+ pp.OneOrMore(
|
|
240
237
|
NumFeaturekeyval | QuoteFeaturekeyval | NoQuoteFeaturekeyval | FlagFeaturekeyval
|
|
241
238
|
)
|
|
242
239
|
)
|
|
243
240
|
|
|
244
241
|
FeaturesEntry = (
|
|
245
|
-
|
|
246
|
-
+
|
|
247
|
-
+
|
|
242
|
+
pp.Literal("FEATURES")
|
|
243
|
+
+ pp.Literal("Location/Qualifiers")
|
|
244
|
+
+ pp.Group(pp.OneOrMore(Feature)).setResultsName("features")
|
|
248
245
|
)
|
|
249
246
|
|
|
250
247
|
# ===============================================================================
|
|
@@ -252,12 +249,12 @@ FeaturesEntry = (
|
|
|
252
249
|
|
|
253
250
|
# sequence is just a column-spaced big table of dna nucleotides
|
|
254
251
|
# should it recognize full IUPAC alphabet? NCBI uses n for unknown region
|
|
255
|
-
Sequence =
|
|
256
|
-
|
|
252
|
+
Sequence = pp.OneOrMore(
|
|
253
|
+
pp.Suppress(pp.Word(pp.nums)) + pp.OneOrMore(pp.Word("ACGTacgtNn"))
|
|
257
254
|
)
|
|
258
255
|
|
|
259
256
|
# Group( ) hides the setResultsName names def'd inside, such that one needs to first access this group and then access the dict of contents inside
|
|
260
|
-
SequenceEntry =
|
|
257
|
+
SequenceEntry = pp.Suppress(pp.Literal("ORIGIN")) + Sequence.setParseAction(
|
|
261
258
|
lambda s, l_, t: "".join(t)
|
|
262
259
|
).setResultsName("sequence")
|
|
263
260
|
|
|
@@ -266,13 +263,13 @@ SequenceEntry = _pp.Suppress(_pp.Literal("ORIGIN")) + Sequence.setParseAction(
|
|
|
266
263
|
# Final GenBank Parser
|
|
267
264
|
|
|
268
265
|
# GB files with multiple records split by "//" sequence at beginning of line
|
|
269
|
-
GBEnd =
|
|
266
|
+
GBEnd = pp.Literal("//")
|
|
270
267
|
|
|
271
268
|
# Begin w. LOCUS, slurp all entries, then stop at the end!
|
|
272
|
-
GB = LocusEntry +
|
|
269
|
+
GB = LocusEntry + pp.OneOrMore(FeaturesEntry | SequenceEntry | GenericEntry) + GBEnd
|
|
273
270
|
|
|
274
271
|
# NCBI often returns sets of GB files
|
|
275
|
-
multipleGB =
|
|
272
|
+
multipleGB = pp.OneOrMore(pp.Group(GB))
|
|
276
273
|
|
|
277
274
|
# ===============================================================================
|
|
278
275
|
# End Genbank Parser
|
|
@@ -284,7 +281,7 @@ multipleGB = _pp.OneOrMore(_pp.Group(GB))
|
|
|
284
281
|
|
|
285
282
|
|
|
286
283
|
def strip_indent(str):
|
|
287
|
-
whitespace =
|
|
284
|
+
whitespace = re.compile("[\n]{1}(COMMENT){0,1}[ ]+")
|
|
288
285
|
return whitespace.sub("\n", str)
|
|
289
286
|
|
|
290
287
|
|
|
@@ -588,9 +585,9 @@ def gbtext_clean(gbtext):
|
|
|
588
585
|
|
|
589
586
|
jseqlist = toJSON(gbtext)
|
|
590
587
|
jseq = jseqlist.pop()
|
|
591
|
-
from collections import namedtuple
|
|
592
|
-
from pydna._pretty import pretty_str as
|
|
588
|
+
from collections import namedtuple
|
|
589
|
+
from pydna._pretty import pretty_str as ps
|
|
593
590
|
|
|
594
|
-
Result =
|
|
595
|
-
result = Result(
|
|
591
|
+
Result = namedtuple("Result", "gbtext jseq")
|
|
592
|
+
result = Result(ps(toGB(jseq).strip()), jseq)
|
|
596
593
|
return result
|
pydna/ladders.py
CHANGED
|
@@ -16,17 +16,16 @@ a gel image. Exampel can be found in scripts/molecular_weight_standards.ods.
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
from pydna.fakeseq import FakeSeq
|
|
19
|
+
from pydna.fakeseq import FakeSeq
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
PennStateLadder = [
|
|
23
|
-
|
|
24
|
-
for n in (10000, 7750, 5000, 4000, 3000, 2000, 1500, 1000, 750, 500)
|
|
23
|
+
FakeSeq(int(n)) for n in (10000, 7750, 5000, 4000, 3000, 2000, 1500, 1000, 750, 500)
|
|
25
24
|
]
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
GeneRuler_1kb = [
|
|
29
|
-
|
|
28
|
+
FakeSeq(int(n))
|
|
30
29
|
for n in (
|
|
31
30
|
10000,
|
|
32
31
|
8000,
|
|
@@ -49,7 +48,7 @@ GeneRuler_1kb = [
|
|
|
49
48
|
# https://docs.google.com/spreadsheets/d/1vN0y75ibxPrG6yJQjq1uF2FXP0L-qGSn_fzInUHeTs4/edit#gid=0
|
|
50
49
|
|
|
51
50
|
GeneRuler_1kb_plus = [
|
|
52
|
-
|
|
51
|
+
FakeSeq(ln, n=n * 1e-15, rf=rf)
|
|
53
52
|
for ln, n, rf in (
|
|
54
53
|
# (length, fmol, Rf )
|
|
55
54
|
(20000, 1.538, 0.000),
|
|
@@ -72,7 +71,7 @@ GeneRuler_1kb_plus = [
|
|
|
72
71
|
|
|
73
72
|
|
|
74
73
|
HI_LO_DNA_MARKER = [
|
|
75
|
-
|
|
74
|
+
FakeSeq(ln, n=n * 1e-15, rf=rf)
|
|
76
75
|
for ln, n, rf in (
|
|
77
76
|
# (length, fmol, Rf )
|
|
78
77
|
(10000, 4.545, 0.000),
|
|
@@ -121,16 +120,16 @@ HI_LO_DNA_MARKER = [
|
|
|
121
120
|
|
|
122
121
|
FakeGel = [
|
|
123
122
|
[
|
|
124
|
-
|
|
125
|
-
|
|
123
|
+
FakeSeq(1000),
|
|
124
|
+
FakeSeq(2000),
|
|
126
125
|
],
|
|
127
126
|
[
|
|
128
|
-
|
|
129
|
-
|
|
127
|
+
FakeSeq(3000),
|
|
128
|
+
FakeSeq(4000),
|
|
130
129
|
],
|
|
131
130
|
[
|
|
132
|
-
|
|
133
|
-
|
|
131
|
+
FakeSeq(5000),
|
|
132
|
+
FakeSeq(6000),
|
|
134
133
|
],
|
|
135
134
|
PennStateLadder,
|
|
136
135
|
]
|