pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +30 -195
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/all.py +1 -12
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +375 -310
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/dseq.py +1788 -718
- pydna/dseqrecord.py +197 -179
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +187 -60
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +19 -19
- pydna/utils.py +97 -75
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/METADATA +8 -8
- pydna-5.5.6.dist-info/RECORD +42 -0
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/download.py +0 -32
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.4.dist-info/RECORD +0 -46
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/licenses/LICENSE.txt +0 -0
pydna/dseq.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2013-2023 by Björn Johansson. All rights reserved.
|
|
5
|
-
# This code is part of the Python-dna distribution and governed by its
|
|
6
|
-
# license. Please see the LICENSE.txt file that should have been included
|
|
7
|
-
# as part of this package.
|
|
8
4
|
"""Provides the Dseq class for handling double stranded DNA sequences.
|
|
9
5
|
|
|
10
6
|
Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
|
|
@@ -14,87 +10,217 @@ which can hold more meta data.
|
|
|
14
10
|
The Dseq class support the notion of circular and linear DNA topology.
|
|
15
11
|
"""
|
|
16
12
|
|
|
13
|
+
import itertools
|
|
14
|
+
import re
|
|
15
|
+
import copy
|
|
16
|
+
import sys
|
|
17
|
+
import math
|
|
18
|
+
import inspect
|
|
19
|
+
from typing import List, Tuple, Union
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
from Bio.Restriction import RestrictionBatch
|
|
22
|
+
from Bio.Restriction import CommOnly
|
|
23
|
+
|
|
24
|
+
from seguid import ldseguid
|
|
25
|
+
from seguid import cdseguid
|
|
26
|
+
|
|
27
|
+
from pydna.seq import Seq
|
|
28
|
+
from Bio.Seq import _SeqAbstractBaseClass
|
|
29
|
+
from Bio.Data.IUPACData import unambiguous_dna_weights
|
|
30
|
+
from Bio.Data.IUPACData import unambiguous_rna_weights
|
|
31
|
+
from Bio.Data.IUPACData import atom_weights
|
|
32
|
+
from pydna._pretty import pretty_str
|
|
33
|
+
from pydna.utils import rc
|
|
34
|
+
from pydna.utils import flatten
|
|
35
|
+
from pydna.utils import cuts_overlap
|
|
36
|
+
|
|
37
|
+
from pydna.alphabet import basepair_dict
|
|
38
|
+
from pydna.alphabet import dscode_to_watson_table
|
|
39
|
+
from pydna.alphabet import dscode_to_crick_table
|
|
40
|
+
from pydna.alphabet import regex_ds_melt_factory
|
|
41
|
+
from pydna.alphabet import regex_ss_melt_factory
|
|
42
|
+
from pydna.alphabet import dscode_to_full_sequence_table
|
|
43
|
+
from pydna.alphabet import dscode_to_watson_tail_table
|
|
44
|
+
from pydna.alphabet import dscode_to_crick_tail_table
|
|
45
|
+
from pydna.alphabet import complement_table_for_dscode
|
|
46
|
+
from pydna.alphabet import letters_not_in_dscode
|
|
47
|
+
from pydna.alphabet import get_parts
|
|
48
|
+
from pydna.alphabet import representation_tuple
|
|
49
|
+
from pydna.alphabet import dsbreaks
|
|
50
|
+
|
|
51
|
+
from pydna.common_sub_strings import common_sub_strings
|
|
52
|
+
from pydna.types import DseqType, EnzymesType, CutSiteType
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Sequences larger than this gets a truncated representation.
|
|
56
|
+
length_limit_for_repr = 30
|
|
57
|
+
placeholder = letters_not_in_dscode[-1]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class CircularBytes(bytes):
|
|
61
|
+
"""
|
|
62
|
+
A circular bytes sequence: indexing and slicing wrap around index 0.
|
|
63
|
+
"""
|
|
23
64
|
|
|
24
|
-
|
|
25
|
-
|
|
65
|
+
def __new__(cls, value: bytes | bytearray | memoryview):
|
|
66
|
+
return super().__new__(cls, bytes(value))
|
|
67
|
+
|
|
68
|
+
def __getitem__(self, key):
|
|
69
|
+
n = len(self)
|
|
70
|
+
if n == 0:
|
|
71
|
+
if isinstance(key, slice):
|
|
72
|
+
return self.__class__(b"")
|
|
73
|
+
raise IndexError("CircularBytes index out of range (empty bytes)")
|
|
74
|
+
|
|
75
|
+
if isinstance(key, int):
|
|
76
|
+
return super().__getitem__(key % n)
|
|
77
|
+
|
|
78
|
+
if isinstance(key, slice):
|
|
79
|
+
start, stop, step = key.start, key.stop, key.step
|
|
80
|
+
step = 1 if step is None else step
|
|
81
|
+
if step == 0:
|
|
82
|
+
raise ValueError("slice step cannot be zero")
|
|
83
|
+
|
|
84
|
+
if step > 0:
|
|
85
|
+
start = 0 if start is None else start
|
|
86
|
+
stop = n if stop is None else stop
|
|
87
|
+
while stop <= start:
|
|
88
|
+
stop += n
|
|
89
|
+
rng = range(start, stop, step)
|
|
90
|
+
else:
|
|
91
|
+
start = (n - 1) if start is None else start
|
|
92
|
+
stop = -1 if stop is None else stop
|
|
93
|
+
while stop >= start:
|
|
94
|
+
stop -= n
|
|
95
|
+
rng = range(start, stop, step)
|
|
96
|
+
|
|
97
|
+
limit = n if step % n == 0 else n * 2
|
|
98
|
+
out = bytearray()
|
|
99
|
+
count = 0
|
|
100
|
+
for i in rng:
|
|
101
|
+
out.append(super().__getitem__(i % n))
|
|
102
|
+
count += 1
|
|
103
|
+
if count > limit:
|
|
104
|
+
break
|
|
105
|
+
return self.__class__(bytes(out))
|
|
26
106
|
|
|
27
|
-
|
|
28
|
-
from seguid import ldseguid as _ldseguid
|
|
29
|
-
from seguid import cdseguid as _cdseguid
|
|
107
|
+
return super().__getitem__(key)
|
|
30
108
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
109
|
+
def cutaround(self, start: int, length: int) -> bytes:
|
|
110
|
+
"""
|
|
111
|
+
Return a circular slice of given length starting at index `start`.
|
|
112
|
+
Can exceed len(self), wrapping around as needed.
|
|
34
113
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
114
|
+
Examples
|
|
115
|
+
--------
|
|
116
|
+
s = CircularBytes(b"ABCDE")
|
|
117
|
+
assert s.cutaround(3, 7) == b"DEABCDE"
|
|
118
|
+
assert s.cutaround(-1, 4) == b"EABC"
|
|
119
|
+
"""
|
|
120
|
+
n = len(self)
|
|
121
|
+
if n == 0 or length <= 0:
|
|
122
|
+
return self.__class__(b"")
|
|
123
|
+
|
|
124
|
+
start %= n
|
|
125
|
+
out = bytearray()
|
|
126
|
+
for i in range(length):
|
|
127
|
+
out.append(self[(start + i) % n])
|
|
128
|
+
return self.__class__(bytes(out))
|
|
129
|
+
|
|
130
|
+
def find(
|
|
131
|
+
self,
|
|
132
|
+
sub: bytes | bytearray | memoryview | str,
|
|
133
|
+
start: int = 0,
|
|
134
|
+
end: int | None = None,
|
|
135
|
+
) -> int:
|
|
136
|
+
"""
|
|
137
|
+
Find a subsequence in the circular sequence, possibly
|
|
138
|
+
wrapping across the origin.
|
|
139
|
+
Returns -1 if not found.
|
|
140
|
+
"""
|
|
141
|
+
n = len(self)
|
|
142
|
+
if n == 0:
|
|
143
|
+
return -1
|
|
144
|
+
|
|
145
|
+
end = n if end is None else min(end, n)
|
|
146
|
+
doubled = self + self
|
|
147
|
+
try:
|
|
148
|
+
sub = sub.encode("ascii")
|
|
149
|
+
except AttributeError:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
pos = doubled.find(bytes(sub), start, n + len(sub) - 1)
|
|
153
|
+
|
|
154
|
+
if pos == -1 or pos >= n:
|
|
155
|
+
return -1
|
|
156
|
+
return pos
|
|
38
157
|
|
|
39
158
|
|
|
40
|
-
|
|
159
|
+
class Dseq(Seq):
|
|
160
|
+
"""Dseq describes a double stranded DNA fragment, linear or circular.
|
|
41
161
|
|
|
42
|
-
|
|
162
|
+
Dseq can be initiated in two ways, using two strings, each representing the
|
|
163
|
+
Watson (upper, sense) strand, the Crick (lower, antisense) strand and an
|
|
164
|
+
optional value describing the stagger betwen the strands on the left side (ovhg).
|
|
43
165
|
|
|
166
|
+
Alternatively, a single string represenation using dsIUPAC codes can be used.
|
|
167
|
+
If a single string is used, the letters of that string are interpreted as base
|
|
168
|
+
pairs rather than single bases. For example "A" would indicate the basepair
|
|
169
|
+
"A/T". An expanded IUPAC code is used where the letters PEXI have been assigned
|
|
170
|
+
to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"",
|
|
171
|
+
T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with
|
|
172
|
+
an empty Watson strand ""/G, ""/A, ""/T, and ""/C.
|
|
173
|
+
|
|
174
|
+
::
|
|
175
|
+
|
|
176
|
+
PEXIGATCQFZJ would indicate the linear double-stranded fragment:
|
|
177
|
+
|
|
178
|
+
GATCGATC
|
|
179
|
+
CTAGCTAG
|
|
44
180
|
|
|
45
|
-
class Dseq(_Seq):
|
|
46
|
-
"""Dseq holds information for a double stranded DNA fragment.
|
|
47
181
|
|
|
48
|
-
Dseq also holds information describing the topology of
|
|
49
|
-
the DNA fragment (linear or circular).
|
|
50
182
|
|
|
51
183
|
Parameters
|
|
52
184
|
----------
|
|
53
185
|
watson : str
|
|
54
|
-
a string representing the
|
|
186
|
+
a string representing the Watson (sense) DNA strand or a basepair
|
|
187
|
+
represenation.
|
|
55
188
|
|
|
56
189
|
crick : str, optional
|
|
57
|
-
a string representing the
|
|
190
|
+
a string representing the Crick (antisense) DNA strand.
|
|
58
191
|
|
|
59
192
|
ovhg : int, optional
|
|
60
193
|
A positive or negative number to describe the stagger between the
|
|
61
|
-
|
|
194
|
+
Watson and Crick strands.
|
|
62
195
|
see below for a detailed explanation.
|
|
63
196
|
|
|
64
|
-
linear : bool, optional
|
|
65
|
-
True indicates that sequence is linear, False that it is circular.
|
|
66
|
-
|
|
67
197
|
circular : bool, optional
|
|
68
198
|
True indicates that sequence is circular, False that it is linear.
|
|
69
199
|
|
|
70
200
|
|
|
71
201
|
Examples
|
|
72
202
|
--------
|
|
73
|
-
Dseq is a subclass of the Biopython Seq
|
|
74
|
-
strings representing the
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
in the 5' end of the fragment.
|
|
203
|
+
Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor
|
|
204
|
+
can accept two strings representing the Watson (sense) and Crick(antisense)
|
|
205
|
+
DNA strands. These are interpreted as single stranded DNA. There is a check
|
|
206
|
+
for complementarity between the strands.
|
|
78
207
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
There are three ways of creating a Dseq object directly listed below, but you can also
|
|
83
|
-
use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
|
|
208
|
+
If the DNA molecule is staggered on the left side, an integer ovhg
|
|
209
|
+
(overhang) must be given, describing the stagger between the Watson and Crick strand
|
|
210
|
+
in the 5' end of the fragment.
|
|
84
211
|
|
|
85
|
-
|
|
212
|
+
Additionally, the optional boolean parameter circular can be given to indicate if the
|
|
213
|
+
DNA molecule is circular.
|
|
86
214
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
ttt
|
|
215
|
+
The most common usage of the Dseq class is probably not to use it directly, but to
|
|
216
|
+
create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
|
|
217
|
+
This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and
|
|
218
|
+
:class:`Bio.SeqRecord.SeqRecord` classes in Biopython.
|
|
92
219
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
is created automatically from the watson strand.
|
|
220
|
+
There are multiple ways of creating a Dseq object directly listed below, but you can also
|
|
221
|
+
use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
|
|
96
222
|
|
|
97
|
-
Two arguments (string, string):
|
|
223
|
+
Two arguments (string, string), no overhang provided:
|
|
98
224
|
|
|
99
225
|
>>> from pydna.dseq import Dseq
|
|
100
226
|
>>> Dseq("gggaaat","ttt")
|
|
@@ -102,16 +228,14 @@ class Dseq(_Seq):
|
|
|
102
228
|
gggaaat
|
|
103
229
|
ttt
|
|
104
230
|
|
|
105
|
-
If
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
slow. The length of the annealing sequences have to be at least
|
|
109
|
-
half the length of the shortest of the strands.
|
|
231
|
+
If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing
|
|
232
|
+
between the strands. There are important limitations to this. If there are several ways to
|
|
233
|
+
anneal the strands, this will fail. For long fragments it is quite slow.
|
|
110
234
|
|
|
111
235
|
Three arguments (string, string, ovhg=int):
|
|
112
236
|
|
|
113
|
-
The ovhg parameter is an integer describing the length of the
|
|
114
|
-
|
|
237
|
+
The ovhg parameter is an integer describing the length of the Crick strand overhang on the
|
|
238
|
+
left side (the 5' end of Watson strand).
|
|
115
239
|
|
|
116
240
|
The ovhg parameter controls the stagger at the five prime end::
|
|
117
241
|
|
|
@@ -134,53 +258,51 @@ class Dseq(_Seq):
|
|
|
134
258
|
|
|
135
259
|
Example of creating Dseq objects with different amounts of stagger:
|
|
136
260
|
|
|
137
|
-
>>> Dseq(watson="
|
|
261
|
+
>>> Dseq(watson="att", crick="acata", ovhg=-2)
|
|
138
262
|
Dseq(-7)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
>>> Dseq(watson="
|
|
263
|
+
att
|
|
264
|
+
ataca
|
|
265
|
+
>>> Dseq(watson="ata",crick="acata",ovhg=-1)
|
|
142
266
|
Dseq(-6)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
>>> Dseq(watson="
|
|
267
|
+
ata
|
|
268
|
+
ataca
|
|
269
|
+
>>> Dseq(watson="taa",crick="actta",ovhg=0)
|
|
146
270
|
Dseq(-5)
|
|
147
|
-
|
|
271
|
+
taa
|
|
148
272
|
attca
|
|
149
|
-
>>> Dseq(watson="
|
|
273
|
+
>>> Dseq(watson="aag",crick="actta",ovhg=1)
|
|
150
274
|
Dseq(-5)
|
|
151
|
-
|
|
275
|
+
aag
|
|
152
276
|
attca
|
|
153
277
|
>>> Dseq(watson="agt",crick="actta",ovhg=2)
|
|
154
278
|
Dseq(-5)
|
|
155
279
|
agt
|
|
156
280
|
attca
|
|
157
281
|
|
|
158
|
-
If the ovhg parameter is specified a
|
|
159
|
-
|
|
282
|
+
If the ovhg parameter is specified a Crick strand also needs to be supplied, or
|
|
283
|
+
an exception is raised.
|
|
160
284
|
|
|
161
285
|
>>> Dseq(watson="agt", ovhg=2)
|
|
162
286
|
Traceback (most recent call last):
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
else:
|
|
166
|
-
ValueError: ovhg defined without crick strand!
|
|
167
|
-
|
|
287
|
+
...
|
|
288
|
+
ValueError: ovhg (overhang) defined without a crick strand.
|
|
168
289
|
|
|
169
|
-
The shape of the fragment is set by circular = True, False
|
|
170
290
|
|
|
171
|
-
|
|
172
|
-
circular = True.
|
|
291
|
+
The shape or topology of the fragment is set by the circular parameter, True or False (default).
|
|
173
292
|
|
|
174
|
-
|
|
175
|
-
>>> Dseq("aaa","ttt")
|
|
293
|
+
>>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default
|
|
176
294
|
Dseq(-3)
|
|
177
295
|
aaa
|
|
178
296
|
ttt
|
|
179
|
-
>>> Dseq("aaa","ttt",ovhg=0)
|
|
297
|
+
>>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False
|
|
180
298
|
Dseq(-3)
|
|
181
299
|
aaa
|
|
182
300
|
ttt
|
|
183
|
-
>>> Dseq("aaa","ttt",ovhg=
|
|
301
|
+
>>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence
|
|
302
|
+
Dseq(o3)
|
|
303
|
+
aaa
|
|
304
|
+
ttt
|
|
305
|
+
>>> Dseq("aaa", "ttt", ovhg=1, circular = False)
|
|
184
306
|
Dseq(-4)
|
|
185
307
|
aaa
|
|
186
308
|
ttt
|
|
@@ -210,6 +332,18 @@ class Dseq(_Seq):
|
|
|
210
332
|
-4
|
|
211
333
|
>>>
|
|
212
334
|
|
|
335
|
+
|
|
336
|
+
dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions:
|
|
337
|
+
|
|
338
|
+
::
|
|
339
|
+
|
|
340
|
+
aaaGATC GATCccc ad-hoc representations
|
|
341
|
+
CTAGttt gggCTAG
|
|
342
|
+
|
|
343
|
+
QFZJaaaPEXI PEXIcccQFZJ dsIUPAC
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
|
|
213
347
|
Coercing to string
|
|
214
348
|
|
|
215
349
|
>>> str(a)
|
|
@@ -295,46 +429,76 @@ class Dseq(_Seq):
|
|
|
295
429
|
|
|
296
430
|
"""
|
|
297
431
|
|
|
298
|
-
trunc = 30
|
|
299
|
-
|
|
300
432
|
def __init__(
|
|
301
433
|
self,
|
|
302
|
-
watson:
|
|
303
|
-
crick:
|
|
434
|
+
watson: Union[str, bytes],
|
|
435
|
+
crick: Union[str, bytes, None] = None,
|
|
304
436
|
ovhg=None,
|
|
305
437
|
circular=False,
|
|
306
438
|
pos=0,
|
|
307
439
|
):
|
|
308
|
-
if isinstance(watson, bytes):
|
|
309
|
-
watson
|
|
310
|
-
|
|
311
|
-
|
|
440
|
+
if isinstance(watson, (bytes, bytearray)):
|
|
441
|
+
# watson is decoded to a string if needed.
|
|
442
|
+
watson = watson.decode("ascii")
|
|
443
|
+
if isinstance(crick, (bytes, bytearray)):
|
|
444
|
+
# crick is decoded to a string if needed.
|
|
445
|
+
crick = crick.decode("ascii")
|
|
312
446
|
|
|
313
447
|
if crick is None:
|
|
314
448
|
if ovhg is not None:
|
|
315
|
-
raise ValueError("ovhg defined without crick strand
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
449
|
+
raise ValueError("ovhg (overhang) defined without a crick strand.")
|
|
450
|
+
"""
|
|
451
|
+
Giving only the watson string implies inferring the Crick complementary strand
|
|
452
|
+
from the Watson sequence. The watson string can contain dscode letters wich will
|
|
453
|
+
be interpreted as outlined in the pydna.alphabet module.
|
|
454
|
+
|
|
455
|
+
The _data property must be a byte string for compatibility with
|
|
456
|
+
Biopython Bio.Seq.Seq
|
|
457
|
+
"""
|
|
458
|
+
data = watson
|
|
459
|
+
self._data = data.encode("ascii")
|
|
319
460
|
|
|
320
|
-
else:
|
|
321
|
-
|
|
322
|
-
|
|
461
|
+
else:
|
|
462
|
+
"""
|
|
463
|
+
Crick strand given, ovhg is optional. An important consequence is that the
|
|
464
|
+
watson and crick strands are interpreted as single stranded DNA that is
|
|
465
|
+
supposed to anneal.
|
|
466
|
+
|
|
467
|
+
If ovhg was not given, we try to guess the value below. This will fail
|
|
468
|
+
if there are two or more ways to anneal with equal length of the double
|
|
469
|
+
stranded part.
|
|
470
|
+
"""
|
|
471
|
+
if ovhg is None: # ovhg not given, try to guess from sequences
|
|
472
|
+
limit = int(math.log(len(watson)) / math.log(4))
|
|
473
|
+
olaps = common_sub_strings(
|
|
323
474
|
str(watson).lower(),
|
|
324
|
-
str(
|
|
325
|
-
|
|
475
|
+
str(rc(crick).lower()),
|
|
476
|
+
limit,
|
|
326
477
|
)
|
|
478
|
+
|
|
479
|
+
"""No overlaps found, strands do not anneal"""
|
|
327
480
|
if len(olaps) == 0:
|
|
328
481
|
raise ValueError(
|
|
329
|
-
"Could not anneal the two strands."
|
|
482
|
+
"Could not anneal the two strands."
|
|
483
|
+
f" looked for annealing with at least {limit} basepairs"
|
|
484
|
+
" Please provide and overhang value (ovhg parameter)"
|
|
330
485
|
)
|
|
331
486
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
487
|
+
"""
|
|
488
|
+
We extract the positions and length of the first (longest) overlap,
|
|
489
|
+
since common_sub_strings sorts the overlaps by length, longest first.
|
|
490
|
+
"""
|
|
335
491
|
|
|
336
|
-
|
|
337
|
-
|
|
492
|
+
(pos_watson, pos_crick, longest_olap_length), *rest = olaps
|
|
493
|
+
|
|
494
|
+
"""
|
|
495
|
+
We see if there is another overlap of the same length
|
|
496
|
+
This means that annealing is ambigous. User should provide
|
|
497
|
+
and ovhg value.
|
|
498
|
+
"""
|
|
499
|
+
if any(
|
|
500
|
+
olap_length >= longest_olap_length for _, _, olap_length in rest
|
|
501
|
+
):
|
|
338
502
|
raise ValueError(
|
|
339
503
|
"More than one way of annealing the"
|
|
340
504
|
" strands. Please provide ovhg value"
|
|
@@ -342,120 +506,80 @@ class Dseq(_Seq):
|
|
|
342
506
|
|
|
343
507
|
ovhg = pos_crick - pos_watson
|
|
344
508
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
]
|
|
354
|
-
),
|
|
355
|
-
encoding="ASCII",
|
|
356
|
-
)
|
|
509
|
+
"""
|
|
510
|
+
Pad both strands on left side ovhg spaces
|
|
511
|
+
a negative number gives no padding,
|
|
512
|
+
"""
|
|
513
|
+
sense = ovhg * " " + watson
|
|
514
|
+
antisense = -ovhg * " " + crick[::-1]
|
|
515
|
+
|
|
516
|
+
max_len = max(len(sense), len(antisense))
|
|
357
517
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
_rc(crick[-ovhg:])
|
|
375
|
-
+ watson
|
|
376
|
-
+ _rc(crick[: len(crick) - ovhg - len(watson)]),
|
|
377
|
-
encoding="ASCII",
|
|
378
|
-
)
|
|
379
|
-
else: # ovhg < 0
|
|
380
|
-
if -ovhg + len(crick) > len(watson):
|
|
381
|
-
self._data = bytes(
|
|
382
|
-
watson + _rc(crick[: -ovhg + len(crick) - len(watson)]),
|
|
383
|
-
encoding="ASCII",
|
|
384
|
-
)
|
|
385
|
-
else:
|
|
386
|
-
self._data = bytes(watson, encoding="ASCII")
|
|
518
|
+
"""pad both strands on right side to same size."""
|
|
519
|
+
sense = sense.ljust(max_len)
|
|
520
|
+
antisense = antisense.ljust(max_len)
|
|
521
|
+
"""both strands padded so that bsepairs align"""
|
|
522
|
+
assert len(sense) == len(antisense)
|
|
523
|
+
|
|
524
|
+
data = []
|
|
525
|
+
|
|
526
|
+
for w, c in zip(sense, antisense):
|
|
527
|
+
try:
|
|
528
|
+
data.append(basepair_dict[w, c])
|
|
529
|
+
except KeyError as err:
|
|
530
|
+
print(f"Base mismatch in representation {err}")
|
|
531
|
+
raise ValueError(f"Base mismatch in representation: {err}")
|
|
532
|
+
data = "".join(data).strip()
|
|
533
|
+
self._data = data.encode("ascii")
|
|
387
534
|
|
|
388
535
|
self.circular = circular
|
|
389
|
-
self.watson = _pretty_str(watson)
|
|
390
|
-
self.crick = _pretty_str(crick)
|
|
391
|
-
self.length = len(self._data)
|
|
392
|
-
self.ovhg = ovhg
|
|
393
536
|
self.pos = pos
|
|
394
537
|
|
|
538
|
+
if circular:
|
|
539
|
+
data += data[0:1]
|
|
540
|
+
|
|
541
|
+
dsb = dsbreaks(data)
|
|
542
|
+
|
|
543
|
+
if dsb:
|
|
544
|
+
msg = "".join(dsb)
|
|
545
|
+
raise ValueError(
|
|
546
|
+
f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip()
|
|
547
|
+
)
|
|
548
|
+
|
|
395
549
|
@classmethod
|
|
396
|
-
def quick(
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
):
|
|
404
|
-
obj = cls.__new__(cls) # Does not call __init__
|
|
405
|
-
obj.watson = _pretty_str(watson)
|
|
406
|
-
obj.crick = _pretty_str(crick)
|
|
407
|
-
obj.ovhg = ovhg
|
|
550
|
+
def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs):
|
|
551
|
+
"""Fastest way to instantiate an object of the Dseq class.
|
|
552
|
+
|
|
553
|
+
No checks of parameters are made.
|
|
554
|
+
Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks.
|
|
555
|
+
"""
|
|
556
|
+
obj = cls.__new__(cls)
|
|
408
557
|
obj.circular = circular
|
|
409
|
-
obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
|
|
410
558
|
obj.pos = pos
|
|
411
|
-
|
|
412
|
-
cb = bytes(crick, encoding="ASCII")
|
|
413
|
-
obj._data = (
|
|
414
|
-
_rc(cb[-max(0, ovhg) or len(cb) :])
|
|
415
|
-
+ wb
|
|
416
|
-
+ _rc(cb[: max(0, len(cb) - ovhg - len(wb))])
|
|
417
|
-
)
|
|
418
|
-
return obj
|
|
559
|
+
obj._data = data
|
|
419
560
|
|
|
420
|
-
@classmethod
|
|
421
|
-
def from_string(
|
|
422
|
-
cls,
|
|
423
|
-
dna: str,
|
|
424
|
-
*args,
|
|
425
|
-
# linear=True,
|
|
426
|
-
circular=False,
|
|
427
|
-
**kwargs,
|
|
428
|
-
):
|
|
429
|
-
obj = cls.__new__(cls) # Does not call __init__
|
|
430
|
-
obj.watson = _pretty_str(dna)
|
|
431
|
-
obj.crick = _pretty_str(_rc(dna))
|
|
432
|
-
obj.ovhg = 0
|
|
433
|
-
obj.circular = circular
|
|
434
|
-
# obj._linear = linear
|
|
435
|
-
obj.length = len(dna)
|
|
436
|
-
obj.pos = 0
|
|
437
|
-
obj._data = bytes(dna, encoding="ASCII")
|
|
438
561
|
return obj
|
|
439
562
|
|
|
440
563
|
@classmethod
|
|
441
564
|
def from_representation(cls, dsdna: str, *args, **kwargs):
|
|
442
|
-
obj = cls.__new__(cls)
|
|
443
|
-
w, c, *r = [ln for ln in dsdna.splitlines() if ln]
|
|
444
|
-
ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip()))
|
|
445
|
-
watson = obj.watson = _pretty_str(w.strip())
|
|
446
|
-
crick = obj.crick = _pretty_str(c.strip()[::-1])
|
|
565
|
+
obj = cls.__new__(cls)
|
|
447
566
|
obj.circular = False
|
|
448
|
-
# obj._linear = True
|
|
449
|
-
obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
|
|
450
567
|
obj.pos = 0
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
)
|
|
458
|
-
|
|
568
|
+
clean = inspect.cleandoc("\n" + dsdna)
|
|
569
|
+
watson, crick = [
|
|
570
|
+
ln
|
|
571
|
+
for ln in clean.splitlines()
|
|
572
|
+
if ln.strip() and not ln.strip().startswith("Dseq(")
|
|
573
|
+
]
|
|
574
|
+
ovhgw = len(watson) - len(watson.lstrip())
|
|
575
|
+
ovhgc = -(len(crick) - len(crick.lstrip()))
|
|
576
|
+
|
|
577
|
+
ovhg = ovhgw or ovhgc
|
|
578
|
+
|
|
579
|
+
watson = watson.strip()
|
|
580
|
+
crick = crick.strip()[::-1]
|
|
581
|
+
|
|
582
|
+
return Dseq(watson, crick, ovhg)
|
|
459
583
|
|
|
460
584
|
@classmethod
|
|
461
585
|
def from_full_sequence_and_overhangs(
|
|
@@ -522,111 +646,177 @@ class Dseq(_Seq):
|
|
|
522
646
|
|
|
523
647
|
return Dseq(watson, crick=crick, ovhg=crick_ovhg)
|
|
524
648
|
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
# each other"""
|
|
530
|
-
# return self._ovhg
|
|
531
|
-
|
|
532
|
-
# @property
|
|
533
|
-
# def linear(self):
|
|
534
|
-
# """The linear property can not be set directly.
|
|
535
|
-
# Use an empty slice [:] to create a linear object."""
|
|
536
|
-
# return self._linear
|
|
537
|
-
|
|
538
|
-
# @property
|
|
539
|
-
# def circular(self):
|
|
540
|
-
# """The circular property can not be set directly.
|
|
541
|
-
# Use :meth:`looped` to create a circular Dseq object"""
|
|
542
|
-
# return self._circular
|
|
649
|
+
@property
|
|
650
|
+
def watson(self) -> str:
|
|
651
|
+
"""
|
|
652
|
+
The watson (upper) strand of the double stranded fragment 5'-3'.
|
|
543
653
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
MW = (A x 313.2) + (T x 304.2) +
|
|
549
|
-
(C x 289.2) + (G x 329.2) +
|
|
550
|
-
(N x 308.9) + 79.0
|
|
551
|
-
"""
|
|
552
|
-
nts = (self.watson + self.crick).lower()
|
|
553
|
-
|
|
554
|
-
return (
|
|
555
|
-
313.2 * nts.count("a")
|
|
556
|
-
+ 304.2 * nts.count("t")
|
|
557
|
-
+ 289.2 * nts.count("c")
|
|
558
|
-
+ 329.2 * nts.count("g")
|
|
559
|
-
+ 308.9 * nts.count("n")
|
|
560
|
-
+ 79.0
|
|
561
|
-
)
|
|
654
|
+
Returns
|
|
655
|
+
-------
|
|
656
|
+
TYPE
|
|
657
|
+
DESCRIPTION.
|
|
562
658
|
|
|
563
|
-
|
|
564
|
-
""
|
|
659
|
+
"""
|
|
660
|
+
return self._data.decode("ascii").translate(dscode_to_watson_table).strip()
|
|
565
661
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
aAa
|
|
571
|
-
tTt
|
|
572
|
-
>>> my_seq.upper()
|
|
573
|
-
Dseq(-3)
|
|
574
|
-
AAA
|
|
575
|
-
TTT
|
|
662
|
+
@property
|
|
663
|
+
def crick(self) -> str:
|
|
664
|
+
"""
|
|
665
|
+
The crick (lower) strand of the double stranded fragment 5'-3'.
|
|
576
666
|
|
|
577
667
|
Returns
|
|
578
668
|
-------
|
|
579
|
-
|
|
580
|
-
|
|
669
|
+
TYPE
|
|
670
|
+
DESCRIPTION.
|
|
581
671
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
pydna.dseq.Dseq.lower
|
|
672
|
+
"""
|
|
673
|
+
return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1]
|
|
585
674
|
|
|
675
|
+
@property
|
|
676
|
+
def left_ovhg(self) -> int:
|
|
586
677
|
"""
|
|
587
|
-
|
|
588
|
-
self.watson.upper(),
|
|
589
|
-
self.crick.upper(),
|
|
590
|
-
ovhg=self.ovhg,
|
|
591
|
-
# linear=self.linear,
|
|
592
|
-
circular=self.circular,
|
|
593
|
-
pos=self.pos,
|
|
594
|
-
)
|
|
678
|
+
The 5' overhang of the lower strand compared the the upper.
|
|
595
679
|
|
|
596
|
-
|
|
597
|
-
"""Return a lower case copy of the sequence.
|
|
680
|
+
See module docstring for more information.
|
|
598
681
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
682
|
+
Returns
|
|
683
|
+
-------
|
|
684
|
+
TYPE
|
|
685
|
+
DESCRIPTION.
|
|
686
|
+
|
|
687
|
+
"""
|
|
688
|
+
parts = self.get_parts()
|
|
689
|
+
if parts.single_watson or parts.single_crick:
|
|
690
|
+
return None
|
|
691
|
+
return -len(parts.sticky_left5) or len(parts.sticky_left3)
|
|
692
|
+
|
|
693
|
+
ovhg = left_ovhg
|
|
694
|
+
|
|
695
|
+
@property
|
|
696
|
+
def right_ovhg(self) -> int:
|
|
697
|
+
"""Overhang at the right side (end)."""
|
|
698
|
+
parts = self.get_parts()
|
|
699
|
+
if parts.single_watson or parts.single_crick:
|
|
700
|
+
return None
|
|
701
|
+
return -len(parts.sticky_right5) or len(parts.sticky_right3)
|
|
702
|
+
|
|
703
|
+
watson_ovhg = right_ovhg
|
|
704
|
+
|
|
705
|
+
def __str__(self) -> str:
|
|
706
|
+
"""
|
|
707
|
+
A string representation of the sequence. The returned string
|
|
708
|
+
is the watson strand of a blunt version of the sequence.
|
|
709
|
+
|
|
710
|
+
>>> ds = Dseq.from_representation(
|
|
711
|
+
... '''
|
|
712
|
+
... GAATTC
|
|
713
|
+
... TAA
|
|
714
|
+
... ''')
|
|
715
|
+
|
|
716
|
+
>>> str(ds)
|
|
717
|
+
'GAATTC'
|
|
718
|
+
>>> ds = Dseq.from_representation(
|
|
719
|
+
... '''
|
|
720
|
+
... ATT
|
|
721
|
+
... CTTAAG
|
|
722
|
+
... ''')
|
|
723
|
+
|
|
724
|
+
>>> str(ds)
|
|
725
|
+
'GAATTC'
|
|
609
726
|
|
|
610
727
|
Returns
|
|
611
728
|
-------
|
|
612
|
-
|
|
613
|
-
|
|
729
|
+
str
|
|
730
|
+
A string representation of the sequence.
|
|
614
731
|
|
|
615
|
-
|
|
732
|
+
"""
|
|
733
|
+
return bytes(self).decode("ascii")
|
|
734
|
+
|
|
735
|
+
to_blunt_string = __str__ # alias of __str__ # TODO: consider removing
|
|
736
|
+
|
|
737
|
+
def __bytes__(self) -> bytes:
|
|
738
|
+
return self._data.translate(dscode_to_full_sequence_table)
|
|
739
|
+
|
|
740
|
+
def mw(self) -> float:
|
|
741
|
+
"""The molecular weight of the DNA/RNA molecule in g/mol.
|
|
742
|
+
|
|
743
|
+
The molecular weight data in Biopython Bio.Data.IUPACData
|
|
744
|
+
is used. The DNA is assumed to have a 5'-phosphate as many
|
|
745
|
+
DNA fragments from restriction digestion do:
|
|
746
|
+
|
|
747
|
+
::
|
|
748
|
+
|
|
749
|
+
P - G-A-T-T-A-C-A - OH
|
|
750
|
+
| | | | | | |
|
|
751
|
+
OH - C-T-A-A-T-G-T - P
|
|
752
|
+
|
|
753
|
+
The molecular weights listed in the unambiguous_dna_weights
|
|
754
|
+
dictionary refers to free monophosphate nucleotides.
|
|
755
|
+
One water molecule is removed for every phopshodiester bond
|
|
756
|
+
formed between nucleotides. For linear molecules, the weight
|
|
757
|
+
of one water molecule is added to account for the terminal
|
|
758
|
+
hydroxyl group and a hydrogen on the 5' terminal phosphate
|
|
759
|
+
group.
|
|
760
|
+
|
|
761
|
+
::
|
|
762
|
+
|
|
763
|
+
P - G---A---T - OH P - C---A - OH
|
|
764
|
+
| | | | |
|
|
765
|
+
OH - C---T---A---A---T---G---T - P
|
|
766
|
+
|
|
767
|
+
If the DNA is discontinuous, the internal 5'- end is assumed
|
|
768
|
+
to have a phosphate and the 3'- a hydroxyl group:
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
Examples
|
|
616
772
|
--------
|
|
617
|
-
pydna.dseq
|
|
618
|
-
""
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
)
|
|
773
|
+
>>> from pydna.dseq import Dseq
|
|
774
|
+
>>> ds_lin_obj = Dseq("GATTACA")
|
|
775
|
+
>>> ds_lin_obj
|
|
776
|
+
Dseq(-7)
|
|
777
|
+
GATTACA
|
|
778
|
+
CTAATGT
|
|
779
|
+
>>> round(ds_lin_obj.mw(), 1)
|
|
780
|
+
4359.8
|
|
781
|
+
>>> ds_circ_obj = Dseq("GATTACA", circular = True)
|
|
782
|
+
>>> round(ds_circ_obj.mw(), 1)
|
|
783
|
+
4323.8
|
|
784
|
+
>>> ssobj = Dseq("PEXXEIE")
|
|
785
|
+
>>> ssobj
|
|
786
|
+
Dseq(-7)
|
|
787
|
+
GATTACA
|
|
788
|
+
<BLANKLINE>
|
|
789
|
+
>>> round(ssobj.mw(), 1)
|
|
790
|
+
2184.4
|
|
791
|
+
>>> ds_lin_obj2 = Dseq("GATZFCA")
|
|
792
|
+
>>> ds_lin_obj2
|
|
793
|
+
Dseq(-7)
|
|
794
|
+
GAT CA
|
|
795
|
+
CTAATGT
|
|
796
|
+
>>> round(ds_lin_obj2.mw(), 1)
|
|
797
|
+
3724.4
|
|
798
|
+
"""
|
|
799
|
+
|
|
800
|
+
h2o = atom_weights["H"] * 2 + atom_weights["O"]
|
|
801
|
+
|
|
802
|
+
mwd = unambiguous_rna_weights | unambiguous_dna_weights | {" ": 0}
|
|
803
|
+
|
|
804
|
+
watsn_weight = sum(mwd[nt] - h2o for nt in self.watson.upper())
|
|
805
|
+
crick_weight = sum(mwd[nt] - h2o for nt in self.crick.upper())
|
|
806
|
+
|
|
807
|
+
watsn_weight += h2o * len(re.findall(r" +", self.watson))
|
|
808
|
+
crick_weight += h2o * len(re.findall(r" +", self.crick))
|
|
809
|
+
|
|
810
|
+
if watsn_weight and not self.circular:
|
|
811
|
+
watsn_weight += h2o
|
|
812
|
+
|
|
813
|
+
if crick_weight and not self.circular:
|
|
814
|
+
crick_weight += h2o
|
|
815
|
+
|
|
816
|
+
return watsn_weight + crick_weight
|
|
627
817
|
|
|
628
818
|
def find(
|
|
629
|
-
self, sub:
|
|
819
|
+
self, sub: Union[_SeqAbstractBaseClass, str, bytes], start=0, end=sys.maxsize
|
|
630
820
|
) -> int:
|
|
631
821
|
"""This method behaves like the python string method of the same name.
|
|
632
822
|
|
|
@@ -635,6 +825,8 @@ class Dseq(_Seq):
|
|
|
635
825
|
|
|
636
826
|
Returns -1 if the subsequence is NOT found.
|
|
637
827
|
|
|
828
|
+
The search is case sensitive.
|
|
829
|
+
|
|
638
830
|
Parameters
|
|
639
831
|
----------
|
|
640
832
|
|
|
@@ -650,80 +842,51 @@ class Dseq(_Seq):
|
|
|
650
842
|
Examples
|
|
651
843
|
--------
|
|
652
844
|
>>> from pydna.dseq import Dseq
|
|
653
|
-
>>> seq = Dseq("
|
|
845
|
+
>>> seq = Dseq("agtaagt")
|
|
654
846
|
>>> seq
|
|
655
|
-
Dseq(-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
>>> seq.find("
|
|
659
|
-
|
|
660
|
-
>>> seq = Dseq(watson="
|
|
847
|
+
Dseq(-7)
|
|
848
|
+
agtaagt
|
|
849
|
+
tcattca
|
|
850
|
+
>>> seq.find("taa")
|
|
851
|
+
2
|
|
852
|
+
>>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
|
|
661
853
|
>>> seq
|
|
662
854
|
Dseq(-7)
|
|
663
|
-
|
|
855
|
+
agta
|
|
664
856
|
attca
|
|
665
857
|
>>> seq.find("taa")
|
|
858
|
+
-1
|
|
859
|
+
>>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
|
|
860
|
+
>>> seq
|
|
861
|
+
Dseq(-7)
|
|
862
|
+
agta
|
|
863
|
+
attca
|
|
864
|
+
>>> seq.find("ta")
|
|
666
865
|
2
|
|
667
866
|
"""
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
return _Seq.find(self, sub, start, end)
|
|
671
|
-
|
|
672
|
-
return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end)
|
|
673
|
-
|
|
674
|
-
def __getitem__(self, sl: slice) -> "Dseq":
|
|
675
|
-
"""Returns a subsequence. This method is used by the slice notation"""
|
|
676
|
-
|
|
677
|
-
if not self.circular:
|
|
678
|
-
x = len(self.crick) - self.ovhg - len(self.watson)
|
|
679
|
-
|
|
680
|
-
sns = (self.ovhg * " " + self.watson + x * " ")[sl]
|
|
681
|
-
asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl]
|
|
682
|
-
|
|
683
|
-
ovhg = max(
|
|
684
|
-
(len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
return Dseq(
|
|
688
|
-
sns.strip(),
|
|
689
|
-
asn[::-1].strip(),
|
|
690
|
-
ovhg=ovhg,
|
|
691
|
-
# linear=True
|
|
692
|
-
)
|
|
867
|
+
if self.circular:
|
|
868
|
+
result = CircularBytes(self._data).find(sub, start, end)
|
|
693
869
|
else:
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
self.watson[sl],
|
|
700
|
-
self.crick[::-1][sl][::-1],
|
|
701
|
-
ovhg=0,
|
|
702
|
-
# linear=True
|
|
703
|
-
)
|
|
704
|
-
else:
|
|
705
|
-
try:
|
|
706
|
-
stp = abs(sl.step)
|
|
707
|
-
except TypeError:
|
|
708
|
-
stp = 1
|
|
709
|
-
start = sl.start
|
|
710
|
-
stop = sl.stop
|
|
711
|
-
|
|
712
|
-
w = (
|
|
713
|
-
self.watson[(start or len(self)) :: stp]
|
|
714
|
-
+ self.watson[: (stop or 0) : stp]
|
|
715
|
-
)
|
|
716
|
-
c = (
|
|
717
|
-
self.crick[len(self) - stop :: stp]
|
|
718
|
-
+ self.crick[: len(self) - start : stp]
|
|
719
|
-
)
|
|
870
|
+
result = super().find(sub, start, end)
|
|
871
|
+
return result
|
|
872
|
+
|
|
873
|
+
def __contains__(self, sub: [str, bytes]) -> bool:
|
|
874
|
+
return self.find(sub) != -1
|
|
720
875
|
|
|
721
|
-
|
|
876
|
+
def __getitem__(self, sl: [slice, int]) -> DseqType:
|
|
877
|
+
if isinstance(sl, int):
|
|
878
|
+
sl = slice(sl, sl + 1, 1)
|
|
879
|
+
sl = slice(sl.start, sl.stop, sl.step)
|
|
880
|
+
if self.circular:
|
|
881
|
+
cb = CircularBytes(self._data)
|
|
882
|
+
return self.quick(cb[sl])
|
|
883
|
+
return super().__getitem__(sl)
|
|
722
884
|
|
|
723
885
|
def __eq__(self, other: DseqType) -> bool:
|
|
724
886
|
"""Compare to another Dseq object OR an object that implements
|
|
725
|
-
watson, crick and ovhg properties.
|
|
726
|
-
|
|
887
|
+
watson, crick and ovhg properties.
|
|
888
|
+
|
|
889
|
+
This comparison is case insensitive.
|
|
727
890
|
|
|
728
891
|
"""
|
|
729
892
|
try:
|
|
@@ -738,85 +901,15 @@ class Dseq(_Seq):
|
|
|
738
901
|
same = False
|
|
739
902
|
return same
|
|
740
903
|
|
|
741
|
-
def __repr__(self):
|
|
742
|
-
"""Returns a representation of the sequence, truncated if
|
|
743
|
-
longer than 30 bp"""
|
|
744
|
-
|
|
745
|
-
if len(self) > Dseq.trunc:
|
|
746
|
-
if self.ovhg > 0:
|
|
747
|
-
d = self.crick[-self.ovhg :][::-1]
|
|
748
|
-
hej = len(d)
|
|
749
|
-
if len(d) > 10:
|
|
750
|
-
d = "{}..{}".format(d[:4], d[-4:])
|
|
751
|
-
a = len(d) * " "
|
|
752
|
-
|
|
753
|
-
elif self.ovhg < 0:
|
|
754
|
-
a = self.watson[: max(0, -self.ovhg)]
|
|
755
|
-
hej = len(a)
|
|
756
|
-
if len(a) > 10:
|
|
757
|
-
a = "{}..{}".format(a[:4], a[-4:])
|
|
758
|
-
d = len(a) * " "
|
|
759
|
-
else:
|
|
760
|
-
a = ""
|
|
761
|
-
d = ""
|
|
762
|
-
hej = 0
|
|
763
|
-
|
|
764
|
-
x = self.ovhg + len(self.watson) - len(self.crick)
|
|
765
|
-
|
|
766
|
-
if x > 0:
|
|
767
|
-
c = self.watson[len(self.crick) - self.ovhg :]
|
|
768
|
-
y = len(c)
|
|
769
|
-
if len(c) > 10:
|
|
770
|
-
c = "{}..{}".format(c[:4], c[-4:])
|
|
771
|
-
f = len(c) * " "
|
|
772
|
-
elif x < 0:
|
|
773
|
-
f = self.crick[:-x][::-1]
|
|
774
|
-
y = len(f)
|
|
775
|
-
if len(f) > 10:
|
|
776
|
-
f = "{}..{}".format(f[:4], f[-4:])
|
|
777
|
-
c = len(f) * " "
|
|
778
|
-
else:
|
|
779
|
-
c = ""
|
|
780
|
-
f = ""
|
|
781
|
-
y = 0
|
|
782
|
-
|
|
783
|
-
L = len(self) - hej - y
|
|
784
|
-
x1 = -min(0, self.ovhg)
|
|
785
|
-
x2 = x1 + L
|
|
786
|
-
x3 = -min(0, x)
|
|
787
|
-
x4 = x3 + L
|
|
788
|
-
|
|
789
|
-
b = self.watson[x1:x2]
|
|
790
|
-
e = self.crick[x3:x4][::-1]
|
|
791
|
-
|
|
792
|
-
if len(b) > 10:
|
|
793
|
-
b = "{}..{}".format(b[:4], b[-4:])
|
|
794
|
-
e = "{}..{}".format(e[:4], e[-4:])
|
|
795
|
-
|
|
796
|
-
return _pretty_str(
|
|
797
|
-
"{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}"
|
|
798
|
-
).format(
|
|
799
|
-
klass=self.__class__.__name__,
|
|
800
|
-
top={False: "-", True: "o"}[self.circular],
|
|
801
|
-
size=len(self),
|
|
802
|
-
a=a,
|
|
803
|
-
b=b,
|
|
804
|
-
c=c,
|
|
805
|
-
d=d,
|
|
806
|
-
e=e,
|
|
807
|
-
f=f,
|
|
808
|
-
)
|
|
904
|
+
def __repr__(self, lim: int = length_limit_for_repr) -> pretty_str:
|
|
809
905
|
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
-self.ovhg * " " + self.crick[::-1],
|
|
818
|
-
)
|
|
819
|
-
)
|
|
906
|
+
header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})"
|
|
907
|
+
|
|
908
|
+
w, c = representation_tuple(
|
|
909
|
+
self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
return pretty_str(header + "\n" + w + "\n" + c)
|
|
820
913
|
|
|
821
914
|
def reverse_complement(self) -> "Dseq":
|
|
822
915
|
"""Dseq object where watson and crick have switched places.
|
|
@@ -839,22 +932,29 @@ class Dseq(_Seq):
|
|
|
839
932
|
>>>
|
|
840
933
|
|
|
841
934
|
"""
|
|
842
|
-
return Dseq.quick(
|
|
843
|
-
self.crick,
|
|
844
|
-
self.watson,
|
|
845
|
-
ovhg=len(self.watson) - len(self.crick) + self.ovhg,
|
|
846
|
-
circular=self.circular,
|
|
847
|
-
)
|
|
935
|
+
return Dseq.quick(rc(self._data), circular=self.circular)
|
|
848
936
|
|
|
849
937
|
rc = reverse_complement # alias for reverse_complement
|
|
850
938
|
|
|
851
939
|
def shifted(self: DseqType, shift: int) -> DseqType:
|
|
852
|
-
"""
|
|
940
|
+
"""
|
|
941
|
+
Shifted copy of a circular Dseq object.
|
|
942
|
+
|
|
943
|
+
>>> ds = Dseq("TAAG", circular = True)
|
|
944
|
+
>>> ds.shifted(1) # First bp moved to right side:
|
|
945
|
+
Dseq(o4)
|
|
946
|
+
AAGT
|
|
947
|
+
TTCA
|
|
948
|
+
>>> ds.shifted(-1) # Last bp moved to left side:
|
|
949
|
+
Dseq(o4)
|
|
950
|
+
GTAA
|
|
951
|
+
CATT
|
|
952
|
+
"""
|
|
853
953
|
if not self.circular:
|
|
854
954
|
raise TypeError("DNA is not circular.")
|
|
855
955
|
shift = shift % len(self)
|
|
856
956
|
if not shift:
|
|
857
|
-
return
|
|
957
|
+
return copy.deepcopy(self)
|
|
858
958
|
else:
|
|
859
959
|
return (self[shift:] + self[:shift]).looped()
|
|
860
960
|
|
|
@@ -876,19 +976,30 @@ class Dseq(_Seq):
|
|
|
876
976
|
Dseq(o8)
|
|
877
977
|
catcgatc
|
|
878
978
|
gtagctag
|
|
879
|
-
>>>
|
|
979
|
+
>>> b = Dseq("iatcgatj")
|
|
980
|
+
>>> b
|
|
880
981
|
Dseq(-8)
|
|
881
982
|
catcgat
|
|
882
983
|
tagctag
|
|
883
|
-
>>>
|
|
984
|
+
>>> b.looped()
|
|
985
|
+
Dseq(o7)
|
|
986
|
+
catcgat
|
|
987
|
+
gtagcta
|
|
988
|
+
>>> c = Dseq("jatcgati")
|
|
989
|
+
>>> c
|
|
990
|
+
Dseq(-8)
|
|
991
|
+
atcgatc
|
|
992
|
+
gtagcta
|
|
993
|
+
>>> c.looped()
|
|
884
994
|
Dseq(o7)
|
|
885
995
|
catcgat
|
|
886
996
|
gtagcta
|
|
887
|
-
>>>
|
|
997
|
+
>>> d = Dseq("ietcgazj")
|
|
998
|
+
>>> d
|
|
888
999
|
Dseq(-8)
|
|
889
1000
|
catcga
|
|
890
1001
|
agctag
|
|
891
|
-
>>>
|
|
1002
|
+
>>> d.looped()
|
|
892
1003
|
Traceback (most recent call last):
|
|
893
1004
|
File "<stdin>", line 1, in <module>
|
|
894
1005
|
File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped
|
|
@@ -899,116 +1010,116 @@ class Dseq(_Seq):
|
|
|
899
1010
|
|
|
900
1011
|
"""
|
|
901
1012
|
if self.circular:
|
|
902
|
-
return
|
|
1013
|
+
return copy.deepcopy(self)
|
|
1014
|
+
|
|
903
1015
|
type5, sticky5 = self.five_prime_end()
|
|
904
1016
|
type3, sticky3 = self.three_prime_end()
|
|
905
|
-
if type5 == type3 and str(sticky5) == str(_rc(sticky3)):
|
|
906
|
-
nseq = self.__class__.quick(
|
|
907
|
-
self.watson,
|
|
908
|
-
self.crick[-self.ovhg :] + self.crick[: -self.ovhg],
|
|
909
|
-
ovhg=0,
|
|
910
|
-
# linear=False,
|
|
911
|
-
circular=True,
|
|
912
|
-
)
|
|
913
|
-
# assert len(nseq.crick) == len(nseq.watson)
|
|
914
|
-
return nseq
|
|
915
|
-
else:
|
|
916
|
-
raise TypeError(
|
|
917
|
-
"DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
|
|
918
|
-
)
|
|
919
1017
|
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
TypeError is raised.
|
|
1018
|
+
err = TypeError(
|
|
1019
|
+
"DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
|
|
1020
|
+
)
|
|
924
1021
|
|
|
925
|
-
|
|
1022
|
+
if type5 != type3:
|
|
1023
|
+
raise err
|
|
926
1024
|
|
|
927
|
-
|
|
928
|
-
|
|
1025
|
+
try:
|
|
1026
|
+
# Test if sticky ends are compatible
|
|
1027
|
+
self + self
|
|
1028
|
+
except TypeError:
|
|
1029
|
+
raise err
|
|
929
1030
|
|
|
930
|
-
|
|
931
|
-
>>> a=Dseq("catcgatc", circular=True)
|
|
932
|
-
>>> a
|
|
933
|
-
Dseq(o8)
|
|
934
|
-
catcgatc
|
|
935
|
-
gtagctag
|
|
936
|
-
>>> a[:]
|
|
937
|
-
Dseq(-8)
|
|
938
|
-
catcgatc
|
|
939
|
-
gtagctag
|
|
940
|
-
>>>
|
|
1031
|
+
new = self.cast_to_ds_left()[: len(self) - len(sticky3)]
|
|
941
1032
|
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
from pydna import _PydnaDeprecationWarning
|
|
1033
|
+
new.circular = True
|
|
1034
|
+
return new
|
|
945
1035
|
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
selfcopy.circular = False
|
|
956
|
-
return selfcopy # self.__class__(self.watson, linear=True)
|
|
1036
|
+
def five_prime_end(self) -> Tuple[str, str]:
|
|
1037
|
+
"""Returns a 2-tuple of trings describing the structure of the 5' end of
|
|
1038
|
+
the DNA fragment.
|
|
1039
|
+
|
|
1040
|
+
The tuple contains (type , sticky) where type is eiter "5'" or "3'".
|
|
1041
|
+
sticky is always in lower case and contains the sequence of the
|
|
1042
|
+
protruding end in 5'-3' direction.
|
|
1043
|
+
|
|
1044
|
+
See examples below:
|
|
957
1045
|
|
|
958
|
-
def five_prime_end(self) -> _Tuple[str, str]:
|
|
959
|
-
"""Returns a tuple describing the structure of the 5' end of
|
|
960
|
-
the DNA fragment
|
|
961
1046
|
|
|
962
1047
|
Examples
|
|
963
1048
|
--------
|
|
964
1049
|
>>> from pydna.dseq import Dseq
|
|
965
|
-
>>> a=Dseq("
|
|
1050
|
+
>>> a = Dseq("aa", "tttg", ovhg=2)
|
|
966
1051
|
>>> a
|
|
967
|
-
Dseq(-
|
|
968
|
-
|
|
969
|
-
|
|
1052
|
+
Dseq(-4)
|
|
1053
|
+
aa
|
|
1054
|
+
gttt
|
|
970
1055
|
>>> a.five_prime_end()
|
|
971
|
-
('
|
|
972
|
-
>>> a=Dseq("
|
|
1056
|
+
("3'", 'tg')
|
|
1057
|
+
>>> a = Dseq("caaa", "tt", ovhg=-2)
|
|
973
1058
|
>>> a
|
|
974
1059
|
Dseq(-4)
|
|
975
|
-
|
|
976
|
-
|
|
1060
|
+
caaa
|
|
1061
|
+
tt
|
|
977
1062
|
>>> a.five_prime_end()
|
|
978
|
-
("
|
|
979
|
-
>>> a=Dseq("
|
|
1063
|
+
("5'", 'ca')
|
|
1064
|
+
>>> a = Dseq("aa", "tt")
|
|
980
1065
|
>>> a
|
|
981
|
-
Dseq(-
|
|
982
|
-
|
|
983
|
-
|
|
1066
|
+
Dseq(-2)
|
|
1067
|
+
aa
|
|
1068
|
+
tt
|
|
984
1069
|
>>> a.five_prime_end()
|
|
985
|
-
(
|
|
986
|
-
>>>
|
|
1070
|
+
('blunt', '')
|
|
987
1071
|
|
|
988
1072
|
See also
|
|
989
1073
|
--------
|
|
990
1074
|
pydna.dseq.Dseq.three_prime_end
|
|
991
1075
|
|
|
992
1076
|
"""
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
1077
|
+
|
|
1078
|
+
# See docstring for function pydna.utils.get_parts for details
|
|
1079
|
+
# on what is contained in parts.
|
|
1080
|
+
parts = self.get_parts()
|
|
1081
|
+
|
|
1082
|
+
sticky5 = parts.sticky_left5.translate(dscode_to_watson_table)
|
|
1083
|
+
|
|
1084
|
+
sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1]
|
|
1085
|
+
|
|
1086
|
+
single_watson = parts.single_watson.translate(dscode_to_watson_table)
|
|
1087
|
+
|
|
1088
|
+
single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
|
|
1089
|
+
|
|
1090
|
+
# The walrus operator returns the value being assigned, so
|
|
1091
|
+
# we can test if it is empty or not.
|
|
1092
|
+
if sticky := single_watson:
|
|
1093
|
+
type_ = "single"
|
|
1094
|
+
elif sticky := single_crick:
|
|
1095
|
+
type_ = "single"
|
|
1096
|
+
elif sticky5 == sticky3 == "":
|
|
1097
|
+
type_, sticky = "blunt", ""
|
|
1098
|
+
elif sticky := sticky5:
|
|
999
1099
|
type_ = "5'"
|
|
1000
|
-
elif
|
|
1001
|
-
sticky = self.crick[-self.ovhg :].lower()
|
|
1100
|
+
elif sticky := sticky3:
|
|
1002
1101
|
type_ = "3'"
|
|
1003
|
-
else:
|
|
1004
|
-
sticky = ""
|
|
1005
|
-
type_ = "blunt"
|
|
1006
|
-
return type_, sticky
|
|
1007
1102
|
|
|
1008
|
-
|
|
1103
|
+
return type_, sticky.lower()
|
|
1104
|
+
|
|
1105
|
+
def three_prime_end(self) -> Tuple[str, str]:
|
|
1009
1106
|
"""Returns a tuple describing the structure of the 5' end of
|
|
1010
1107
|
the DNA fragment
|
|
1011
1108
|
|
|
1109
|
+
>>> a = Dseq("aa", "gttt", ovhg=0)
|
|
1110
|
+
>>> a
|
|
1111
|
+
Dseq(-4)
|
|
1112
|
+
aa
|
|
1113
|
+
tttg
|
|
1114
|
+
>>> a.three_prime_end()
|
|
1115
|
+
("5'", 'gt')
|
|
1116
|
+
>>> a = Dseq("aaac", "tt", ovhg=0)
|
|
1117
|
+
>>> a
|
|
1118
|
+
Dseq(-4)
|
|
1119
|
+
aaac
|
|
1120
|
+
tt
|
|
1121
|
+
>>> a.three_prime_end()
|
|
1122
|
+
("3'", 'ac')
|
|
1012
1123
|
>>> from pydna.dseq import Dseq
|
|
1013
1124
|
>>> a=Dseq("aaa", "ttt")
|
|
1014
1125
|
>>> a
|
|
@@ -1017,21 +1128,6 @@ class Dseq(_Seq):
|
|
|
1017
1128
|
ttt
|
|
1018
1129
|
>>> a.three_prime_end()
|
|
1019
1130
|
('blunt', '')
|
|
1020
|
-
>>> a=Dseq("aaa", "ttt", ovhg=1)
|
|
1021
|
-
>>> a
|
|
1022
|
-
Dseq(-4)
|
|
1023
|
-
aaa
|
|
1024
|
-
ttt
|
|
1025
|
-
>>> a.three_prime_end()
|
|
1026
|
-
("3'", 'a')
|
|
1027
|
-
>>> a=Dseq("aaa", "ttt", ovhg=-1)
|
|
1028
|
-
>>> a
|
|
1029
|
-
Dseq(-4)
|
|
1030
|
-
aaa
|
|
1031
|
-
ttt
|
|
1032
|
-
>>> a.three_prime_end()
|
|
1033
|
-
("5'", 't')
|
|
1034
|
-
>>>
|
|
1035
1131
|
|
|
1036
1132
|
See also
|
|
1037
1133
|
--------
|
|
@@ -1039,42 +1135,73 @@ class Dseq(_Seq):
|
|
|
1039
1135
|
|
|
1040
1136
|
"""
|
|
1041
1137
|
|
|
1042
|
-
|
|
1138
|
+
# See docstring for function pydna.utils.get_parts for details
|
|
1139
|
+
# on what is contained in parts.
|
|
1140
|
+
parts = self.get_parts()
|
|
1141
|
+
|
|
1142
|
+
sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1]
|
|
1143
|
+
|
|
1144
|
+
sticky3 = parts.sticky_right3.translate(dscode_to_watson_table)
|
|
1145
|
+
|
|
1146
|
+
single_watson = parts.single_watson.translate(dscode_to_watson_table)
|
|
1147
|
+
|
|
1148
|
+
single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
|
|
1043
1149
|
|
|
1044
|
-
|
|
1045
|
-
|
|
1150
|
+
# The walrus operator returns the value being assigned, so
|
|
1151
|
+
# we can test if it is empty or not.
|
|
1152
|
+
if sticky := single_watson:
|
|
1153
|
+
type_ = "single"
|
|
1154
|
+
elif sticky := single_crick:
|
|
1155
|
+
type_ = "single"
|
|
1156
|
+
elif sticky5 == sticky3 == "":
|
|
1157
|
+
type_, sticky = "blunt", ""
|
|
1158
|
+
elif sticky := sticky5:
|
|
1046
1159
|
type_ = "5'"
|
|
1047
|
-
elif
|
|
1048
|
-
sticky = self.watson[-ovhg:].lower()
|
|
1160
|
+
elif sticky := sticky3:
|
|
1049
1161
|
type_ = "3'"
|
|
1050
|
-
else:
|
|
1051
|
-
sticky = ""
|
|
1052
|
-
type_ = "blunt"
|
|
1053
|
-
return type_, sticky
|
|
1054
1162
|
|
|
1055
|
-
|
|
1056
|
-
"""Returns the overhang of the watson strand at the three prime."""
|
|
1057
|
-
return len(self.watson) - len(self.crick) + self.ovhg
|
|
1163
|
+
return type_, sticky.lower()
|
|
1058
1164
|
|
|
1059
|
-
def __add__(self: DseqType, other: DseqType) -> DseqType:
|
|
1060
|
-
"""
|
|
1165
|
+
def __add__(self: DseqType, other: [DseqType, str, bytes]) -> DseqType:
|
|
1166
|
+
"""
|
|
1167
|
+
Adding two Dseq objects together.
|
|
1168
|
+
|
|
1169
|
+
>>> ds = Dseq("a", "t", ovhg=0)
|
|
1170
|
+
>>> ds
|
|
1171
|
+
Dseq(-1)
|
|
1172
|
+
a
|
|
1173
|
+
t
|
|
1174
|
+
>>> ds + ds
|
|
1175
|
+
Dseq(-2)
|
|
1176
|
+
aa
|
|
1177
|
+
tt
|
|
1178
|
+
>>> "g" + ds # adding a string of left side returns a Dseq
|
|
1179
|
+
Dseq(-2)
|
|
1180
|
+
ga
|
|
1181
|
+
ct
|
|
1182
|
+
>>> ds + "c" # adding a string of right side returns a Dseq
|
|
1183
|
+
Dseq(-2)
|
|
1184
|
+
ac
|
|
1185
|
+
tg
|
|
1061
1186
|
|
|
1062
|
-
Add other Dseq object at the end of the sequence.
|
|
1063
|
-
Type error is raised if any of the points below are fulfilled:
|
|
1064
1187
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
prime sticky end of other.
|
|
1188
|
+
Parameters
|
|
1189
|
+
----------
|
|
1190
|
+
other : [DseqType, str, bytes]
|
|
1191
|
+
Object to be added.
|
|
1070
1192
|
|
|
1071
|
-
|
|
1193
|
+
Raises
|
|
1194
|
+
------
|
|
1195
|
+
TypeError
|
|
1196
|
+
Preventing adding to a circular sequence.
|
|
1072
1197
|
|
|
1073
|
-
|
|
1074
|
-
|
|
1198
|
+
Returns
|
|
1199
|
+
-------
|
|
1200
|
+
DseqType
|
|
1201
|
+
A new Dseq object.
|
|
1075
1202
|
|
|
1076
1203
|
"""
|
|
1077
|
-
|
|
1204
|
+
|
|
1078
1205
|
if self.circular:
|
|
1079
1206
|
raise TypeError("circular DNA cannot be ligated!")
|
|
1080
1207
|
try:
|
|
@@ -1083,60 +1210,85 @@ class Dseq(_Seq):
|
|
|
1083
1210
|
except AttributeError:
|
|
1084
1211
|
pass
|
|
1085
1212
|
|
|
1213
|
+
# If other evaluates to False, return a copy of self.
|
|
1214
|
+
if not other:
|
|
1215
|
+
return copy.deepcopy(self)
|
|
1216
|
+
# If self evaluates to False, return a copy of other.
|
|
1217
|
+
elif not self:
|
|
1218
|
+
return copy.deepcopy(other)
|
|
1219
|
+
|
|
1220
|
+
# get right side end properties for self.
|
|
1086
1221
|
self_type, self_tail = self.three_prime_end()
|
|
1087
|
-
other_type, other_tail = other.five_prime_end()
|
|
1088
1222
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1223
|
+
try:
|
|
1224
|
+
other_type, other_tail = other.five_prime_end()
|
|
1225
|
+
except AttributeError:
|
|
1226
|
+
# if other does not have the expected properties
|
|
1227
|
+
# most likely it is a string that can be cast as
|
|
1228
|
+
# a Dseq.
|
|
1229
|
+
other_type, other_tail = "blunt", ""
|
|
1230
|
+
other = Dseq(other)
|
|
1231
|
+
|
|
1232
|
+
err = TypeError("sticky ends not compatible!")
|
|
1233
|
+
|
|
1234
|
+
# The sticky ends has to be of the same type
|
|
1235
|
+
# or
|
|
1236
|
+
# one or both of is "single" indicating a stranded molecule.
|
|
1237
|
+
if (self_type != other_type) and ("single" not in (self_type, other_type)):
|
|
1238
|
+
raise err
|
|
1239
|
+
|
|
1240
|
+
# tail length has to be equal for two phosphdiester bonds to form
|
|
1241
|
+
if len(self_tail) != len(other_tail):
|
|
1242
|
+
raise err
|
|
1243
|
+
|
|
1244
|
+
# Each basepair is checked against the pydna.alphabet basepair_dict
|
|
1245
|
+
# which contains the permitted base pairings.
|
|
1246
|
+
for w, c in zip(self_tail, other_tail[::-1]):
|
|
1247
|
+
try:
|
|
1248
|
+
basepair_dict[(w, c)]
|
|
1249
|
+
except KeyError:
|
|
1250
|
+
raise err
|
|
1251
|
+
|
|
1252
|
+
return self.__class__(
|
|
1253
|
+
self.watson + other.watson, other.crick + self.crick, self.ovhg
|
|
1254
|
+
)
|
|
1100
1255
|
|
|
1101
1256
|
def __mul__(self: DseqType, number: int) -> DseqType:
|
|
1102
1257
|
if not isinstance(number, int):
|
|
1103
1258
|
raise TypeError(
|
|
1104
|
-
"TypeError: can't multiply Dseq by non-int of type {}"
|
|
1105
|
-
type(number)
|
|
1106
|
-
)
|
|
1259
|
+
"TypeError: can't multiply Dseq" f" by non-int of type {type(number)}"
|
|
1107
1260
|
)
|
|
1108
|
-
|
|
1109
|
-
return self.__class__("")
|
|
1110
|
-
new = _copy.deepcopy(self)
|
|
1111
|
-
for i in range(number - 1):
|
|
1112
|
-
new += self
|
|
1113
|
-
return new
|
|
1261
|
+
return Dseq("").join(list(itertools.repeat(self, number)))
|
|
1114
1262
|
|
|
1115
|
-
def
|
|
1263
|
+
def _fill_in_left(self: DseqType, nucleotides: str) -> str:
|
|
1116
1264
|
stuffer = ""
|
|
1117
1265
|
type, se = self.five_prime_end()
|
|
1118
1266
|
if type == "5'":
|
|
1119
|
-
for n in
|
|
1267
|
+
for n in rc(se):
|
|
1120
1268
|
if n in nucleotides:
|
|
1121
1269
|
stuffer += n
|
|
1122
1270
|
else:
|
|
1123
1271
|
break
|
|
1124
1272
|
return self.crick + stuffer, self.ovhg + len(stuffer)
|
|
1125
1273
|
|
|
1126
|
-
def
|
|
1274
|
+
def _fill_in_right(self: DseqType, nucleotides: str) -> str:
|
|
1127
1275
|
stuffer = ""
|
|
1128
1276
|
type, se = self.three_prime_end()
|
|
1129
1277
|
if type == "5'":
|
|
1130
|
-
for n in
|
|
1278
|
+
for n in rc(se):
|
|
1131
1279
|
if n in nucleotides:
|
|
1132
1280
|
stuffer += n
|
|
1133
1281
|
else:
|
|
1134
1282
|
break
|
|
1135
1283
|
return self.watson + stuffer
|
|
1136
1284
|
|
|
1137
|
-
def fill_in(self, nucleotides:
|
|
1285
|
+
def fill_in(self, nucleotides: Union[None, str] = None) -> DseqType:
|
|
1138
1286
|
"""Fill in of five prime protruding end with a DNA polymerase
|
|
1139
|
-
that has only DNA polymerase activity (such as
|
|
1287
|
+
that has only DNA polymerase activity (such as Exo-Klenow [#]_).
|
|
1288
|
+
Exo-Klenow is a modified version of the Klenow fragment of E.
|
|
1289
|
+
coli DNA polymerase I, which has been engineered to lack both
|
|
1290
|
+
3-5 proofreading and 5-3 exonuclease activities.
|
|
1291
|
+
|
|
1140
1292
|
and any combination of A, G, C or T. Default are all four
|
|
1141
1293
|
nucleotides together.
|
|
1142
1294
|
|
|
@@ -1149,15 +1301,6 @@ class Dseq(_Seq):
|
|
|
1149
1301
|
--------
|
|
1150
1302
|
|
|
1151
1303
|
>>> from pydna.dseq import Dseq
|
|
1152
|
-
>>> a=Dseq("aaa", "ttt")
|
|
1153
|
-
>>> a
|
|
1154
|
-
Dseq(-3)
|
|
1155
|
-
aaa
|
|
1156
|
-
ttt
|
|
1157
|
-
>>> a.fill_in()
|
|
1158
|
-
Dseq(-3)
|
|
1159
|
-
aaa
|
|
1160
|
-
ttt
|
|
1161
1304
|
>>> b=Dseq("caaa", "cttt")
|
|
1162
1305
|
>>> b
|
|
1163
1306
|
Dseq(-5)
|
|
@@ -1184,7 +1327,15 @@ class Dseq(_Seq):
|
|
|
1184
1327
|
Dseq(-5)
|
|
1185
1328
|
aaac
|
|
1186
1329
|
gttt
|
|
1187
|
-
>>>
|
|
1330
|
+
>>> a=Dseq("aaa", "ttt")
|
|
1331
|
+
>>> a
|
|
1332
|
+
Dseq(-3)
|
|
1333
|
+
aaa
|
|
1334
|
+
ttt
|
|
1335
|
+
>>> a.fill_in()
|
|
1336
|
+
Dseq(-3)
|
|
1337
|
+
aaa
|
|
1338
|
+
ttt
|
|
1188
1339
|
|
|
1189
1340
|
References
|
|
1190
1341
|
----------
|
|
@@ -1195,32 +1346,31 @@ class Dseq(_Seq):
|
|
|
1195
1346
|
nucleotides = "GATCRYWSMKHBVDN"
|
|
1196
1347
|
|
|
1197
1348
|
nucleotides = set(nucleotides.lower() + nucleotides.upper())
|
|
1198
|
-
crick, ovhg = self.
|
|
1199
|
-
watson = self.
|
|
1349
|
+
crick, ovhg = self._fill_in_left(nucleotides)
|
|
1350
|
+
watson = self._fill_in_right(nucleotides)
|
|
1200
1351
|
return Dseq(watson, crick, ovhg)
|
|
1201
1352
|
|
|
1202
|
-
|
|
1203
|
-
return _Seq(self.watson).transcribe()
|
|
1204
|
-
|
|
1205
|
-
def translate(
|
|
1206
|
-
self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
|
|
1207
|
-
) -> _Seq:
|
|
1208
|
-
return _Seq(
|
|
1209
|
-
_translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
|
|
1210
|
-
)
|
|
1353
|
+
klenow = fill_in # alias
|
|
1211
1354
|
|
|
1212
|
-
def
|
|
1355
|
+
def nibble_to_blunt(self) -> DseqType:
|
|
1213
1356
|
"""
|
|
1214
|
-
Simulates treatment a nuclease with 5'-3' and 3'-5' single
|
|
1357
|
+
Simulates treatment a nuclease with both 5'-3' and 3'-5' single
|
|
1215
1358
|
strand specific exonuclease activity (such as mung bean nuclease [#]_)
|
|
1216
1359
|
|
|
1360
|
+
Mung bean nuclease is a nuclease enzyme derived from mung bean sprouts
|
|
1361
|
+
that preferentially degrades single-stranded DNA and RNA into
|
|
1362
|
+
5'-phosphate- and 3'-hydroxyl-containing nucleotides.
|
|
1363
|
+
|
|
1364
|
+
Treatment results in blunt DNA, regardless of wheter the protruding end
|
|
1365
|
+
is 5' or 3'.
|
|
1366
|
+
|
|
1217
1367
|
::
|
|
1218
1368
|
|
|
1219
1369
|
ggatcc -> gatcc
|
|
1220
1370
|
ctaggg ctagg
|
|
1221
1371
|
|
|
1222
|
-
ggatcc ->
|
|
1223
|
-
tcctag
|
|
1372
|
+
ggatcc -> ggatc
|
|
1373
|
+
tcctag cctag
|
|
1224
1374
|
|
|
1225
1375
|
>>> from pydna.dseq import Dseq
|
|
1226
1376
|
>>> b=Dseq("caaa", "cttt")
|
|
@@ -1250,19 +1400,60 @@ class Dseq(_Seq):
|
|
|
1250
1400
|
|
|
1251
1401
|
|
|
1252
1402
|
"""
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1403
|
+
parts = self.get_parts()
|
|
1404
|
+
return self.__class__(parts.middle)
|
|
1405
|
+
|
|
1406
|
+
mung = nibble_to_blunt
|
|
1407
|
+
|
|
1408
|
+
def T4(self, nucleotides=None) -> DseqType:
|
|
1409
|
+
"""
|
|
1410
|
+
Fill in 5' protruding ends and nibble 3' protruding ends.
|
|
1411
|
+
|
|
1412
|
+
This is done using a DNA polymerase providing 3'-5' nuclease activity
|
|
1413
|
+
such as T4 DNA polymerase. This can be done in presence of any
|
|
1414
|
+
combination of the four nucleotides A, G, C or T.
|
|
1415
|
+
|
|
1416
|
+
T4 DNA polymerase is widely used to “polish” DNA ends because of its
|
|
1417
|
+
strong 3-5 exonuclease activity in the absence of dNTPs, it chews
|
|
1418
|
+
back 3′ overhangs to create blunt ends; in the presence of limiting
|
|
1419
|
+
dNTPs, it can fill in 5′ overhangs; and by carefully controlling
|
|
1420
|
+
reaction time, temperature, and nucleotide supply, you can generate
|
|
1421
|
+
defined recessed or blunt termini.
|
|
1422
|
+
|
|
1423
|
+
Tuning the nucleotide set can facilitate engineering of partial
|
|
1424
|
+
sticky ends. Default are all four nucleotides together.
|
|
1425
|
+
|
|
1426
|
+
::
|
|
1427
|
+
|
|
1428
|
+
aaagatc-3 aaa 3' ends are always removed.
|
|
1429
|
+
||| ---> ||| A and T needed or the molecule will
|
|
1430
|
+
3-ctagttt ttt degrade completely.
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
5-gatcaaa gatcaaaGATC 5' ends are filled in the
|
|
1435
|
+
||| ---> ||||||||||| presence of GATC
|
|
1436
|
+
tttctag-5 CTAGtttctag
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
5-gatcaaa gatcaaaGAT 5' ends are partially filled in the
|
|
1441
|
+
||| ---> ||||||||| presence of GAT to produce a 1 nt
|
|
1442
|
+
tttctag-5 TAGtttctag 5' overhang
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
5-gatcaaa gatcaaaGA 5' ends are partially filled in the
|
|
1447
|
+
||| ---> ||||||| presence of GA to produce a 2 nt
|
|
1448
|
+
tttctag-5 AGtttctag 5' overhang
|
|
1449
|
+
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
5-gatcaaa gatcaaaG 5' ends are partially filled in the
|
|
1453
|
+
||| ---> ||||| presence of G to produce a 3 nt
|
|
1454
|
+
tttctag-5 Gtttctag 5' overhang
|
|
1455
|
+
|
|
1258
1456
|
|
|
1259
|
-
def T4(self, nucleotides=None) -> "Dseq":
|
|
1260
|
-
"""Fill in five prime protruding ends and chewing back
|
|
1261
|
-
three prime protruding ends by a DNA polymerase providing both
|
|
1262
|
-
5'-3' DNA polymerase activity and 3'-5' nuclease acitivty
|
|
1263
|
-
(such as T4 DNA polymerase). This can be done in presence of any
|
|
1264
|
-
combination of the four A, G, C or T. Removing one or more nucleotides
|
|
1265
|
-
can facilitate engineering of sticky ends. Default are all four nucleotides together.
|
|
1266
1457
|
|
|
1267
1458
|
Parameters
|
|
1268
1459
|
----------
|
|
@@ -1273,29 +1464,31 @@ class Dseq(_Seq):
|
|
|
1273
1464
|
--------
|
|
1274
1465
|
|
|
1275
1466
|
>>> from pydna.dseq import Dseq
|
|
1276
|
-
>>> a=Dseq(
|
|
1467
|
+
>>> a = Dseq.from_representation(
|
|
1468
|
+
... '''
|
|
1469
|
+
... gatcaaa
|
|
1470
|
+
... tttctag
|
|
1471
|
+
... ''')
|
|
1277
1472
|
>>> a
|
|
1278
|
-
Dseq(-
|
|
1279
|
-
|
|
1280
|
-
|
|
1473
|
+
Dseq(-11)
|
|
1474
|
+
gatcaaa
|
|
1475
|
+
tttctag
|
|
1281
1476
|
>>> a.T4()
|
|
1282
|
-
Dseq(-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
>>> a.T4("
|
|
1286
|
-
Dseq(-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
>>> a.T4("
|
|
1290
|
-
Dseq(-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
>>> a.T4("
|
|
1294
|
-
Dseq(-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
>>>
|
|
1298
|
-
|
|
1477
|
+
Dseq(-11)
|
|
1478
|
+
gatcaaagatc
|
|
1479
|
+
ctagtttctag
|
|
1480
|
+
>>> a.T4("GAT")
|
|
1481
|
+
Dseq(-11)
|
|
1482
|
+
gatcaaagat
|
|
1483
|
+
tagtttctag
|
|
1484
|
+
>>> a.T4("GA")
|
|
1485
|
+
Dseq(-11)
|
|
1486
|
+
gatcaaaga
|
|
1487
|
+
agtttctag
|
|
1488
|
+
>>> a.T4("G")
|
|
1489
|
+
Dseq(-11)
|
|
1490
|
+
gatcaaag
|
|
1491
|
+
gtttctag
|
|
1299
1492
|
"""
|
|
1300
1493
|
|
|
1301
1494
|
if not nucleotides:
|
|
@@ -1303,7 +1496,7 @@ class Dseq(_Seq):
|
|
|
1303
1496
|
nucleotides = set(nucleotides.lower() + nucleotides.upper())
|
|
1304
1497
|
type, se = self.five_prime_end()
|
|
1305
1498
|
if type == "5'":
|
|
1306
|
-
crick, ovhg = self.
|
|
1499
|
+
crick, ovhg = self._fill_in_left(nucleotides)
|
|
1307
1500
|
else:
|
|
1308
1501
|
if type == "3'":
|
|
1309
1502
|
ovhg = 0
|
|
@@ -1323,7 +1516,7 @@ class Dseq(_Seq):
|
|
|
1323
1516
|
watson = self.watson
|
|
1324
1517
|
type, se = self.three_prime_end()
|
|
1325
1518
|
if type == "5'":
|
|
1326
|
-
watson = self.
|
|
1519
|
+
watson = self._fill_in_right(nucleotides)
|
|
1327
1520
|
else:
|
|
1328
1521
|
if type == "3'":
|
|
1329
1522
|
watson = self.watson[: -len(se)]
|
|
@@ -1337,32 +1530,305 @@ class Dseq(_Seq):
|
|
|
1337
1530
|
|
|
1338
1531
|
t4 = T4 # alias for the T4 method.
|
|
1339
1532
|
|
|
1340
|
-
def
|
|
1341
|
-
"""
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1533
|
+
def nibble_five_prime_left(self: DseqType, n: int = 1) -> DseqType:
|
|
1534
|
+
"""
|
|
1535
|
+
5' => 3' resection at the left side (start) of the molecule.
|
|
1536
|
+
|
|
1537
|
+
The argument n indicate the number of nucleotides that are to be
|
|
1538
|
+
removed. The outcome of this depend on the structure of the molecule.
|
|
1539
|
+
See the two examples below:
|
|
1540
|
+
|
|
1541
|
+
The figure below indicates a recess of length two from a blunt DNA
|
|
1542
|
+
fragment. The resulting DNA fragment has a 3' protruding single strand.
|
|
1543
|
+
|
|
1544
|
+
::
|
|
1545
|
+
|
|
1546
|
+
gatc tc
|
|
1547
|
+
|||| --> ||
|
|
1548
|
+
ctag ctag
|
|
1549
|
+
|
|
1550
|
+
|
|
1551
|
+
The figure below indicates a recess of length two from a DNA fragment
|
|
1552
|
+
with a 5' sticky end resulting in a blunt sequence.
|
|
1553
|
+
|
|
1554
|
+
::
|
|
1555
|
+
|
|
1556
|
+
ttgatc gatc
|
|
1557
|
+
|||| --> ||||
|
|
1558
|
+
ctag ctag
|
|
1559
|
+
|
|
1560
|
+
|
|
1561
|
+
>>> from pydna.dseq import Dseq
|
|
1562
|
+
>>> ds = Dseq("gatc")
|
|
1563
|
+
>>> ds
|
|
1564
|
+
Dseq(-4)
|
|
1565
|
+
gatc
|
|
1566
|
+
ctag
|
|
1567
|
+
>>> ds.nibble_five_prime_left(2)
|
|
1568
|
+
Dseq(-4)
|
|
1569
|
+
tc
|
|
1570
|
+
ctag
|
|
1571
|
+
>>> ds.nibble_five_prime_left(3)
|
|
1572
|
+
Dseq(-4)
|
|
1573
|
+
c
|
|
1574
|
+
ctag
|
|
1575
|
+
>>> ds.nibble_five_prime_left(4)
|
|
1576
|
+
Dseq(-4)
|
|
1577
|
+
<BLANKLINE>
|
|
1578
|
+
ctag
|
|
1579
|
+
>>> ds = Dseq.from_representation(
|
|
1580
|
+
... '''
|
|
1581
|
+
... GGgatc
|
|
1582
|
+
... ctag
|
|
1583
|
+
... ''')
|
|
1584
|
+
>>> ds
|
|
1585
|
+
Dseq(-6)
|
|
1586
|
+
GGgatc
|
|
1587
|
+
ctag
|
|
1588
|
+
>>> ds.nibble_five_prime_left(2)
|
|
1589
|
+
Dseq(-4)
|
|
1590
|
+
gatc
|
|
1591
|
+
ctag
|
|
1592
|
+
|
|
1593
|
+
Parameters
|
|
1594
|
+
----------
|
|
1595
|
+
n : int, optional
|
|
1596
|
+
The default is 1. This is the number of nucleotides removed.
|
|
1597
|
+
|
|
1598
|
+
Returns
|
|
1599
|
+
-------
|
|
1600
|
+
DseqType
|
|
1601
|
+
DESCRIPTION.
|
|
1602
|
+
|
|
1603
|
+
"""
|
|
1604
|
+
n += max(0, self.ovhg or 0)
|
|
1605
|
+
return Dseq(
|
|
1606
|
+
self._data[:n]
|
|
1607
|
+
.translate(dscode_to_crick_table)
|
|
1608
|
+
.translate(complement_table_for_dscode)
|
|
1609
|
+
.translate(dscode_to_crick_tail_table)
|
|
1610
|
+
.lstrip()
|
|
1611
|
+
+ self._data[n:]
|
|
1612
|
+
)
|
|
1613
|
+
|
|
1614
|
+
def nibble_five_prime_right(self: DseqType, n: int = 1) -> DseqType:
|
|
1615
|
+
"""
|
|
1616
|
+
5' => 3' resection at the right side (end) of the molecule.
|
|
1617
|
+
|
|
1618
|
+
The argument n indicate the number of nucleotides that are to be
|
|
1619
|
+
removed. The outcome of this depend on the structure of the molecule.
|
|
1620
|
+
See the two examples below:
|
|
1621
|
+
|
|
1622
|
+
The figure below indicates a recess of length two from a blunt DNA
|
|
1623
|
+
fragment. The resulting DNA fragment has a 3' protruding single strand.
|
|
1624
|
+
|
|
1625
|
+
::
|
|
1626
|
+
|
|
1627
|
+
gatc gatc
|
|
1628
|
+
|||| --> ||
|
|
1629
|
+
ctag ct
|
|
1630
|
+
|
|
1631
|
+
The figure below indicates a recess of length two from a DNA fragment
|
|
1632
|
+
with a 5' sticky end resulting in a blunt sequence.
|
|
1633
|
+
|
|
1634
|
+
::
|
|
1635
|
+
|
|
1636
|
+
gatc gatc
|
|
1637
|
+
|||| --> ||||
|
|
1638
|
+
ctagtt ctag
|
|
1639
|
+
|
|
1640
|
+
|
|
1641
|
+
>>> from pydna.dseq import Dseq
|
|
1642
|
+
>>> ds = Dseq("gatc")
|
|
1643
|
+
>>> ds
|
|
1644
|
+
Dseq(-4)
|
|
1645
|
+
gatc
|
|
1646
|
+
ctag
|
|
1647
|
+
>>> ds.nibble_five_prime_right(2)
|
|
1648
|
+
Dseq(-4)
|
|
1649
|
+
gatc
|
|
1650
|
+
ct
|
|
1651
|
+
>>> ds.nibble_five_prime_right(3)
|
|
1652
|
+
Dseq(-4)
|
|
1653
|
+
gatc
|
|
1654
|
+
c
|
|
1655
|
+
>>> ds.nibble_five_prime_right(4)
|
|
1656
|
+
Dseq(-4)
|
|
1657
|
+
gatc
|
|
1658
|
+
<BLANKLINE>
|
|
1659
|
+
>>> ds = Dseq.from_representation(
|
|
1660
|
+
... '''
|
|
1661
|
+
... gatc
|
|
1662
|
+
... ctagGG
|
|
1663
|
+
... ''')
|
|
1664
|
+
>>> ds.nibble_five_prime_right(2)
|
|
1665
|
+
Dseq(-4)
|
|
1666
|
+
gatc
|
|
1667
|
+
ctag
|
|
1668
|
+
"""
|
|
1669
|
+
n = len(self) - n
|
|
1670
|
+
ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
|
|
1671
|
+
n -= max(0, ovhg)
|
|
1672
|
+
return Dseq(
|
|
1673
|
+
self._data[:n]
|
|
1674
|
+
+ self._data[n:]
|
|
1675
|
+
.translate(dscode_to_watson_table)
|
|
1676
|
+
.translate(dscode_to_watson_tail_table)
|
|
1677
|
+
.lstrip()
|
|
1678
|
+
)
|
|
1679
|
+
|
|
1680
|
+
exo1_front = nibble_five_prime_left # TODO: consider using the new names
|
|
1681
|
+
exo1_end = nibble_five_prime_right # TODO: consider using the new names
|
|
1682
|
+
|
|
1683
|
+
def nibble_three_prime_left(self: DseqType, n=1) -> DseqType:
|
|
1684
|
+
"""
|
|
1685
|
+
3' => 5' resection at the left side (beginning) of the molecule.
|
|
1686
|
+
|
|
1687
|
+
The argument n indicate the number of nucleotides that are to be
|
|
1688
|
+
removed. The outcome of this depend on the structure of the molecule.
|
|
1689
|
+
See the two examples below:
|
|
1690
|
+
|
|
1691
|
+
The figure below indicates a recess of length two from a blunt DNA
|
|
1692
|
+
fragment. The resulting DNA fragment has a 5' protruding single strand.
|
|
1693
|
+
|
|
1694
|
+
::
|
|
1695
|
+
|
|
1696
|
+
gatc gatc
|
|
1697
|
+
|||| --> ||
|
|
1698
|
+
ctag ag
|
|
1699
|
+
|
|
1700
|
+
The figure below indicates a recess of length two from a DNA fragment
|
|
1701
|
+
with a 3' sticky end resulting in a blunt sequence.
|
|
1702
|
+
|
|
1703
|
+
::
|
|
1704
|
+
|
|
1705
|
+
gatc gatc
|
|
1706
|
+
|||| --> ||||
|
|
1707
|
+
ttctag ctag
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
>>> from pydna.dseq import Dseq
|
|
1711
|
+
>>> ds = Dseq("gatc")
|
|
1712
|
+
>>> ds
|
|
1713
|
+
Dseq(-4)
|
|
1714
|
+
gatc
|
|
1715
|
+
ctag
|
|
1716
|
+
>>> ds.nibble_three_prime_left(2)
|
|
1717
|
+
Dseq(-4)
|
|
1718
|
+
gatc
|
|
1719
|
+
ag
|
|
1720
|
+
>>> ds.nibble_three_prime_left(3)
|
|
1721
|
+
Dseq(-4)
|
|
1722
|
+
gatc
|
|
1723
|
+
g
|
|
1724
|
+
>>> ds.nibble_three_prime_left(4)
|
|
1725
|
+
Dseq(-4)
|
|
1726
|
+
gatc
|
|
1727
|
+
<BLANKLINE>
|
|
1728
|
+
>>> ds = Dseq.from_representation(
|
|
1729
|
+
... '''
|
|
1730
|
+
... gatc
|
|
1731
|
+
... CCctag
|
|
1732
|
+
... ''')
|
|
1733
|
+
>>> ds
|
|
1734
|
+
Dseq(-6)
|
|
1735
|
+
gatc
|
|
1736
|
+
CCctag
|
|
1737
|
+
>>> ds.nibble_three_prime_left(2)
|
|
1738
|
+
Dseq(-4)
|
|
1739
|
+
gatc
|
|
1740
|
+
ctag
|
|
1741
|
+
"""
|
|
1742
|
+
ovhg = len(self) if self.ovhg is None else self.ovhg
|
|
1743
|
+
n -= min(0, ovhg)
|
|
1744
|
+
return Dseq(
|
|
1745
|
+
self._data[:n]
|
|
1746
|
+
.translate(dscode_to_watson_table)
|
|
1747
|
+
.translate(dscode_to_watson_tail_table)
|
|
1748
|
+
.lstrip()
|
|
1749
|
+
+ self._data[n:]
|
|
1750
|
+
)
|
|
1751
|
+
|
|
1752
|
+
def nibble_three_prime_right(self: DseqType, n=1) -> DseqType:
|
|
1753
|
+
"""
|
|
1754
|
+
3' => 5' resection at the right side (end) of the molecule.
|
|
1755
|
+
|
|
1756
|
+
The argument n indicate the number of nucleotides that are to be
|
|
1757
|
+
removed. The outcome of this depend on the structure of the molecule.
|
|
1758
|
+
See the two examples below:
|
|
1759
|
+
|
|
1760
|
+
The figure below indicates a recess of length two from a blunt DNA
|
|
1761
|
+
fragment. The resulting DNA fragment has a 5' protruding single strand.
|
|
1762
|
+
|
|
1763
|
+
::
|
|
1346
1764
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1765
|
+
gatc ga
|
|
1766
|
+
|||| --> ||
|
|
1767
|
+
ctag ctag
|
|
1768
|
+
|
|
1769
|
+
The figure below indicates a recess of length two from a DNA fragment
|
|
1770
|
+
with a 3' sticky end resulting in a blunt sequence.
|
|
1771
|
+
|
|
1772
|
+
::
|
|
1773
|
+
|
|
1774
|
+
gatctt gatc
|
|
1775
|
+
|||| --> ||||
|
|
1776
|
+
ctag ctag
|
|
1777
|
+
|
|
1778
|
+
|
|
1779
|
+
>>> from pydna.dseq import Dseq
|
|
1780
|
+
>>> ds = Dseq("gatc")
|
|
1781
|
+
>>> ds
|
|
1782
|
+
Dseq(-4)
|
|
1783
|
+
gatc
|
|
1784
|
+
ctag
|
|
1785
|
+
>>> ds.nibble_three_prime_right(2)
|
|
1786
|
+
Dseq(-4)
|
|
1787
|
+
ga
|
|
1788
|
+
ctag
|
|
1789
|
+
>>> ds.nibble_three_prime_right(3)
|
|
1790
|
+
Dseq(-4)
|
|
1791
|
+
g
|
|
1792
|
+
ctag
|
|
1793
|
+
>>> ds.nibble_three_prime_right(4)
|
|
1794
|
+
Dseq(-4)
|
|
1795
|
+
<BLANKLINE>
|
|
1796
|
+
ctag
|
|
1797
|
+
>>> ds = Dseq.from_representation(
|
|
1798
|
+
... '''
|
|
1799
|
+
... gatcCC
|
|
1800
|
+
... ctag
|
|
1801
|
+
... ''')
|
|
1802
|
+
>>> ds.nibble_three_prime_right(2)
|
|
1803
|
+
Dseq(-4)
|
|
1804
|
+
gatc
|
|
1805
|
+
ctag
|
|
1806
|
+
"""
|
|
1807
|
+
n = len(self) - n
|
|
1808
|
+
ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
|
|
1809
|
+
n += min(0, ovhg)
|
|
1810
|
+
return Dseq(
|
|
1811
|
+
self._data[:n]
|
|
1812
|
+
+ self._data[n:]
|
|
1813
|
+
.translate(dscode_to_crick_table)
|
|
1814
|
+
.translate(complement_table_for_dscode)
|
|
1815
|
+
.translate(dscode_to_crick_tail_table)
|
|
1816
|
+
.lstrip()
|
|
1817
|
+
)
|
|
1352
1818
|
|
|
1353
1819
|
def no_cutters(
|
|
1354
|
-
self, batch:
|
|
1355
|
-
) ->
|
|
1820
|
+
self, batch: Union[RestrictionBatch, None] = None
|
|
1821
|
+
) -> RestrictionBatch:
|
|
1356
1822
|
"""Enzymes in a RestrictionBatch not cutting sequence."""
|
|
1357
1823
|
if batch is None:
|
|
1358
1824
|
batch = CommOnly
|
|
1359
1825
|
ana = batch.search(self)
|
|
1360
1826
|
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist}
|
|
1361
|
-
return
|
|
1827
|
+
return RestrictionBatch(ncut)
|
|
1362
1828
|
|
|
1363
1829
|
def unique_cutters(
|
|
1364
|
-
self, batch:
|
|
1365
|
-
) ->
|
|
1830
|
+
self, batch: Union[RestrictionBatch, None] = None
|
|
1831
|
+
) -> RestrictionBatch:
|
|
1366
1832
|
"""Enzymes in a RestrictionBatch cutting sequence once."""
|
|
1367
1833
|
if batch is None:
|
|
1368
1834
|
batch = CommOnly
|
|
@@ -1371,44 +1837,42 @@ class Dseq(_Seq):
|
|
|
1371
1837
|
once_cutters = unique_cutters # alias for unique_cutters
|
|
1372
1838
|
|
|
1373
1839
|
def twice_cutters(
|
|
1374
|
-
self, batch:
|
|
1375
|
-
) ->
|
|
1840
|
+
self, batch: Union[RestrictionBatch, None] = None
|
|
1841
|
+
) -> RestrictionBatch:
|
|
1376
1842
|
"""Enzymes in a RestrictionBatch cutting sequence twice."""
|
|
1377
1843
|
if batch is None:
|
|
1378
1844
|
batch = CommOnly
|
|
1379
1845
|
return self.n_cutters(n=2, batch=batch)
|
|
1380
1846
|
|
|
1381
1847
|
def n_cutters(
|
|
1382
|
-
self, n=3, batch:
|
|
1383
|
-
) ->
|
|
1848
|
+
self, n=3, batch: Union[RestrictionBatch, None] = None
|
|
1849
|
+
) -> RestrictionBatch:
|
|
1384
1850
|
"""Enzymes in a RestrictionBatch cutting n times."""
|
|
1385
1851
|
if batch is None:
|
|
1386
1852
|
batch = CommOnly
|
|
1387
1853
|
ana = batch.search(self)
|
|
1388
1854
|
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n}
|
|
1389
|
-
return
|
|
1855
|
+
return RestrictionBatch(ncut)
|
|
1390
1856
|
|
|
1391
|
-
def cutters(
|
|
1392
|
-
self, batch: _Union[_RestrictionBatch, None] = None
|
|
1393
|
-
) -> _RestrictionBatch:
|
|
1857
|
+
def cutters(self, batch: Union[RestrictionBatch, None] = None) -> RestrictionBatch:
|
|
1394
1858
|
"""Enzymes in a RestrictionBatch cutting sequence at least once."""
|
|
1395
1859
|
if batch is None:
|
|
1396
1860
|
batch = CommOnly
|
|
1397
1861
|
ana = batch.search(self)
|
|
1398
1862
|
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist}
|
|
1399
|
-
return
|
|
1863
|
+
return RestrictionBatch(ncut)
|
|
1400
1864
|
|
|
1401
1865
|
def seguid(self) -> str:
|
|
1402
1866
|
"""SEGUID checksum for the sequence."""
|
|
1403
1867
|
if self.circular:
|
|
1404
|
-
cs =
|
|
1868
|
+
cs = cdseguid(
|
|
1405
1869
|
self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}"
|
|
1406
1870
|
)
|
|
1407
1871
|
else:
|
|
1408
1872
|
"""docstring."""
|
|
1409
1873
|
w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper()
|
|
1410
1874
|
c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper()
|
|
1411
|
-
cs =
|
|
1875
|
+
cs = ldseguid(w, c, alphabet="{DNA-extended},AU")
|
|
1412
1876
|
return cs
|
|
1413
1877
|
|
|
1414
1878
|
def isblunt(self) -> bool:
|
|
@@ -1449,29 +1913,113 @@ class Dseq(_Seq):
|
|
|
1449
1913
|
>>> a.isblunt()
|
|
1450
1914
|
False
|
|
1451
1915
|
"""
|
|
1452
|
-
|
|
1453
|
-
|
|
1916
|
+
parts = self.get_parts()
|
|
1917
|
+
|
|
1918
|
+
return not any(
|
|
1919
|
+
(
|
|
1920
|
+
parts.sticky_right5,
|
|
1921
|
+
parts.sticky_right3,
|
|
1922
|
+
parts.sticky_left3,
|
|
1923
|
+
parts.sticky_left5,
|
|
1924
|
+
self.circular,
|
|
1925
|
+
)
|
|
1454
1926
|
)
|
|
1455
1927
|
|
|
1456
|
-
def
|
|
1457
|
-
"""
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1928
|
+
def terminal_transferase(self, nucleotides: str = "a") -> DseqType:
|
|
1929
|
+
"""
|
|
1930
|
+
Terminal deoxynucleotidyl transferase (TdT) is a template-independent
|
|
1931
|
+
DNA polymerase that adds nucleotides to the 3′-OH ends of DNA, typically
|
|
1932
|
+
single-stranded or recessed 3′ ends. In cloning, it’s classically used
|
|
1933
|
+
to create homopolymer tails (e.g. poly-dG on a vector and poly-dC on an insert)
|
|
1934
|
+
so that fragments can anneal via complementary overhangs (“tailing” cloning).
|
|
1935
|
+
|
|
1936
|
+
This activity ia also present in some DNA polymerases, such as Taq polymerase.
|
|
1937
|
+
This property is used in the populat T/A cloning protocol ([#]_).
|
|
1938
|
+
|
|
1939
|
+
::
|
|
1940
|
+
|
|
1941
|
+
gct gcta
|
|
1942
|
+
||| --> |||
|
|
1943
|
+
cga acga
|
|
1944
|
+
|
|
1945
|
+
|
|
1946
|
+
|
|
1947
|
+
>>> from pydna.dseq import Dseq
|
|
1948
|
+
>>> a = Dseq("aa")
|
|
1949
|
+
>>> a = Dseq("gct")
|
|
1950
|
+
>>> a
|
|
1951
|
+
Dseq(-3)
|
|
1952
|
+
gct
|
|
1953
|
+
cga
|
|
1954
|
+
>>> a.terminal_transferase()
|
|
1955
|
+
Dseq(-5)
|
|
1956
|
+
gcta
|
|
1957
|
+
acga
|
|
1958
|
+
>>> a.terminal_transferase("G")
|
|
1959
|
+
Dseq(-5)
|
|
1960
|
+
gctG
|
|
1961
|
+
Gcga
|
|
1962
|
+
|
|
1963
|
+
Parameters
|
|
1964
|
+
----------
|
|
1965
|
+
nucleotides : str, optional
|
|
1966
|
+
The default is "a".
|
|
1967
|
+
|
|
1968
|
+
Returns
|
|
1969
|
+
-------
|
|
1970
|
+
DseqType
|
|
1971
|
+
DESCRIPTION.
|
|
1972
|
+
|
|
1973
|
+
References
|
|
1974
|
+
----------
|
|
1975
|
+
.. [#] https://en.wikipedia.org/wiki/TA_cloning
|
|
1976
|
+
|
|
1977
|
+
"""
|
|
1469
1978
|
ovhg = self.ovhg
|
|
1470
1979
|
if self.ovhg >= 0:
|
|
1471
1980
|
ovhg += len(nucleotides)
|
|
1472
1981
|
return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
|
|
1473
1982
|
|
|
1474
|
-
def
|
|
1983
|
+
def user(self) -> DseqType:
|
|
1984
|
+
"""
|
|
1985
|
+
USER Enzyme treatment.
|
|
1986
|
+
|
|
1987
|
+
USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the
|
|
1988
|
+
DNA glycosylase-lyase Endonuclease VIII.
|
|
1989
|
+
|
|
1990
|
+
UDG catalyses the excision of an uracil base, forming an abasic
|
|
1991
|
+
or apyrimidinic site (AP site). Endonuclease VIII removes the AP
|
|
1992
|
+
site creating a DNA gap.
|
|
1993
|
+
|
|
1994
|
+
::
|
|
1995
|
+
|
|
1996
|
+
tagaagtaggUat tagaagtagg at
|
|
1997
|
+
||||||||||||| ---> |||||||||| ||
|
|
1998
|
+
atcUtcatccata atc tcatccata
|
|
1999
|
+
|
|
2000
|
+
|
|
2001
|
+
|
|
2002
|
+
>>> a = Dseq("tagaagtaggUat", "atcUtcatccata"[::-1], 0)
|
|
2003
|
+
>>> a
|
|
2004
|
+
Dseq(-13)
|
|
2005
|
+
tagaagtaggUat
|
|
2006
|
+
atcutcatccAta
|
|
2007
|
+
>>> a.user()
|
|
2008
|
+
Dseq(-13)
|
|
2009
|
+
tagaagtagg at
|
|
2010
|
+
atc tcatccAta
|
|
2011
|
+
|
|
2012
|
+
|
|
2013
|
+
Returns
|
|
2014
|
+
-------
|
|
2015
|
+
DseqType
|
|
2016
|
+
DNA fragment with uracile bases removed.
|
|
2017
|
+
|
|
2018
|
+
"""
|
|
2019
|
+
|
|
2020
|
+
return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe")))
|
|
2021
|
+
|
|
2022
|
+
def cut(self: DseqType, *enzymes: EnzymesType) -> Tuple[DseqType, ...]:
|
|
1475
2023
|
"""Returns a list of linear Dseq fragments produced in the digestion.
|
|
1476
2024
|
If there are no cuts, an empty list is returned.
|
|
1477
2025
|
|
|
@@ -1522,11 +2070,73 @@ class Dseq(_Seq):
|
|
|
1522
2070
|
return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
|
|
1523
2071
|
|
|
1524
2072
|
def cutsite_is_valid(self, cutsite: CutSiteType) -> bool:
|
|
1525
|
-
"""
|
|
2073
|
+
"""
|
|
2074
|
+
Check is a cutsite is valid.
|
|
2075
|
+
|
|
2076
|
+
A cutsite is a nested 2-tuple with this form:
|
|
2077
|
+
|
|
2078
|
+
((cut_watson, ovhg), enz), for example ((396, -4), EcoRI)
|
|
2079
|
+
|
|
2080
|
+
The cut_watson (positive integer) is the cut position of the sequence as for example
|
|
2081
|
+
returned by the Bio.Restriction module.
|
|
2082
|
+
|
|
2083
|
+
The ovhg (overhang, positive or negative integer or 0) has the same meaning as
|
|
2084
|
+
for restriction enzymes in the Bio.Restriction module and for
|
|
2085
|
+
pydna.dseq.Dseq objects (see docstring for this module and example below)
|
|
2086
|
+
|
|
2087
|
+
Enzyme can be None.
|
|
2088
|
+
|
|
2089
|
+
::
|
|
2090
|
+
|
|
2091
|
+
Enzyme overhang
|
|
2092
|
+
|
|
2093
|
+
EcoRI -4 --GAATTC-- --G AATTC--
|
|
2094
|
+
|||||| --> | |
|
|
2095
|
+
--CTTAAG-- --CTTAA G--
|
|
2096
|
+
|
|
2097
|
+
KpnI 4 --GGTACC-- --GGTAC C--
|
|
2098
|
+
|||||| --> | |
|
|
2099
|
+
--CCATGG-- --C CATGG--
|
|
2100
|
+
|
|
2101
|
+
SmaI 0 --CCCGGG-- --CCC GGG--
|
|
2102
|
+
|||||| --> ||| |||
|
|
2103
|
+
--GGGCCC-- --GGG CCC--
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
>>> from Bio.Restriction import EcoRI, KpnI, SmaI
|
|
2107
|
+
>>> EcoRI.ovhg
|
|
2108
|
+
-4
|
|
2109
|
+
>>> KpnI.ovhg
|
|
2110
|
+
4
|
|
2111
|
+
>>> SmaI.ovhg
|
|
2112
|
+
0
|
|
2113
|
+
|
|
2114
|
+
Returns False if:
|
|
2115
|
+
|
|
1526
2116
|
- Cut positions fall outside the sequence (could be moved to Biopython)
|
|
2117
|
+
TODO: example
|
|
2118
|
+
|
|
1527
2119
|
- Overhang is not double stranded
|
|
2120
|
+
TODO: example
|
|
2121
|
+
|
|
1528
2122
|
- Recognition site is not double stranded or is outside the sequence
|
|
2123
|
+
TODO: example
|
|
2124
|
+
|
|
1529
2125
|
- For enzymes that cut twice, it checks that at least one possibility is valid
|
|
2126
|
+
TODO: example
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
|
|
2130
|
+
Parameters
|
|
2131
|
+
----------
|
|
2132
|
+
cutsite : CutSiteType
|
|
2133
|
+
DESCRIPTION.
|
|
2134
|
+
|
|
2135
|
+
Returns
|
|
2136
|
+
-------
|
|
2137
|
+
bool
|
|
2138
|
+
True if cutsite can cut the DNA fragment.
|
|
2139
|
+
|
|
1530
2140
|
"""
|
|
1531
2141
|
|
|
1532
2142
|
assert cutsite is not None, "cutsite is None"
|
|
@@ -1536,7 +2146,7 @@ class Dseq(_Seq):
|
|
|
1536
2146
|
|
|
1537
2147
|
# The overhang is double stranded
|
|
1538
2148
|
overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson]
|
|
1539
|
-
if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg
|
|
2149
|
+
if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg != 0:
|
|
1540
2150
|
return False
|
|
1541
2151
|
|
|
1542
2152
|
# The recognition site is double stranded and within the sequence
|
|
@@ -1550,7 +2160,7 @@ class Dseq(_Seq):
|
|
|
1550
2160
|
if (
|
|
1551
2161
|
len(recognition_site) == 0
|
|
1552
2162
|
or recognition_site.ovhg != 0
|
|
1553
|
-
or recognition_site.watson_ovhg
|
|
2163
|
+
or recognition_site.watson_ovhg != 0
|
|
1554
2164
|
):
|
|
1555
2165
|
if enz is None or enz.scd5 is None:
|
|
1556
2166
|
return False
|
|
@@ -1569,20 +2179,22 @@ class Dseq(_Seq):
|
|
|
1569
2179
|
if (
|
|
1570
2180
|
len(recognition_site) == 0
|
|
1571
2181
|
or recognition_site.ovhg != 0
|
|
1572
|
-
or recognition_site.watson_ovhg
|
|
2182
|
+
or recognition_site.watson_ovhg != 0
|
|
1573
2183
|
):
|
|
1574
2184
|
return False
|
|
1575
2185
|
|
|
1576
2186
|
return True
|
|
1577
2187
|
|
|
1578
|
-
def get_cutsites(self: DseqType, *enzymes: EnzymesType) ->
|
|
2188
|
+
def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> List[CutSiteType]:
|
|
1579
2189
|
"""Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`:
|
|
1580
2190
|
|
|
1581
2191
|
- `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence
|
|
1582
2192
|
that will be cut. It represents the position of the cut on the watson strand, using the full
|
|
1583
2193
|
sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`.
|
|
2194
|
+
|
|
1584
2195
|
- `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in
|
|
1585
2196
|
the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.
|
|
2197
|
+
|
|
1586
2198
|
- `enz` is the enzyme object. It's not necessary to perform the cut, but can be
|
|
1587
2199
|
used to keep track of which enzyme was used.
|
|
1588
2200
|
|
|
@@ -1592,7 +2204,7 @@ class Dseq(_Seq):
|
|
|
1592
2204
|
Parameters
|
|
1593
2205
|
----------
|
|
1594
2206
|
|
|
1595
|
-
enzymes : Union[
|
|
2207
|
+
enzymes : Union[RestrictionBatch,list[_AbstractCut]]
|
|
1596
2208
|
|
|
1597
2209
|
Returns
|
|
1598
2210
|
-------
|
|
@@ -1628,11 +2240,11 @@ class Dseq(_Seq):
|
|
|
1628
2240
|
|
|
1629
2241
|
"""
|
|
1630
2242
|
|
|
1631
|
-
if len(enzymes) == 1 and isinstance(enzymes[0],
|
|
2243
|
+
if len(enzymes) == 1 and isinstance(enzymes[0], RestrictionBatch):
|
|
1632
2244
|
# argument is probably a RestrictionBatch
|
|
1633
2245
|
enzymes = [e for e in enzymes[0]]
|
|
1634
2246
|
|
|
1635
|
-
enzymes =
|
|
2247
|
+
enzymes = list(dict.fromkeys(flatten(enzymes))) # remove duplicate enzymes
|
|
1636
2248
|
out = list()
|
|
1637
2249
|
for e in enzymes:
|
|
1638
2250
|
# Positions of the cut on the watson strand. They are 1-based, so we subtract
|
|
@@ -1643,7 +2255,7 @@ class Dseq(_Seq):
|
|
|
1643
2255
|
|
|
1644
2256
|
return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
|
|
1645
2257
|
|
|
1646
|
-
def left_end_position(self) ->
|
|
2258
|
+
def left_end_position(self) -> Tuple[int, int]:
|
|
1647
2259
|
"""
|
|
1648
2260
|
The index in the full sequence of the watson and crick start positions.
|
|
1649
2261
|
|
|
@@ -1660,7 +2272,7 @@ class Dseq(_Seq):
|
|
|
1660
2272
|
return self.ovhg, 0
|
|
1661
2273
|
return 0, -self.ovhg
|
|
1662
2274
|
|
|
1663
|
-
def right_end_position(self) ->
|
|
2275
|
+
def right_end_position(self) -> Tuple[int, int]:
|
|
1664
2276
|
"""The index in the full sequence of the watson and crick end positions.
|
|
1665
2277
|
|
|
1666
2278
|
full sequence (str(self)) for all three cases is AAA
|
|
@@ -1672,13 +2284,210 @@ class Dseq(_Seq):
|
|
|
1672
2284
|
```
|
|
1673
2285
|
|
|
1674
2286
|
"""
|
|
1675
|
-
if self.watson_ovhg
|
|
1676
|
-
return len(self) + self.watson_ovhg
|
|
1677
|
-
return len(self), len(self) - self.watson_ovhg
|
|
2287
|
+
if self.watson_ovhg < 0:
|
|
2288
|
+
return len(self) + self.watson_ovhg, len(self)
|
|
2289
|
+
return len(self), len(self) - self.watson_ovhg
|
|
2290
|
+
|
|
2291
|
+
def get_ss_meltsites(self: DseqType, length: int) -> tuple[int, int]:
|
|
2292
|
+
"""
|
|
2293
|
+
Single stranded DNA melt sites
|
|
2294
|
+
|
|
2295
|
+
Two lists of 2-tuples of integers are returned. Each tuple
|
|
2296
|
+
(`((from, to))`) contains the start and end positions of a single
|
|
2297
|
+
stranded region, shorter or equal to `length`.
|
|
2298
|
+
|
|
2299
|
+
In the example below, the middle 2 nt part is released from the
|
|
2300
|
+
molecule.
|
|
2301
|
+
|
|
2302
|
+
::
|
|
2303
|
+
|
|
2304
|
+
|
|
2305
|
+
tagaa ta gtatg
|
|
2306
|
+
||||| || ||||| --> [(6,8)], []
|
|
2307
|
+
atcttcatccatac
|
|
2308
|
+
|
|
2309
|
+
tagaagtaggtatg
|
|
2310
|
+
||||| || ||||| --> [], [(6,8)]
|
|
2311
|
+
atctt at catac
|
|
2312
|
+
|
|
2313
|
+
|
|
2314
|
+
|
|
2315
|
+
|
|
2316
|
+
The output of this method is used in the `melt_ss_dna` method in order
|
|
2317
|
+
to determine the start and end positions of single stranded regions.
|
|
2318
|
+
|
|
2319
|
+
See get_ds_meltsites for melting ds sequences.
|
|
2320
|
+
|
|
2321
|
+
Examples
|
|
2322
|
+
--------
|
|
2323
|
+
>>> from pydna.dseq import Dseq
|
|
2324
|
+
>>> ds = Dseq("tagaaqtaqgtatg")
|
|
2325
|
+
>>> ds
|
|
2326
|
+
Dseq(-14)
|
|
2327
|
+
tagaa ta gtatg
|
|
2328
|
+
atcttcatccatac
|
|
2329
|
+
>>> cutsites = ds.get_ss_meltsites(2)
|
|
2330
|
+
>>> cutsites
|
|
2331
|
+
([(6, 8)], [])
|
|
2332
|
+
>>> ds[6:8]
|
|
2333
|
+
Dseq(-2)
|
|
2334
|
+
ta
|
|
2335
|
+
at
|
|
2336
|
+
>>> ds = Dseq("tagaaptapgtatg")
|
|
2337
|
+
>>> ds
|
|
2338
|
+
Dseq(-14)
|
|
2339
|
+
tagaagtaggtatg
|
|
2340
|
+
atctt at catac
|
|
2341
|
+
>>> cutsites = ds.get_ss_meltsites(2)
|
|
2342
|
+
>>> cutsites
|
|
2343
|
+
([], [(6, 8)])
|
|
2344
|
+
"""
|
|
2345
|
+
|
|
2346
|
+
regex = regex_ss_melt_factory(length)
|
|
2347
|
+
|
|
2348
|
+
if self.circular:
|
|
2349
|
+
spacer = length
|
|
2350
|
+
cutfrom = self._data[-length:] + self._data + self._data[:length]
|
|
2351
|
+
else:
|
|
2352
|
+
spacer = 0
|
|
2353
|
+
cutfrom = self._data
|
|
2354
|
+
|
|
2355
|
+
watson_cuts = []
|
|
2356
|
+
crick_cuts = []
|
|
2357
|
+
|
|
2358
|
+
for m in regex.finditer(cutfrom):
|
|
2359
|
+
|
|
2360
|
+
if m.lastgroup == "watson":
|
|
2361
|
+
cut1 = m.start() + spacer
|
|
2362
|
+
cut2 = m.end() + spacer
|
|
2363
|
+
watson_cuts.append((cut1, cut2))
|
|
2364
|
+
else:
|
|
2365
|
+
assert m.lastgroup == "crick"
|
|
2366
|
+
cut1 = m.start() + spacer
|
|
2367
|
+
cut2 = m.end() + spacer
|
|
2368
|
+
crick_cuts.append((cut1, cut2))
|
|
2369
|
+
|
|
2370
|
+
return watson_cuts, crick_cuts
|
|
2371
|
+
|
|
2372
|
+
def get_ds_meltsites(self: DseqType, length: int) -> List[CutSiteType]:
|
|
2373
|
+
"""
|
|
2374
|
+
Double stranded DNA melt sites
|
|
2375
|
+
|
|
2376
|
+
DNA molecules can fall apart by melting if they have internal single
|
|
2377
|
+
stranded regions. In the example below, the molecule has two gaps
|
|
2378
|
+
on opposite sides, two nucleotides apart, which means that it hangs
|
|
2379
|
+
together by two basepairs.
|
|
2380
|
+
|
|
2381
|
+
This molecule can melt into two separate 8 bp double stranded
|
|
2382
|
+
molecules, each with 3 nt 3' overhangs a depicted below.
|
|
2383
|
+
|
|
2384
|
+
::
|
|
2385
|
+
|
|
2386
|
+
tagaagta gtatg tagaagta gtatg
|
|
2387
|
+
||||| || ||||| --> ||||| |||||
|
|
2388
|
+
atctt atccatac atctt atccatac
|
|
2389
|
+
|
|
2390
|
+
|
|
2391
|
+
A list of 2-tuples is returned. Each tuple (`((cut_watson, ovhg), None)`)
|
|
2392
|
+
contains cut position and the overhang value in the same format as
|
|
2393
|
+
returned by the get_cutsites method for restriction enzymes.
|
|
2394
|
+
|
|
2395
|
+
Note that this function deals with melting that results in two double
|
|
2396
|
+
stranded DNA molecules.
|
|
2397
|
+
|
|
2398
|
+
See get_ss_meltsites for melting of single stranded regions from
|
|
2399
|
+
molecules.
|
|
2400
|
+
|
|
2401
|
+
Examples
|
|
2402
|
+
--------
|
|
2403
|
+
>>> from pydna.dseq import Dseq
|
|
2404
|
+
>>> ds = Dseq("tagaaptaqgtatg")
|
|
2405
|
+
>>> ds
|
|
2406
|
+
Dseq(-14)
|
|
2407
|
+
tagaagta gtatg
|
|
2408
|
+
atctt atccatac
|
|
2409
|
+
>>> cutsite = ds.get_ds_meltsites(2)
|
|
2410
|
+
>>> cutsite
|
|
2411
|
+
[((8, 2), None)]
|
|
2412
|
+
|
|
2413
|
+
"""
|
|
2414
|
+
|
|
2415
|
+
if length < 1:
|
|
2416
|
+
return tuple()
|
|
2417
|
+
|
|
2418
|
+
regex = regex_ds_melt_factory(length)
|
|
2419
|
+
|
|
2420
|
+
if self.circular:
|
|
2421
|
+
spacer = length
|
|
2422
|
+
cutfrom = self._data[-length:] + self._data + self._data[:length]
|
|
2423
|
+
else:
|
|
2424
|
+
spacer = 0
|
|
2425
|
+
cutfrom = self._data
|
|
2426
|
+
|
|
2427
|
+
cuts = []
|
|
2428
|
+
|
|
2429
|
+
for m in regex.finditer(cutfrom):
|
|
2430
|
+
|
|
2431
|
+
if m.lastgroup == "watson":
|
|
2432
|
+
cut = (m.end() - spacer, m.end() - m.start()), None
|
|
2433
|
+
else:
|
|
2434
|
+
assert m.lastgroup == "crick"
|
|
2435
|
+
cut = (m.start() - spacer, m.start() - m.end()), None
|
|
2436
|
+
|
|
2437
|
+
cuts.append(cut)
|
|
2438
|
+
|
|
2439
|
+
return cuts
|
|
2440
|
+
|
|
2441
|
+
def cast_to_ds_right(self):
|
|
2442
|
+
"""
|
|
2443
|
+
NNNN NNNNGATC
|
|
2444
|
+
|||| --> ||||||||
|
|
2445
|
+
NNNNCTAG NNNNCTAG
|
|
2446
|
+
|
|
2447
|
+
|
|
2448
|
+
NNNNGATC NNNNGATC
|
|
2449
|
+
|||| --> ||||||||
|
|
2450
|
+
NNNN NNNNCTAG
|
|
2451
|
+
"""
|
|
2452
|
+
|
|
2453
|
+
p = self.get_parts()
|
|
2454
|
+
|
|
2455
|
+
ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate(
|
|
2456
|
+
dscode_to_full_sequence_table
|
|
2457
|
+
)
|
|
2458
|
+
|
|
2459
|
+
result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer
|
|
2460
|
+
|
|
2461
|
+
return self.__class__(result, circular=False)
|
|
2462
|
+
|
|
2463
|
+
def cast_to_ds(self):
|
|
2464
|
+
"""Sequencially calls cast_to_ds_left and cast_to_ds_right."""
|
|
2465
|
+
return self.cast_to_ds_left().cast_to_ds_right()
|
|
2466
|
+
|
|
2467
|
+
def cast_to_ds_left(self):
|
|
2468
|
+
"""
|
|
2469
|
+
GATCNNNN GATCNNNN
|
|
2470
|
+
|||| --> ||||||||
|
|
2471
|
+
NNNN CTAGNNNN
|
|
2472
|
+
|
|
2473
|
+
NNNN GATCNNNN
|
|
2474
|
+
|||| --> ||||||||
|
|
2475
|
+
CTAGNNNN CTAGNNNN
|
|
2476
|
+
"""
|
|
2477
|
+
|
|
2478
|
+
p = self.get_parts()
|
|
2479
|
+
|
|
2480
|
+
ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate(
|
|
2481
|
+
dscode_to_full_sequence_table
|
|
2482
|
+
)
|
|
2483
|
+
|
|
2484
|
+
result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3)
|
|
2485
|
+
|
|
2486
|
+
return self.__class__(result, circular=False)
|
|
1678
2487
|
|
|
1679
2488
|
def get_cut_parameters(
|
|
1680
|
-
self, cut:
|
|
1681
|
-
) ->
|
|
2489
|
+
self, cut: Union[CutSiteType, None], is_left: bool
|
|
2490
|
+
) -> Tuple[int, int, int]:
|
|
1682
2491
|
"""For a given cut expressed as ((cut_watson, ovhg), enz), returns
|
|
1683
2492
|
a tuple (cut_watson, cut_crick, ovhg).
|
|
1684
2493
|
|
|
@@ -1703,7 +2512,169 @@ class Dseq(_Seq):
|
|
|
1703
2512
|
if is_left:
|
|
1704
2513
|
return *self.left_end_position(), self.ovhg
|
|
1705
2514
|
# In the right end, the overhang does not matter
|
|
1706
|
-
return *self.right_end_position(), self.watson_ovhg
|
|
2515
|
+
return *self.right_end_position(), self.watson_ovhg
|
|
2516
|
+
|
|
2517
|
+
def melt(self, length):
|
|
2518
|
+
"""
|
|
2519
|
+
TBD
|
|
2520
|
+
|
|
2521
|
+
Parameters
|
|
2522
|
+
----------
|
|
2523
|
+
length : TYPE
|
|
2524
|
+
DESCRIPTION.
|
|
2525
|
+
|
|
2526
|
+
Returns
|
|
2527
|
+
-------
|
|
2528
|
+
TYPE
|
|
2529
|
+
DESCRIPTION.
|
|
2530
|
+
|
|
2531
|
+
"""
|
|
2532
|
+
if not length or length < 1:
|
|
2533
|
+
return tuple()
|
|
2534
|
+
|
|
2535
|
+
# First we need to get rid of single stranded sequences
|
|
2536
|
+
new, strands = self.melt_ss_dna(length)
|
|
2537
|
+
|
|
2538
|
+
cutsites = new.get_ds_meltsites(length)
|
|
2539
|
+
|
|
2540
|
+
cutsite_pairs = self.get_cutsite_pairs(cutsites)
|
|
2541
|
+
|
|
2542
|
+
result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs)
|
|
2543
|
+
|
|
2544
|
+
result = tuple([new]) if strands and not result else result
|
|
2545
|
+
|
|
2546
|
+
return tuple(strands) + tuple(result)
|
|
2547
|
+
|
|
2548
|
+
def melt_ss_dna(self, length) -> tuple["Dseq", list["Dseq"]]:
|
|
2549
|
+
"""
|
|
2550
|
+
Melt to separate single stranded DNA
|
|
2551
|
+
|
|
2552
|
+
Single stranded DNA molecules shorter or equal to `length` shed from
|
|
2553
|
+
a double stranded DNA molecule without affecting the length of the
|
|
2554
|
+
remaining molecule.
|
|
2555
|
+
|
|
2556
|
+
In the examples below, the middle 2 nt part is released from the
|
|
2557
|
+
molecule.
|
|
2558
|
+
|
|
2559
|
+
::
|
|
2560
|
+
|
|
2561
|
+
tagaa ta gtatg tagaa gtatg ta
|
|
2562
|
+
||||| || ||||| --> ||||| ||||| + ||
|
|
2563
|
+
atcttcatccatac atcttcatccatac
|
|
2564
|
+
|
|
2565
|
+
tagaagtaggtatg tagaagtaggtatg
|
|
2566
|
+
||||| || ||||| --> ||||| ||||| + ||
|
|
2567
|
+
atctt at catac atctt catac at
|
|
2568
|
+
|
|
2569
|
+
|
|
2570
|
+
Examples
|
|
2571
|
+
--------
|
|
2572
|
+
>>> from pydna.dseq import Dseq
|
|
2573
|
+
>>> ds = Dseq("tagaaqtaqgtatg")
|
|
2574
|
+
>>> ds
|
|
2575
|
+
Dseq(-14)
|
|
2576
|
+
tagaa ta gtatg
|
|
2577
|
+
atcttcatccatac
|
|
2578
|
+
>>> new, strands = ds.melt_ss_dna(2)
|
|
2579
|
+
>>> new
|
|
2580
|
+
Dseq(-14)
|
|
2581
|
+
tagaa gtatg
|
|
2582
|
+
atcttcatccatac
|
|
2583
|
+
>>> strands[0]
|
|
2584
|
+
Dseq(-2)
|
|
2585
|
+
ta
|
|
2586
|
+
<BLANKLINE>
|
|
2587
|
+
>>> ds = Dseq("tagaaptapgtatg")
|
|
2588
|
+
>>> ds
|
|
2589
|
+
Dseq(-14)
|
|
2590
|
+
tagaagtaggtatg
|
|
2591
|
+
atctt at catac
|
|
2592
|
+
>>> new, strands = ds.melt_ss_dna(2)
|
|
2593
|
+
>>> new
|
|
2594
|
+
Dseq(-14)
|
|
2595
|
+
tagaagtaggtatg
|
|
2596
|
+
atctt catac
|
|
2597
|
+
>>> strands[0]
|
|
2598
|
+
Dseq(-2)
|
|
2599
|
+
<BLANKLINE>
|
|
2600
|
+
at
|
|
2601
|
+
"""
|
|
2602
|
+
|
|
2603
|
+
watsonnicks, cricknicks = self.get_ss_meltsites(length)
|
|
2604
|
+
|
|
2605
|
+
new, strands = self.shed_ss_dna(watsonnicks, cricknicks)
|
|
2606
|
+
|
|
2607
|
+
return new, strands
|
|
2608
|
+
|
|
2609
|
+
def shed_ss_dna(
|
|
2610
|
+
self,
|
|
2611
|
+
watson_cutpairs: list[tuple[int, int]] = None,
|
|
2612
|
+
crick_cutpairs: list[tuple[int, int]] = None,
|
|
2613
|
+
):
|
|
2614
|
+
"""
|
|
2615
|
+
Separate parts of one of the DNA strands
|
|
2616
|
+
|
|
2617
|
+
Examples
|
|
2618
|
+
--------
|
|
2619
|
+
>>> from pydna.dseq import Dseq
|
|
2620
|
+
>>> ds = Dseq("tagaagtaggtatg")
|
|
2621
|
+
>>> ds
|
|
2622
|
+
Dseq(-14)
|
|
2623
|
+
tagaagtaggtatg
|
|
2624
|
+
atcttcatccatac
|
|
2625
|
+
>>> new, strands = ds.shed_ss_dna([(6, 8)],[])
|
|
2626
|
+
>>> new
|
|
2627
|
+
Dseq(-14)
|
|
2628
|
+
tagaag ggtatg
|
|
2629
|
+
atcttcatccatac
|
|
2630
|
+
>>> strands[0]
|
|
2631
|
+
Dseq(-2)
|
|
2632
|
+
ta
|
|
2633
|
+
<BLANKLINE>
|
|
2634
|
+
>>> new, strands = ds.shed_ss_dna([],[(6, 8)])
|
|
2635
|
+
>>> new
|
|
2636
|
+
Dseq(-14)
|
|
2637
|
+
tagaagtaggtatg
|
|
2638
|
+
atcttc ccatac
|
|
2639
|
+
>>> strands[0]
|
|
2640
|
+
Dseq(-2)
|
|
2641
|
+
<BLANKLINE>
|
|
2642
|
+
at
|
|
2643
|
+
>>> ds = Dseq("tagaagtaggtatg")
|
|
2644
|
+
>>> new, (strand1, strand2) = ds.shed_ss_dna([(6, 8), (9, 11)],[])
|
|
2645
|
+
>>> new
|
|
2646
|
+
Dseq(-14)
|
|
2647
|
+
tagaag g atg
|
|
2648
|
+
atcttcatccatac
|
|
2649
|
+
>>> strand1
|
|
2650
|
+
Dseq(-2)
|
|
2651
|
+
ta
|
|
2652
|
+
<BLANKLINE>
|
|
2653
|
+
>>> strand2
|
|
2654
|
+
Dseq(-2)
|
|
2655
|
+
gt
|
|
2656
|
+
<BLANKLINE>
|
|
2657
|
+
"""
|
|
2658
|
+
|
|
2659
|
+
watson_cutpairs = watson_cutpairs or list()
|
|
2660
|
+
crick_cutpairs = crick_cutpairs or list()
|
|
2661
|
+
strands = []
|
|
2662
|
+
|
|
2663
|
+
new = bytearray(self._data)
|
|
2664
|
+
|
|
2665
|
+
for x, y in watson_cutpairs:
|
|
2666
|
+
stuffer = new[x:y]
|
|
2667
|
+
ss = Dseq.quick(new[x:y].translate(dscode_to_watson_tail_table))
|
|
2668
|
+
new[x:y] = stuffer.translate(dscode_to_crick_tail_table)
|
|
2669
|
+
strands.append(ss)
|
|
2670
|
+
|
|
2671
|
+
for x, y in crick_cutpairs:
|
|
2672
|
+
stuffer = new[x:y]
|
|
2673
|
+
ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table))
|
|
2674
|
+
new[x:y] = stuffer.translate(dscode_to_watson_tail_table)
|
|
2675
|
+
strands.append(ss)
|
|
2676
|
+
|
|
2677
|
+
return Dseq.quick(new), strands
|
|
1707
2678
|
|
|
1708
2679
|
def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq":
|
|
1709
2680
|
"""Extracts a subfragment of the sequence between two cuts.
|
|
@@ -1760,25 +2731,22 @@ class Dseq(_Seq):
|
|
|
1760
2731
|
GttCTTAA
|
|
1761
2732
|
|
|
1762
2733
|
"""
|
|
1763
|
-
if
|
|
2734
|
+
if cuts_overlap(left_cut, right_cut, len(self)):
|
|
1764
2735
|
raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1]))
|
|
1765
2736
|
|
|
1766
2737
|
left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True)
|
|
1767
2738
|
right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False)
|
|
1768
2739
|
return Dseq(
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
len(self) - right_crick : len(self) - left_crick
|
|
1774
|
-
]
|
|
1775
|
-
),
|
|
2740
|
+
self[left_watson:right_watson]._data.translate(dscode_to_watson_table),
|
|
2741
|
+
self[left_crick:right_crick]
|
|
2742
|
+
.reverse_complement()
|
|
2743
|
+
._data.translate(dscode_to_watson_table),
|
|
1776
2744
|
ovhg=ovhg_left,
|
|
1777
2745
|
)
|
|
1778
2746
|
|
|
1779
2747
|
def get_cutsite_pairs(
|
|
1780
|
-
self, cutsites:
|
|
1781
|
-
) ->
|
|
2748
|
+
self, cutsites: List[CutSiteType]
|
|
2749
|
+
) -> List[Tuple[Union[None, CutSiteType], Union[None, CutSiteType]]]:
|
|
1782
2750
|
"""Returns pairs of cutsites that render the edges of the resulting fragments.
|
|
1783
2751
|
|
|
1784
2752
|
A fragment produced by restriction is represented by a tuple of length 2 that
|
|
@@ -1828,3 +2796,105 @@ class Dseq(_Seq):
|
|
|
1828
2796
|
cutsites.append(cutsites[0])
|
|
1829
2797
|
|
|
1830
2798
|
return list(zip(cutsites, cutsites[1:]))
|
|
2799
|
+
|
|
2800
|
+
def get_parts(self):
|
|
2801
|
+
"""
|
|
2802
|
+
Returns a DseqParts instance containing the parts (strings) of a dsDNA
|
|
2803
|
+
sequence. DseqParts instance field names:
|
|
2804
|
+
|
|
2805
|
+
::
|
|
2806
|
+
|
|
2807
|
+
"sticky_left5"
|
|
2808
|
+
|
|
|
2809
|
+
| "sticky_right5"
|
|
2810
|
+
| |
|
|
2811
|
+
--- ---
|
|
2812
|
+
GGGATCC
|
|
2813
|
+
TAGGTCA
|
|
2814
|
+
----
|
|
2815
|
+
|
|
|
2816
|
+
"middle"
|
|
2817
|
+
|
|
2818
|
+
|
|
2819
|
+
|
|
2820
|
+
"sticky_left3"
|
|
2821
|
+
|
|
|
2822
|
+
| "sticky_right3"
|
|
2823
|
+
| |
|
|
2824
|
+
--- ---
|
|
2825
|
+
ATCCAGT
|
|
2826
|
+
CCCTAGG
|
|
2827
|
+
----
|
|
2828
|
+
|
|
|
2829
|
+
"middle"
|
|
2830
|
+
|
|
2831
|
+
|
|
2832
|
+
|
|
2833
|
+
"single_watson" (only an upper strand)
|
|
2834
|
+
|
|
|
2835
|
+
-------
|
|
2836
|
+
ATCCAGT
|
|
2837
|
+
|||||||
|
|
2838
|
+
|
|
2839
|
+
|
|
2840
|
+
|
|
2841
|
+
"single_crick" (only a lower strand)
|
|
2842
|
+
|
|
|
2843
|
+
-------
|
|
2844
|
+
|
|
2845
|
+
|||||||
|
|
2846
|
+
CCCTAGG
|
|
2847
|
+
|
|
2848
|
+
|
|
2849
|
+
Up to seven groups (0..6) are captured, but some are mutually exclusive
|
|
2850
|
+
which means that one of them is an empty string:
|
|
2851
|
+
|
|
2852
|
+
0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
|
|
2853
|
+
|
|
2854
|
+
2 or 5 or 6, a DNA molecule has a ds region or is single stranded.
|
|
2855
|
+
|
|
2856
|
+
3 or 4, not both, either 5' or 3' sticky end.
|
|
2857
|
+
|
|
2858
|
+
Note that internal single stranded regions are not identified and will
|
|
2859
|
+
be contained in the middle part if they are present.
|
|
2860
|
+
|
|
2861
|
+
Examples
|
|
2862
|
+
--------
|
|
2863
|
+
>>> from pydna.dseq import Dseq
|
|
2864
|
+
>>> ds = Dseq("PPPATCFQZ")
|
|
2865
|
+
>>> ds
|
|
2866
|
+
Dseq(-9)
|
|
2867
|
+
GGGATC
|
|
2868
|
+
TAGTCA
|
|
2869
|
+
>>> parts = ds.get_parts()
|
|
2870
|
+
>>> parts
|
|
2871
|
+
DseqParts(sticky_left5='PPP', sticky_left3='', middle='ATC', sticky_right3='', sticky_right5='FQZ', single_watson='', single_crick='')
|
|
2872
|
+
>>> Dseq(parts.sticky_left5)
|
|
2873
|
+
Dseq(-3)
|
|
2874
|
+
GGG
|
|
2875
|
+
<BLANKLINE>
|
|
2876
|
+
>>> Dseq(parts.middle)
|
|
2877
|
+
Dseq(-3)
|
|
2878
|
+
ATC
|
|
2879
|
+
TAG
|
|
2880
|
+
>>> Dseq(parts.sticky_right5)
|
|
2881
|
+
Dseq(-3)
|
|
2882
|
+
<BLANKLINE>
|
|
2883
|
+
TCA
|
|
2884
|
+
|
|
2885
|
+
Parameters
|
|
2886
|
+
----------
|
|
2887
|
+
datastring : str
|
|
2888
|
+
A string with dscode.
|
|
2889
|
+
|
|
2890
|
+
Returns
|
|
2891
|
+
-------
|
|
2892
|
+
namedtuple
|
|
2893
|
+
Seven string fields describing the DNA molecule.
|
|
2894
|
+
fragment(sticky_left5='', sticky_left3='',
|
|
2895
|
+
middle='',
|
|
2896
|
+
sticky_right3='', sticky_right5='',
|
|
2897
|
+
single_watson='', single_crick='')
|
|
2898
|
+
|
|
2899
|
+
"""
|
|
2900
|
+
return get_parts(self._data.decode("ascii"))
|