pydna 5.5.4__py3-none-any.whl → 5.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/dseq.py CHANGED
@@ -1,10 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2013-2023 by Björn Johansson. All rights reserved.
5
- # This code is part of the Python-dna distribution and governed by its
6
- # license. Please see the LICENSE.txt file that should have been included
7
- # as part of this package.
8
4
  """Provides the Dseq class for handling double stranded DNA sequences.
9
5
 
10
6
  Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
@@ -14,87 +10,217 @@ which can hold more meta data.
14
10
  The Dseq class support the notion of circular and linear DNA topology.
15
11
  """
16
12
 
13
+ import itertools
14
+ import re
15
+ import copy
16
+ import sys
17
+ import math
18
+ import inspect
19
+ from typing import List, Tuple, Union
17
20
 
18
- import copy as _copy
19
- import itertools as _itertools
20
- import re as _re
21
- import sys as _sys
22
- import math as _math
21
+ from Bio.Restriction import RestrictionBatch
22
+ from Bio.Restriction import CommOnly
23
+
24
+ from seguid import ldseguid
25
+ from seguid import cdseguid
26
+
27
+ from pydna.seq import Seq
28
+ from Bio.Seq import _SeqAbstractBaseClass
29
+ from Bio.Data.IUPACData import unambiguous_dna_weights
30
+ from Bio.Data.IUPACData import unambiguous_rna_weights
31
+ from Bio.Data.IUPACData import atom_weights
32
+ from pydna._pretty import pretty_str
33
+ from pydna.utils import rc
34
+ from pydna.utils import flatten
35
+ from pydna.utils import cuts_overlap
36
+
37
+ from pydna.alphabet import basepair_dict
38
+ from pydna.alphabet import dscode_to_watson_table
39
+ from pydna.alphabet import dscode_to_crick_table
40
+ from pydna.alphabet import regex_ds_melt_factory
41
+ from pydna.alphabet import regex_ss_melt_factory
42
+ from pydna.alphabet import dscode_to_full_sequence_table
43
+ from pydna.alphabet import dscode_to_watson_tail_table
44
+ from pydna.alphabet import dscode_to_crick_tail_table
45
+ from pydna.alphabet import complement_table_for_dscode
46
+ from pydna.alphabet import letters_not_in_dscode
47
+ from pydna.alphabet import get_parts
48
+ from pydna.alphabet import representation_tuple
49
+ from pydna.alphabet import dsbreaks
50
+
51
+ from pydna.common_sub_strings import common_sub_strings
52
+ from pydna.types import DseqType, EnzymesType, CutSiteType
53
+
54
+
55
+ # Sequences larger than this gets a truncated representation.
56
+ length_limit_for_repr = 30
57
+ placeholder = letters_not_in_dscode[-1]
58
+
59
+
60
+ class CircularBytes(bytes):
61
+ """
62
+ A circular bytes sequence: indexing and slicing wrap around index 0.
63
+ """
23
64
 
24
- from pydna.seq import Seq as _Seq
25
- from Bio.Seq import _translate_str, _SeqAbstractBaseClass
65
+ def __new__(cls, value: bytes | bytearray | memoryview):
66
+ return super().__new__(cls, bytes(value))
67
+
68
+ def __getitem__(self, key):
69
+ n = len(self)
70
+ if n == 0:
71
+ if isinstance(key, slice):
72
+ return self.__class__(b"")
73
+ raise IndexError("CircularBytes index out of range (empty bytes)")
74
+
75
+ if isinstance(key, int):
76
+ return super().__getitem__(key % n)
77
+
78
+ if isinstance(key, slice):
79
+ start, stop, step = key.start, key.stop, key.step
80
+ step = 1 if step is None else step
81
+ if step == 0:
82
+ raise ValueError("slice step cannot be zero")
83
+
84
+ if step > 0:
85
+ start = 0 if start is None else start
86
+ stop = n if stop is None else stop
87
+ while stop <= start:
88
+ stop += n
89
+ rng = range(start, stop, step)
90
+ else:
91
+ start = (n - 1) if start is None else start
92
+ stop = -1 if stop is None else stop
93
+ while stop >= start:
94
+ stop -= n
95
+ rng = range(start, stop, step)
96
+
97
+ limit = n if step % n == 0 else n * 2
98
+ out = bytearray()
99
+ count = 0
100
+ for i in rng:
101
+ out.append(super().__getitem__(i % n))
102
+ count += 1
103
+ if count > limit:
104
+ break
105
+ return self.__class__(bytes(out))
26
106
 
27
- from pydna._pretty import pretty_str as _pretty_str
28
- from seguid import ldseguid as _ldseguid
29
- from seguid import cdseguid as _cdseguid
107
+ return super().__getitem__(key)
30
108
 
31
- from pydna.utils import rc as _rc
32
- from pydna.utils import flatten as _flatten
33
- from pydna.utils import cuts_overlap as _cuts_overlap
109
+ def cutaround(self, start: int, length: int) -> bytes:
110
+ """
111
+ Return a circular slice of given length starting at index `start`.
112
+ Can exceed len(self), wrapping around as needed.
34
113
 
35
- from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
36
- from Bio.Restriction import RestrictionBatch as _RestrictionBatch
37
- from Bio.Restriction import CommOnly
114
+ Examples
115
+ --------
116
+ s = CircularBytes(b"ABCDE")
117
+ assert s.cutaround(3, 7) == b"DEABCDE"
118
+ assert s.cutaround(-1, 4) == b"EABC"
119
+ """
120
+ n = len(self)
121
+ if n == 0 or length <= 0:
122
+ return self.__class__(b"")
123
+
124
+ start %= n
125
+ out = bytearray()
126
+ for i in range(length):
127
+ out.append(self[(start + i) % n])
128
+ return self.__class__(bytes(out))
129
+
130
+ def find(
131
+ self,
132
+ sub: bytes | bytearray | memoryview | str,
133
+ start: int = 0,
134
+ end: int | None = None,
135
+ ) -> int:
136
+ """
137
+ Find a subsequence in the circular sequence, possibly
138
+ wrapping across the origin.
139
+ Returns -1 if not found.
140
+ """
141
+ n = len(self)
142
+ if n == 0:
143
+ return -1
144
+
145
+ end = n if end is None else min(end, n)
146
+ doubled = self + self
147
+ try:
148
+ sub = sub.encode("ascii")
149
+ except AttributeError:
150
+ pass
151
+
152
+ pos = doubled.find(bytes(sub), start, n + len(sub) - 1)
153
+
154
+ if pos == -1 or pos >= n:
155
+ return -1
156
+ return pos
38
157
 
39
158
 
40
- from .types import DseqType, EnzymesType, CutSiteType
159
+ class Dseq(Seq):
160
+ """Dseq describes a double stranded DNA fragment, linear or circular.
41
161
 
42
- from typing import List as _List, Tuple as _Tuple, Union as _Union
162
+ Dseq can be initiated in two ways, using two strings, each representing the
163
+ Watson (upper, sense) strand, the Crick (lower, antisense) strand and an
164
+ optional value describing the stagger betwen the strands on the left side (ovhg).
43
165
 
166
+ Alternatively, a single string represenation using dsIUPAC codes can be used.
167
+ If a single string is used, the letters of that string are interpreted as base
168
+ pairs rather than single bases. For example "A" would indicate the basepair
169
+ "A/T". An expanded IUPAC code is used where the letters PEXI have been assigned
170
+ to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"",
171
+ T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with
172
+ an empty Watson strand ""/G, ""/A, ""/T, and ""/C.
173
+
174
+ ::
175
+
176
+ PEXIGATCQFZJ would indicate the linear double-stranded fragment:
177
+
178
+ GATCGATC
179
+ CTAGCTAG
44
180
 
45
- class Dseq(_Seq):
46
- """Dseq holds information for a double stranded DNA fragment.
47
181
 
48
- Dseq also holds information describing the topology of
49
- the DNA fragment (linear or circular).
50
182
 
51
183
  Parameters
52
184
  ----------
53
185
  watson : str
54
- a string representing the watson (sense) DNA strand.
186
+ a string representing the Watson (sense) DNA strand or a basepair
187
+ represenation.
55
188
 
56
189
  crick : str, optional
57
- a string representing the crick (antisense) DNA strand.
190
+ a string representing the Crick (antisense) DNA strand.
58
191
 
59
192
  ovhg : int, optional
60
193
  A positive or negative number to describe the stagger between the
61
- watson and crick strands.
194
+ Watson and Crick strands.
62
195
  see below for a detailed explanation.
63
196
 
64
- linear : bool, optional
65
- True indicates that sequence is linear, False that it is circular.
66
-
67
197
  circular : bool, optional
68
198
  True indicates that sequence is circular, False that it is linear.
69
199
 
70
200
 
71
201
  Examples
72
202
  --------
73
- Dseq is a subclass of the Biopython Seq object. It stores two
74
- strings representing the watson (sense) and crick(antisense) strands.
75
- two properties called linear and circular, and a numeric value ovhg
76
- (overhang) describing the stagger for the watson and crick strand
77
- in the 5' end of the fragment.
203
+ Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor
204
+ can accept two strings representing the Watson (sense) and Crick(antisense)
205
+ DNA strands. These are interpreted as single stranded DNA. There is a check
206
+ for complementarity between the strands.
78
207
 
79
- The most common usage is probably to create a Dseq object as a
80
- part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
81
-
82
- There are three ways of creating a Dseq object directly listed below, but you can also
83
- use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
208
+ If the DNA molecule is staggered on the left side, an integer ovhg
209
+ (overhang) must be given, describing the stagger between the Watson and Crick strand
210
+ in the 5' end of the fragment.
84
211
 
85
- Only one argument (string):
212
+ Additionally, the optional boolean parameter circular can be given to indicate if the
213
+ DNA molecule is circular.
86
214
 
87
- >>> from pydna.dseq import Dseq
88
- >>> Dseq("aaa")
89
- Dseq(-3)
90
- aaa
91
- ttt
215
+ The most common usage of the Dseq class is probably not to use it directly, but to
216
+ create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
217
+ This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and
218
+ :class:`Bio.SeqRecord.SeqRecord` classes in Biopython.
92
219
 
93
- The given string will be interpreted as the watson strand of a
94
- blunt, linear double stranded sequence object. The crick strand
95
- is created automatically from the watson strand.
220
+ There are multiple ways of creating a Dseq object directly listed below, but you can also
221
+ use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
96
222
 
97
- Two arguments (string, string):
223
+ Two arguments (string, string), no overhang provided:
98
224
 
99
225
  >>> from pydna.dseq import Dseq
100
226
  >>> Dseq("gggaaat","ttt")
@@ -102,16 +228,14 @@ class Dseq(_Seq):
102
228
  gggaaat
103
229
  ttt
104
230
 
105
- If both watson and crick are given, but not ovhg an attempt
106
- will be made to find the best annealing between the strands.
107
- There are limitations to this. For long fragments it is quite
108
- slow. The length of the annealing sequences have to be at least
109
- half the length of the shortest of the strands.
231
+ If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing
232
+ between the strands. There are important limitations to this. If there are several ways to
233
+ anneal the strands, this will fail. For long fragments it is quite slow.
110
234
 
111
235
  Three arguments (string, string, ovhg=int):
112
236
 
113
- The ovhg parameter is an integer describing the length of the
114
- crick strand overhang in the 5' end of the molecule.
237
+ The ovhg parameter is an integer describing the length of the Crick strand overhang on the
238
+ left side (the 5' end of Watson strand).
115
239
 
116
240
  The ovhg parameter controls the stagger at the five prime end::
117
241
 
@@ -134,53 +258,51 @@ class Dseq(_Seq):
134
258
 
135
259
  Example of creating Dseq objects with different amounts of stagger:
136
260
 
137
- >>> Dseq(watson="agt", crick="actta", ovhg=-2)
261
+ >>> Dseq(watson="att", crick="acata", ovhg=-2)
138
262
  Dseq(-7)
139
- agt
140
- attca
141
- >>> Dseq(watson="agt",crick="actta",ovhg=-1)
263
+ att
264
+ ataca
265
+ >>> Dseq(watson="ata",crick="acata",ovhg=-1)
142
266
  Dseq(-6)
143
- agt
144
- attca
145
- >>> Dseq(watson="agt",crick="actta",ovhg=0)
267
+ ata
268
+ ataca
269
+ >>> Dseq(watson="taa",crick="actta",ovhg=0)
146
270
  Dseq(-5)
147
- agt
271
+ taa
148
272
  attca
149
- >>> Dseq(watson="agt",crick="actta",ovhg=1)
273
+ >>> Dseq(watson="aag",crick="actta",ovhg=1)
150
274
  Dseq(-5)
151
- agt
275
+ aag
152
276
  attca
153
277
  >>> Dseq(watson="agt",crick="actta",ovhg=2)
154
278
  Dseq(-5)
155
279
  agt
156
280
  attca
157
281
 
158
- If the ovhg parameter is specified a crick strand also
159
- needs to be supplied, otherwise an exception is raised.
282
+ If the ovhg parameter is specified a Crick strand also needs to be supplied, or
283
+ an exception is raised.
160
284
 
161
285
  >>> Dseq(watson="agt", ovhg=2)
162
286
  Traceback (most recent call last):
163
- File "<stdin>", line 1, in <module>
164
- File "/usr/local/lib/python2.7/dist-packages/pydna_/dsdna.py", line 169, in __init__
165
- else:
166
- ValueError: ovhg defined without crick strand!
167
-
287
+ ...
288
+ ValueError: ovhg (overhang) defined without a crick strand.
168
289
 
169
- The shape of the fragment is set by circular = True, False
170
290
 
171
- Note that both ends of the DNA fragment has to be compatible to set
172
- circular = True.
291
+ The shape or topology of the fragment is set by the circular parameter, True or False (default).
173
292
 
174
-
175
- >>> Dseq("aaa","ttt")
293
+ >>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default
176
294
  Dseq(-3)
177
295
  aaa
178
296
  ttt
179
- >>> Dseq("aaa","ttt",ovhg=0)
297
+ >>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False
180
298
  Dseq(-3)
181
299
  aaa
182
300
  ttt
183
- >>> Dseq("aaa","ttt",ovhg=1)
301
+ >>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence
302
+ Dseq(o3)
303
+ aaa
304
+ ttt
305
+ >>> Dseq("aaa", "ttt", ovhg=1, circular = False)
184
306
  Dseq(-4)
185
307
  aaa
186
308
  ttt
@@ -210,6 +332,18 @@ class Dseq(_Seq):
210
332
  -4
211
333
  >>>
212
334
 
335
+
336
+ dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions:
337
+
338
+ ::
339
+
340
+ aaaGATC GATCccc ad-hoc representations
341
+ CTAGttt gggCTAG
342
+
343
+ QFZJaaaPEXI PEXIcccQFZJ dsIUPAC
344
+
345
+
346
+
213
347
  Coercing to string
214
348
 
215
349
  >>> str(a)
@@ -295,46 +429,76 @@ class Dseq(_Seq):
295
429
 
296
430
  """
297
431
 
298
- trunc = 30
299
-
300
432
  def __init__(
301
433
  self,
302
- watson: _Union[str, bytes],
303
- crick: _Union[str, bytes, None] = None,
434
+ watson: Union[str, bytes],
435
+ crick: Union[str, bytes, None] = None,
304
436
  ovhg=None,
305
437
  circular=False,
306
438
  pos=0,
307
439
  ):
308
- if isinstance(watson, bytes):
309
- watson = watson.decode("ASCII")
310
- if isinstance(crick, bytes):
311
- crick = crick.decode("ASCII")
440
+ if isinstance(watson, (bytes, bytearray)):
441
+ # watson is decoded to a string if needed.
442
+ watson = watson.decode("ascii")
443
+ if isinstance(crick, (bytes, bytearray)):
444
+ # crick is decoded to a string if needed.
445
+ crick = crick.decode("ascii")
312
446
 
313
447
  if crick is None:
314
448
  if ovhg is not None:
315
- raise ValueError("ovhg defined without crick strand!")
316
- crick = _rc(watson)
317
- ovhg = 0
318
- self._data = bytes(watson, encoding="ASCII")
449
+ raise ValueError("ovhg (overhang) defined without a crick strand.")
450
+ """
451
+ Giving only the watson string implies inferring the Crick complementary strand
452
+ from the Watson sequence. The watson string can contain dscode letters wich will
453
+ be interpreted as outlined in the pydna.alphabet module.
454
+
455
+ The _data property must be a byte string for compatibility with
456
+ Biopython Bio.Seq.Seq
457
+ """
458
+ data = watson
459
+ self._data = data.encode("ascii")
319
460
 
320
- else: # crick strand given
321
- if ovhg is None: # ovhg not given
322
- olaps = _common_sub_strings(
461
+ else:
462
+ """
463
+ Crick strand given, ovhg is optional. An important consequence is that the
464
+ watson and crick strands are interpreted as single stranded DNA that is
465
+ supposed to anneal.
466
+
467
+ If ovhg was not given, we try to guess the value below. This will fail
468
+ if there are two or more ways to anneal with equal length of the double
469
+ stranded part.
470
+ """
471
+ if ovhg is None: # ovhg not given, try to guess from sequences
472
+ limit = int(math.log(len(watson)) / math.log(4))
473
+ olaps = common_sub_strings(
323
474
  str(watson).lower(),
324
- str(_rc(crick).lower()),
325
- int(_math.log(len(watson)) / _math.log(4)),
475
+ str(rc(crick).lower()),
476
+ limit,
326
477
  )
478
+
479
+ """No overlaps found, strands do not anneal"""
327
480
  if len(olaps) == 0:
328
481
  raise ValueError(
329
- "Could not anneal the two strands." " Please provide ovhg value"
482
+ "Could not anneal the two strands."
483
+ f" looked for annealing with at least {limit} basepairs"
484
+ " Please provide and overhang value (ovhg parameter)"
330
485
  )
331
486
 
332
- # We extract the positions and length of the first (longest) overlap, since
333
- # common_sub_strings sorts the overlaps by length.
334
- pos_watson, pos_crick, longest_olap_length = olaps[0]
487
+ """
488
+ We extract the positions and length of the first (longest) overlap,
489
+ since common_sub_strings sorts the overlaps by length, longest first.
490
+ """
335
491
 
336
- # We see if there is another overlap of the same length
337
- if any(olap[2] >= longest_olap_length for olap in olaps[1:]):
492
+ (pos_watson, pos_crick, longest_olap_length), *rest = olaps
493
+
494
+ """
495
+ We see if there is another overlap of the same length
496
+ This means that annealing is ambigous. User should provide
497
+ and ovhg value.
498
+ """
499
+ if any(
500
+ olap_length >= longest_olap_length for _, _, olap_length in rest
501
+ ):
338
502
  raise ValueError(
339
503
  "More than one way of annealing the"
340
504
  " strands. Please provide ovhg value"
@@ -342,120 +506,80 @@ class Dseq(_Seq):
342
506
 
343
507
  ovhg = pos_crick - pos_watson
344
508
 
345
- sns = (ovhg * " ") + _pretty_str(watson)
346
- asn = (-ovhg * " ") + _pretty_str(_rc(crick))
347
-
348
- self._data = bytes(
349
- "".join(
350
- [
351
- a.strip() or b.strip()
352
- for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ")
353
- ]
354
- ),
355
- encoding="ASCII",
356
- )
509
+ """
510
+ Pad both strands on left side ovhg spaces
511
+ a negative number gives no padding,
512
+ """
513
+ sense = ovhg * " " + watson
514
+ antisense = -ovhg * " " + crick[::-1]
515
+
516
+ max_len = max(len(sense), len(antisense))
357
517
 
358
- else: # ovhg given
359
- if ovhg == 0:
360
- if len(watson) >= len(crick):
361
- self._data = bytes(watson, encoding="ASCII")
362
- else:
363
- self._data = bytes(
364
- watson + _rc(crick[: len(crick) - len(watson)]),
365
- encoding="ASCII",
366
- )
367
- elif ovhg > 0:
368
- if ovhg + len(watson) > len(crick):
369
- self._data = bytes(
370
- _rc(crick[-ovhg:]) + watson, encoding="ASCII"
371
- )
372
- else:
373
- self._data = bytes(
374
- _rc(crick[-ovhg:])
375
- + watson
376
- + _rc(crick[: len(crick) - ovhg - len(watson)]),
377
- encoding="ASCII",
378
- )
379
- else: # ovhg < 0
380
- if -ovhg + len(crick) > len(watson):
381
- self._data = bytes(
382
- watson + _rc(crick[: -ovhg + len(crick) - len(watson)]),
383
- encoding="ASCII",
384
- )
385
- else:
386
- self._data = bytes(watson, encoding="ASCII")
518
+ """pad both strands on right side to same size."""
519
+ sense = sense.ljust(max_len)
520
+ antisense = antisense.ljust(max_len)
521
+ """both strands padded so that bsepairs align"""
522
+ assert len(sense) == len(antisense)
523
+
524
+ data = []
525
+
526
+ for w, c in zip(sense, antisense):
527
+ try:
528
+ data.append(basepair_dict[w, c])
529
+ except KeyError as err:
530
+ print(f"Base mismatch in representation {err}")
531
+ raise ValueError(f"Base mismatch in representation: {err}")
532
+ data = "".join(data).strip()
533
+ self._data = data.encode("ascii")
387
534
 
388
535
  self.circular = circular
389
- self.watson = _pretty_str(watson)
390
- self.crick = _pretty_str(crick)
391
- self.length = len(self._data)
392
- self.ovhg = ovhg
393
536
  self.pos = pos
394
537
 
538
+ if circular:
539
+ data += data[0:1]
540
+
541
+ dsb = dsbreaks(data)
542
+
543
+ if dsb:
544
+ msg = "".join(dsb)
545
+ raise ValueError(
546
+ f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip()
547
+ )
548
+
395
549
  @classmethod
396
- def quick(
397
- cls,
398
- watson: str,
399
- crick: str,
400
- ovhg=0,
401
- circular=False,
402
- pos=0,
403
- ):
404
- obj = cls.__new__(cls) # Does not call __init__
405
- obj.watson = _pretty_str(watson)
406
- obj.crick = _pretty_str(crick)
407
- obj.ovhg = ovhg
550
+ def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs):
551
+ """Fastest way to instantiate an object of the Dseq class.
552
+
553
+ No checks of parameters are made.
554
+ Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks.
555
+ """
556
+ obj = cls.__new__(cls)
408
557
  obj.circular = circular
409
- obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
410
558
  obj.pos = pos
411
- wb = bytes(watson, encoding="ASCII")
412
- cb = bytes(crick, encoding="ASCII")
413
- obj._data = (
414
- _rc(cb[-max(0, ovhg) or len(cb) :])
415
- + wb
416
- + _rc(cb[: max(0, len(cb) - ovhg - len(wb))])
417
- )
418
- return obj
559
+ obj._data = data
419
560
 
420
- @classmethod
421
- def from_string(
422
- cls,
423
- dna: str,
424
- *args,
425
- # linear=True,
426
- circular=False,
427
- **kwargs,
428
- ):
429
- obj = cls.__new__(cls) # Does not call __init__
430
- obj.watson = _pretty_str(dna)
431
- obj.crick = _pretty_str(_rc(dna))
432
- obj.ovhg = 0
433
- obj.circular = circular
434
- # obj._linear = linear
435
- obj.length = len(dna)
436
- obj.pos = 0
437
- obj._data = bytes(dna, encoding="ASCII")
438
561
  return obj
439
562
 
440
563
  @classmethod
441
564
  def from_representation(cls, dsdna: str, *args, **kwargs):
442
- obj = cls.__new__(cls) # Does not call __init__
443
- w, c, *r = [ln for ln in dsdna.splitlines() if ln]
444
- ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip()))
445
- watson = obj.watson = _pretty_str(w.strip())
446
- crick = obj.crick = _pretty_str(c.strip()[::-1])
565
+ obj = cls.__new__(cls)
447
566
  obj.circular = False
448
- # obj._linear = True
449
- obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
450
567
  obj.pos = 0
451
- wb = bytes(watson, encoding="ASCII")
452
- cb = bytes(crick, encoding="ASCII")
453
- obj._data = (
454
- _rc(cb[-max(0, ovhg) or len(cb) :])
455
- + wb
456
- + _rc(cb[: max(0, len(cb) - ovhg - len(wb))])
457
- )
458
- return obj
568
+ clean = inspect.cleandoc("\n" + dsdna)
569
+ watson, crick = [
570
+ ln
571
+ for ln in clean.splitlines()
572
+ if ln.strip() and not ln.strip().startswith("Dseq(")
573
+ ]
574
+ ovhgw = len(watson) - len(watson.lstrip())
575
+ ovhgc = -(len(crick) - len(crick.lstrip()))
576
+
577
+ ovhg = ovhgw or ovhgc
578
+
579
+ watson = watson.strip()
580
+ crick = crick.strip()[::-1]
581
+
582
+ return Dseq(watson, crick, ovhg)
459
583
 
460
584
  @classmethod
461
585
  def from_full_sequence_and_overhangs(
@@ -522,111 +646,177 @@ class Dseq(_Seq):
522
646
 
523
647
  return Dseq(watson, crick=crick, ovhg=crick_ovhg)
524
648
 
525
- # @property
526
- # def ovhg(self):
527
- # """The ovhg property. This cannot be set directly, but is a
528
- # consequence of how the watson and crick strands anneal to
529
- # each other"""
530
- # return self._ovhg
531
-
532
- # @property
533
- # def linear(self):
534
- # """The linear property can not be set directly.
535
- # Use an empty slice [:] to create a linear object."""
536
- # return self._linear
537
-
538
- # @property
539
- # def circular(self):
540
- # """The circular property can not be set directly.
541
- # Use :meth:`looped` to create a circular Dseq object"""
542
- # return self._circular
649
+ @property
650
+ def watson(self) -> str:
651
+ """
652
+ The watson (upper) strand of the double stranded fragment 5'-3'.
543
653
 
544
- def mw(self) -> float:
545
- """This method returns the molecular weight of the DNA molecule
546
- in g/mol. The following formula is used::
547
-
548
- MW = (A x 313.2) + (T x 304.2) +
549
- (C x 289.2) + (G x 329.2) +
550
- (N x 308.9) + 79.0
551
- """
552
- nts = (self.watson + self.crick).lower()
553
-
554
- return (
555
- 313.2 * nts.count("a")
556
- + 304.2 * nts.count("t")
557
- + 289.2 * nts.count("c")
558
- + 329.2 * nts.count("g")
559
- + 308.9 * nts.count("n")
560
- + 79.0
561
- )
654
+ Returns
655
+ -------
656
+ TYPE
657
+ DESCRIPTION.
562
658
 
563
- def upper(self: DseqType) -> DseqType:
564
- """Return an upper case copy of the sequence.
659
+ """
660
+ return self._data.decode("ascii").translate(dscode_to_watson_table).strip()
565
661
 
566
- >>> from pydna.dseq import Dseq
567
- >>> my_seq = Dseq("aAa")
568
- >>> my_seq
569
- Dseq(-3)
570
- aAa
571
- tTt
572
- >>> my_seq.upper()
573
- Dseq(-3)
574
- AAA
575
- TTT
662
+ @property
663
+ def crick(self) -> str:
664
+ """
665
+ The crick (lower) strand of the double stranded fragment 5'-3'.
576
666
 
577
667
  Returns
578
668
  -------
579
- Dseq
580
- Dseq object in uppercase
669
+ TYPE
670
+ DESCRIPTION.
581
671
 
582
- See also
583
- --------
584
- pydna.dseq.Dseq.lower
672
+ """
673
+ return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1]
585
674
 
675
+ @property
676
+ def left_ovhg(self) -> int:
586
677
  """
587
- return self.quick(
588
- self.watson.upper(),
589
- self.crick.upper(),
590
- ovhg=self.ovhg,
591
- # linear=self.linear,
592
- circular=self.circular,
593
- pos=self.pos,
594
- )
678
+ The 5' overhang of the lower strand compared the the upper.
595
679
 
596
- def lower(self: DseqType) -> DseqType:
597
- """Return a lower case copy of the sequence.
680
+ See module docstring for more information.
598
681
 
599
- >>> from pydna.dseq import Dseq
600
- >>> my_seq = Dseq("aAa")
601
- >>> my_seq
602
- Dseq(-3)
603
- aAa
604
- tTt
605
- >>> my_seq.lower()
606
- Dseq(-3)
607
- aaa
608
- ttt
682
+ Returns
683
+ -------
684
+ TYPE
685
+ DESCRIPTION.
686
+
687
+ """
688
+ parts = self.get_parts()
689
+ if parts.single_watson or parts.single_crick:
690
+ return None
691
+ return -len(parts.sticky_left5) or len(parts.sticky_left3)
692
+
693
+ ovhg = left_ovhg
694
+
695
+ @property
696
+ def right_ovhg(self) -> int:
697
+ """Overhang at the right side (end)."""
698
+ parts = self.get_parts()
699
+ if parts.single_watson or parts.single_crick:
700
+ return None
701
+ return -len(parts.sticky_right5) or len(parts.sticky_right3)
702
+
703
+ watson_ovhg = right_ovhg
704
+
705
+ def __str__(self) -> str:
706
+ """
707
+ A string representation of the sequence. The returned string
708
+ is the watson strand of a blunt version of the sequence.
709
+
710
+ >>> ds = Dseq.from_representation(
711
+ ... '''
712
+ ... GAATTC
713
+ ... TAA
714
+ ... ''')
715
+
716
+ >>> str(ds)
717
+ 'GAATTC'
718
+ >>> ds = Dseq.from_representation(
719
+ ... '''
720
+ ... ATT
721
+ ... CTTAAG
722
+ ... ''')
723
+
724
+ >>> str(ds)
725
+ 'GAATTC'
609
726
 
610
727
  Returns
611
728
  -------
612
- Dseq
613
- Dseq object in lowercase
729
+ str
730
+ A string representation of the sequence.
614
731
 
615
- See also
732
+ """
733
+ return bytes(self).decode("ascii")
734
+
735
+ to_blunt_string = __str__ # alias of __str__ # TODO: consider removing
736
+
737
+ def __bytes__(self) -> bytes:
738
+ return self._data.translate(dscode_to_full_sequence_table)
739
+
740
+ def mw(self) -> float:
741
+ """The molecular weight of the DNA/RNA molecule in g/mol.
742
+
743
+ The molecular weight data in Biopython Bio.Data.IUPACData
744
+ is used. The DNA is assumed to have a 5'-phosphate as many
745
+ DNA fragments from restriction digestion do:
746
+
747
+ ::
748
+
749
+ P - G-A-T-T-A-C-A - OH
750
+ | | | | | | |
751
+ OH - C-T-A-A-T-G-T - P
752
+
753
+ The molecular weights listed in the unambiguous_dna_weights
754
+ dictionary refers to free monophosphate nucleotides.
755
+ One water molecule is removed for every phopshodiester bond
756
+ formed between nucleotides. For linear molecules, the weight
757
+ of one water molecule is added to account for the terminal
758
+ hydroxyl group and a hydrogen on the 5' terminal phosphate
759
+ group.
760
+
761
+ ::
762
+
763
+ P - G---A---T - OH P - C---A - OH
764
+ | | | | |
765
+ OH - C---T---A---A---T---G---T - P
766
+
767
+ If the DNA is discontinuous, the internal 5'- end is assumed
768
+ to have a phosphate and the 3'- a hydroxyl group:
769
+
770
+
771
+ Examples
616
772
  --------
617
- pydna.dseq.Dseq.upper
618
- """
619
- return self.quick(
620
- self.watson.lower(),
621
- self.crick.lower(),
622
- ovhg=self.ovhg,
623
- # linear=self.linear,
624
- circular=self.circular,
625
- pos=self.pos,
626
- )
773
+ >>> from pydna.dseq import Dseq
774
+ >>> ds_lin_obj = Dseq("GATTACA")
775
+ >>> ds_lin_obj
776
+ Dseq(-7)
777
+ GATTACA
778
+ CTAATGT
779
+ >>> round(ds_lin_obj.mw(), 1)
780
+ 4359.8
781
+ >>> ds_circ_obj = Dseq("GATTACA", circular = True)
782
+ >>> round(ds_circ_obj.mw(), 1)
783
+ 4323.8
784
+ >>> ssobj = Dseq("PEXXEIE")
785
+ >>> ssobj
786
+ Dseq(-7)
787
+ GATTACA
788
+ <BLANKLINE>
789
+ >>> round(ssobj.mw(), 1)
790
+ 2184.4
791
+ >>> ds_lin_obj2 = Dseq("GATZFCA")
792
+ >>> ds_lin_obj2
793
+ Dseq(-7)
794
+ GAT CA
795
+ CTAATGT
796
+ >>> round(ds_lin_obj2.mw(), 1)
797
+ 3724.4
798
+ """
799
+
800
+ h2o = atom_weights["H"] * 2 + atom_weights["O"]
801
+
802
+ mwd = unambiguous_rna_weights | unambiguous_dna_weights | {" ": 0}
803
+
804
+ watsn_weight = sum(mwd[nt] - h2o for nt in self.watson.upper())
805
+ crick_weight = sum(mwd[nt] - h2o for nt in self.crick.upper())
806
+
807
+ watsn_weight += h2o * len(re.findall(r" +", self.watson))
808
+ crick_weight += h2o * len(re.findall(r" +", self.crick))
809
+
810
+ if watsn_weight and not self.circular:
811
+ watsn_weight += h2o
812
+
813
+ if crick_weight and not self.circular:
814
+ crick_weight += h2o
815
+
816
+ return watsn_weight + crick_weight
627
817
 
628
818
  def find(
629
- self, sub: _Union[_SeqAbstractBaseClass, str, bytes], start=0, end=_sys.maxsize
819
+ self, sub: Union[_SeqAbstractBaseClass, str, bytes], start=0, end=sys.maxsize
630
820
  ) -> int:
631
821
  """This method behaves like the python string method of the same name.
632
822
 
@@ -635,6 +825,8 @@ class Dseq(_Seq):
635
825
 
636
826
  Returns -1 if the subsequence is NOT found.
637
827
 
828
+ The search is case sensitive.
829
+
638
830
  Parameters
639
831
  ----------
640
832
 
@@ -650,80 +842,51 @@ class Dseq(_Seq):
650
842
  Examples
651
843
  --------
652
844
  >>> from pydna.dseq import Dseq
653
- >>> seq = Dseq("atcgactgacgtgtt")
845
+ >>> seq = Dseq("agtaagt")
654
846
  >>> seq
655
- Dseq(-15)
656
- atcgactgacgtgtt
657
- tagctgactgcacaa
658
- >>> seq.find("gac")
659
- 3
660
- >>> seq = Dseq(watson="agt",crick="actta",ovhg=-2)
847
+ Dseq(-7)
848
+ agtaagt
849
+ tcattca
850
+ >>> seq.find("taa")
851
+ 2
852
+ >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
661
853
  >>> seq
662
854
  Dseq(-7)
663
- agt
855
+ agta
664
856
  attca
665
857
  >>> seq.find("taa")
858
+ -1
859
+ >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
860
+ >>> seq
861
+ Dseq(-7)
862
+ agta
863
+ attca
864
+ >>> seq.find("ta")
666
865
  2
667
866
  """
668
-
669
- if not self.circular:
670
- return _Seq.find(self, sub, start, end)
671
-
672
- return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end)
673
-
674
- def __getitem__(self, sl: slice) -> "Dseq":
675
- """Returns a subsequence. This method is used by the slice notation"""
676
-
677
- if not self.circular:
678
- x = len(self.crick) - self.ovhg - len(self.watson)
679
-
680
- sns = (self.ovhg * " " + self.watson + x * " ")[sl]
681
- asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl]
682
-
683
- ovhg = max(
684
- (len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs
685
- )
686
-
687
- return Dseq(
688
- sns.strip(),
689
- asn[::-1].strip(),
690
- ovhg=ovhg,
691
- # linear=True
692
- )
867
+ if self.circular:
868
+ result = CircularBytes(self._data).find(sub, start, end)
693
869
  else:
694
- sl = slice(sl.start or 0, sl.stop or len(self), sl.step)
695
- if sl.start > len(self) or sl.stop > len(self):
696
- return Dseq("")
697
- if sl.start < sl.stop:
698
- return Dseq(
699
- self.watson[sl],
700
- self.crick[::-1][sl][::-1],
701
- ovhg=0,
702
- # linear=True
703
- )
704
- else:
705
- try:
706
- stp = abs(sl.step)
707
- except TypeError:
708
- stp = 1
709
- start = sl.start
710
- stop = sl.stop
711
-
712
- w = (
713
- self.watson[(start or len(self)) :: stp]
714
- + self.watson[: (stop or 0) : stp]
715
- )
716
- c = (
717
- self.crick[len(self) - stop :: stp]
718
- + self.crick[: len(self) - start : stp]
719
- )
870
+ result = super().find(sub, start, end)
871
+ return result
872
+
873
+ def __contains__(self, sub: [str, bytes]) -> bool:
874
+ return self.find(sub) != -1
720
875
 
721
- return Dseq(w, c, ovhg=0) # , linear=True)
876
+ def __getitem__(self, sl: [slice, int]) -> DseqType:
877
+ if isinstance(sl, int):
878
+ sl = slice(sl, sl + 1, 1)
879
+ sl = slice(sl.start, sl.stop, sl.step)
880
+ if self.circular:
881
+ cb = CircularBytes(self._data)
882
+ return self.quick(cb[sl])
883
+ return super().__getitem__(sl)
722
884
 
723
885
  def __eq__(self, other: DseqType) -> bool:
724
886
  """Compare to another Dseq object OR an object that implements
725
- watson, crick and ovhg properties. This comparison is case
726
- insensitive.
887
+ watson, crick and ovhg properties.
888
+
889
+ This comparison is case insensitive.
727
890
 
728
891
  """
729
892
  try:
@@ -738,85 +901,15 @@ class Dseq(_Seq):
738
901
  same = False
739
902
  return same
740
903
 
741
- def __repr__(self):
742
- """Returns a representation of the sequence, truncated if
743
- longer than 30 bp"""
744
-
745
- if len(self) > Dseq.trunc:
746
- if self.ovhg > 0:
747
- d = self.crick[-self.ovhg :][::-1]
748
- hej = len(d)
749
- if len(d) > 10:
750
- d = "{}..{}".format(d[:4], d[-4:])
751
- a = len(d) * " "
752
-
753
- elif self.ovhg < 0:
754
- a = self.watson[: max(0, -self.ovhg)]
755
- hej = len(a)
756
- if len(a) > 10:
757
- a = "{}..{}".format(a[:4], a[-4:])
758
- d = len(a) * " "
759
- else:
760
- a = ""
761
- d = ""
762
- hej = 0
763
-
764
- x = self.ovhg + len(self.watson) - len(self.crick)
765
-
766
- if x > 0:
767
- c = self.watson[len(self.crick) - self.ovhg :]
768
- y = len(c)
769
- if len(c) > 10:
770
- c = "{}..{}".format(c[:4], c[-4:])
771
- f = len(c) * " "
772
- elif x < 0:
773
- f = self.crick[:-x][::-1]
774
- y = len(f)
775
- if len(f) > 10:
776
- f = "{}..{}".format(f[:4], f[-4:])
777
- c = len(f) * " "
778
- else:
779
- c = ""
780
- f = ""
781
- y = 0
782
-
783
- L = len(self) - hej - y
784
- x1 = -min(0, self.ovhg)
785
- x2 = x1 + L
786
- x3 = -min(0, x)
787
- x4 = x3 + L
788
-
789
- b = self.watson[x1:x2]
790
- e = self.crick[x3:x4][::-1]
791
-
792
- if len(b) > 10:
793
- b = "{}..{}".format(b[:4], b[-4:])
794
- e = "{}..{}".format(e[:4], e[-4:])
795
-
796
- return _pretty_str(
797
- "{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}"
798
- ).format(
799
- klass=self.__class__.__name__,
800
- top={False: "-", True: "o"}[self.circular],
801
- size=len(self),
802
- a=a,
803
- b=b,
804
- c=c,
805
- d=d,
806
- e=e,
807
- f=f,
808
- )
904
+ def __repr__(self, lim: int = length_limit_for_repr) -> pretty_str:
809
905
 
810
- else:
811
- return _pretty_str(
812
- "{}({}{})\n{}\n{}".format(
813
- self.__class__.__name__,
814
- {False: "-", True: "o"}[self.circular],
815
- len(self),
816
- self.ovhg * " " + self.watson,
817
- -self.ovhg * " " + self.crick[::-1],
818
- )
819
- )
906
+ header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})"
907
+
908
+ w, c = representation_tuple(
909
+ self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr
910
+ )
911
+
912
+ return pretty_str(header + "\n" + w + "\n" + c)
820
913
 
821
914
  def reverse_complement(self) -> "Dseq":
822
915
  """Dseq object where watson and crick have switched places.
@@ -839,22 +932,29 @@ class Dseq(_Seq):
839
932
  >>>
840
933
 
841
934
  """
842
- return Dseq.quick(
843
- self.crick,
844
- self.watson,
845
- ovhg=len(self.watson) - len(self.crick) + self.ovhg,
846
- circular=self.circular,
847
- )
935
+ return Dseq.quick(rc(self._data), circular=self.circular)
848
936
 
849
937
  rc = reverse_complement # alias for reverse_complement
850
938
 
851
939
  def shifted(self: DseqType, shift: int) -> DseqType:
852
- """Shifted version of a circular Dseq object."""
940
+ """
941
+ Shifted copy of a circular Dseq object.
942
+
943
+ >>> ds = Dseq("TAAG", circular = True)
944
+ >>> ds.shifted(1) # First bp moved to right side:
945
+ Dseq(o4)
946
+ AAGT
947
+ TTCA
948
+ >>> ds.shifted(-1) # Last bp moved to left side:
949
+ Dseq(o4)
950
+ GTAA
951
+ CATT
952
+ """
853
953
  if not self.circular:
854
954
  raise TypeError("DNA is not circular.")
855
955
  shift = shift % len(self)
856
956
  if not shift:
857
- return _copy.deepcopy(self)
957
+ return copy.deepcopy(self)
858
958
  else:
859
959
  return (self[shift:] + self[:shift]).looped()
860
960
 
@@ -876,19 +976,30 @@ class Dseq(_Seq):
876
976
  Dseq(o8)
877
977
  catcgatc
878
978
  gtagctag
879
- >>> a.T4("t")
979
+ >>> b = Dseq("iatcgatj")
980
+ >>> b
880
981
  Dseq(-8)
881
982
  catcgat
882
983
  tagctag
883
- >>> a.T4("t").looped()
984
+ >>> b.looped()
985
+ Dseq(o7)
986
+ catcgat
987
+ gtagcta
988
+ >>> c = Dseq("jatcgati")
989
+ >>> c
990
+ Dseq(-8)
991
+ atcgatc
992
+ gtagcta
993
+ >>> c.looped()
884
994
  Dseq(o7)
885
995
  catcgat
886
996
  gtagcta
887
- >>> a.T4("a")
997
+ >>> d = Dseq("ietcgazj")
998
+ >>> d
888
999
  Dseq(-8)
889
1000
  catcga
890
1001
  agctag
891
- >>> a.T4("a").looped()
1002
+ >>> d.looped()
892
1003
  Traceback (most recent call last):
893
1004
  File "<stdin>", line 1, in <module>
894
1005
  File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped
@@ -899,116 +1010,116 @@ class Dseq(_Seq):
899
1010
 
900
1011
  """
901
1012
  if self.circular:
902
- return _copy.deepcopy(self)
1013
+ return copy.deepcopy(self)
1014
+
903
1015
  type5, sticky5 = self.five_prime_end()
904
1016
  type3, sticky3 = self.three_prime_end()
905
- if type5 == type3 and str(sticky5) == str(_rc(sticky3)):
906
- nseq = self.__class__.quick(
907
- self.watson,
908
- self.crick[-self.ovhg :] + self.crick[: -self.ovhg],
909
- ovhg=0,
910
- # linear=False,
911
- circular=True,
912
- )
913
- # assert len(nseq.crick) == len(nseq.watson)
914
- return nseq
915
- else:
916
- raise TypeError(
917
- "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
918
- )
919
1017
 
920
- def tolinear(self: DseqType) -> DseqType: # pragma: no cover
921
- """Returns a blunt, linear copy of a circular Dseq object. This can
922
- only be done if the Dseq object is circular, otherwise a
923
- TypeError is raised.
1018
+ err = TypeError(
1019
+ "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
1020
+ )
924
1021
 
925
- This method is deprecated, use slicing instead. See example below.
1022
+ if type5 != type3:
1023
+ raise err
926
1024
 
927
- Examples
928
- --------
1025
+ try:
1026
+ # Test if sticky ends are compatible
1027
+ self + self
1028
+ except TypeError:
1029
+ raise err
929
1030
 
930
- >>> from pydna.dseq import Dseq
931
- >>> a=Dseq("catcgatc", circular=True)
932
- >>> a
933
- Dseq(o8)
934
- catcgatc
935
- gtagctag
936
- >>> a[:]
937
- Dseq(-8)
938
- catcgatc
939
- gtagctag
940
- >>>
1031
+ new = self.cast_to_ds_left()[: len(self) - len(sticky3)]
941
1032
 
942
- """
943
- import warnings as _warnings
944
- from pydna import _PydnaDeprecationWarning
1033
+ new.circular = True
1034
+ return new
945
1035
 
946
- _warnings.warn(
947
- "tolinear method is obsolete; "
948
- "please use obj[:] "
949
- "instead of obj.tolinear().",
950
- _PydnaDeprecationWarning,
951
- )
952
- if not self.circular:
953
- raise TypeError("DNA is not circular.\n")
954
- selfcopy = _copy.deepcopy(self)
955
- selfcopy.circular = False
956
- return selfcopy # self.__class__(self.watson, linear=True)
1036
+ def five_prime_end(self) -> Tuple[str, str]:
1037
+ """Returns a 2-tuple of trings describing the structure of the 5' end of
1038
+ the DNA fragment.
1039
+
1040
+ The tuple contains (type , sticky) where type is eiter "5'" or "3'".
1041
+ sticky is always in lower case and contains the sequence of the
1042
+ protruding end in 5'-3' direction.
1043
+
1044
+ See examples below:
957
1045
 
958
- def five_prime_end(self) -> _Tuple[str, str]:
959
- """Returns a tuple describing the structure of the 5' end of
960
- the DNA fragment
961
1046
 
962
1047
  Examples
963
1048
  --------
964
1049
  >>> from pydna.dseq import Dseq
965
- >>> a=Dseq("aaa", "ttt")
1050
+ >>> a = Dseq("aa", "tttg", ovhg=2)
966
1051
  >>> a
967
- Dseq(-3)
968
- aaa
969
- ttt
1052
+ Dseq(-4)
1053
+ aa
1054
+ gttt
970
1055
  >>> a.five_prime_end()
971
- ('blunt', '')
972
- >>> a=Dseq("aaa", "ttt", ovhg=1)
1056
+ ("3'", 'tg')
1057
+ >>> a = Dseq("caaa", "tt", ovhg=-2)
973
1058
  >>> a
974
1059
  Dseq(-4)
975
- aaa
976
- ttt
1060
+ caaa
1061
+ tt
977
1062
  >>> a.five_prime_end()
978
- ("3'", 't')
979
- >>> a=Dseq("aaa", "ttt", ovhg=-1)
1063
+ ("5'", 'ca')
1064
+ >>> a = Dseq("aa", "tt")
980
1065
  >>> a
981
- Dseq(-4)
982
- aaa
983
- ttt
1066
+ Dseq(-2)
1067
+ aa
1068
+ tt
984
1069
  >>> a.five_prime_end()
985
- ("5'", 'a')
986
- >>>
1070
+ ('blunt', '')
987
1071
 
988
1072
  See also
989
1073
  --------
990
1074
  pydna.dseq.Dseq.three_prime_end
991
1075
 
992
1076
  """
993
- if self.watson and not self.crick:
994
- return "5'", self.watson.lower()
995
- if not self.watson and self.crick:
996
- return "3'", self.crick.lower()
997
- if self.ovhg < 0:
998
- sticky = self.watson[: -self.ovhg].lower()
1077
+
1078
+ # See docstring for function pydna.utils.get_parts for details
1079
+ # on what is contained in parts.
1080
+ parts = self.get_parts()
1081
+
1082
+ sticky5 = parts.sticky_left5.translate(dscode_to_watson_table)
1083
+
1084
+ sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1]
1085
+
1086
+ single_watson = parts.single_watson.translate(dscode_to_watson_table)
1087
+
1088
+ single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
1089
+
1090
+ # The walrus operator returns the value being assigned, so
1091
+ # we can test if it is empty or not.
1092
+ if sticky := single_watson:
1093
+ type_ = "single"
1094
+ elif sticky := single_crick:
1095
+ type_ = "single"
1096
+ elif sticky5 == sticky3 == "":
1097
+ type_, sticky = "blunt", ""
1098
+ elif sticky := sticky5:
999
1099
  type_ = "5'"
1000
- elif self.ovhg > 0:
1001
- sticky = self.crick[-self.ovhg :].lower()
1100
+ elif sticky := sticky3:
1002
1101
  type_ = "3'"
1003
- else:
1004
- sticky = ""
1005
- type_ = "blunt"
1006
- return type_, sticky
1007
1102
 
1008
- def three_prime_end(self) -> _Tuple[str, str]:
1103
+ return type_, sticky.lower()
1104
+
1105
+ def three_prime_end(self) -> Tuple[str, str]:
1009
1106
  """Returns a tuple describing the structure of the 5' end of
1010
1107
  the DNA fragment
1011
1108
 
1109
+ >>> a = Dseq("aa", "gttt", ovhg=0)
1110
+ >>> a
1111
+ Dseq(-4)
1112
+ aa
1113
+ tttg
1114
+ >>> a.three_prime_end()
1115
+ ("5'", 'gt')
1116
+ >>> a = Dseq("aaac", "tt", ovhg=0)
1117
+ >>> a
1118
+ Dseq(-4)
1119
+ aaac
1120
+ tt
1121
+ >>> a.three_prime_end()
1122
+ ("3'", 'ac')
1012
1123
  >>> from pydna.dseq import Dseq
1013
1124
  >>> a=Dseq("aaa", "ttt")
1014
1125
  >>> a
@@ -1017,21 +1128,6 @@ class Dseq(_Seq):
1017
1128
  ttt
1018
1129
  >>> a.three_prime_end()
1019
1130
  ('blunt', '')
1020
- >>> a=Dseq("aaa", "ttt", ovhg=1)
1021
- >>> a
1022
- Dseq(-4)
1023
- aaa
1024
- ttt
1025
- >>> a.three_prime_end()
1026
- ("3'", 'a')
1027
- >>> a=Dseq("aaa", "ttt", ovhg=-1)
1028
- >>> a
1029
- Dseq(-4)
1030
- aaa
1031
- ttt
1032
- >>> a.three_prime_end()
1033
- ("5'", 't')
1034
- >>>
1035
1131
 
1036
1132
  See also
1037
1133
  --------
@@ -1039,42 +1135,73 @@ class Dseq(_Seq):
1039
1135
 
1040
1136
  """
1041
1137
 
1042
- ovhg = len(self.watson) - len(self.crick) + self.ovhg
1138
+ # See docstring for function pydna.utils.get_parts for details
1139
+ # on what is contained in parts.
1140
+ parts = self.get_parts()
1141
+
1142
+ sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1]
1143
+
1144
+ sticky3 = parts.sticky_right3.translate(dscode_to_watson_table)
1145
+
1146
+ single_watson = parts.single_watson.translate(dscode_to_watson_table)
1147
+
1148
+ single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
1043
1149
 
1044
- if ovhg < 0:
1045
- sticky = self.crick[:-ovhg].lower()
1150
+ # The walrus operator returns the value being assigned, so
1151
+ # we can test if it is empty or not.
1152
+ if sticky := single_watson:
1153
+ type_ = "single"
1154
+ elif sticky := single_crick:
1155
+ type_ = "single"
1156
+ elif sticky5 == sticky3 == "":
1157
+ type_, sticky = "blunt", ""
1158
+ elif sticky := sticky5:
1046
1159
  type_ = "5'"
1047
- elif ovhg > 0:
1048
- sticky = self.watson[-ovhg:].lower()
1160
+ elif sticky := sticky3:
1049
1161
  type_ = "3'"
1050
- else:
1051
- sticky = ""
1052
- type_ = "blunt"
1053
- return type_, sticky
1054
1162
 
1055
- def watson_ovhg(self) -> int:
1056
- """Returns the overhang of the watson strand at the three prime."""
1057
- return len(self.watson) - len(self.crick) + self.ovhg
1163
+ return type_, sticky.lower()
1058
1164
 
1059
- def __add__(self: DseqType, other: DseqType) -> DseqType:
1060
- """Simulates ligation between two DNA fragments.
1165
+ def __add__(self: DseqType, other: [DseqType, str, bytes]) -> DseqType:
1166
+ """
1167
+ Adding two Dseq objects together.
1168
+
1169
+ >>> ds = Dseq("a", "t", ovhg=0)
1170
+ >>> ds
1171
+ Dseq(-1)
1172
+ a
1173
+ t
1174
+ >>> ds + ds
1175
+ Dseq(-2)
1176
+ aa
1177
+ tt
1178
+ >>> "g" + ds # adding a string of left side returns a Dseq
1179
+ Dseq(-2)
1180
+ ga
1181
+ ct
1182
+ >>> ds + "c" # adding a string of right side returns a Dseq
1183
+ Dseq(-2)
1184
+ ac
1185
+ tg
1061
1186
 
1062
- Add other Dseq object at the end of the sequence.
1063
- Type error is raised if any of the points below are fulfilled:
1064
1187
 
1065
- * one or more objects are circular
1066
- * if three prime sticky end of self is not the same type
1067
- (5' or 3') as the sticky end of other
1068
- * three prime sticky end of self complementary with five
1069
- prime sticky end of other.
1188
+ Parameters
1189
+ ----------
1190
+ other : [DseqType, str, bytes]
1191
+ Object to be added.
1070
1192
 
1071
- Phosphorylation and dephosphorylation is not considered.
1193
+ Raises
1194
+ ------
1195
+ TypeError
1196
+ Preventing adding to a circular sequence.
1072
1197
 
1073
- DNA is allways presumed to have the necessary 5' phospate
1074
- group necessary for ligation.
1198
+ Returns
1199
+ -------
1200
+ DseqType
1201
+ A new Dseq object.
1075
1202
 
1076
1203
  """
1077
- # test for circular DNA
1204
+
1078
1205
  if self.circular:
1079
1206
  raise TypeError("circular DNA cannot be ligated!")
1080
1207
  try:
@@ -1083,60 +1210,85 @@ class Dseq(_Seq):
1083
1210
  except AttributeError:
1084
1211
  pass
1085
1212
 
1213
+ # If other evaluates to False, return a copy of self.
1214
+ if not other:
1215
+ return copy.deepcopy(self)
1216
+ # If self evaluates to False, return a copy of other.
1217
+ elif not self:
1218
+ return copy.deepcopy(other)
1219
+
1220
+ # get right side end properties for self.
1086
1221
  self_type, self_tail = self.three_prime_end()
1087
- other_type, other_tail = other.five_prime_end()
1088
1222
 
1089
- if self_type == other_type and str(self_tail) == str(_rc(other_tail)):
1090
- answer = Dseq.quick(
1091
- self.watson + other.watson, other.crick + self.crick, self.ovhg
1092
- )
1093
- elif not self:
1094
- answer = _copy.deepcopy(other)
1095
- elif not other:
1096
- answer = _copy.deepcopy(self)
1097
- else:
1098
- raise TypeError("sticky ends not compatible!")
1099
- return answer
1223
+ try:
1224
+ other_type, other_tail = other.five_prime_end()
1225
+ except AttributeError:
1226
+ # if other does not have the expected properties
1227
+ # most likely it is a string that can be cast as
1228
+ # a Dseq.
1229
+ other_type, other_tail = "blunt", ""
1230
+ other = Dseq(other)
1231
+
1232
+ err = TypeError("sticky ends not compatible!")
1233
+
1234
+ # The sticky ends has to be of the same type
1235
+ # or
1236
+ # one or both of is "single" indicating a stranded molecule.
1237
+ if (self_type != other_type) and ("single" not in (self_type, other_type)):
1238
+ raise err
1239
+
1240
+ # tail length has to be equal for two phosphdiester bonds to form
1241
+ if len(self_tail) != len(other_tail):
1242
+ raise err
1243
+
1244
+ # Each basepair is checked against the pydna.alphabet basepair_dict
1245
+ # which contains the permitted base pairings.
1246
+ for w, c in zip(self_tail, other_tail[::-1]):
1247
+ try:
1248
+ basepair_dict[(w, c)]
1249
+ except KeyError:
1250
+ raise err
1251
+
1252
+ return self.__class__(
1253
+ self.watson + other.watson, other.crick + self.crick, self.ovhg
1254
+ )
1100
1255
 
1101
1256
  def __mul__(self: DseqType, number: int) -> DseqType:
1102
1257
  if not isinstance(number, int):
1103
1258
  raise TypeError(
1104
- "TypeError: can't multiply Dseq by non-int of type {}".format(
1105
- type(number)
1106
- )
1259
+ "TypeError: can't multiply Dseq" f" by non-int of type {type(number)}"
1107
1260
  )
1108
- if number <= 0:
1109
- return self.__class__("")
1110
- new = _copy.deepcopy(self)
1111
- for i in range(number - 1):
1112
- new += self
1113
- return new
1261
+ return Dseq("").join(list(itertools.repeat(self, number)))
1114
1262
 
1115
- def _fill_in_five_prime(self: DseqType, nucleotides: str) -> str:
1263
+ def _fill_in_left(self: DseqType, nucleotides: str) -> str:
1116
1264
  stuffer = ""
1117
1265
  type, se = self.five_prime_end()
1118
1266
  if type == "5'":
1119
- for n in _rc(se):
1267
+ for n in rc(se):
1120
1268
  if n in nucleotides:
1121
1269
  stuffer += n
1122
1270
  else:
1123
1271
  break
1124
1272
  return self.crick + stuffer, self.ovhg + len(stuffer)
1125
1273
 
1126
- def _fill_in_three_prime(self: DseqType, nucleotides: str) -> str:
1274
+ def _fill_in_right(self: DseqType, nucleotides: str) -> str:
1127
1275
  stuffer = ""
1128
1276
  type, se = self.three_prime_end()
1129
1277
  if type == "5'":
1130
- for n in _rc(se):
1278
+ for n in rc(se):
1131
1279
  if n in nucleotides:
1132
1280
  stuffer += n
1133
1281
  else:
1134
1282
  break
1135
1283
  return self.watson + stuffer
1136
1284
 
1137
- def fill_in(self, nucleotides: _Union[None, str] = None) -> "Dseq":
1285
+ def fill_in(self, nucleotides: Union[None, str] = None) -> DseqType:
1138
1286
  """Fill in of five prime protruding end with a DNA polymerase
1139
- that has only DNA polymerase activity (such as exo-klenow [#]_)
1287
+ that has only DNA polymerase activity (such as Exo-Klenow [#]_).
1288
+ Exo-Klenow is a modified version of the Klenow fragment of E.
1289
+ coli DNA polymerase I, which has been engineered to lack both
1290
+ 3-5 proofreading and 5-3 exonuclease activities.
1291
+
1140
1292
  and any combination of A, G, C or T. Default are all four
1141
1293
  nucleotides together.
1142
1294
 
@@ -1149,15 +1301,6 @@ class Dseq(_Seq):
1149
1301
  --------
1150
1302
 
1151
1303
  >>> from pydna.dseq import Dseq
1152
- >>> a=Dseq("aaa", "ttt")
1153
- >>> a
1154
- Dseq(-3)
1155
- aaa
1156
- ttt
1157
- >>> a.fill_in()
1158
- Dseq(-3)
1159
- aaa
1160
- ttt
1161
1304
  >>> b=Dseq("caaa", "cttt")
1162
1305
  >>> b
1163
1306
  Dseq(-5)
@@ -1184,7 +1327,15 @@ class Dseq(_Seq):
1184
1327
  Dseq(-5)
1185
1328
  aaac
1186
1329
  gttt
1187
- >>>
1330
+ >>> a=Dseq("aaa", "ttt")
1331
+ >>> a
1332
+ Dseq(-3)
1333
+ aaa
1334
+ ttt
1335
+ >>> a.fill_in()
1336
+ Dseq(-3)
1337
+ aaa
1338
+ ttt
1188
1339
 
1189
1340
  References
1190
1341
  ----------
@@ -1195,32 +1346,31 @@ class Dseq(_Seq):
1195
1346
  nucleotides = "GATCRYWSMKHBVDN"
1196
1347
 
1197
1348
  nucleotides = set(nucleotides.lower() + nucleotides.upper())
1198
- crick, ovhg = self._fill_in_five_prime(nucleotides)
1199
- watson = self._fill_in_three_prime(nucleotides)
1349
+ crick, ovhg = self._fill_in_left(nucleotides)
1350
+ watson = self._fill_in_right(nucleotides)
1200
1351
  return Dseq(watson, crick, ovhg)
1201
1352
 
1202
- def transcribe(self) -> _Seq:
1203
- return _Seq(self.watson).transcribe()
1204
-
1205
- def translate(
1206
- self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
1207
- ) -> _Seq:
1208
- return _Seq(
1209
- _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
1210
- )
1353
+ klenow = fill_in # alias
1211
1354
 
1212
- def mung(self) -> "Dseq":
1355
+ def nibble_to_blunt(self) -> DseqType:
1213
1356
  """
1214
- Simulates treatment a nuclease with 5'-3' and 3'-5' single
1357
+ Simulates treatment a nuclease with both 5'-3' and 3'-5' single
1215
1358
  strand specific exonuclease activity (such as mung bean nuclease [#]_)
1216
1359
 
1360
+ Mung bean nuclease is a nuclease enzyme derived from mung bean sprouts
1361
+ that preferentially degrades single-stranded DNA and RNA into
1362
+ 5'-phosphate- and 3'-hydroxyl-containing nucleotides.
1363
+
1364
+ Treatment results in blunt DNA, regardless of wheter the protruding end
1365
+ is 5' or 3'.
1366
+
1217
1367
  ::
1218
1368
 
1219
1369
  ggatcc -> gatcc
1220
1370
  ctaggg ctagg
1221
1371
 
1222
- ggatcc -> ggatc
1223
- tcctag cctag
1372
+ ggatcc -> ggatc
1373
+ tcctag cctag
1224
1374
 
1225
1375
  >>> from pydna.dseq import Dseq
1226
1376
  >>> b=Dseq("caaa", "cttt")
@@ -1250,19 +1400,60 @@ class Dseq(_Seq):
1250
1400
 
1251
1401
 
1252
1402
  """
1253
- return Dseq(
1254
- self.watson[
1255
- max(0, -self.ovhg) : min(len(self.watson), len(self.crick) - self.ovhg)
1256
- ]
1257
- )
1403
+ parts = self.get_parts()
1404
+ return self.__class__(parts.middle)
1405
+
1406
+ mung = nibble_to_blunt
1407
+
1408
+ def T4(self, nucleotides=None) -> DseqType:
1409
+ """
1410
+ Fill in 5' protruding ends and nibble 3' protruding ends.
1411
+
1412
+ This is done using a DNA polymerase providing 3'-5' nuclease activity
1413
+ such as T4 DNA polymerase. This can be done in presence of any
1414
+ combination of the four nucleotides A, G, C or T.
1415
+
1416
+ T4 DNA polymerase is widely used to “polish” DNA ends because of its
1417
+ strong 3-5 exonuclease activity in the absence of dNTPs, it chews
1418
+ back 3′ overhangs to create blunt ends; in the presence of limiting
1419
+ dNTPs, it can fill in 5′ overhangs; and by carefully controlling
1420
+ reaction time, temperature, and nucleotide supply, you can generate
1421
+ defined recessed or blunt termini.
1422
+
1423
+ Tuning the nucleotide set can facilitate engineering of partial
1424
+ sticky ends. Default are all four nucleotides together.
1425
+
1426
+ ::
1427
+
1428
+ aaagatc-3 aaa 3' ends are always removed.
1429
+ ||| ---> ||| A and T needed or the molecule will
1430
+ 3-ctagttt ttt degrade completely.
1431
+
1432
+
1433
+
1434
+ 5-gatcaaa gatcaaaGATC 5' ends are filled in the
1435
+ ||| ---> ||||||||||| presence of GATC
1436
+ tttctag-5 CTAGtttctag
1437
+
1438
+
1439
+
1440
+ 5-gatcaaa gatcaaaGAT 5' ends are partially filled in the
1441
+ ||| ---> ||||||||| presence of GAT to produce a 1 nt
1442
+ tttctag-5 TAGtttctag 5' overhang
1443
+
1444
+
1445
+
1446
+ 5-gatcaaa gatcaaaGA 5' ends are partially filled in the
1447
+ ||| ---> ||||||| presence of GA to produce a 2 nt
1448
+ tttctag-5 AGtttctag 5' overhang
1449
+
1450
+
1451
+
1452
+ 5-gatcaaa gatcaaaG 5' ends are partially filled in the
1453
+ ||| ---> ||||| presence of G to produce a 3 nt
1454
+ tttctag-5 Gtttctag 5' overhang
1455
+
1258
1456
 
1259
- def T4(self, nucleotides=None) -> "Dseq":
1260
- """Fill in five prime protruding ends and chewing back
1261
- three prime protruding ends by a DNA polymerase providing both
1262
- 5'-3' DNA polymerase activity and 3'-5' nuclease acitivty
1263
- (such as T4 DNA polymerase). This can be done in presence of any
1264
- combination of the four A, G, C or T. Removing one or more nucleotides
1265
- can facilitate engineering of sticky ends. Default are all four nucleotides together.
1266
1457
 
1267
1458
  Parameters
1268
1459
  ----------
@@ -1273,29 +1464,31 @@ class Dseq(_Seq):
1273
1464
  --------
1274
1465
 
1275
1466
  >>> from pydna.dseq import Dseq
1276
- >>> a=Dseq("gatcgatc")
1467
+ >>> a = Dseq.from_representation(
1468
+ ... '''
1469
+ ... gatcaaa
1470
+ ... tttctag
1471
+ ... ''')
1277
1472
  >>> a
1278
- Dseq(-8)
1279
- gatcgatc
1280
- ctagctag
1473
+ Dseq(-11)
1474
+ gatcaaa
1475
+ tttctag
1281
1476
  >>> a.T4()
1282
- Dseq(-8)
1283
- gatcgatc
1284
- ctagctag
1285
- >>> a.T4("t")
1286
- Dseq(-8)
1287
- gatcgat
1288
- tagctag
1289
- >>> a.T4("a")
1290
- Dseq(-8)
1291
- gatcga
1292
- agctag
1293
- >>> a.T4("g")
1294
- Dseq(-8)
1295
- gatcg
1296
- gctag
1297
- >>>
1298
-
1477
+ Dseq(-11)
1478
+ gatcaaagatc
1479
+ ctagtttctag
1480
+ >>> a.T4("GAT")
1481
+ Dseq(-11)
1482
+ gatcaaagat
1483
+ tagtttctag
1484
+ >>> a.T4("GA")
1485
+ Dseq(-11)
1486
+ gatcaaaga
1487
+ agtttctag
1488
+ >>> a.T4("G")
1489
+ Dseq(-11)
1490
+ gatcaaag
1491
+ gtttctag
1299
1492
  """
1300
1493
 
1301
1494
  if not nucleotides:
@@ -1303,7 +1496,7 @@ class Dseq(_Seq):
1303
1496
  nucleotides = set(nucleotides.lower() + nucleotides.upper())
1304
1497
  type, se = self.five_prime_end()
1305
1498
  if type == "5'":
1306
- crick, ovhg = self._fill_in_five_prime(nucleotides)
1499
+ crick, ovhg = self._fill_in_left(nucleotides)
1307
1500
  else:
1308
1501
  if type == "3'":
1309
1502
  ovhg = 0
@@ -1323,7 +1516,7 @@ class Dseq(_Seq):
1323
1516
  watson = self.watson
1324
1517
  type, se = self.three_prime_end()
1325
1518
  if type == "5'":
1326
- watson = self._fill_in_three_prime(nucleotides)
1519
+ watson = self._fill_in_right(nucleotides)
1327
1520
  else:
1328
1521
  if type == "3'":
1329
1522
  watson = self.watson[: -len(se)]
@@ -1337,32 +1530,311 @@ class Dseq(_Seq):
1337
1530
 
1338
1531
  t4 = T4 # alias for the T4 method.
1339
1532
 
1340
- def exo1_front(self: DseqType, n=1) -> DseqType:
1341
- """5'-3' resection at the start (left side) of the molecule."""
1342
- d = _copy.deepcopy(self)
1343
- d.ovhg += n
1344
- d.watson = d.watson[n:]
1345
- return d
1533
+ def nibble_five_prime_left(self: DseqType, n: int = 1) -> DseqType:
1534
+ """
1535
+ 5' => 3' resection at the left side (start) of the molecule.
1536
+
1537
+ The argument n indicate the number of nucleotides that are to be
1538
+ removed. The outcome of this depend on the structure of the molecule.
1539
+ See the two examples below:
1540
+
1541
+ The figure below indicates a recess of length two from a blunt DNA
1542
+ fragment. The resulting DNA fragment has a 3' protruding single strand.
1543
+
1544
+ ::
1545
+
1546
+ gatc tc
1547
+ |||| --> ||
1548
+ ctag ctag
1549
+
1550
+
1551
+ The figure below indicates a recess of length two from a DNA fragment
1552
+ with a 5' sticky end resulting in a blunt sequence.
1553
+
1554
+ ::
1555
+
1556
+ ttgatc gatc
1557
+ |||| --> ||||
1558
+ ctag ctag
1559
+
1560
+
1561
+ >>> from pydna.dseq import Dseq
1562
+ >>> ds = Dseq("gatc")
1563
+ >>> ds
1564
+ Dseq(-4)
1565
+ gatc
1566
+ ctag
1567
+ >>> ds.nibble_five_prime_left(2)
1568
+ Dseq(-4)
1569
+ tc
1570
+ ctag
1571
+ >>> ds.nibble_five_prime_left(3)
1572
+ Dseq(-4)
1573
+ c
1574
+ ctag
1575
+ >>> ds.nibble_five_prime_left(4)
1576
+ Dseq(-4)
1577
+ <BLANKLINE>
1578
+ ctag
1579
+ >>> ds = Dseq.from_representation(
1580
+ ... '''
1581
+ ... GGgatc
1582
+ ... ctag
1583
+ ... ''')
1584
+ >>> ds
1585
+ Dseq(-6)
1586
+ GGgatc
1587
+ ctag
1588
+ >>> ds.nibble_five_prime_left(2)
1589
+ Dseq(-4)
1590
+ gatc
1591
+ ctag
1592
+
1593
+ Parameters
1594
+ ----------
1595
+ n : int, optional
1596
+ The default is 1. This is the number of nucleotides removed.
1597
+
1598
+ Returns
1599
+ -------
1600
+ DseqType
1601
+ DESCRIPTION.
1602
+
1603
+ """
1604
+ recessed = copy.deepcopy(self)
1605
+ n += max(0, self.ovhg or 0)
1606
+ recessed = Dseq(
1607
+ self._data[:n]
1608
+ .translate(dscode_to_crick_table)
1609
+ .translate(complement_table_for_dscode)
1610
+ .translate(dscode_to_crick_tail_table)
1611
+ .lstrip()
1612
+ + self._data[n:]
1613
+ )
1614
+ return recessed
1615
+
1616
+ def nibble_five_prime_right(self: DseqType, n: int = 1) -> DseqType:
1617
+ """
1618
+ 5' => 3' resection at the right side (end) of the molecule.
1619
+
1620
+ The argument n indicate the number of nucleotides that are to be
1621
+ removed. The outcome of this depend on the structure of the molecule.
1622
+ See the two examples below:
1623
+
1624
+ The figure below indicates a recess of length two from a blunt DNA
1625
+ fragment. The resulting DNA fragment has a 3' protruding single strand.
1626
+
1627
+ ::
1628
+
1629
+ gatc gatc
1630
+ |||| --> ||
1631
+ ctag ct
1632
+
1633
+ The figure below indicates a recess of length two from a DNA fragment
1634
+ with a 5' sticky end resulting in a blunt sequence.
1346
1635
 
1347
- def exo1_end(self: DseqType, n=1) -> DseqType:
1348
- """5'-3' resection at the end (right side) of the molecule."""
1349
- d = _copy.deepcopy(self)
1350
- d.crick = d.crick[n:]
1351
- return d
1636
+ ::
1637
+
1638
+ gatc gatc
1639
+ |||| --> ||||
1640
+ ctagtt ctag
1641
+
1642
+
1643
+ >>> from pydna.dseq import Dseq
1644
+ >>> ds = Dseq("gatc")
1645
+ >>> ds
1646
+ Dseq(-4)
1647
+ gatc
1648
+ ctag
1649
+ >>> ds.nibble_five_prime_right(2)
1650
+ Dseq(-4)
1651
+ gatc
1652
+ ct
1653
+ >>> ds.nibble_five_prime_right(3)
1654
+ Dseq(-4)
1655
+ gatc
1656
+ c
1657
+ >>> ds.nibble_five_prime_right(4)
1658
+ Dseq(-4)
1659
+ gatc
1660
+ <BLANKLINE>
1661
+ >>> ds = Dseq.from_representation(
1662
+ ... '''
1663
+ ... gatc
1664
+ ... ctagGG
1665
+ ... ''')
1666
+ >>> ds.nibble_five_prime_right(2)
1667
+ Dseq(-4)
1668
+ gatc
1669
+ ctag
1670
+ """
1671
+ recessed = copy.deepcopy(self)
1672
+ n = len(self) - n
1673
+ ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
1674
+ n -= max(0, ovhg)
1675
+ recessed = Dseq(
1676
+ self._data[:n]
1677
+ + self._data[n:]
1678
+ .translate(dscode_to_watson_table)
1679
+ .translate(dscode_to_watson_tail_table)
1680
+ .lstrip()
1681
+ )
1682
+ return recessed
1683
+
1684
+ exo1_front = nibble_five_prime_left # TODO: consider using the new names
1685
+ exo1_end = nibble_five_prime_right # TODO: consider using the new names
1686
+
1687
+ def nibble_three_prime_left(self: DseqType, n=1) -> DseqType:
1688
+ """
1689
+ 3' => 5' resection at the left side (beginning) of the molecule.
1690
+
1691
+ The argument n indicate the number of nucleotides that are to be
1692
+ removed. The outcome of this depend on the structure of the molecule.
1693
+ See the two examples below:
1694
+
1695
+ The figure below indicates a recess of length two from a blunt DNA
1696
+ fragment. The resulting DNA fragment has a 5' protruding single strand.
1697
+
1698
+ ::
1699
+
1700
+ gatc gatc
1701
+ |||| --> ||
1702
+ ctag ag
1703
+
1704
+ The figure below indicates a recess of length two from a DNA fragment
1705
+ with a 3' sticky end resulting in a blunt sequence.
1706
+
1707
+ ::
1708
+
1709
+ gatc gatc
1710
+ |||| --> ||||
1711
+ ttctag ctag
1712
+
1713
+
1714
+ >>> from pydna.dseq import Dseq
1715
+ >>> ds = Dseq("gatc")
1716
+ >>> ds
1717
+ Dseq(-4)
1718
+ gatc
1719
+ ctag
1720
+ >>> ds.nibble_three_prime_left(2)
1721
+ Dseq(-4)
1722
+ gatc
1723
+ ag
1724
+ >>> ds.nibble_three_prime_left(3)
1725
+ Dseq(-4)
1726
+ gatc
1727
+ g
1728
+ >>> ds.nibble_three_prime_left(4)
1729
+ Dseq(-4)
1730
+ gatc
1731
+ <BLANKLINE>
1732
+ >>> ds = Dseq.from_representation(
1733
+ ... '''
1734
+ ... gatc
1735
+ ... CCctag
1736
+ ... ''')
1737
+ >>> ds
1738
+ Dseq(-6)
1739
+ gatc
1740
+ CCctag
1741
+ >>> ds.nibble_three_prime_left(2)
1742
+ Dseq(-4)
1743
+ gatc
1744
+ ctag
1745
+ """
1746
+ ovhg = len(self) if self.ovhg is None else self.ovhg
1747
+ n -= min(0, ovhg)
1748
+ recessed = Dseq(
1749
+ self._data[:n]
1750
+ .translate(dscode_to_watson_table)
1751
+ .translate(dscode_to_watson_tail_table)
1752
+ .lstrip()
1753
+ + self._data[n:]
1754
+ )
1755
+ return recessed
1756
+
1757
+ def nibble_three_prime_right(self: DseqType, n=1) -> DseqType:
1758
+ """
1759
+ 3' => 5' resection at the right side (end) of the molecule.
1760
+
1761
+ The argument n indicate the number of nucleotides that are to be
1762
+ removed. The outcome of this depend on the structure of the molecule.
1763
+ See the two examples below:
1764
+
1765
+ The figure below indicates a recess of length two from a blunt DNA
1766
+ fragment. The resulting DNA fragment has a 5' protruding single strand.
1767
+
1768
+ ::
1769
+
1770
+ gatc ga
1771
+ |||| --> ||
1772
+ ctag ctag
1773
+
1774
+ The figure below indicates a recess of length two from a DNA fragment
1775
+ with a 3' sticky end resulting in a blunt sequence.
1776
+
1777
+ ::
1778
+
1779
+ gatctt gatc
1780
+ |||| --> ||||
1781
+ ctag ctag
1782
+
1783
+
1784
+ >>> from pydna.dseq import Dseq
1785
+ >>> ds = Dseq("gatc")
1786
+ >>> ds
1787
+ Dseq(-4)
1788
+ gatc
1789
+ ctag
1790
+ >>> ds.nibble_three_prime_right(2)
1791
+ Dseq(-4)
1792
+ ga
1793
+ ctag
1794
+ >>> ds.nibble_three_prime_right(3)
1795
+ Dseq(-4)
1796
+ g
1797
+ ctag
1798
+ >>> ds.nibble_three_prime_right(4)
1799
+ Dseq(-4)
1800
+ <BLANKLINE>
1801
+ ctag
1802
+ >>> ds = Dseq.from_representation(
1803
+ ... '''
1804
+ ... gatcCC
1805
+ ... ctag
1806
+ ... ''')
1807
+ >>> ds.nibble_three_prime_right(2)
1808
+ Dseq(-4)
1809
+ gatc
1810
+ ctag
1811
+ """
1812
+ n = len(self) - n
1813
+ ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
1814
+ n += min(0, ovhg)
1815
+ recessed = Dseq(
1816
+ self._data[:n]
1817
+ + self._data[n:]
1818
+ .translate(dscode_to_crick_table)
1819
+ .translate(complement_table_for_dscode)
1820
+ .translate(dscode_to_crick_tail_table)
1821
+ .lstrip()
1822
+ )
1823
+ return recessed
1352
1824
 
1353
1825
  def no_cutters(
1354
- self, batch: _Union[_RestrictionBatch, None] = None
1355
- ) -> _RestrictionBatch:
1826
+ self, batch: Union[RestrictionBatch, None] = None
1827
+ ) -> RestrictionBatch:
1356
1828
  """Enzymes in a RestrictionBatch not cutting sequence."""
1357
1829
  if batch is None:
1358
1830
  batch = CommOnly
1359
1831
  ana = batch.search(self)
1360
1832
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist}
1361
- return _RestrictionBatch(ncut)
1833
+ return RestrictionBatch(ncut)
1362
1834
 
1363
1835
  def unique_cutters(
1364
- self, batch: _Union[_RestrictionBatch, None] = None
1365
- ) -> _RestrictionBatch:
1836
+ self, batch: Union[RestrictionBatch, None] = None
1837
+ ) -> RestrictionBatch:
1366
1838
  """Enzymes in a RestrictionBatch cutting sequence once."""
1367
1839
  if batch is None:
1368
1840
  batch = CommOnly
@@ -1371,44 +1843,42 @@ class Dseq(_Seq):
1371
1843
  once_cutters = unique_cutters # alias for unique_cutters
1372
1844
 
1373
1845
  def twice_cutters(
1374
- self, batch: _Union[_RestrictionBatch, None] = None
1375
- ) -> _RestrictionBatch:
1846
+ self, batch: Union[RestrictionBatch, None] = None
1847
+ ) -> RestrictionBatch:
1376
1848
  """Enzymes in a RestrictionBatch cutting sequence twice."""
1377
1849
  if batch is None:
1378
1850
  batch = CommOnly
1379
1851
  return self.n_cutters(n=2, batch=batch)
1380
1852
 
1381
1853
  def n_cutters(
1382
- self, n=3, batch: _Union[_RestrictionBatch, None] = None
1383
- ) -> _RestrictionBatch:
1854
+ self, n=3, batch: Union[RestrictionBatch, None] = None
1855
+ ) -> RestrictionBatch:
1384
1856
  """Enzymes in a RestrictionBatch cutting n times."""
1385
1857
  if batch is None:
1386
1858
  batch = CommOnly
1387
1859
  ana = batch.search(self)
1388
1860
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n}
1389
- return _RestrictionBatch(ncut)
1861
+ return RestrictionBatch(ncut)
1390
1862
 
1391
- def cutters(
1392
- self, batch: _Union[_RestrictionBatch, None] = None
1393
- ) -> _RestrictionBatch:
1863
+ def cutters(self, batch: Union[RestrictionBatch, None] = None) -> RestrictionBatch:
1394
1864
  """Enzymes in a RestrictionBatch cutting sequence at least once."""
1395
1865
  if batch is None:
1396
1866
  batch = CommOnly
1397
1867
  ana = batch.search(self)
1398
1868
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist}
1399
- return _RestrictionBatch(ncut)
1869
+ return RestrictionBatch(ncut)
1400
1870
 
1401
1871
  def seguid(self) -> str:
1402
1872
  """SEGUID checksum for the sequence."""
1403
1873
  if self.circular:
1404
- cs = _cdseguid(
1874
+ cs = cdseguid(
1405
1875
  self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}"
1406
1876
  )
1407
1877
  else:
1408
1878
  """docstring."""
1409
1879
  w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper()
1410
1880
  c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper()
1411
- cs = _ldseguid(w, c, alphabet="{DNA-extended}")
1881
+ cs = ldseguid(w, c, alphabet="{DNA-extended}")
1412
1882
  return cs
1413
1883
 
1414
1884
  def isblunt(self) -> bool:
@@ -1449,29 +1919,113 @@ class Dseq(_Seq):
1449
1919
  >>> a.isblunt()
1450
1920
  False
1451
1921
  """
1452
- return (
1453
- self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular
1922
+ parts = self.get_parts()
1923
+
1924
+ return not any(
1925
+ (
1926
+ parts.sticky_right5,
1927
+ parts.sticky_right3,
1928
+ parts.sticky_left3,
1929
+ parts.sticky_left5,
1930
+ self.circular,
1931
+ )
1454
1932
  )
1455
1933
 
1456
- def cas9(self, RNA: str) -> _Tuple[slice, ...]:
1457
- """docstring."""
1458
- bRNA = bytes(RNA, "ASCII")
1459
- slices = []
1460
- cuts = [0]
1461
- for m in _re.finditer(bRNA, self._data):
1462
- cuts.append(m.start() + 17)
1463
- cuts.append(self.length)
1464
- slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:]))
1465
- return slices
1466
-
1467
- def terminal_transferase(self, nucleotides="a") -> "Dseq":
1468
- """docstring."""
1934
+ def terminal_transferase(self, nucleotides: str = "a") -> DseqType:
1935
+ """
1936
+ Terminal deoxynucleotidyl transferase (TdT) is a template-independent
1937
+ DNA polymerase that adds nucleotides to the 3′-OH ends of DNA, typically
1938
+ single-stranded or recessed 3′ ends. In cloning, it’s classically used
1939
+ to create homopolymer tails (e.g. poly-dG on a vector and poly-dC on an insert)
1940
+ so that fragments can anneal via complementary overhangs (“tailing” cloning).
1941
+
1942
+ This activity ia also present in some DNA polymerases, such as Taq polymerase.
1943
+ This property is used in the populat T/A cloning protocol ([#]_).
1944
+
1945
+ ::
1946
+
1947
+ gct gcta
1948
+ ||| --> |||
1949
+ cga acga
1950
+
1951
+
1952
+
1953
+ >>> from pydna.dseq import Dseq
1954
+ >>> a = Dseq("aa")
1955
+ >>> a = Dseq("gct")
1956
+ >>> a
1957
+ Dseq(-3)
1958
+ gct
1959
+ cga
1960
+ >>> a.terminal_transferase()
1961
+ Dseq(-5)
1962
+ gcta
1963
+ acga
1964
+ >>> a.terminal_transferase("G")
1965
+ Dseq(-5)
1966
+ gctG
1967
+ Gcga
1968
+
1969
+ Parameters
1970
+ ----------
1971
+ nucleotides : str, optional
1972
+ The default is "a".
1973
+
1974
+ Returns
1975
+ -------
1976
+ DseqType
1977
+ DESCRIPTION.
1978
+
1979
+ References
1980
+ ----------
1981
+ .. [#] https://en.wikipedia.org/wiki/TA_cloning
1982
+
1983
+ """
1469
1984
  ovhg = self.ovhg
1470
1985
  if self.ovhg >= 0:
1471
1986
  ovhg += len(nucleotides)
1472
1987
  return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
1473
1988
 
1474
- def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]:
1989
+ def user(self) -> DseqType:
1990
+ """
1991
+ USER Enzyme treatment.
1992
+
1993
+ USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the
1994
+ DNA glycosylase-lyase Endonuclease VIII.
1995
+
1996
+ UDG catalyses the excision of an uracil base, forming an abasic
1997
+ or apyrimidinic site (AP site). Endonuclease VIII removes the AP
1998
+ site creating a DNA gap.
1999
+
2000
+ ::
2001
+
2002
+ tagaagtaggUat tagaagtagg at
2003
+ ||||||||||||| ---> |||||||||| ||
2004
+ atcUtcatccata atc tcatccata
2005
+
2006
+
2007
+
2008
+ >>> a = Dseq("tagaagtaggUat", "atcUtcatccata"[::-1], 0)
2009
+ >>> a
2010
+ Dseq(-13)
2011
+ tagaagtaggUat
2012
+ atcutcatccAta
2013
+ >>> a.user()
2014
+ Dseq(-13)
2015
+ tagaagtagg at
2016
+ atc tcatccAta
2017
+
2018
+
2019
+ Returns
2020
+ -------
2021
+ DseqType
2022
+ DNA fragment with uracile bases removed.
2023
+
2024
+ """
2025
+
2026
+ return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe")))
2027
+
2028
+ def cut(self: DseqType, *enzymes: EnzymesType) -> Tuple[DseqType, ...]:
1475
2029
  """Returns a list of linear Dseq fragments produced in the digestion.
1476
2030
  If there are no cuts, an empty list is returned.
1477
2031
 
@@ -1522,11 +2076,73 @@ class Dseq(_Seq):
1522
2076
  return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
1523
2077
 
1524
2078
  def cutsite_is_valid(self, cutsite: CutSiteType) -> bool:
1525
- """Returns False if:
2079
+ """
2080
+ Check is a cutsite is valid.
2081
+
2082
+ A cutsite is a nested 2-tuple with this form:
2083
+
2084
+ ((cut_watson, ovhg), enz), for example ((396, -4), EcoRI)
2085
+
2086
+ The cut_watson (positive integer) is the cut position of the sequence as for example
2087
+ returned by the Bio.Restriction module.
2088
+
2089
+ The ovhg (overhang, positive or negative integer or 0) has the same meaning as
2090
+ for restriction enzymes in the Bio.Restriction module and for
2091
+ pydna.dseq.Dseq objects (see docstring for this module and example below)
2092
+
2093
+ Enzyme can be None.
2094
+
2095
+ ::
2096
+
2097
+ Enzyme overhang
2098
+
2099
+ EcoRI -4 --GAATTC-- --G AATTC--
2100
+ |||||| --> | |
2101
+ --CTTAAG-- --CTTAA G--
2102
+
2103
+ KpnI 4 --GGTACC-- --GGTAC C--
2104
+ |||||| --> | |
2105
+ --CCATGG-- --C CATGG--
2106
+
2107
+ SmaI 0 --CCCGGG-- --CCC GGG--
2108
+ |||||| --> ||| |||
2109
+ --GGGCCC-- --GGG CCC--
2110
+
2111
+
2112
+ >>> from Bio.Restriction import EcoRI, KpnI, SmaI
2113
+ >>> EcoRI.ovhg
2114
+ -4
2115
+ >>> KpnI.ovhg
2116
+ 4
2117
+ >>> SmaI.ovhg
2118
+ 0
2119
+
2120
+ Returns False if:
2121
+
1526
2122
  - Cut positions fall outside the sequence (could be moved to Biopython)
2123
+ TODO: example
2124
+
1527
2125
  - Overhang is not double stranded
2126
+ TODO: example
2127
+
1528
2128
  - Recognition site is not double stranded or is outside the sequence
2129
+ TODO: example
2130
+
1529
2131
  - For enzymes that cut twice, it checks that at least one possibility is valid
2132
+ TODO: example
2133
+
2134
+
2135
+
2136
+ Parameters
2137
+ ----------
2138
+ cutsite : CutSiteType
2139
+ DESCRIPTION.
2140
+
2141
+ Returns
2142
+ -------
2143
+ bool
2144
+ True if cutsite can cut the DNA fragment.
2145
+
1530
2146
  """
1531
2147
 
1532
2148
  assert cutsite is not None, "cutsite is None"
@@ -1536,7 +2152,7 @@ class Dseq(_Seq):
1536
2152
 
1537
2153
  # The overhang is double stranded
1538
2154
  overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson]
1539
- if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg() != 0:
2155
+ if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg != 0:
1540
2156
  return False
1541
2157
 
1542
2158
  # The recognition site is double stranded and within the sequence
@@ -1550,7 +2166,7 @@ class Dseq(_Seq):
1550
2166
  if (
1551
2167
  len(recognition_site) == 0
1552
2168
  or recognition_site.ovhg != 0
1553
- or recognition_site.watson_ovhg() != 0
2169
+ or recognition_site.watson_ovhg != 0
1554
2170
  ):
1555
2171
  if enz is None or enz.scd5 is None:
1556
2172
  return False
@@ -1569,20 +2185,22 @@ class Dseq(_Seq):
1569
2185
  if (
1570
2186
  len(recognition_site) == 0
1571
2187
  or recognition_site.ovhg != 0
1572
- or recognition_site.watson_ovhg() != 0
2188
+ or recognition_site.watson_ovhg != 0
1573
2189
  ):
1574
2190
  return False
1575
2191
 
1576
2192
  return True
1577
2193
 
1578
- def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> _List[CutSiteType]:
2194
+ def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> List[CutSiteType]:
1579
2195
  """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`:
1580
2196
 
1581
2197
  - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence
1582
2198
  that will be cut. It represents the position of the cut on the watson strand, using the full
1583
2199
  sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`.
2200
+
1584
2201
  - `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in
1585
2202
  the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.
2203
+
1586
2204
  - `enz` is the enzyme object. It's not necessary to perform the cut, but can be
1587
2205
  used to keep track of which enzyme was used.
1588
2206
 
@@ -1592,7 +2210,7 @@ class Dseq(_Seq):
1592
2210
  Parameters
1593
2211
  ----------
1594
2212
 
1595
- enzymes : Union[_RestrictionBatch,list[_AbstractCut]]
2213
+ enzymes : Union[RestrictionBatch,list[_AbstractCut]]
1596
2214
 
1597
2215
  Returns
1598
2216
  -------
@@ -1628,11 +2246,11 @@ class Dseq(_Seq):
1628
2246
 
1629
2247
  """
1630
2248
 
1631
- if len(enzymes) == 1 and isinstance(enzymes[0], _RestrictionBatch):
2249
+ if len(enzymes) == 1 and isinstance(enzymes[0], RestrictionBatch):
1632
2250
  # argument is probably a RestrictionBatch
1633
2251
  enzymes = [e for e in enzymes[0]]
1634
2252
 
1635
- enzymes = _flatten(enzymes)
2253
+ enzymes = flatten(enzymes)
1636
2254
  out = list()
1637
2255
  for e in enzymes:
1638
2256
  # Positions of the cut on the watson strand. They are 1-based, so we subtract
@@ -1643,7 +2261,7 @@ class Dseq(_Seq):
1643
2261
 
1644
2262
  return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
1645
2263
 
1646
- def left_end_position(self) -> _Tuple[int, int]:
2264
+ def left_end_position(self) -> Tuple[int, int]:
1647
2265
  """
1648
2266
  The index in the full sequence of the watson and crick start positions.
1649
2267
 
@@ -1660,7 +2278,7 @@ class Dseq(_Seq):
1660
2278
  return self.ovhg, 0
1661
2279
  return 0, -self.ovhg
1662
2280
 
1663
- def right_end_position(self) -> _Tuple[int, int]:
2281
+ def right_end_position(self) -> Tuple[int, int]:
1664
2282
  """The index in the full sequence of the watson and crick end positions.
1665
2283
 
1666
2284
  full sequence (str(self)) for all three cases is AAA
@@ -1672,13 +2290,210 @@ class Dseq(_Seq):
1672
2290
  ```
1673
2291
 
1674
2292
  """
1675
- if self.watson_ovhg() < 0:
1676
- return len(self) + self.watson_ovhg(), len(self)
1677
- return len(self), len(self) - self.watson_ovhg()
2293
+ if self.watson_ovhg < 0:
2294
+ return len(self) + self.watson_ovhg, len(self)
2295
+ return len(self), len(self) - self.watson_ovhg
2296
+
2297
+ def get_ss_meltsites(self: DseqType, length: int) -> tuple[int, int]:
2298
+ """
2299
+ Single stranded DNA melt sites
2300
+
2301
+ Two lists of 2-tuples of integers are returned. Each tuple
2302
+ (`((from, to))`) contains the start and end positions of a single
2303
+ stranded region, shorter or equal to `length`.
2304
+
2305
+ In the example below, the middle 2 nt part is released from the
2306
+ molecule.
2307
+
2308
+ ::
2309
+
2310
+
2311
+ tagaa ta gtatg
2312
+ ||||| || ||||| --> [(6,8)], []
2313
+ atcttcatccatac
2314
+
2315
+ tagaagtaggtatg
2316
+ ||||| || ||||| --> [], [(6,8)]
2317
+ atctt at catac
2318
+
2319
+
2320
+
2321
+
2322
+ The output of this method is used in the `melt_ss_dna` method in order
2323
+ to determine the start and end positions of single stranded regions.
2324
+
2325
+ See get_ds_meltsites for melting ds sequences.
2326
+
2327
+ Examples
2328
+ --------
2329
+ >>> from pydna.dseq import Dseq
2330
+ >>> ds = Dseq("tagaaqtaqgtatg")
2331
+ >>> ds
2332
+ Dseq(-14)
2333
+ tagaa ta gtatg
2334
+ atcttcatccatac
2335
+ >>> cutsites = ds.get_ss_meltsites(2)
2336
+ >>> cutsites
2337
+ ([(6, 8)], [])
2338
+ >>> ds[6:8]
2339
+ Dseq(-2)
2340
+ ta
2341
+ at
2342
+ >>> ds = Dseq("tagaaptapgtatg")
2343
+ >>> ds
2344
+ Dseq(-14)
2345
+ tagaagtaggtatg
2346
+ atctt at catac
2347
+ >>> cutsites = ds.get_ss_meltsites(2)
2348
+ >>> cutsites
2349
+ ([], [(6, 8)])
2350
+ """
2351
+
2352
+ regex = regex_ss_melt_factory(length)
2353
+
2354
+ if self.circular:
2355
+ spacer = length
2356
+ cutfrom = self._data[-length:] + self._data + self._data[:length]
2357
+ else:
2358
+ spacer = 0
2359
+ cutfrom = self._data
2360
+
2361
+ watson_cuts = []
2362
+ crick_cuts = []
2363
+
2364
+ for m in regex.finditer(cutfrom):
2365
+
2366
+ if m.lastgroup == "watson":
2367
+ cut1 = m.start() + spacer
2368
+ cut2 = m.end() + spacer
2369
+ watson_cuts.append((cut1, cut2))
2370
+ else:
2371
+ assert m.lastgroup == "crick"
2372
+ cut1 = m.start() + spacer
2373
+ cut2 = m.end() + spacer
2374
+ crick_cuts.append((cut1, cut2))
2375
+
2376
+ return watson_cuts, crick_cuts
2377
+
2378
+ def get_ds_meltsites(self: DseqType, length: int) -> List[CutSiteType]:
2379
+ """
2380
+ Double stranded DNA melt sites
2381
+
2382
+ DNA molecules can fall apart by melting if they have internal single
2383
+ stranded regions. In the example below, the molecule has two gaps
2384
+ on opposite sides, two nucleotides apart, which means that it hangs
2385
+ together by two basepairs.
2386
+
2387
+ This molecule can melt into two separate 8 bp double stranded
2388
+ molecules, each with 3 nt 3' overhangs a depicted below.
2389
+
2390
+ ::
2391
+
2392
+ tagaagta gtatg tagaagta gtatg
2393
+ ||||| || ||||| --> ||||| |||||
2394
+ atctt atccatac atctt atccatac
2395
+
2396
+
2397
+ A list of 2-tuples is returned. Each tuple (`((cut_watson, ovhg), None)`)
2398
+ contains cut position and the overhang value in the same format as
2399
+ returned by the get_cutsites method for restriction enzymes.
2400
+
2401
+ Note that this function deals with melting that results in two double
2402
+ stranded DNA molecules.
2403
+
2404
+ See get_ss_meltsites for melting of single stranded regions from
2405
+ molecules.
2406
+
2407
+ Examples
2408
+ --------
2409
+ >>> from pydna.dseq import Dseq
2410
+ >>> ds = Dseq("tagaaptaqgtatg")
2411
+ >>> ds
2412
+ Dseq(-14)
2413
+ tagaagta gtatg
2414
+ atctt atccatac
2415
+ >>> cutsite = ds.get_ds_meltsites(2)
2416
+ >>> cutsite
2417
+ [((8, 2), None)]
2418
+
2419
+ """
2420
+
2421
+ if length < 1:
2422
+ return tuple()
2423
+
2424
+ regex = regex_ds_melt_factory(length)
2425
+
2426
+ if self.circular:
2427
+ spacer = length
2428
+ cutfrom = self._data[-length:] + self._data + self._data[:length]
2429
+ else:
2430
+ spacer = 0
2431
+ cutfrom = self._data
2432
+
2433
+ cuts = []
2434
+
2435
+ for m in regex.finditer(cutfrom):
2436
+
2437
+ if m.lastgroup == "watson":
2438
+ cut = (m.end() - spacer, m.end() - m.start()), None
2439
+ else:
2440
+ assert m.lastgroup == "crick"
2441
+ cut = (m.start() - spacer, m.start() - m.end()), None
2442
+
2443
+ cuts.append(cut)
2444
+
2445
+ return cuts
2446
+
2447
+ def cast_to_ds_right(self):
2448
+ """
2449
+ NNNN NNNNGATC
2450
+ |||| --> ||||||||
2451
+ NNNNCTAG NNNNCTAG
2452
+
2453
+
2454
+ NNNNGATC NNNNGATC
2455
+ |||| --> ||||||||
2456
+ NNNN NNNNCTAG
2457
+ """
2458
+
2459
+ p = self.get_parts()
2460
+
2461
+ ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate(
2462
+ dscode_to_full_sequence_table
2463
+ )
2464
+
2465
+ result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer
2466
+
2467
+ return self.__class__(result, circular=False)
2468
+
2469
+ def cast_to_ds(self):
2470
+ """Sequencially calls cast_to_ds_left and cast_to_ds_right."""
2471
+ return self.cast_to_ds_left().cast_to_ds_right()
2472
+
2473
+ def cast_to_ds_left(self):
2474
+ """
2475
+ GATCNNNN GATCNNNN
2476
+ |||| --> ||||||||
2477
+ NNNN CTAGNNNN
2478
+
2479
+ NNNN GATCNNNN
2480
+ |||| --> ||||||||
2481
+ CTAGNNNN CTAGNNNN
2482
+ """
2483
+
2484
+ p = self.get_parts()
2485
+
2486
+ ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate(
2487
+ dscode_to_full_sequence_table
2488
+ )
2489
+
2490
+ result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3)
2491
+
2492
+ return self.__class__(result, circular=False)
1678
2493
 
1679
2494
  def get_cut_parameters(
1680
- self, cut: _Union[CutSiteType, None], is_left: bool
1681
- ) -> _Tuple[int, int, int]:
2495
+ self, cut: Union[CutSiteType, None], is_left: bool
2496
+ ) -> Tuple[int, int, int]:
1682
2497
  """For a given cut expressed as ((cut_watson, ovhg), enz), returns
1683
2498
  a tuple (cut_watson, cut_crick, ovhg).
1684
2499
 
@@ -1703,7 +2518,169 @@ class Dseq(_Seq):
1703
2518
  if is_left:
1704
2519
  return *self.left_end_position(), self.ovhg
1705
2520
  # In the right end, the overhang does not matter
1706
- return *self.right_end_position(), self.watson_ovhg()
2521
+ return *self.right_end_position(), self.watson_ovhg
2522
+
2523
+ def melt(self, length):
2524
+ """
2525
+ TBD
2526
+
2527
+ Parameters
2528
+ ----------
2529
+ length : TYPE
2530
+ DESCRIPTION.
2531
+
2532
+ Returns
2533
+ -------
2534
+ TYPE
2535
+ DESCRIPTION.
2536
+
2537
+ """
2538
+ if not length or length < 1:
2539
+ return tuple()
2540
+
2541
+ # First we need to get rid of single stranded sequences
2542
+ new, strands = self.melt_ss_dna(length)
2543
+
2544
+ cutsites = new.get_ds_meltsites(length)
2545
+
2546
+ cutsite_pairs = self.get_cutsite_pairs(cutsites)
2547
+
2548
+ result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs)
2549
+
2550
+ result = tuple([new]) if strands and not result else result
2551
+
2552
+ return tuple(strands) + tuple(result)
2553
+
2554
+ def melt_ss_dna(self, length) -> tuple["Dseq", list["Dseq"]]:
2555
+ """
2556
+ Melt to separate single stranded DNA
2557
+
2558
+ Single stranded DNA molecules shorter or equal to `length` shed from
2559
+ a double stranded DNA molecule without affecting the length of the
2560
+ remaining molecule.
2561
+
2562
+ In the examples below, the middle 2 nt part is released from the
2563
+ molecule.
2564
+
2565
+ ::
2566
+
2567
+ tagaa ta gtatg tagaa gtatg ta
2568
+ ||||| || ||||| --> ||||| ||||| + ||
2569
+ atcttcatccatac atcttcatccatac
2570
+
2571
+ tagaagtaggtatg tagaagtaggtatg
2572
+ ||||| || ||||| --> ||||| ||||| + ||
2573
+ atctt at catac atctt catac at
2574
+
2575
+
2576
+ Examples
2577
+ --------
2578
+ >>> from pydna.dseq import Dseq
2579
+ >>> ds = Dseq("tagaaqtaqgtatg")
2580
+ >>> ds
2581
+ Dseq(-14)
2582
+ tagaa ta gtatg
2583
+ atcttcatccatac
2584
+ >>> new, strands = ds.melt_ss_dna(2)
2585
+ >>> new
2586
+ Dseq(-14)
2587
+ tagaa gtatg
2588
+ atcttcatccatac
2589
+ >>> strands[0]
2590
+ Dseq(-2)
2591
+ ta
2592
+ <BLANKLINE>
2593
+ >>> ds = Dseq("tagaaptapgtatg")
2594
+ >>> ds
2595
+ Dseq(-14)
2596
+ tagaagtaggtatg
2597
+ atctt at catac
2598
+ >>> new, strands = ds.melt_ss_dna(2)
2599
+ >>> new
2600
+ Dseq(-14)
2601
+ tagaagtaggtatg
2602
+ atctt catac
2603
+ >>> strands[0]
2604
+ Dseq(-2)
2605
+ <BLANKLINE>
2606
+ at
2607
+ """
2608
+
2609
+ watsonnicks, cricknicks = self.get_ss_meltsites(length)
2610
+
2611
+ new, strands = self.shed_ss_dna(watsonnicks, cricknicks)
2612
+
2613
+ return new, strands
2614
+
2615
+ def shed_ss_dna(
2616
+ self,
2617
+ watson_cutpairs: list[tuple[int, int]] = None,
2618
+ crick_cutpairs: list[tuple[int, int]] = None,
2619
+ ):
2620
+ """
2621
+ Separate parts of one of the DNA strands
2622
+
2623
+ Examples
2624
+ --------
2625
+ >>> from pydna.dseq import Dseq
2626
+ >>> ds = Dseq("tagaagtaggtatg")
2627
+ >>> ds
2628
+ Dseq(-14)
2629
+ tagaagtaggtatg
2630
+ atcttcatccatac
2631
+ >>> new, strands = ds.shed_ss_dna([(6, 8)],[])
2632
+ >>> new
2633
+ Dseq(-14)
2634
+ tagaag ggtatg
2635
+ atcttcatccatac
2636
+ >>> strands[0]
2637
+ Dseq(-2)
2638
+ ta
2639
+ <BLANKLINE>
2640
+ >>> new, strands = ds.shed_ss_dna([],[(6, 8)])
2641
+ >>> new
2642
+ Dseq(-14)
2643
+ tagaagtaggtatg
2644
+ atcttc ccatac
2645
+ >>> strands[0]
2646
+ Dseq(-2)
2647
+ <BLANKLINE>
2648
+ at
2649
+ >>> ds = Dseq("tagaagtaggtatg")
2650
+ >>> new, (strand1, strand2) = ds.shed_ss_dna([(6, 8), (9, 11)],[])
2651
+ >>> new
2652
+ Dseq(-14)
2653
+ tagaag g atg
2654
+ atcttcatccatac
2655
+ >>> strand1
2656
+ Dseq(-2)
2657
+ ta
2658
+ <BLANKLINE>
2659
+ >>> strand2
2660
+ Dseq(-2)
2661
+ gt
2662
+ <BLANKLINE>
2663
+ """
2664
+
2665
+ watson_cutpairs = watson_cutpairs or list()
2666
+ crick_cutpairs = crick_cutpairs or list()
2667
+ strands = []
2668
+
2669
+ new = bytearray(self._data)
2670
+
2671
+ for x, y in watson_cutpairs:
2672
+ stuffer = new[x:y]
2673
+ ss = Dseq.quick(new[x:y].translate(dscode_to_watson_tail_table))
2674
+ new[x:y] = stuffer.translate(dscode_to_crick_tail_table)
2675
+ strands.append(ss)
2676
+
2677
+ for x, y in crick_cutpairs:
2678
+ stuffer = new[x:y]
2679
+ ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table))
2680
+ new[x:y] = stuffer.translate(dscode_to_watson_tail_table)
2681
+ strands.append(ss)
2682
+
2683
+ return Dseq.quick(new), strands
1707
2684
 
1708
2685
  def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq":
1709
2686
  """Extracts a subfragment of the sequence between two cuts.
@@ -1760,25 +2737,22 @@ class Dseq(_Seq):
1760
2737
  GttCTTAA
1761
2738
 
1762
2739
  """
1763
- if _cuts_overlap(left_cut, right_cut, len(self)):
2740
+ if cuts_overlap(left_cut, right_cut, len(self)):
1764
2741
  raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1]))
1765
2742
 
1766
2743
  left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True)
1767
2744
  right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False)
1768
2745
  return Dseq(
1769
- str(self[left_watson:right_watson]),
1770
- # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case
1771
- str(
1772
- self.reverse_complement()[
1773
- len(self) - right_crick : len(self) - left_crick
1774
- ]
1775
- ),
2746
+ self[left_watson:right_watson]._data.translate(dscode_to_watson_table),
2747
+ self[left_crick:right_crick]
2748
+ .reverse_complement()
2749
+ ._data.translate(dscode_to_watson_table),
1776
2750
  ovhg=ovhg_left,
1777
2751
  )
1778
2752
 
1779
2753
  def get_cutsite_pairs(
1780
- self, cutsites: _List[CutSiteType]
1781
- ) -> _List[_Tuple[_Union[None, CutSiteType], _Union[None, CutSiteType]]]:
2754
+ self, cutsites: List[CutSiteType]
2755
+ ) -> List[Tuple[Union[None, CutSiteType], Union[None, CutSiteType]]]:
1782
2756
  """Returns pairs of cutsites that render the edges of the resulting fragments.
1783
2757
 
1784
2758
  A fragment produced by restriction is represented by a tuple of length 2 that
@@ -1828,3 +2802,105 @@ class Dseq(_Seq):
1828
2802
  cutsites.append(cutsites[0])
1829
2803
 
1830
2804
  return list(zip(cutsites, cutsites[1:]))
2805
+
2806
+ def get_parts(self):
2807
+ """
2808
+ Returns a DseqParts instance containing the parts (strings) of a dsDNA
2809
+ sequence. DseqParts instance field names:
2810
+
2811
+ ::
2812
+
2813
+ "sticky_left5"
2814
+ |
2815
+ | "sticky_right5"
2816
+ | |
2817
+ --- ---
2818
+ GGGATCC
2819
+ TAGGTCA
2820
+ ----
2821
+ |
2822
+ "middle"
2823
+
2824
+
2825
+
2826
+ "sticky_left3"
2827
+ |
2828
+ | "sticky_right3"
2829
+ | |
2830
+ --- ---
2831
+ ATCCAGT
2832
+ CCCTAGG
2833
+ ----
2834
+ |
2835
+ "middle"
2836
+
2837
+
2838
+
2839
+ "single_watson" (only an upper strand)
2840
+ |
2841
+ -------
2842
+ ATCCAGT
2843
+ |||||||
2844
+
2845
+
2846
+
2847
+ "single_crick" (only a lower strand)
2848
+ |
2849
+ -------
2850
+
2851
+ |||||||
2852
+ CCCTAGG
2853
+
2854
+
2855
+ Up to seven groups (0..6) are captured, but some are mutually exclusive
2856
+ which means that one of them is an empty string:
2857
+
2858
+ 0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
2859
+
2860
+ 2 or 5 or 6, a DNA molecule has a ds region or is single stranded.
2861
+
2862
+ 3 or 4, not both, either 5' or 3' sticky end.
2863
+
2864
+ Note that internal single stranded regions are not identified and will
2865
+ be contained in the middle part if they are present.
2866
+
2867
+ Examples
2868
+ --------
2869
+ >>> from pydna.dseq import Dseq
2870
+ >>> ds = Dseq("PPPATCFQZ")
2871
+ >>> ds
2872
+ Dseq(-9)
2873
+ GGGATC
2874
+ TAGTCA
2875
+ >>> parts = ds.get_parts()
2876
+ >>> parts
2877
+ DseqParts(sticky_left5='PPP', sticky_left3='', middle='ATC', sticky_right3='', sticky_right5='FQZ', single_watson='', single_crick='')
2878
+ >>> Dseq(parts.sticky_left5)
2879
+ Dseq(-3)
2880
+ GGG
2881
+ <BLANKLINE>
2882
+ >>> Dseq(parts.middle)
2883
+ Dseq(-3)
2884
+ ATC
2885
+ TAG
2886
+ >>> Dseq(parts.sticky_right5)
2887
+ Dseq(-3)
2888
+ <BLANKLINE>
2889
+ TCA
2890
+
2891
+ Parameters
2892
+ ----------
2893
+ datastring : str
2894
+ A string with dscode.
2895
+
2896
+ Returns
2897
+ -------
2898
+ namedtuple
2899
+ Seven string fields describing the DNA molecule.
2900
+ fragment(sticky_left5='', sticky_left3='',
2901
+ middle='',
2902
+ sticky_right3='', sticky_right5='',
2903
+ single_watson='', single_crick='')
2904
+
2905
+ """
2906
+ return get_parts(self._data.decode("ascii"))