pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/dseq.py CHANGED
@@ -1,10 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2013-2023 by Björn Johansson. All rights reserved.
5
- # This code is part of the Python-dna distribution and governed by its
6
- # license. Please see the LICENSE.txt file that should have been included
7
- # as part of this package.
8
4
  """Provides the Dseq class for handling double stranded DNA sequences.
9
5
 
10
6
  Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
@@ -14,87 +10,217 @@ which can hold more meta data.
14
10
  The Dseq class support the notion of circular and linear DNA topology.
15
11
  """
16
12
 
13
+ import itertools
14
+ import re
15
+ import copy
16
+ import sys
17
+ import math
18
+ import inspect
19
+ from typing import List, Tuple, Union
17
20
 
18
- import copy as _copy
19
- import itertools as _itertools
20
- import re as _re
21
- import sys as _sys
22
- import math as _math
21
+ from Bio.Restriction import RestrictionBatch
22
+ from Bio.Restriction import CommOnly
23
+
24
+ from seguid import ldseguid
25
+ from seguid import cdseguid
26
+
27
+ from pydna.seq import Seq
28
+ from Bio.Seq import _SeqAbstractBaseClass
29
+ from Bio.Data.IUPACData import unambiguous_dna_weights
30
+ from Bio.Data.IUPACData import unambiguous_rna_weights
31
+ from Bio.Data.IUPACData import atom_weights
32
+ from pydna._pretty import pretty_str
33
+ from pydna.utils import rc
34
+ from pydna.utils import flatten
35
+ from pydna.utils import cuts_overlap
36
+
37
+ from pydna.alphabet import basepair_dict
38
+ from pydna.alphabet import dscode_to_watson_table
39
+ from pydna.alphabet import dscode_to_crick_table
40
+ from pydna.alphabet import regex_ds_melt_factory
41
+ from pydna.alphabet import regex_ss_melt_factory
42
+ from pydna.alphabet import dscode_to_full_sequence_table
43
+ from pydna.alphabet import dscode_to_watson_tail_table
44
+ from pydna.alphabet import dscode_to_crick_tail_table
45
+ from pydna.alphabet import complement_table_for_dscode
46
+ from pydna.alphabet import letters_not_in_dscode
47
+ from pydna.alphabet import get_parts
48
+ from pydna.alphabet import representation_tuple
49
+ from pydna.alphabet import dsbreaks
50
+
51
+ from pydna.common_sub_strings import common_sub_strings
52
+ from pydna.types import DseqType, EnzymesType, CutSiteType
53
+
54
+
55
+ # Sequences larger than this gets a truncated representation.
56
+ length_limit_for_repr = 30
57
+ placeholder = letters_not_in_dscode[-1]
58
+
59
+
60
+ class CircularBytes(bytes):
61
+ """
62
+ A circular bytes sequence: indexing and slicing wrap around index 0.
63
+ """
23
64
 
24
- from pydna.seq import Seq as _Seq
25
- from Bio.Seq import _translate_str, _SeqAbstractBaseClass
65
+ def __new__(cls, value: bytes | bytearray | memoryview):
66
+ return super().__new__(cls, bytes(value))
67
+
68
+ def __getitem__(self, key):
69
+ n = len(self)
70
+ if n == 0:
71
+ if isinstance(key, slice):
72
+ return self.__class__(b"")
73
+ raise IndexError("CircularBytes index out of range (empty bytes)")
74
+
75
+ if isinstance(key, int):
76
+ return super().__getitem__(key % n)
77
+
78
+ if isinstance(key, slice):
79
+ start, stop, step = key.start, key.stop, key.step
80
+ step = 1 if step is None else step
81
+ if step == 0:
82
+ raise ValueError("slice step cannot be zero")
83
+
84
+ if step > 0:
85
+ start = 0 if start is None else start
86
+ stop = n if stop is None else stop
87
+ while stop <= start:
88
+ stop += n
89
+ rng = range(start, stop, step)
90
+ else:
91
+ start = (n - 1) if start is None else start
92
+ stop = -1 if stop is None else stop
93
+ while stop >= start:
94
+ stop -= n
95
+ rng = range(start, stop, step)
96
+
97
+ limit = n if step % n == 0 else n * 2
98
+ out = bytearray()
99
+ count = 0
100
+ for i in rng:
101
+ out.append(super().__getitem__(i % n))
102
+ count += 1
103
+ if count > limit:
104
+ break
105
+ return self.__class__(bytes(out))
26
106
 
27
- from pydna._pretty import pretty_str as _pretty_str
28
- from seguid import ldseguid as _ldseguid
29
- from seguid import cdseguid as _cdseguid
107
+ return super().__getitem__(key)
30
108
 
31
- from pydna.utils import rc as _rc
32
- from pydna.utils import flatten as _flatten
33
- from pydna.utils import cuts_overlap as _cuts_overlap
109
+ def cutaround(self, start: int, length: int) -> bytes:
110
+ """
111
+ Return a circular slice of given length starting at index `start`.
112
+ Can exceed len(self), wrapping around as needed.
34
113
 
35
- from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
36
- from Bio.Restriction import RestrictionBatch as _RestrictionBatch
37
- from Bio.Restriction import CommOnly
114
+ Examples
115
+ --------
116
+ s = CircularBytes(b"ABCDE")
117
+ assert s.cutaround(3, 7) == b"DEABCDE"
118
+ assert s.cutaround(-1, 4) == b"EABC"
119
+ """
120
+ n = len(self)
121
+ if n == 0 or length <= 0:
122
+ return self.__class__(b"")
123
+
124
+ start %= n
125
+ out = bytearray()
126
+ for i in range(length):
127
+ out.append(self[(start + i) % n])
128
+ return self.__class__(bytes(out))
129
+
130
+ def find(
131
+ self,
132
+ sub: bytes | bytearray | memoryview | str,
133
+ start: int = 0,
134
+ end: int | None = None,
135
+ ) -> int:
136
+ """
137
+ Find a subsequence in the circular sequence, possibly
138
+ wrapping across the origin.
139
+ Returns -1 if not found.
140
+ """
141
+ n = len(self)
142
+ if n == 0:
143
+ return -1
144
+
145
+ end = n if end is None else min(end, n)
146
+ doubled = self + self
147
+ try:
148
+ sub = sub.encode("ascii")
149
+ except AttributeError:
150
+ pass
151
+
152
+ pos = doubled.find(bytes(sub), start, n + len(sub) - 1)
153
+
154
+ if pos == -1 or pos >= n:
155
+ return -1
156
+ return pos
38
157
 
39
158
 
40
- from .types import DseqType, EnzymesType, CutSiteType
159
+ class Dseq(Seq):
160
+ """Dseq describes a double stranded DNA fragment, linear or circular.
41
161
 
42
- from typing import List as _List, Tuple as _Tuple, Union as _Union
162
+ Dseq can be initiated in two ways, using two strings, each representing the
163
+ Watson (upper, sense) strand, the Crick (lower, antisense) strand and an
164
+ optional value describing the stagger betwen the strands on the left side (ovhg).
43
165
 
166
+ Alternatively, a single string represenation using dsIUPAC codes can be used.
167
+ If a single string is used, the letters of that string are interpreted as base
168
+ pairs rather than single bases. For example "A" would indicate the basepair
169
+ "A/T". An expanded IUPAC code is used where the letters PEXI have been assigned
170
+ to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"",
171
+ T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with
172
+ an empty Watson strand ""/G, ""/A, ""/T, and ""/C.
173
+
174
+ ::
175
+
176
+ PEXIGATCQFZJ would indicate the linear double-stranded fragment:
177
+
178
+ GATCGATC
179
+ CTAGCTAG
44
180
 
45
- class Dseq(_Seq):
46
- """Dseq holds information for a double stranded DNA fragment.
47
181
 
48
- Dseq also holds information describing the topology of
49
- the DNA fragment (linear or circular).
50
182
 
51
183
  Parameters
52
184
  ----------
53
185
  watson : str
54
- a string representing the watson (sense) DNA strand.
186
+ a string representing the Watson (sense) DNA strand or a basepair
187
+ represenation.
55
188
 
56
189
  crick : str, optional
57
- a string representing the crick (antisense) DNA strand.
190
+ a string representing the Crick (antisense) DNA strand.
58
191
 
59
192
  ovhg : int, optional
60
193
  A positive or negative number to describe the stagger between the
61
- watson and crick strands.
194
+ Watson and Crick strands.
62
195
  see below for a detailed explanation.
63
196
 
64
- linear : bool, optional
65
- True indicates that sequence is linear, False that it is circular.
66
-
67
197
  circular : bool, optional
68
198
  True indicates that sequence is circular, False that it is linear.
69
199
 
70
200
 
71
201
  Examples
72
202
  --------
73
- Dseq is a subclass of the Biopython Seq object. It stores two
74
- strings representing the watson (sense) and crick(antisense) strands.
75
- two properties called linear and circular, and a numeric value ovhg
76
- (overhang) describing the stagger for the watson and crick strand
77
- in the 5' end of the fragment.
203
+ Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor
204
+ can accept two strings representing the Watson (sense) and Crick(antisense)
205
+ DNA strands. These are interpreted as single stranded DNA. There is a check
206
+ for complementarity between the strands.
78
207
 
79
- The most common usage is probably to create a Dseq object as a
80
- part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
81
-
82
- There are three ways of creating a Dseq object directly listed below, but you can also
83
- use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
208
+ If the DNA molecule is staggered on the left side, an integer ovhg
209
+ (overhang) must be given, describing the stagger between the Watson and Crick strand
210
+ in the 5' end of the fragment.
84
211
 
85
- Only one argument (string):
212
+ Additionally, the optional boolean parameter circular can be given to indicate if the
213
+ DNA molecule is circular.
86
214
 
87
- >>> from pydna.dseq import Dseq
88
- >>> Dseq("aaa")
89
- Dseq(-3)
90
- aaa
91
- ttt
215
+ The most common usage of the Dseq class is probably not to use it directly, but to
216
+ create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
217
+ This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and
218
+ :class:`Bio.SeqRecord.SeqRecord` classes in Biopython.
92
219
 
93
- The given string will be interpreted as the watson strand of a
94
- blunt, linear double stranded sequence object. The crick strand
95
- is created automatically from the watson strand.
220
+ There are multiple ways of creating a Dseq object directly listed below, but you can also
221
+ use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
96
222
 
97
- Two arguments (string, string):
223
+ Two arguments (string, string), no overhang provided:
98
224
 
99
225
  >>> from pydna.dseq import Dseq
100
226
  >>> Dseq("gggaaat","ttt")
@@ -102,16 +228,14 @@ class Dseq(_Seq):
102
228
  gggaaat
103
229
  ttt
104
230
 
105
- If both watson and crick are given, but not ovhg an attempt
106
- will be made to find the best annealing between the strands.
107
- There are limitations to this. For long fragments it is quite
108
- slow. The length of the annealing sequences have to be at least
109
- half the length of the shortest of the strands.
231
+ If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing
232
+ between the strands. There are important limitations to this. If there are several ways to
233
+ anneal the strands, this will fail. For long fragments it is quite slow.
110
234
 
111
235
  Three arguments (string, string, ovhg=int):
112
236
 
113
- The ovhg parameter is an integer describing the length of the
114
- crick strand overhang in the 5' end of the molecule.
237
+ The ovhg parameter is an integer describing the length of the Crick strand overhang on the
238
+ left side (the 5' end of Watson strand).
115
239
 
116
240
  The ovhg parameter controls the stagger at the five prime end::
117
241
 
@@ -134,53 +258,51 @@ class Dseq(_Seq):
134
258
 
135
259
  Example of creating Dseq objects with different amounts of stagger:
136
260
 
137
- >>> Dseq(watson="agt", crick="actta", ovhg=-2)
261
+ >>> Dseq(watson="att", crick="acata", ovhg=-2)
138
262
  Dseq(-7)
139
- agt
140
- attca
141
- >>> Dseq(watson="agt",crick="actta",ovhg=-1)
263
+ att
264
+ ataca
265
+ >>> Dseq(watson="ata",crick="acata",ovhg=-1)
142
266
  Dseq(-6)
143
- agt
144
- attca
145
- >>> Dseq(watson="agt",crick="actta",ovhg=0)
267
+ ata
268
+ ataca
269
+ >>> Dseq(watson="taa",crick="actta",ovhg=0)
146
270
  Dseq(-5)
147
- agt
271
+ taa
148
272
  attca
149
- >>> Dseq(watson="agt",crick="actta",ovhg=1)
273
+ >>> Dseq(watson="aag",crick="actta",ovhg=1)
150
274
  Dseq(-5)
151
- agt
275
+ aag
152
276
  attca
153
277
  >>> Dseq(watson="agt",crick="actta",ovhg=2)
154
278
  Dseq(-5)
155
279
  agt
156
280
  attca
157
281
 
158
- If the ovhg parameter is specified a crick strand also
159
- needs to be supplied, otherwise an exception is raised.
282
+ If the ovhg parameter is specified a Crick strand also needs to be supplied, or
283
+ an exception is raised.
160
284
 
161
285
  >>> Dseq(watson="agt", ovhg=2)
162
286
  Traceback (most recent call last):
163
- File "<stdin>", line 1, in <module>
164
- File "/usr/local/lib/python2.7/dist-packages/pydna_/dsdna.py", line 169, in __init__
165
- else:
166
- ValueError: ovhg defined without crick strand!
167
-
287
+ ...
288
+ ValueError: ovhg (overhang) defined without a crick strand.
168
289
 
169
- The shape of the fragment is set by circular = True, False
170
290
 
171
- Note that both ends of the DNA fragment has to be compatible to set
172
- circular = True.
291
+ The shape or topology of the fragment is set by the circular parameter, True or False (default).
173
292
 
174
-
175
- >>> Dseq("aaa","ttt")
293
+ >>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default
176
294
  Dseq(-3)
177
295
  aaa
178
296
  ttt
179
- >>> Dseq("aaa","ttt",ovhg=0)
297
+ >>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False
180
298
  Dseq(-3)
181
299
  aaa
182
300
  ttt
183
- >>> Dseq("aaa","ttt",ovhg=1)
301
+ >>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence
302
+ Dseq(o3)
303
+ aaa
304
+ ttt
305
+ >>> Dseq("aaa", "ttt", ovhg=1, circular = False)
184
306
  Dseq(-4)
185
307
  aaa
186
308
  ttt
@@ -210,6 +332,18 @@ class Dseq(_Seq):
210
332
  -4
211
333
  >>>
212
334
 
335
+
336
+ dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions:
337
+
338
+ ::
339
+
340
+ aaaGATC GATCccc ad-hoc representations
341
+ CTAGttt gggCTAG
342
+
343
+ QFZJaaaPEXI PEXIcccQFZJ dsIUPAC
344
+
345
+
346
+
213
347
  Coercing to string
214
348
 
215
349
  >>> str(a)
@@ -295,46 +429,76 @@ class Dseq(_Seq):
295
429
 
296
430
  """
297
431
 
298
- trunc = 30
299
-
300
432
  def __init__(
301
433
  self,
302
- watson: _Union[str, bytes],
303
- crick: _Union[str, bytes, None] = None,
434
+ watson: Union[str, bytes],
435
+ crick: Union[str, bytes, None] = None,
304
436
  ovhg=None,
305
437
  circular=False,
306
438
  pos=0,
307
439
  ):
308
- if isinstance(watson, bytes):
309
- watson = watson.decode("ASCII")
310
- if isinstance(crick, bytes):
311
- crick = crick.decode("ASCII")
440
+ if isinstance(watson, (bytes, bytearray)):
441
+ # watson is decoded to a string if needed.
442
+ watson = watson.decode("ascii")
443
+ if isinstance(crick, (bytes, bytearray)):
444
+ # crick is decoded to a string if needed.
445
+ crick = crick.decode("ascii")
312
446
 
313
447
  if crick is None:
314
448
  if ovhg is not None:
315
- raise ValueError("ovhg defined without crick strand!")
316
- crick = _rc(watson)
317
- ovhg = 0
318
- self._data = bytes(watson, encoding="ASCII")
449
+ raise ValueError("ovhg (overhang) defined without a crick strand.")
450
+ """
451
+ Giving only the watson string implies inferring the Crick complementary strand
452
+ from the Watson sequence. The watson string can contain dscode letters wich will
453
+ be interpreted as outlined in the pydna.alphabet module.
454
+
455
+ The _data property must be a byte string for compatibility with
456
+ Biopython Bio.Seq.Seq
457
+ """
458
+ data = watson
459
+ self._data = data.encode("ascii")
319
460
 
320
- else: # crick strand given
321
- if ovhg is None: # ovhg not given
322
- olaps = _common_sub_strings(
461
+ else:
462
+ """
463
+ Crick strand given, ovhg is optional. An important consequence is that the
464
+ watson and crick strands are interpreted as single stranded DNA that is
465
+ supposed to anneal.
466
+
467
+ If ovhg was not given, we try to guess the value below. This will fail
468
+ if there are two or more ways to anneal with equal length of the double
469
+ stranded part.
470
+ """
471
+ if ovhg is None: # ovhg not given, try to guess from sequences
472
+ limit = int(math.log(len(watson)) / math.log(4))
473
+ olaps = common_sub_strings(
323
474
  str(watson).lower(),
324
- str(_rc(crick).lower()),
325
- int(_math.log(len(watson)) / _math.log(4)),
475
+ str(rc(crick).lower()),
476
+ limit,
326
477
  )
478
+
479
+ """No overlaps found, strands do not anneal"""
327
480
  if len(olaps) == 0:
328
481
  raise ValueError(
329
- "Could not anneal the two strands." " Please provide ovhg value"
482
+ "Could not anneal the two strands."
483
+ f" looked for annealing with at least {limit} basepairs"
484
+ " Please provide and overhang value (ovhg parameter)"
330
485
  )
331
486
 
332
- # We extract the positions and length of the first (longest) overlap, since
333
- # common_sub_strings sorts the overlaps by length.
334
- pos_watson, pos_crick, longest_olap_length = olaps[0]
487
+ """
488
+ We extract the positions and length of the first (longest) overlap,
489
+ since common_sub_strings sorts the overlaps by length, longest first.
490
+ """
335
491
 
336
- # We see if there is another overlap of the same length
337
- if any(olap[2] >= longest_olap_length for olap in olaps[1:]):
492
+ (pos_watson, pos_crick, longest_olap_length), *rest = olaps
493
+
494
+ """
495
+ We see if there is another overlap of the same length
496
+ This means that annealing is ambigous. User should provide
497
+ and ovhg value.
498
+ """
499
+ if any(
500
+ olap_length >= longest_olap_length for _, _, olap_length in rest
501
+ ):
338
502
  raise ValueError(
339
503
  "More than one way of annealing the"
340
504
  " strands. Please provide ovhg value"
@@ -342,120 +506,80 @@ class Dseq(_Seq):
342
506
 
343
507
  ovhg = pos_crick - pos_watson
344
508
 
345
- sns = (ovhg * " ") + _pretty_str(watson)
346
- asn = (-ovhg * " ") + _pretty_str(_rc(crick))
347
-
348
- self._data = bytes(
349
- "".join(
350
- [
351
- a.strip() or b.strip()
352
- for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ")
353
- ]
354
- ),
355
- encoding="ASCII",
356
- )
509
+ """
510
+ Pad both strands on left side ovhg spaces
511
+ a negative number gives no padding,
512
+ """
513
+ sense = ovhg * " " + watson
514
+ antisense = -ovhg * " " + crick[::-1]
515
+
516
+ max_len = max(len(sense), len(antisense))
357
517
 
358
- else: # ovhg given
359
- if ovhg == 0:
360
- if len(watson) >= len(crick):
361
- self._data = bytes(watson, encoding="ASCII")
362
- else:
363
- self._data = bytes(
364
- watson + _rc(crick[: len(crick) - len(watson)]),
365
- encoding="ASCII",
366
- )
367
- elif ovhg > 0:
368
- if ovhg + len(watson) > len(crick):
369
- self._data = bytes(
370
- _rc(crick[-ovhg:]) + watson, encoding="ASCII"
371
- )
372
- else:
373
- self._data = bytes(
374
- _rc(crick[-ovhg:])
375
- + watson
376
- + _rc(crick[: len(crick) - ovhg - len(watson)]),
377
- encoding="ASCII",
378
- )
379
- else: # ovhg < 0
380
- if -ovhg + len(crick) > len(watson):
381
- self._data = bytes(
382
- watson + _rc(crick[: -ovhg + len(crick) - len(watson)]),
383
- encoding="ASCII",
384
- )
385
- else:
386
- self._data = bytes(watson, encoding="ASCII")
518
+ """pad both strands on right side to same size."""
519
+ sense = sense.ljust(max_len)
520
+ antisense = antisense.ljust(max_len)
521
+ """both strands padded so that bsepairs align"""
522
+ assert len(sense) == len(antisense)
523
+
524
+ data = []
525
+
526
+ for w, c in zip(sense, antisense):
527
+ try:
528
+ data.append(basepair_dict[w, c])
529
+ except KeyError as err:
530
+ print(f"Base mismatch in representation {err}")
531
+ raise ValueError(f"Base mismatch in representation: {err}")
532
+ data = "".join(data).strip()
533
+ self._data = data.encode("ascii")
387
534
 
388
535
  self.circular = circular
389
- self.watson = _pretty_str(watson)
390
- self.crick = _pretty_str(crick)
391
- self.length = len(self._data)
392
- self.ovhg = ovhg
393
536
  self.pos = pos
394
537
 
538
+ if circular:
539
+ data += data[0:1]
540
+
541
+ dsb = dsbreaks(data)
542
+
543
+ if dsb:
544
+ msg = "".join(dsb)
545
+ raise ValueError(
546
+ f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip()
547
+ )
548
+
395
549
  @classmethod
396
- def quick(
397
- cls,
398
- watson: str,
399
- crick: str,
400
- ovhg=0,
401
- circular=False,
402
- pos=0,
403
- ):
404
- obj = cls.__new__(cls) # Does not call __init__
405
- obj.watson = _pretty_str(watson)
406
- obj.crick = _pretty_str(crick)
407
- obj.ovhg = ovhg
550
+ def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs):
551
+ """Fastest way to instantiate an object of the Dseq class.
552
+
553
+ No checks of parameters are made.
554
+ Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks.
555
+ """
556
+ obj = cls.__new__(cls)
408
557
  obj.circular = circular
409
- obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
410
558
  obj.pos = pos
411
- wb = bytes(watson, encoding="ASCII")
412
- cb = bytes(crick, encoding="ASCII")
413
- obj._data = (
414
- _rc(cb[-max(0, ovhg) or len(cb) :])
415
- + wb
416
- + _rc(cb[: max(0, len(cb) - ovhg - len(wb))])
417
- )
418
- return obj
559
+ obj._data = data
419
560
 
420
- @classmethod
421
- def from_string(
422
- cls,
423
- dna: str,
424
- *args,
425
- # linear=True,
426
- circular=False,
427
- **kwargs,
428
- ):
429
- obj = cls.__new__(cls) # Does not call __init__
430
- obj.watson = _pretty_str(dna)
431
- obj.crick = _pretty_str(_rc(dna))
432
- obj.ovhg = 0
433
- obj.circular = circular
434
- # obj._linear = linear
435
- obj.length = len(dna)
436
- obj.pos = 0
437
- obj._data = bytes(dna, encoding="ASCII")
438
561
  return obj
439
562
 
440
563
  @classmethod
441
564
  def from_representation(cls, dsdna: str, *args, **kwargs):
442
- obj = cls.__new__(cls) # Does not call __init__
443
- w, c, *r = [ln for ln in dsdna.splitlines() if ln]
444
- ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip()))
445
- watson = obj.watson = _pretty_str(w.strip())
446
- crick = obj.crick = _pretty_str(c.strip()[::-1])
565
+ obj = cls.__new__(cls)
447
566
  obj.circular = False
448
- # obj._linear = True
449
- obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg))
450
567
  obj.pos = 0
451
- wb = bytes(watson, encoding="ASCII")
452
- cb = bytes(crick, encoding="ASCII")
453
- obj._data = (
454
- _rc(cb[-max(0, ovhg) or len(cb) :])
455
- + wb
456
- + _rc(cb[: max(0, len(cb) - ovhg - len(wb))])
457
- )
458
- return obj
568
+ clean = inspect.cleandoc("\n" + dsdna)
569
+ watson, crick = [
570
+ ln
571
+ for ln in clean.splitlines()
572
+ if ln.strip() and not ln.strip().startswith("Dseq(")
573
+ ]
574
+ ovhgw = len(watson) - len(watson.lstrip())
575
+ ovhgc = -(len(crick) - len(crick.lstrip()))
576
+
577
+ ovhg = ovhgw or ovhgc
578
+
579
+ watson = watson.strip()
580
+ crick = crick.strip()[::-1]
581
+
582
+ return Dseq(watson, crick, ovhg)
459
583
 
460
584
  @classmethod
461
585
  def from_full_sequence_and_overhangs(
@@ -522,111 +646,177 @@ class Dseq(_Seq):
522
646
 
523
647
  return Dseq(watson, crick=crick, ovhg=crick_ovhg)
524
648
 
525
- # @property
526
- # def ovhg(self):
527
- # """The ovhg property. This cannot be set directly, but is a
528
- # consequence of how the watson and crick strands anneal to
529
- # each other"""
530
- # return self._ovhg
531
-
532
- # @property
533
- # def linear(self):
534
- # """The linear property can not be set directly.
535
- # Use an empty slice [:] to create a linear object."""
536
- # return self._linear
537
-
538
- # @property
539
- # def circular(self):
540
- # """The circular property can not be set directly.
541
- # Use :meth:`looped` to create a circular Dseq object"""
542
- # return self._circular
649
+ @property
650
+ def watson(self) -> str:
651
+ """
652
+ The watson (upper) strand of the double stranded fragment 5'-3'.
543
653
 
544
- def mw(self) -> float:
545
- """This method returns the molecular weight of the DNA molecule
546
- in g/mol. The following formula is used::
547
-
548
- MW = (A x 313.2) + (T x 304.2) +
549
- (C x 289.2) + (G x 329.2) +
550
- (N x 308.9) + 79.0
551
- """
552
- nts = (self.watson + self.crick).lower()
553
-
554
- return (
555
- 313.2 * nts.count("a")
556
- + 304.2 * nts.count("t")
557
- + 289.2 * nts.count("c")
558
- + 329.2 * nts.count("g")
559
- + 308.9 * nts.count("n")
560
- + 79.0
561
- )
654
+ Returns
655
+ -------
656
+ TYPE
657
+ DESCRIPTION.
562
658
 
563
- def upper(self: DseqType) -> DseqType:
564
- """Return an upper case copy of the sequence.
659
+ """
660
+ return self._data.decode("ascii").translate(dscode_to_watson_table).strip()
565
661
 
566
- >>> from pydna.dseq import Dseq
567
- >>> my_seq = Dseq("aAa")
568
- >>> my_seq
569
- Dseq(-3)
570
- aAa
571
- tTt
572
- >>> my_seq.upper()
573
- Dseq(-3)
574
- AAA
575
- TTT
662
+ @property
663
+ def crick(self) -> str:
664
+ """
665
+ The crick (lower) strand of the double stranded fragment 5'-3'.
576
666
 
577
667
  Returns
578
668
  -------
579
- Dseq
580
- Dseq object in uppercase
669
+ TYPE
670
+ DESCRIPTION.
581
671
 
582
- See also
583
- --------
584
- pydna.dseq.Dseq.lower
672
+ """
673
+ return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1]
585
674
 
675
+ @property
676
+ def left_ovhg(self) -> int:
586
677
  """
587
- return self.quick(
588
- self.watson.upper(),
589
- self.crick.upper(),
590
- ovhg=self.ovhg,
591
- # linear=self.linear,
592
- circular=self.circular,
593
- pos=self.pos,
594
- )
678
+ The 5' overhang of the lower strand compared the the upper.
595
679
 
596
- def lower(self: DseqType) -> DseqType:
597
- """Return a lower case copy of the sequence.
680
+ See module docstring for more information.
598
681
 
599
- >>> from pydna.dseq import Dseq
600
- >>> my_seq = Dseq("aAa")
601
- >>> my_seq
602
- Dseq(-3)
603
- aAa
604
- tTt
605
- >>> my_seq.lower()
606
- Dseq(-3)
607
- aaa
608
- ttt
682
+ Returns
683
+ -------
684
+ TYPE
685
+ DESCRIPTION.
686
+
687
+ """
688
+ parts = self.get_parts()
689
+ if parts.single_watson or parts.single_crick:
690
+ return None
691
+ return -len(parts.sticky_left5) or len(parts.sticky_left3)
692
+
693
+ ovhg = left_ovhg
694
+
695
+ @property
696
+ def right_ovhg(self) -> int:
697
+ """Overhang at the right side (end)."""
698
+ parts = self.get_parts()
699
+ if parts.single_watson or parts.single_crick:
700
+ return None
701
+ return -len(parts.sticky_right5) or len(parts.sticky_right3)
702
+
703
+ watson_ovhg = right_ovhg
704
+
705
+ def __str__(self) -> str:
706
+ """
707
+ A string representation of the sequence. The returned string
708
+ is the watson strand of a blunt version of the sequence.
709
+
710
+ >>> ds = Dseq.from_representation(
711
+ ... '''
712
+ ... GAATTC
713
+ ... TAA
714
+ ... ''')
715
+
716
+ >>> str(ds)
717
+ 'GAATTC'
718
+ >>> ds = Dseq.from_representation(
719
+ ... '''
720
+ ... ATT
721
+ ... CTTAAG
722
+ ... ''')
723
+
724
+ >>> str(ds)
725
+ 'GAATTC'
609
726
 
610
727
  Returns
611
728
  -------
612
- Dseq
613
- Dseq object in lowercase
729
+ str
730
+ A string representation of the sequence.
614
731
 
615
- See also
732
+ """
733
+ return bytes(self).decode("ascii")
734
+
735
+ to_blunt_string = __str__ # alias of __str__ # TODO: consider removing
736
+
737
+ def __bytes__(self) -> bytes:
738
+ return self._data.translate(dscode_to_full_sequence_table)
739
+
740
+ def mw(self) -> float:
741
+ """The molecular weight of the DNA/RNA molecule in g/mol.
742
+
743
+ The molecular weight data in Biopython Bio.Data.IUPACData
744
+ is used. The DNA is assumed to have a 5'-phosphate as many
745
+ DNA fragments from restriction digestion do:
746
+
747
+ ::
748
+
749
+ P - G-A-T-T-A-C-A - OH
750
+ | | | | | | |
751
+ OH - C-T-A-A-T-G-T - P
752
+
753
+ The molecular weights listed in the unambiguous_dna_weights
754
+ dictionary refers to free monophosphate nucleotides.
755
+ One water molecule is removed for every phopshodiester bond
756
+ formed between nucleotides. For linear molecules, the weight
757
+ of one water molecule is added to account for the terminal
758
+ hydroxyl group and a hydrogen on the 5' terminal phosphate
759
+ group.
760
+
761
+ ::
762
+
763
+ P - G---A---T - OH P - C---A - OH
764
+ | | | | |
765
+ OH - C---T---A---A---T---G---T - P
766
+
767
+ If the DNA is discontinuous, the internal 5'- end is assumed
768
+ to have a phosphate and the 3'- a hydroxyl group:
769
+
770
+
771
+ Examples
616
772
  --------
617
- pydna.dseq.Dseq.upper
618
- """
619
- return self.quick(
620
- self.watson.lower(),
621
- self.crick.lower(),
622
- ovhg=self.ovhg,
623
- # linear=self.linear,
624
- circular=self.circular,
625
- pos=self.pos,
626
- )
773
+ >>> from pydna.dseq import Dseq
774
+ >>> ds_lin_obj = Dseq("GATTACA")
775
+ >>> ds_lin_obj
776
+ Dseq(-7)
777
+ GATTACA
778
+ CTAATGT
779
+ >>> round(ds_lin_obj.mw(), 1)
780
+ 4359.8
781
+ >>> ds_circ_obj = Dseq("GATTACA", circular = True)
782
+ >>> round(ds_circ_obj.mw(), 1)
783
+ 4323.8
784
+ >>> ssobj = Dseq("PEXXEIE")
785
+ >>> ssobj
786
+ Dseq(-7)
787
+ GATTACA
788
+ <BLANKLINE>
789
+ >>> round(ssobj.mw(), 1)
790
+ 2184.4
791
+ >>> ds_lin_obj2 = Dseq("GATZFCA")
792
+ >>> ds_lin_obj2
793
+ Dseq(-7)
794
+ GAT CA
795
+ CTAATGT
796
+ >>> round(ds_lin_obj2.mw(), 1)
797
+ 3724.4
798
+ """
799
+
800
+ h2o = atom_weights["H"] * 2 + atom_weights["O"]
801
+
802
+ mwd = unambiguous_rna_weights | unambiguous_dna_weights | {" ": 0}
803
+
804
+ watsn_weight = sum(mwd[nt] - h2o for nt in self.watson.upper())
805
+ crick_weight = sum(mwd[nt] - h2o for nt in self.crick.upper())
806
+
807
+ watsn_weight += h2o * len(re.findall(r" +", self.watson))
808
+ crick_weight += h2o * len(re.findall(r" +", self.crick))
809
+
810
+ if watsn_weight and not self.circular:
811
+ watsn_weight += h2o
812
+
813
+ if crick_weight and not self.circular:
814
+ crick_weight += h2o
815
+
816
+ return watsn_weight + crick_weight
627
817
 
628
818
  def find(
629
- self, sub: _Union[_SeqAbstractBaseClass, str, bytes], start=0, end=_sys.maxsize
819
+ self, sub: Union[_SeqAbstractBaseClass, str, bytes], start=0, end=sys.maxsize
630
820
  ) -> int:
631
821
  """This method behaves like the python string method of the same name.
632
822
 
@@ -635,6 +825,8 @@ class Dseq(_Seq):
635
825
 
636
826
  Returns -1 if the subsequence is NOT found.
637
827
 
828
+ The search is case sensitive.
829
+
638
830
  Parameters
639
831
  ----------
640
832
 
@@ -650,80 +842,51 @@ class Dseq(_Seq):
650
842
  Examples
651
843
  --------
652
844
  >>> from pydna.dseq import Dseq
653
- >>> seq = Dseq("atcgactgacgtgtt")
845
+ >>> seq = Dseq("agtaagt")
654
846
  >>> seq
655
- Dseq(-15)
656
- atcgactgacgtgtt
657
- tagctgactgcacaa
658
- >>> seq.find("gac")
659
- 3
660
- >>> seq = Dseq(watson="agt",crick="actta",ovhg=-2)
847
+ Dseq(-7)
848
+ agtaagt
849
+ tcattca
850
+ >>> seq.find("taa")
851
+ 2
852
+ >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
661
853
  >>> seq
662
854
  Dseq(-7)
663
- agt
855
+ agta
664
856
  attca
665
857
  >>> seq.find("taa")
858
+ -1
859
+ >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
860
+ >>> seq
861
+ Dseq(-7)
862
+ agta
863
+ attca
864
+ >>> seq.find("ta")
666
865
  2
667
866
  """
668
-
669
- if not self.circular:
670
- return _Seq.find(self, sub, start, end)
671
-
672
- return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end)
673
-
674
- def __getitem__(self, sl: slice) -> "Dseq":
675
- """Returns a subsequence. This method is used by the slice notation"""
676
-
677
- if not self.circular:
678
- x = len(self.crick) - self.ovhg - len(self.watson)
679
-
680
- sns = (self.ovhg * " " + self.watson + x * " ")[sl]
681
- asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl]
682
-
683
- ovhg = max(
684
- (len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs
685
- )
686
-
687
- return Dseq(
688
- sns.strip(),
689
- asn[::-1].strip(),
690
- ovhg=ovhg,
691
- # linear=True
692
- )
867
+ if self.circular:
868
+ result = CircularBytes(self._data).find(sub, start, end)
693
869
  else:
694
- sl = slice(sl.start or 0, sl.stop or len(self), sl.step)
695
- if sl.start > len(self) or sl.stop > len(self):
696
- return Dseq("")
697
- if sl.start < sl.stop:
698
- return Dseq(
699
- self.watson[sl],
700
- self.crick[::-1][sl][::-1],
701
- ovhg=0,
702
- # linear=True
703
- )
704
- else:
705
- try:
706
- stp = abs(sl.step)
707
- except TypeError:
708
- stp = 1
709
- start = sl.start
710
- stop = sl.stop
711
-
712
- w = (
713
- self.watson[(start or len(self)) :: stp]
714
- + self.watson[: (stop or 0) : stp]
715
- )
716
- c = (
717
- self.crick[len(self) - stop :: stp]
718
- + self.crick[: len(self) - start : stp]
719
- )
870
+ result = super().find(sub, start, end)
871
+ return result
872
+
873
+ def __contains__(self, sub: [str, bytes]) -> bool:
874
+ return self.find(sub) != -1
720
875
 
721
- return Dseq(w, c, ovhg=0) # , linear=True)
876
+ def __getitem__(self, sl: [slice, int]) -> DseqType:
877
+ if isinstance(sl, int):
878
+ sl = slice(sl, sl + 1, 1)
879
+ sl = slice(sl.start, sl.stop, sl.step)
880
+ if self.circular:
881
+ cb = CircularBytes(self._data)
882
+ return self.quick(cb[sl])
883
+ return super().__getitem__(sl)
722
884
 
723
885
  def __eq__(self, other: DseqType) -> bool:
724
886
  """Compare to another Dseq object OR an object that implements
725
- watson, crick and ovhg properties. This comparison is case
726
- insensitive.
887
+ watson, crick and ovhg properties.
888
+
889
+ This comparison is case insensitive.
727
890
 
728
891
  """
729
892
  try:
@@ -738,85 +901,15 @@ class Dseq(_Seq):
738
901
  same = False
739
902
  return same
740
903
 
741
- def __repr__(self):
742
- """Returns a representation of the sequence, truncated if
743
- longer than 30 bp"""
744
-
745
- if len(self) > Dseq.trunc:
746
- if self.ovhg > 0:
747
- d = self.crick[-self.ovhg :][::-1]
748
- hej = len(d)
749
- if len(d) > 10:
750
- d = "{}..{}".format(d[:4], d[-4:])
751
- a = len(d) * " "
752
-
753
- elif self.ovhg < 0:
754
- a = self.watson[: max(0, -self.ovhg)]
755
- hej = len(a)
756
- if len(a) > 10:
757
- a = "{}..{}".format(a[:4], a[-4:])
758
- d = len(a) * " "
759
- else:
760
- a = ""
761
- d = ""
762
- hej = 0
763
-
764
- x = self.ovhg + len(self.watson) - len(self.crick)
765
-
766
- if x > 0:
767
- c = self.watson[len(self.crick) - self.ovhg :]
768
- y = len(c)
769
- if len(c) > 10:
770
- c = "{}..{}".format(c[:4], c[-4:])
771
- f = len(c) * " "
772
- elif x < 0:
773
- f = self.crick[:-x][::-1]
774
- y = len(f)
775
- if len(f) > 10:
776
- f = "{}..{}".format(f[:4], f[-4:])
777
- c = len(f) * " "
778
- else:
779
- c = ""
780
- f = ""
781
- y = 0
782
-
783
- L = len(self) - hej - y
784
- x1 = -min(0, self.ovhg)
785
- x2 = x1 + L
786
- x3 = -min(0, x)
787
- x4 = x3 + L
788
-
789
- b = self.watson[x1:x2]
790
- e = self.crick[x3:x4][::-1]
791
-
792
- if len(b) > 10:
793
- b = "{}..{}".format(b[:4], b[-4:])
794
- e = "{}..{}".format(e[:4], e[-4:])
795
-
796
- return _pretty_str(
797
- "{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}"
798
- ).format(
799
- klass=self.__class__.__name__,
800
- top={False: "-", True: "o"}[self.circular],
801
- size=len(self),
802
- a=a,
803
- b=b,
804
- c=c,
805
- d=d,
806
- e=e,
807
- f=f,
808
- )
904
+ def __repr__(self, lim: int = length_limit_for_repr) -> pretty_str:
809
905
 
810
- else:
811
- return _pretty_str(
812
- "{}({}{})\n{}\n{}".format(
813
- self.__class__.__name__,
814
- {False: "-", True: "o"}[self.circular],
815
- len(self),
816
- self.ovhg * " " + self.watson,
817
- -self.ovhg * " " + self.crick[::-1],
818
- )
819
- )
906
+ header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})"
907
+
908
+ w, c = representation_tuple(
909
+ self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr
910
+ )
911
+
912
+ return pretty_str(header + "\n" + w + "\n" + c)
820
913
 
821
914
  def reverse_complement(self) -> "Dseq":
822
915
  """Dseq object where watson and crick have switched places.
@@ -839,22 +932,29 @@ class Dseq(_Seq):
839
932
  >>>
840
933
 
841
934
  """
842
- return Dseq.quick(
843
- self.crick,
844
- self.watson,
845
- ovhg=len(self.watson) - len(self.crick) + self.ovhg,
846
- circular=self.circular,
847
- )
935
+ return Dseq.quick(rc(self._data), circular=self.circular)
848
936
 
849
937
  rc = reverse_complement # alias for reverse_complement
850
938
 
851
939
  def shifted(self: DseqType, shift: int) -> DseqType:
852
- """Shifted version of a circular Dseq object."""
940
+ """
941
+ Shifted copy of a circular Dseq object.
942
+
943
+ >>> ds = Dseq("TAAG", circular = True)
944
+ >>> ds.shifted(1) # First bp moved to right side:
945
+ Dseq(o4)
946
+ AAGT
947
+ TTCA
948
+ >>> ds.shifted(-1) # Last bp moved to left side:
949
+ Dseq(o4)
950
+ GTAA
951
+ CATT
952
+ """
853
953
  if not self.circular:
854
954
  raise TypeError("DNA is not circular.")
855
955
  shift = shift % len(self)
856
956
  if not shift:
857
- return _copy.deepcopy(self)
957
+ return copy.deepcopy(self)
858
958
  else:
859
959
  return (self[shift:] + self[:shift]).looped()
860
960
 
@@ -876,19 +976,30 @@ class Dseq(_Seq):
876
976
  Dseq(o8)
877
977
  catcgatc
878
978
  gtagctag
879
- >>> a.T4("t")
979
+ >>> b = Dseq("iatcgatj")
980
+ >>> b
880
981
  Dseq(-8)
881
982
  catcgat
882
983
  tagctag
883
- >>> a.T4("t").looped()
984
+ >>> b.looped()
985
+ Dseq(o7)
986
+ catcgat
987
+ gtagcta
988
+ >>> c = Dseq("jatcgati")
989
+ >>> c
990
+ Dseq(-8)
991
+ atcgatc
992
+ gtagcta
993
+ >>> c.looped()
884
994
  Dseq(o7)
885
995
  catcgat
886
996
  gtagcta
887
- >>> a.T4("a")
997
+ >>> d = Dseq("ietcgazj")
998
+ >>> d
888
999
  Dseq(-8)
889
1000
  catcga
890
1001
  agctag
891
- >>> a.T4("a").looped()
1002
+ >>> d.looped()
892
1003
  Traceback (most recent call last):
893
1004
  File "<stdin>", line 1, in <module>
894
1005
  File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped
@@ -899,116 +1010,116 @@ class Dseq(_Seq):
899
1010
 
900
1011
  """
901
1012
  if self.circular:
902
- return _copy.deepcopy(self)
1013
+ return copy.deepcopy(self)
1014
+
903
1015
  type5, sticky5 = self.five_prime_end()
904
1016
  type3, sticky3 = self.three_prime_end()
905
- if type5 == type3 and str(sticky5) == str(_rc(sticky3)):
906
- nseq = self.__class__.quick(
907
- self.watson,
908
- self.crick[-self.ovhg :] + self.crick[: -self.ovhg],
909
- ovhg=0,
910
- # linear=False,
911
- circular=True,
912
- )
913
- # assert len(nseq.crick) == len(nseq.watson)
914
- return nseq
915
- else:
916
- raise TypeError(
917
- "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
918
- )
919
1017
 
920
- def tolinear(self: DseqType) -> DseqType: # pragma: no cover
921
- """Returns a blunt, linear copy of a circular Dseq object. This can
922
- only be done if the Dseq object is circular, otherwise a
923
- TypeError is raised.
1018
+ err = TypeError(
1019
+ "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
1020
+ )
924
1021
 
925
- This method is deprecated, use slicing instead. See example below.
1022
+ if type5 != type3:
1023
+ raise err
926
1024
 
927
- Examples
928
- --------
1025
+ try:
1026
+ # Test if sticky ends are compatible
1027
+ self + self
1028
+ except TypeError:
1029
+ raise err
929
1030
 
930
- >>> from pydna.dseq import Dseq
931
- >>> a=Dseq("catcgatc", circular=True)
932
- >>> a
933
- Dseq(o8)
934
- catcgatc
935
- gtagctag
936
- >>> a[:]
937
- Dseq(-8)
938
- catcgatc
939
- gtagctag
940
- >>>
1031
+ new = self.cast_to_ds_left()[: len(self) - len(sticky3)]
941
1032
 
942
- """
943
- import warnings as _warnings
944
- from pydna import _PydnaDeprecationWarning
1033
+ new.circular = True
1034
+ return new
945
1035
 
946
- _warnings.warn(
947
- "tolinear method is obsolete; "
948
- "please use obj[:] "
949
- "instead of obj.tolinear().",
950
- _PydnaDeprecationWarning,
951
- )
952
- if not self.circular:
953
- raise TypeError("DNA is not circular.\n")
954
- selfcopy = _copy.deepcopy(self)
955
- selfcopy.circular = False
956
- return selfcopy # self.__class__(self.watson, linear=True)
1036
+ def five_prime_end(self) -> Tuple[str, str]:
1037
+ """Returns a 2-tuple of trings describing the structure of the 5' end of
1038
+ the DNA fragment.
1039
+
1040
+ The tuple contains (type , sticky) where type is eiter "5'" or "3'".
1041
+ sticky is always in lower case and contains the sequence of the
1042
+ protruding end in 5'-3' direction.
1043
+
1044
+ See examples below:
957
1045
 
958
- def five_prime_end(self) -> _Tuple[str, str]:
959
- """Returns a tuple describing the structure of the 5' end of
960
- the DNA fragment
961
1046
 
962
1047
  Examples
963
1048
  --------
964
1049
  >>> from pydna.dseq import Dseq
965
- >>> a=Dseq("aaa", "ttt")
1050
+ >>> a = Dseq("aa", "tttg", ovhg=2)
966
1051
  >>> a
967
- Dseq(-3)
968
- aaa
969
- ttt
1052
+ Dseq(-4)
1053
+ aa
1054
+ gttt
970
1055
  >>> a.five_prime_end()
971
- ('blunt', '')
972
- >>> a=Dseq("aaa", "ttt", ovhg=1)
1056
+ ("3'", 'tg')
1057
+ >>> a = Dseq("caaa", "tt", ovhg=-2)
973
1058
  >>> a
974
1059
  Dseq(-4)
975
- aaa
976
- ttt
1060
+ caaa
1061
+ tt
977
1062
  >>> a.five_prime_end()
978
- ("3'", 't')
979
- >>> a=Dseq("aaa", "ttt", ovhg=-1)
1063
+ ("5'", 'ca')
1064
+ >>> a = Dseq("aa", "tt")
980
1065
  >>> a
981
- Dseq(-4)
982
- aaa
983
- ttt
1066
+ Dseq(-2)
1067
+ aa
1068
+ tt
984
1069
  >>> a.five_prime_end()
985
- ("5'", 'a')
986
- >>>
1070
+ ('blunt', '')
987
1071
 
988
1072
  See also
989
1073
  --------
990
1074
  pydna.dseq.Dseq.three_prime_end
991
1075
 
992
1076
  """
993
- if self.watson and not self.crick:
994
- return "5'", self.watson.lower()
995
- if not self.watson and self.crick:
996
- return "3'", self.crick.lower()
997
- if self.ovhg < 0:
998
- sticky = self.watson[: -self.ovhg].lower()
1077
+
1078
+ # See docstring for function pydna.utils.get_parts for details
1079
+ # on what is contained in parts.
1080
+ parts = self.get_parts()
1081
+
1082
+ sticky5 = parts.sticky_left5.translate(dscode_to_watson_table)
1083
+
1084
+ sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1]
1085
+
1086
+ single_watson = parts.single_watson.translate(dscode_to_watson_table)
1087
+
1088
+ single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
1089
+
1090
+ # The walrus operator returns the value being assigned, so
1091
+ # we can test if it is empty or not.
1092
+ if sticky := single_watson:
1093
+ type_ = "single"
1094
+ elif sticky := single_crick:
1095
+ type_ = "single"
1096
+ elif sticky5 == sticky3 == "":
1097
+ type_, sticky = "blunt", ""
1098
+ elif sticky := sticky5:
999
1099
  type_ = "5'"
1000
- elif self.ovhg > 0:
1001
- sticky = self.crick[-self.ovhg :].lower()
1100
+ elif sticky := sticky3:
1002
1101
  type_ = "3'"
1003
- else:
1004
- sticky = ""
1005
- type_ = "blunt"
1006
- return type_, sticky
1007
1102
 
1008
- def three_prime_end(self) -> _Tuple[str, str]:
1103
+ return type_, sticky.lower()
1104
+
1105
+ def three_prime_end(self) -> Tuple[str, str]:
1009
1106
  """Returns a tuple describing the structure of the 5' end of
1010
1107
  the DNA fragment
1011
1108
 
1109
+ >>> a = Dseq("aa", "gttt", ovhg=0)
1110
+ >>> a
1111
+ Dseq(-4)
1112
+ aa
1113
+ tttg
1114
+ >>> a.three_prime_end()
1115
+ ("5'", 'gt')
1116
+ >>> a = Dseq("aaac", "tt", ovhg=0)
1117
+ >>> a
1118
+ Dseq(-4)
1119
+ aaac
1120
+ tt
1121
+ >>> a.three_prime_end()
1122
+ ("3'", 'ac')
1012
1123
  >>> from pydna.dseq import Dseq
1013
1124
  >>> a=Dseq("aaa", "ttt")
1014
1125
  >>> a
@@ -1017,21 +1128,6 @@ class Dseq(_Seq):
1017
1128
  ttt
1018
1129
  >>> a.three_prime_end()
1019
1130
  ('blunt', '')
1020
- >>> a=Dseq("aaa", "ttt", ovhg=1)
1021
- >>> a
1022
- Dseq(-4)
1023
- aaa
1024
- ttt
1025
- >>> a.three_prime_end()
1026
- ("3'", 'a')
1027
- >>> a=Dseq("aaa", "ttt", ovhg=-1)
1028
- >>> a
1029
- Dseq(-4)
1030
- aaa
1031
- ttt
1032
- >>> a.three_prime_end()
1033
- ("5'", 't')
1034
- >>>
1035
1131
 
1036
1132
  See also
1037
1133
  --------
@@ -1039,42 +1135,73 @@ class Dseq(_Seq):
1039
1135
 
1040
1136
  """
1041
1137
 
1042
- ovhg = len(self.watson) - len(self.crick) + self.ovhg
1138
+ # See docstring for function pydna.utils.get_parts for details
1139
+ # on what is contained in parts.
1140
+ parts = self.get_parts()
1141
+
1142
+ sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1]
1143
+
1144
+ sticky3 = parts.sticky_right3.translate(dscode_to_watson_table)
1145
+
1146
+ single_watson = parts.single_watson.translate(dscode_to_watson_table)
1147
+
1148
+ single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
1043
1149
 
1044
- if ovhg < 0:
1045
- sticky = self.crick[:-ovhg].lower()
1150
+ # The walrus operator returns the value being assigned, so
1151
+ # we can test if it is empty or not.
1152
+ if sticky := single_watson:
1153
+ type_ = "single"
1154
+ elif sticky := single_crick:
1155
+ type_ = "single"
1156
+ elif sticky5 == sticky3 == "":
1157
+ type_, sticky = "blunt", ""
1158
+ elif sticky := sticky5:
1046
1159
  type_ = "5'"
1047
- elif ovhg > 0:
1048
- sticky = self.watson[-ovhg:].lower()
1160
+ elif sticky := sticky3:
1049
1161
  type_ = "3'"
1050
- else:
1051
- sticky = ""
1052
- type_ = "blunt"
1053
- return type_, sticky
1054
1162
 
1055
- def watson_ovhg(self) -> int:
1056
- """Returns the overhang of the watson strand at the three prime."""
1057
- return len(self.watson) - len(self.crick) + self.ovhg
1163
+ return type_, sticky.lower()
1058
1164
 
1059
- def __add__(self: DseqType, other: DseqType) -> DseqType:
1060
- """Simulates ligation between two DNA fragments.
1165
+ def __add__(self: DseqType, other: [DseqType, str, bytes]) -> DseqType:
1166
+ """
1167
+ Adding two Dseq objects together.
1168
+
1169
+ >>> ds = Dseq("a", "t", ovhg=0)
1170
+ >>> ds
1171
+ Dseq(-1)
1172
+ a
1173
+ t
1174
+ >>> ds + ds
1175
+ Dseq(-2)
1176
+ aa
1177
+ tt
1178
+ >>> "g" + ds # adding a string of left side returns a Dseq
1179
+ Dseq(-2)
1180
+ ga
1181
+ ct
1182
+ >>> ds + "c" # adding a string of right side returns a Dseq
1183
+ Dseq(-2)
1184
+ ac
1185
+ tg
1061
1186
 
1062
- Add other Dseq object at the end of the sequence.
1063
- Type error is raised if any of the points below are fulfilled:
1064
1187
 
1065
- * one or more objects are circular
1066
- * if three prime sticky end of self is not the same type
1067
- (5' or 3') as the sticky end of other
1068
- * three prime sticky end of self complementary with five
1069
- prime sticky end of other.
1188
+ Parameters
1189
+ ----------
1190
+ other : [DseqType, str, bytes]
1191
+ Object to be added.
1070
1192
 
1071
- Phosphorylation and dephosphorylation is not considered.
1193
+ Raises
1194
+ ------
1195
+ TypeError
1196
+ Preventing adding to a circular sequence.
1072
1197
 
1073
- DNA is allways presumed to have the necessary 5' phospate
1074
- group necessary for ligation.
1198
+ Returns
1199
+ -------
1200
+ DseqType
1201
+ A new Dseq object.
1075
1202
 
1076
1203
  """
1077
- # test for circular DNA
1204
+
1078
1205
  if self.circular:
1079
1206
  raise TypeError("circular DNA cannot be ligated!")
1080
1207
  try:
@@ -1083,60 +1210,85 @@ class Dseq(_Seq):
1083
1210
  except AttributeError:
1084
1211
  pass
1085
1212
 
1213
+ # If other evaluates to False, return a copy of self.
1214
+ if not other:
1215
+ return copy.deepcopy(self)
1216
+ # If self evaluates to False, return a copy of other.
1217
+ elif not self:
1218
+ return copy.deepcopy(other)
1219
+
1220
+ # get right side end properties for self.
1086
1221
  self_type, self_tail = self.three_prime_end()
1087
- other_type, other_tail = other.five_prime_end()
1088
1222
 
1089
- if self_type == other_type and str(self_tail) == str(_rc(other_tail)):
1090
- answer = Dseq.quick(
1091
- self.watson + other.watson, other.crick + self.crick, self.ovhg
1092
- )
1093
- elif not self:
1094
- answer = _copy.deepcopy(other)
1095
- elif not other:
1096
- answer = _copy.deepcopy(self)
1097
- else:
1098
- raise TypeError("sticky ends not compatible!")
1099
- return answer
1223
+ try:
1224
+ other_type, other_tail = other.five_prime_end()
1225
+ except AttributeError:
1226
+ # if other does not have the expected properties
1227
+ # most likely it is a string that can be cast as
1228
+ # a Dseq.
1229
+ other_type, other_tail = "blunt", ""
1230
+ other = Dseq(other)
1231
+
1232
+ err = TypeError("sticky ends not compatible!")
1233
+
1234
+ # The sticky ends has to be of the same type
1235
+ # or
1236
+ # one or both of is "single" indicating a stranded molecule.
1237
+ if (self_type != other_type) and ("single" not in (self_type, other_type)):
1238
+ raise err
1239
+
1240
+ # tail length has to be equal for two phosphdiester bonds to form
1241
+ if len(self_tail) != len(other_tail):
1242
+ raise err
1243
+
1244
+ # Each basepair is checked against the pydna.alphabet basepair_dict
1245
+ # which contains the permitted base pairings.
1246
+ for w, c in zip(self_tail, other_tail[::-1]):
1247
+ try:
1248
+ basepair_dict[(w, c)]
1249
+ except KeyError:
1250
+ raise err
1251
+
1252
+ return self.__class__(
1253
+ self.watson + other.watson, other.crick + self.crick, self.ovhg
1254
+ )
1100
1255
 
1101
1256
  def __mul__(self: DseqType, number: int) -> DseqType:
1102
1257
  if not isinstance(number, int):
1103
1258
  raise TypeError(
1104
- "TypeError: can't multiply Dseq by non-int of type {}".format(
1105
- type(number)
1106
- )
1259
+ "TypeError: can't multiply Dseq" f" by non-int of type {type(number)}"
1107
1260
  )
1108
- if number <= 0:
1109
- return self.__class__("")
1110
- new = _copy.deepcopy(self)
1111
- for i in range(number - 1):
1112
- new += self
1113
- return new
1261
+ return Dseq("").join(list(itertools.repeat(self, number)))
1114
1262
 
1115
- def _fill_in_five_prime(self: DseqType, nucleotides: str) -> str:
1263
+ def _fill_in_left(self: DseqType, nucleotides: str) -> str:
1116
1264
  stuffer = ""
1117
1265
  type, se = self.five_prime_end()
1118
1266
  if type == "5'":
1119
- for n in _rc(se):
1267
+ for n in rc(se):
1120
1268
  if n in nucleotides:
1121
1269
  stuffer += n
1122
1270
  else:
1123
1271
  break
1124
1272
  return self.crick + stuffer, self.ovhg + len(stuffer)
1125
1273
 
1126
- def _fill_in_three_prime(self: DseqType, nucleotides: str) -> str:
1274
+ def _fill_in_right(self: DseqType, nucleotides: str) -> str:
1127
1275
  stuffer = ""
1128
1276
  type, se = self.three_prime_end()
1129
1277
  if type == "5'":
1130
- for n in _rc(se):
1278
+ for n in rc(se):
1131
1279
  if n in nucleotides:
1132
1280
  stuffer += n
1133
1281
  else:
1134
1282
  break
1135
1283
  return self.watson + stuffer
1136
1284
 
1137
- def fill_in(self, nucleotides: _Union[None, str] = None) -> "Dseq":
1285
+ def fill_in(self, nucleotides: Union[None, str] = None) -> DseqType:
1138
1286
  """Fill in of five prime protruding end with a DNA polymerase
1139
- that has only DNA polymerase activity (such as exo-klenow [#]_)
1287
+ that has only DNA polymerase activity (such as Exo-Klenow [#]_).
1288
+ Exo-Klenow is a modified version of the Klenow fragment of E.
1289
+ coli DNA polymerase I, which has been engineered to lack both
1290
+ 3-5 proofreading and 5-3 exonuclease activities.
1291
+
1140
1292
  and any combination of A, G, C or T. Default are all four
1141
1293
  nucleotides together.
1142
1294
 
@@ -1149,15 +1301,6 @@ class Dseq(_Seq):
1149
1301
  --------
1150
1302
 
1151
1303
  >>> from pydna.dseq import Dseq
1152
- >>> a=Dseq("aaa", "ttt")
1153
- >>> a
1154
- Dseq(-3)
1155
- aaa
1156
- ttt
1157
- >>> a.fill_in()
1158
- Dseq(-3)
1159
- aaa
1160
- ttt
1161
1304
  >>> b=Dseq("caaa", "cttt")
1162
1305
  >>> b
1163
1306
  Dseq(-5)
@@ -1184,7 +1327,15 @@ class Dseq(_Seq):
1184
1327
  Dseq(-5)
1185
1328
  aaac
1186
1329
  gttt
1187
- >>>
1330
+ >>> a=Dseq("aaa", "ttt")
1331
+ >>> a
1332
+ Dseq(-3)
1333
+ aaa
1334
+ ttt
1335
+ >>> a.fill_in()
1336
+ Dseq(-3)
1337
+ aaa
1338
+ ttt
1188
1339
 
1189
1340
  References
1190
1341
  ----------
@@ -1195,32 +1346,31 @@ class Dseq(_Seq):
1195
1346
  nucleotides = "GATCRYWSMKHBVDN"
1196
1347
 
1197
1348
  nucleotides = set(nucleotides.lower() + nucleotides.upper())
1198
- crick, ovhg = self._fill_in_five_prime(nucleotides)
1199
- watson = self._fill_in_three_prime(nucleotides)
1349
+ crick, ovhg = self._fill_in_left(nucleotides)
1350
+ watson = self._fill_in_right(nucleotides)
1200
1351
  return Dseq(watson, crick, ovhg)
1201
1352
 
1202
- def transcribe(self) -> _Seq:
1203
- return _Seq(self.watson).transcribe()
1204
-
1205
- def translate(
1206
- self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
1207
- ) -> _Seq:
1208
- return _Seq(
1209
- _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
1210
- )
1353
+ klenow = fill_in # alias
1211
1354
 
1212
- def mung(self) -> "Dseq":
1355
+ def nibble_to_blunt(self) -> DseqType:
1213
1356
  """
1214
- Simulates treatment a nuclease with 5'-3' and 3'-5' single
1357
+ Simulates treatment a nuclease with both 5'-3' and 3'-5' single
1215
1358
  strand specific exonuclease activity (such as mung bean nuclease [#]_)
1216
1359
 
1360
+ Mung bean nuclease is a nuclease enzyme derived from mung bean sprouts
1361
+ that preferentially degrades single-stranded DNA and RNA into
1362
+ 5'-phosphate- and 3'-hydroxyl-containing nucleotides.
1363
+
1364
+ Treatment results in blunt DNA, regardless of wheter the protruding end
1365
+ is 5' or 3'.
1366
+
1217
1367
  ::
1218
1368
 
1219
1369
  ggatcc -> gatcc
1220
1370
  ctaggg ctagg
1221
1371
 
1222
- ggatcc -> ggatc
1223
- tcctag cctag
1372
+ ggatcc -> ggatc
1373
+ tcctag cctag
1224
1374
 
1225
1375
  >>> from pydna.dseq import Dseq
1226
1376
  >>> b=Dseq("caaa", "cttt")
@@ -1250,19 +1400,60 @@ class Dseq(_Seq):
1250
1400
 
1251
1401
 
1252
1402
  """
1253
- return Dseq(
1254
- self.watson[
1255
- max(0, -self.ovhg) : min(len(self.watson), len(self.crick) - self.ovhg)
1256
- ]
1257
- )
1403
+ parts = self.get_parts()
1404
+ return self.__class__(parts.middle)
1405
+
1406
+ mung = nibble_to_blunt
1407
+
1408
+ def T4(self, nucleotides=None) -> DseqType:
1409
+ """
1410
+ Fill in 5' protruding ends and nibble 3' protruding ends.
1411
+
1412
+ This is done using a DNA polymerase providing 3'-5' nuclease activity
1413
+ such as T4 DNA polymerase. This can be done in presence of any
1414
+ combination of the four nucleotides A, G, C or T.
1415
+
1416
+ T4 DNA polymerase is widely used to “polish” DNA ends because of its
1417
+ strong 3-5 exonuclease activity in the absence of dNTPs, it chews
1418
+ back 3′ overhangs to create blunt ends; in the presence of limiting
1419
+ dNTPs, it can fill in 5′ overhangs; and by carefully controlling
1420
+ reaction time, temperature, and nucleotide supply, you can generate
1421
+ defined recessed or blunt termini.
1422
+
1423
+ Tuning the nucleotide set can facilitate engineering of partial
1424
+ sticky ends. Default are all four nucleotides together.
1425
+
1426
+ ::
1427
+
1428
+ aaagatc-3 aaa 3' ends are always removed.
1429
+ ||| ---> ||| A and T needed or the molecule will
1430
+ 3-ctagttt ttt degrade completely.
1431
+
1432
+
1433
+
1434
+ 5-gatcaaa gatcaaaGATC 5' ends are filled in the
1435
+ ||| ---> ||||||||||| presence of GATC
1436
+ tttctag-5 CTAGtttctag
1437
+
1438
+
1439
+
1440
+ 5-gatcaaa gatcaaaGAT 5' ends are partially filled in the
1441
+ ||| ---> ||||||||| presence of GAT to produce a 1 nt
1442
+ tttctag-5 TAGtttctag 5' overhang
1443
+
1444
+
1445
+
1446
+ 5-gatcaaa gatcaaaGA 5' ends are partially filled in the
1447
+ ||| ---> ||||||| presence of GA to produce a 2 nt
1448
+ tttctag-5 AGtttctag 5' overhang
1449
+
1450
+
1451
+
1452
+ 5-gatcaaa gatcaaaG 5' ends are partially filled in the
1453
+ ||| ---> ||||| presence of G to produce a 3 nt
1454
+ tttctag-5 Gtttctag 5' overhang
1455
+
1258
1456
 
1259
- def T4(self, nucleotides=None) -> "Dseq":
1260
- """Fill in five prime protruding ends and chewing back
1261
- three prime protruding ends by a DNA polymerase providing both
1262
- 5'-3' DNA polymerase activity and 3'-5' nuclease acitivty
1263
- (such as T4 DNA polymerase). This can be done in presence of any
1264
- combination of the four A, G, C or T. Removing one or more nucleotides
1265
- can facilitate engineering of sticky ends. Default are all four nucleotides together.
1266
1457
 
1267
1458
  Parameters
1268
1459
  ----------
@@ -1273,29 +1464,31 @@ class Dseq(_Seq):
1273
1464
  --------
1274
1465
 
1275
1466
  >>> from pydna.dseq import Dseq
1276
- >>> a=Dseq("gatcgatc")
1467
+ >>> a = Dseq.from_representation(
1468
+ ... '''
1469
+ ... gatcaaa
1470
+ ... tttctag
1471
+ ... ''')
1277
1472
  >>> a
1278
- Dseq(-8)
1279
- gatcgatc
1280
- ctagctag
1473
+ Dseq(-11)
1474
+ gatcaaa
1475
+ tttctag
1281
1476
  >>> a.T4()
1282
- Dseq(-8)
1283
- gatcgatc
1284
- ctagctag
1285
- >>> a.T4("t")
1286
- Dseq(-8)
1287
- gatcgat
1288
- tagctag
1289
- >>> a.T4("a")
1290
- Dseq(-8)
1291
- gatcga
1292
- agctag
1293
- >>> a.T4("g")
1294
- Dseq(-8)
1295
- gatcg
1296
- gctag
1297
- >>>
1298
-
1477
+ Dseq(-11)
1478
+ gatcaaagatc
1479
+ ctagtttctag
1480
+ >>> a.T4("GAT")
1481
+ Dseq(-11)
1482
+ gatcaaagat
1483
+ tagtttctag
1484
+ >>> a.T4("GA")
1485
+ Dseq(-11)
1486
+ gatcaaaga
1487
+ agtttctag
1488
+ >>> a.T4("G")
1489
+ Dseq(-11)
1490
+ gatcaaag
1491
+ gtttctag
1299
1492
  """
1300
1493
 
1301
1494
  if not nucleotides:
@@ -1303,7 +1496,7 @@ class Dseq(_Seq):
1303
1496
  nucleotides = set(nucleotides.lower() + nucleotides.upper())
1304
1497
  type, se = self.five_prime_end()
1305
1498
  if type == "5'":
1306
- crick, ovhg = self._fill_in_five_prime(nucleotides)
1499
+ crick, ovhg = self._fill_in_left(nucleotides)
1307
1500
  else:
1308
1501
  if type == "3'":
1309
1502
  ovhg = 0
@@ -1323,7 +1516,7 @@ class Dseq(_Seq):
1323
1516
  watson = self.watson
1324
1517
  type, se = self.three_prime_end()
1325
1518
  if type == "5'":
1326
- watson = self._fill_in_three_prime(nucleotides)
1519
+ watson = self._fill_in_right(nucleotides)
1327
1520
  else:
1328
1521
  if type == "3'":
1329
1522
  watson = self.watson[: -len(se)]
@@ -1337,32 +1530,305 @@ class Dseq(_Seq):
1337
1530
 
1338
1531
  t4 = T4 # alias for the T4 method.
1339
1532
 
1340
- def exo1_front(self: DseqType, n=1) -> DseqType:
1341
- """5'-3' resection at the start (left side) of the molecule."""
1342
- d = _copy.deepcopy(self)
1343
- d.ovhg += n
1344
- d.watson = d.watson[n:]
1345
- return d
1533
+ def nibble_five_prime_left(self: DseqType, n: int = 1) -> DseqType:
1534
+ """
1535
+ 5' => 3' resection at the left side (start) of the molecule.
1536
+
1537
+ The argument n indicate the number of nucleotides that are to be
1538
+ removed. The outcome of this depend on the structure of the molecule.
1539
+ See the two examples below:
1540
+
1541
+ The figure below indicates a recess of length two from a blunt DNA
1542
+ fragment. The resulting DNA fragment has a 3' protruding single strand.
1543
+
1544
+ ::
1545
+
1546
+ gatc tc
1547
+ |||| --> ||
1548
+ ctag ctag
1549
+
1550
+
1551
+ The figure below indicates a recess of length two from a DNA fragment
1552
+ with a 5' sticky end resulting in a blunt sequence.
1553
+
1554
+ ::
1555
+
1556
+ ttgatc gatc
1557
+ |||| --> ||||
1558
+ ctag ctag
1559
+
1560
+
1561
+ >>> from pydna.dseq import Dseq
1562
+ >>> ds = Dseq("gatc")
1563
+ >>> ds
1564
+ Dseq(-4)
1565
+ gatc
1566
+ ctag
1567
+ >>> ds.nibble_five_prime_left(2)
1568
+ Dseq(-4)
1569
+ tc
1570
+ ctag
1571
+ >>> ds.nibble_five_prime_left(3)
1572
+ Dseq(-4)
1573
+ c
1574
+ ctag
1575
+ >>> ds.nibble_five_prime_left(4)
1576
+ Dseq(-4)
1577
+ <BLANKLINE>
1578
+ ctag
1579
+ >>> ds = Dseq.from_representation(
1580
+ ... '''
1581
+ ... GGgatc
1582
+ ... ctag
1583
+ ... ''')
1584
+ >>> ds
1585
+ Dseq(-6)
1586
+ GGgatc
1587
+ ctag
1588
+ >>> ds.nibble_five_prime_left(2)
1589
+ Dseq(-4)
1590
+ gatc
1591
+ ctag
1592
+
1593
+ Parameters
1594
+ ----------
1595
+ n : int, optional
1596
+ The default is 1. This is the number of nucleotides removed.
1597
+
1598
+ Returns
1599
+ -------
1600
+ DseqType
1601
+ DESCRIPTION.
1602
+
1603
+ """
1604
+ n += max(0, self.ovhg or 0)
1605
+ return Dseq(
1606
+ self._data[:n]
1607
+ .translate(dscode_to_crick_table)
1608
+ .translate(complement_table_for_dscode)
1609
+ .translate(dscode_to_crick_tail_table)
1610
+ .lstrip()
1611
+ + self._data[n:]
1612
+ )
1613
+
1614
+ def nibble_five_prime_right(self: DseqType, n: int = 1) -> DseqType:
1615
+ """
1616
+ 5' => 3' resection at the right side (end) of the molecule.
1617
+
1618
+ The argument n indicate the number of nucleotides that are to be
1619
+ removed. The outcome of this depend on the structure of the molecule.
1620
+ See the two examples below:
1621
+
1622
+ The figure below indicates a recess of length two from a blunt DNA
1623
+ fragment. The resulting DNA fragment has a 3' protruding single strand.
1624
+
1625
+ ::
1626
+
1627
+ gatc gatc
1628
+ |||| --> ||
1629
+ ctag ct
1630
+
1631
+ The figure below indicates a recess of length two from a DNA fragment
1632
+ with a 5' sticky end resulting in a blunt sequence.
1633
+
1634
+ ::
1635
+
1636
+ gatc gatc
1637
+ |||| --> ||||
1638
+ ctagtt ctag
1639
+
1640
+
1641
+ >>> from pydna.dseq import Dseq
1642
+ >>> ds = Dseq("gatc")
1643
+ >>> ds
1644
+ Dseq(-4)
1645
+ gatc
1646
+ ctag
1647
+ >>> ds.nibble_five_prime_right(2)
1648
+ Dseq(-4)
1649
+ gatc
1650
+ ct
1651
+ >>> ds.nibble_five_prime_right(3)
1652
+ Dseq(-4)
1653
+ gatc
1654
+ c
1655
+ >>> ds.nibble_five_prime_right(4)
1656
+ Dseq(-4)
1657
+ gatc
1658
+ <BLANKLINE>
1659
+ >>> ds = Dseq.from_representation(
1660
+ ... '''
1661
+ ... gatc
1662
+ ... ctagGG
1663
+ ... ''')
1664
+ >>> ds.nibble_five_prime_right(2)
1665
+ Dseq(-4)
1666
+ gatc
1667
+ ctag
1668
+ """
1669
+ n = len(self) - n
1670
+ ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
1671
+ n -= max(0, ovhg)
1672
+ return Dseq(
1673
+ self._data[:n]
1674
+ + self._data[n:]
1675
+ .translate(dscode_to_watson_table)
1676
+ .translate(dscode_to_watson_tail_table)
1677
+ .lstrip()
1678
+ )
1679
+
1680
+ exo1_front = nibble_five_prime_left # TODO: consider using the new names
1681
+ exo1_end = nibble_five_prime_right # TODO: consider using the new names
1682
+
1683
+ def nibble_three_prime_left(self: DseqType, n=1) -> DseqType:
1684
+ """
1685
+ 3' => 5' resection at the left side (beginning) of the molecule.
1686
+
1687
+ The argument n indicate the number of nucleotides that are to be
1688
+ removed. The outcome of this depend on the structure of the molecule.
1689
+ See the two examples below:
1690
+
1691
+ The figure below indicates a recess of length two from a blunt DNA
1692
+ fragment. The resulting DNA fragment has a 5' protruding single strand.
1693
+
1694
+ ::
1695
+
1696
+ gatc gatc
1697
+ |||| --> ||
1698
+ ctag ag
1699
+
1700
+ The figure below indicates a recess of length two from a DNA fragment
1701
+ with a 3' sticky end resulting in a blunt sequence.
1702
+
1703
+ ::
1704
+
1705
+ gatc gatc
1706
+ |||| --> ||||
1707
+ ttctag ctag
1708
+
1709
+
1710
+ >>> from pydna.dseq import Dseq
1711
+ >>> ds = Dseq("gatc")
1712
+ >>> ds
1713
+ Dseq(-4)
1714
+ gatc
1715
+ ctag
1716
+ >>> ds.nibble_three_prime_left(2)
1717
+ Dseq(-4)
1718
+ gatc
1719
+ ag
1720
+ >>> ds.nibble_three_prime_left(3)
1721
+ Dseq(-4)
1722
+ gatc
1723
+ g
1724
+ >>> ds.nibble_three_prime_left(4)
1725
+ Dseq(-4)
1726
+ gatc
1727
+ <BLANKLINE>
1728
+ >>> ds = Dseq.from_representation(
1729
+ ... '''
1730
+ ... gatc
1731
+ ... CCctag
1732
+ ... ''')
1733
+ >>> ds
1734
+ Dseq(-6)
1735
+ gatc
1736
+ CCctag
1737
+ >>> ds.nibble_three_prime_left(2)
1738
+ Dseq(-4)
1739
+ gatc
1740
+ ctag
1741
+ """
1742
+ ovhg = len(self) if self.ovhg is None else self.ovhg
1743
+ n -= min(0, ovhg)
1744
+ return Dseq(
1745
+ self._data[:n]
1746
+ .translate(dscode_to_watson_table)
1747
+ .translate(dscode_to_watson_tail_table)
1748
+ .lstrip()
1749
+ + self._data[n:]
1750
+ )
1751
+
1752
+ def nibble_three_prime_right(self: DseqType, n=1) -> DseqType:
1753
+ """
1754
+ 3' => 5' resection at the right side (end) of the molecule.
1755
+
1756
+ The argument n indicate the number of nucleotides that are to be
1757
+ removed. The outcome of this depend on the structure of the molecule.
1758
+ See the two examples below:
1759
+
1760
+ The figure below indicates a recess of length two from a blunt DNA
1761
+ fragment. The resulting DNA fragment has a 5' protruding single strand.
1762
+
1763
+ ::
1346
1764
 
1347
- def exo1_end(self: DseqType, n=1) -> DseqType:
1348
- """5'-3' resection at the end (right side) of the molecule."""
1349
- d = _copy.deepcopy(self)
1350
- d.crick = d.crick[n:]
1351
- return d
1765
+ gatc ga
1766
+ |||| --> ||
1767
+ ctag ctag
1768
+
1769
+ The figure below indicates a recess of length two from a DNA fragment
1770
+ with a 3' sticky end resulting in a blunt sequence.
1771
+
1772
+ ::
1773
+
1774
+ gatctt gatc
1775
+ |||| --> ||||
1776
+ ctag ctag
1777
+
1778
+
1779
+ >>> from pydna.dseq import Dseq
1780
+ >>> ds = Dseq("gatc")
1781
+ >>> ds
1782
+ Dseq(-4)
1783
+ gatc
1784
+ ctag
1785
+ >>> ds.nibble_three_prime_right(2)
1786
+ Dseq(-4)
1787
+ ga
1788
+ ctag
1789
+ >>> ds.nibble_three_prime_right(3)
1790
+ Dseq(-4)
1791
+ g
1792
+ ctag
1793
+ >>> ds.nibble_three_prime_right(4)
1794
+ Dseq(-4)
1795
+ <BLANKLINE>
1796
+ ctag
1797
+ >>> ds = Dseq.from_representation(
1798
+ ... '''
1799
+ ... gatcCC
1800
+ ... ctag
1801
+ ... ''')
1802
+ >>> ds.nibble_three_prime_right(2)
1803
+ Dseq(-4)
1804
+ gatc
1805
+ ctag
1806
+ """
1807
+ n = len(self) - n
1808
+ ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
1809
+ n += min(0, ovhg)
1810
+ return Dseq(
1811
+ self._data[:n]
1812
+ + self._data[n:]
1813
+ .translate(dscode_to_crick_table)
1814
+ .translate(complement_table_for_dscode)
1815
+ .translate(dscode_to_crick_tail_table)
1816
+ .lstrip()
1817
+ )
1352
1818
 
1353
1819
  def no_cutters(
1354
- self, batch: _Union[_RestrictionBatch, None] = None
1355
- ) -> _RestrictionBatch:
1820
+ self, batch: Union[RestrictionBatch, None] = None
1821
+ ) -> RestrictionBatch:
1356
1822
  """Enzymes in a RestrictionBatch not cutting sequence."""
1357
1823
  if batch is None:
1358
1824
  batch = CommOnly
1359
1825
  ana = batch.search(self)
1360
1826
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist}
1361
- return _RestrictionBatch(ncut)
1827
+ return RestrictionBatch(ncut)
1362
1828
 
1363
1829
  def unique_cutters(
1364
- self, batch: _Union[_RestrictionBatch, None] = None
1365
- ) -> _RestrictionBatch:
1830
+ self, batch: Union[RestrictionBatch, None] = None
1831
+ ) -> RestrictionBatch:
1366
1832
  """Enzymes in a RestrictionBatch cutting sequence once."""
1367
1833
  if batch is None:
1368
1834
  batch = CommOnly
@@ -1371,44 +1837,42 @@ class Dseq(_Seq):
1371
1837
  once_cutters = unique_cutters # alias for unique_cutters
1372
1838
 
1373
1839
  def twice_cutters(
1374
- self, batch: _Union[_RestrictionBatch, None] = None
1375
- ) -> _RestrictionBatch:
1840
+ self, batch: Union[RestrictionBatch, None] = None
1841
+ ) -> RestrictionBatch:
1376
1842
  """Enzymes in a RestrictionBatch cutting sequence twice."""
1377
1843
  if batch is None:
1378
1844
  batch = CommOnly
1379
1845
  return self.n_cutters(n=2, batch=batch)
1380
1846
 
1381
1847
  def n_cutters(
1382
- self, n=3, batch: _Union[_RestrictionBatch, None] = None
1383
- ) -> _RestrictionBatch:
1848
+ self, n=3, batch: Union[RestrictionBatch, None] = None
1849
+ ) -> RestrictionBatch:
1384
1850
  """Enzymes in a RestrictionBatch cutting n times."""
1385
1851
  if batch is None:
1386
1852
  batch = CommOnly
1387
1853
  ana = batch.search(self)
1388
1854
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n}
1389
- return _RestrictionBatch(ncut)
1855
+ return RestrictionBatch(ncut)
1390
1856
 
1391
- def cutters(
1392
- self, batch: _Union[_RestrictionBatch, None] = None
1393
- ) -> _RestrictionBatch:
1857
+ def cutters(self, batch: Union[RestrictionBatch, None] = None) -> RestrictionBatch:
1394
1858
  """Enzymes in a RestrictionBatch cutting sequence at least once."""
1395
1859
  if batch is None:
1396
1860
  batch = CommOnly
1397
1861
  ana = batch.search(self)
1398
1862
  ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist}
1399
- return _RestrictionBatch(ncut)
1863
+ return RestrictionBatch(ncut)
1400
1864
 
1401
1865
  def seguid(self) -> str:
1402
1866
  """SEGUID checksum for the sequence."""
1403
1867
  if self.circular:
1404
- cs = _cdseguid(
1868
+ cs = cdseguid(
1405
1869
  self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}"
1406
1870
  )
1407
1871
  else:
1408
1872
  """docstring."""
1409
1873
  w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper()
1410
1874
  c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper()
1411
- cs = _ldseguid(w, c, alphabet="{DNA-extended}")
1875
+ cs = ldseguid(w, c, alphabet="{DNA-extended},AU")
1412
1876
  return cs
1413
1877
 
1414
1878
  def isblunt(self) -> bool:
@@ -1449,29 +1913,113 @@ class Dseq(_Seq):
1449
1913
  >>> a.isblunt()
1450
1914
  False
1451
1915
  """
1452
- return (
1453
- self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular
1916
+ parts = self.get_parts()
1917
+
1918
+ return not any(
1919
+ (
1920
+ parts.sticky_right5,
1921
+ parts.sticky_right3,
1922
+ parts.sticky_left3,
1923
+ parts.sticky_left5,
1924
+ self.circular,
1925
+ )
1454
1926
  )
1455
1927
 
1456
- def cas9(self, RNA: str) -> _Tuple[slice, ...]:
1457
- """docstring."""
1458
- bRNA = bytes(RNA, "ASCII")
1459
- slices = []
1460
- cuts = [0]
1461
- for m in _re.finditer(bRNA, self._data):
1462
- cuts.append(m.start() + 17)
1463
- cuts.append(self.length)
1464
- slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:]))
1465
- return slices
1466
-
1467
- def terminal_transferase(self, nucleotides="a") -> "Dseq":
1468
- """docstring."""
1928
+ def terminal_transferase(self, nucleotides: str = "a") -> DseqType:
1929
+ """
1930
+ Terminal deoxynucleotidyl transferase (TdT) is a template-independent
1931
+ DNA polymerase that adds nucleotides to the 3′-OH ends of DNA, typically
1932
+ single-stranded or recessed 3′ ends. In cloning, it’s classically used
1933
+ to create homopolymer tails (e.g. poly-dG on a vector and poly-dC on an insert)
1934
+ so that fragments can anneal via complementary overhangs (“tailing” cloning).
1935
+
1936
+ This activity ia also present in some DNA polymerases, such as Taq polymerase.
1937
+ This property is used in the populat T/A cloning protocol ([#]_).
1938
+
1939
+ ::
1940
+
1941
+ gct gcta
1942
+ ||| --> |||
1943
+ cga acga
1944
+
1945
+
1946
+
1947
+ >>> from pydna.dseq import Dseq
1948
+ >>> a = Dseq("aa")
1949
+ >>> a = Dseq("gct")
1950
+ >>> a
1951
+ Dseq(-3)
1952
+ gct
1953
+ cga
1954
+ >>> a.terminal_transferase()
1955
+ Dseq(-5)
1956
+ gcta
1957
+ acga
1958
+ >>> a.terminal_transferase("G")
1959
+ Dseq(-5)
1960
+ gctG
1961
+ Gcga
1962
+
1963
+ Parameters
1964
+ ----------
1965
+ nucleotides : str, optional
1966
+ The default is "a".
1967
+
1968
+ Returns
1969
+ -------
1970
+ DseqType
1971
+ DESCRIPTION.
1972
+
1973
+ References
1974
+ ----------
1975
+ .. [#] https://en.wikipedia.org/wiki/TA_cloning
1976
+
1977
+ """
1469
1978
  ovhg = self.ovhg
1470
1979
  if self.ovhg >= 0:
1471
1980
  ovhg += len(nucleotides)
1472
1981
  return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
1473
1982
 
1474
- def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]:
1983
+ def user(self) -> DseqType:
1984
+ """
1985
+ USER Enzyme treatment.
1986
+
1987
+ USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the
1988
+ DNA glycosylase-lyase Endonuclease VIII.
1989
+
1990
+ UDG catalyses the excision of an uracil base, forming an abasic
1991
+ or apyrimidinic site (AP site). Endonuclease VIII removes the AP
1992
+ site creating a DNA gap.
1993
+
1994
+ ::
1995
+
1996
+ tagaagtaggUat tagaagtagg at
1997
+ ||||||||||||| ---> |||||||||| ||
1998
+ atcUtcatccata atc tcatccata
1999
+
2000
+
2001
+
2002
+ >>> a = Dseq("tagaagtaggUat", "atcUtcatccata"[::-1], 0)
2003
+ >>> a
2004
+ Dseq(-13)
2005
+ tagaagtaggUat
2006
+ atcutcatccAta
2007
+ >>> a.user()
2008
+ Dseq(-13)
2009
+ tagaagtagg at
2010
+ atc tcatccAta
2011
+
2012
+
2013
+ Returns
2014
+ -------
2015
+ DseqType
2016
+ DNA fragment with uracile bases removed.
2017
+
2018
+ """
2019
+
2020
+ return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe")))
2021
+
2022
+ def cut(self: DseqType, *enzymes: EnzymesType) -> Tuple[DseqType, ...]:
1475
2023
  """Returns a list of linear Dseq fragments produced in the digestion.
1476
2024
  If there are no cuts, an empty list is returned.
1477
2025
 
@@ -1522,11 +2070,73 @@ class Dseq(_Seq):
1522
2070
  return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
1523
2071
 
1524
2072
  def cutsite_is_valid(self, cutsite: CutSiteType) -> bool:
1525
- """Returns False if:
2073
+ """
2074
+ Check is a cutsite is valid.
2075
+
2076
+ A cutsite is a nested 2-tuple with this form:
2077
+
2078
+ ((cut_watson, ovhg), enz), for example ((396, -4), EcoRI)
2079
+
2080
+ The cut_watson (positive integer) is the cut position of the sequence as for example
2081
+ returned by the Bio.Restriction module.
2082
+
2083
+ The ovhg (overhang, positive or negative integer or 0) has the same meaning as
2084
+ for restriction enzymes in the Bio.Restriction module and for
2085
+ pydna.dseq.Dseq objects (see docstring for this module and example below)
2086
+
2087
+ Enzyme can be None.
2088
+
2089
+ ::
2090
+
2091
+ Enzyme overhang
2092
+
2093
+ EcoRI -4 --GAATTC-- --G AATTC--
2094
+ |||||| --> | |
2095
+ --CTTAAG-- --CTTAA G--
2096
+
2097
+ KpnI 4 --GGTACC-- --GGTAC C--
2098
+ |||||| --> | |
2099
+ --CCATGG-- --C CATGG--
2100
+
2101
+ SmaI 0 --CCCGGG-- --CCC GGG--
2102
+ |||||| --> ||| |||
2103
+ --GGGCCC-- --GGG CCC--
2104
+
2105
+
2106
+ >>> from Bio.Restriction import EcoRI, KpnI, SmaI
2107
+ >>> EcoRI.ovhg
2108
+ -4
2109
+ >>> KpnI.ovhg
2110
+ 4
2111
+ >>> SmaI.ovhg
2112
+ 0
2113
+
2114
+ Returns False if:
2115
+
1526
2116
  - Cut positions fall outside the sequence (could be moved to Biopython)
2117
+ TODO: example
2118
+
1527
2119
  - Overhang is not double stranded
2120
+ TODO: example
2121
+
1528
2122
  - Recognition site is not double stranded or is outside the sequence
2123
+ TODO: example
2124
+
1529
2125
  - For enzymes that cut twice, it checks that at least one possibility is valid
2126
+ TODO: example
2127
+
2128
+
2129
+
2130
+ Parameters
2131
+ ----------
2132
+ cutsite : CutSiteType
2133
+ DESCRIPTION.
2134
+
2135
+ Returns
2136
+ -------
2137
+ bool
2138
+ True if cutsite can cut the DNA fragment.
2139
+
1530
2140
  """
1531
2141
 
1532
2142
  assert cutsite is not None, "cutsite is None"
@@ -1536,7 +2146,7 @@ class Dseq(_Seq):
1536
2146
 
1537
2147
  # The overhang is double stranded
1538
2148
  overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson]
1539
- if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg() != 0:
2149
+ if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg != 0:
1540
2150
  return False
1541
2151
 
1542
2152
  # The recognition site is double stranded and within the sequence
@@ -1550,7 +2160,7 @@ class Dseq(_Seq):
1550
2160
  if (
1551
2161
  len(recognition_site) == 0
1552
2162
  or recognition_site.ovhg != 0
1553
- or recognition_site.watson_ovhg() != 0
2163
+ or recognition_site.watson_ovhg != 0
1554
2164
  ):
1555
2165
  if enz is None or enz.scd5 is None:
1556
2166
  return False
@@ -1569,20 +2179,22 @@ class Dseq(_Seq):
1569
2179
  if (
1570
2180
  len(recognition_site) == 0
1571
2181
  or recognition_site.ovhg != 0
1572
- or recognition_site.watson_ovhg() != 0
2182
+ or recognition_site.watson_ovhg != 0
1573
2183
  ):
1574
2184
  return False
1575
2185
 
1576
2186
  return True
1577
2187
 
1578
- def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> _List[CutSiteType]:
2188
+ def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> List[CutSiteType]:
1579
2189
  """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`:
1580
2190
 
1581
2191
  - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence
1582
2192
  that will be cut. It represents the position of the cut on the watson strand, using the full
1583
2193
  sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`.
2194
+
1584
2195
  - `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in
1585
2196
  the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.
2197
+
1586
2198
  - `enz` is the enzyme object. It's not necessary to perform the cut, but can be
1587
2199
  used to keep track of which enzyme was used.
1588
2200
 
@@ -1592,7 +2204,7 @@ class Dseq(_Seq):
1592
2204
  Parameters
1593
2205
  ----------
1594
2206
 
1595
- enzymes : Union[_RestrictionBatch,list[_AbstractCut]]
2207
+ enzymes : Union[RestrictionBatch,list[_AbstractCut]]
1596
2208
 
1597
2209
  Returns
1598
2210
  -------
@@ -1628,11 +2240,11 @@ class Dseq(_Seq):
1628
2240
 
1629
2241
  """
1630
2242
 
1631
- if len(enzymes) == 1 and isinstance(enzymes[0], _RestrictionBatch):
2243
+ if len(enzymes) == 1 and isinstance(enzymes[0], RestrictionBatch):
1632
2244
  # argument is probably a RestrictionBatch
1633
2245
  enzymes = [e for e in enzymes[0]]
1634
2246
 
1635
- enzymes = _flatten(enzymes)
2247
+ enzymes = list(dict.fromkeys(flatten(enzymes))) # remove duplicate enzymes
1636
2248
  out = list()
1637
2249
  for e in enzymes:
1638
2250
  # Positions of the cut on the watson strand. They are 1-based, so we subtract
@@ -1643,7 +2255,7 @@ class Dseq(_Seq):
1643
2255
 
1644
2256
  return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
1645
2257
 
1646
- def left_end_position(self) -> _Tuple[int, int]:
2258
+ def left_end_position(self) -> Tuple[int, int]:
1647
2259
  """
1648
2260
  The index in the full sequence of the watson and crick start positions.
1649
2261
 
@@ -1660,7 +2272,7 @@ class Dseq(_Seq):
1660
2272
  return self.ovhg, 0
1661
2273
  return 0, -self.ovhg
1662
2274
 
1663
- def right_end_position(self) -> _Tuple[int, int]:
2275
+ def right_end_position(self) -> Tuple[int, int]:
1664
2276
  """The index in the full sequence of the watson and crick end positions.
1665
2277
 
1666
2278
  full sequence (str(self)) for all three cases is AAA
@@ -1672,13 +2284,210 @@ class Dseq(_Seq):
1672
2284
  ```
1673
2285
 
1674
2286
  """
1675
- if self.watson_ovhg() < 0:
1676
- return len(self) + self.watson_ovhg(), len(self)
1677
- return len(self), len(self) - self.watson_ovhg()
2287
+ if self.watson_ovhg < 0:
2288
+ return len(self) + self.watson_ovhg, len(self)
2289
+ return len(self), len(self) - self.watson_ovhg
2290
+
2291
+ def get_ss_meltsites(self: DseqType, length: int) -> tuple[int, int]:
2292
+ """
2293
+ Single stranded DNA melt sites
2294
+
2295
+ Two lists of 2-tuples of integers are returned. Each tuple
2296
+ (`((from, to))`) contains the start and end positions of a single
2297
+ stranded region, shorter or equal to `length`.
2298
+
2299
+ In the example below, the middle 2 nt part is released from the
2300
+ molecule.
2301
+
2302
+ ::
2303
+
2304
+
2305
+ tagaa ta gtatg
2306
+ ||||| || ||||| --> [(6,8)], []
2307
+ atcttcatccatac
2308
+
2309
+ tagaagtaggtatg
2310
+ ||||| || ||||| --> [], [(6,8)]
2311
+ atctt at catac
2312
+
2313
+
2314
+
2315
+
2316
+ The output of this method is used in the `melt_ss_dna` method in order
2317
+ to determine the start and end positions of single stranded regions.
2318
+
2319
+ See get_ds_meltsites for melting ds sequences.
2320
+
2321
+ Examples
2322
+ --------
2323
+ >>> from pydna.dseq import Dseq
2324
+ >>> ds = Dseq("tagaaqtaqgtatg")
2325
+ >>> ds
2326
+ Dseq(-14)
2327
+ tagaa ta gtatg
2328
+ atcttcatccatac
2329
+ >>> cutsites = ds.get_ss_meltsites(2)
2330
+ >>> cutsites
2331
+ ([(6, 8)], [])
2332
+ >>> ds[6:8]
2333
+ Dseq(-2)
2334
+ ta
2335
+ at
2336
+ >>> ds = Dseq("tagaaptapgtatg")
2337
+ >>> ds
2338
+ Dseq(-14)
2339
+ tagaagtaggtatg
2340
+ atctt at catac
2341
+ >>> cutsites = ds.get_ss_meltsites(2)
2342
+ >>> cutsites
2343
+ ([], [(6, 8)])
2344
+ """
2345
+
2346
+ regex = regex_ss_melt_factory(length)
2347
+
2348
+ if self.circular:
2349
+ spacer = length
2350
+ cutfrom = self._data[-length:] + self._data + self._data[:length]
2351
+ else:
2352
+ spacer = 0
2353
+ cutfrom = self._data
2354
+
2355
+ watson_cuts = []
2356
+ crick_cuts = []
2357
+
2358
+ for m in regex.finditer(cutfrom):
2359
+
2360
+ if m.lastgroup == "watson":
2361
+ cut1 = m.start() + spacer
2362
+ cut2 = m.end() + spacer
2363
+ watson_cuts.append((cut1, cut2))
2364
+ else:
2365
+ assert m.lastgroup == "crick"
2366
+ cut1 = m.start() + spacer
2367
+ cut2 = m.end() + spacer
2368
+ crick_cuts.append((cut1, cut2))
2369
+
2370
+ return watson_cuts, crick_cuts
2371
+
2372
+ def get_ds_meltsites(self: DseqType, length: int) -> List[CutSiteType]:
2373
+ """
2374
+ Double stranded DNA melt sites
2375
+
2376
+ DNA molecules can fall apart by melting if they have internal single
2377
+ stranded regions. In the example below, the molecule has two gaps
2378
+ on opposite sides, two nucleotides apart, which means that it hangs
2379
+ together by two basepairs.
2380
+
2381
+ This molecule can melt into two separate 8 bp double stranded
2382
+ molecules, each with 3 nt 3' overhangs a depicted below.
2383
+
2384
+ ::
2385
+
2386
+ tagaagta gtatg tagaagta gtatg
2387
+ ||||| || ||||| --> ||||| |||||
2388
+ atctt atccatac atctt atccatac
2389
+
2390
+
2391
+ A list of 2-tuples is returned. Each tuple (`((cut_watson, ovhg), None)`)
2392
+ contains cut position and the overhang value in the same format as
2393
+ returned by the get_cutsites method for restriction enzymes.
2394
+
2395
+ Note that this function deals with melting that results in two double
2396
+ stranded DNA molecules.
2397
+
2398
+ See get_ss_meltsites for melting of single stranded regions from
2399
+ molecules.
2400
+
2401
+ Examples
2402
+ --------
2403
+ >>> from pydna.dseq import Dseq
2404
+ >>> ds = Dseq("tagaaptaqgtatg")
2405
+ >>> ds
2406
+ Dseq(-14)
2407
+ tagaagta gtatg
2408
+ atctt atccatac
2409
+ >>> cutsite = ds.get_ds_meltsites(2)
2410
+ >>> cutsite
2411
+ [((8, 2), None)]
2412
+
2413
+ """
2414
+
2415
+ if length < 1:
2416
+ return tuple()
2417
+
2418
+ regex = regex_ds_melt_factory(length)
2419
+
2420
+ if self.circular:
2421
+ spacer = length
2422
+ cutfrom = self._data[-length:] + self._data + self._data[:length]
2423
+ else:
2424
+ spacer = 0
2425
+ cutfrom = self._data
2426
+
2427
+ cuts = []
2428
+
2429
+ for m in regex.finditer(cutfrom):
2430
+
2431
+ if m.lastgroup == "watson":
2432
+ cut = (m.end() - spacer, m.end() - m.start()), None
2433
+ else:
2434
+ assert m.lastgroup == "crick"
2435
+ cut = (m.start() - spacer, m.start() - m.end()), None
2436
+
2437
+ cuts.append(cut)
2438
+
2439
+ return cuts
2440
+
2441
+ def cast_to_ds_right(self):
2442
+ """
2443
+ NNNN NNNNGATC
2444
+ |||| --> ||||||||
2445
+ NNNNCTAG NNNNCTAG
2446
+
2447
+
2448
+ NNNNGATC NNNNGATC
2449
+ |||| --> ||||||||
2450
+ NNNN NNNNCTAG
2451
+ """
2452
+
2453
+ p = self.get_parts()
2454
+
2455
+ ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate(
2456
+ dscode_to_full_sequence_table
2457
+ )
2458
+
2459
+ result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer
2460
+
2461
+ return self.__class__(result, circular=False)
2462
+
2463
+ def cast_to_ds(self):
2464
+ """Sequencially calls cast_to_ds_left and cast_to_ds_right."""
2465
+ return self.cast_to_ds_left().cast_to_ds_right()
2466
+
2467
+ def cast_to_ds_left(self):
2468
+ """
2469
+ GATCNNNN GATCNNNN
2470
+ |||| --> ||||||||
2471
+ NNNN CTAGNNNN
2472
+
2473
+ NNNN GATCNNNN
2474
+ |||| --> ||||||||
2475
+ CTAGNNNN CTAGNNNN
2476
+ """
2477
+
2478
+ p = self.get_parts()
2479
+
2480
+ ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate(
2481
+ dscode_to_full_sequence_table
2482
+ )
2483
+
2484
+ result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3)
2485
+
2486
+ return self.__class__(result, circular=False)
1678
2487
 
1679
2488
  def get_cut_parameters(
1680
- self, cut: _Union[CutSiteType, None], is_left: bool
1681
- ) -> _Tuple[int, int, int]:
2489
+ self, cut: Union[CutSiteType, None], is_left: bool
2490
+ ) -> Tuple[int, int, int]:
1682
2491
  """For a given cut expressed as ((cut_watson, ovhg), enz), returns
1683
2492
  a tuple (cut_watson, cut_crick, ovhg).
1684
2493
 
@@ -1703,7 +2512,169 @@ class Dseq(_Seq):
1703
2512
  if is_left:
1704
2513
  return *self.left_end_position(), self.ovhg
1705
2514
  # In the right end, the overhang does not matter
1706
- return *self.right_end_position(), self.watson_ovhg()
2515
+ return *self.right_end_position(), self.watson_ovhg
2516
+
2517
+ def melt(self, length):
2518
+ """
2519
+ TBD
2520
+
2521
+ Parameters
2522
+ ----------
2523
+ length : TYPE
2524
+ DESCRIPTION.
2525
+
2526
+ Returns
2527
+ -------
2528
+ TYPE
2529
+ DESCRIPTION.
2530
+
2531
+ """
2532
+ if not length or length < 1:
2533
+ return tuple()
2534
+
2535
+ # First we need to get rid of single stranded sequences
2536
+ new, strands = self.melt_ss_dna(length)
2537
+
2538
+ cutsites = new.get_ds_meltsites(length)
2539
+
2540
+ cutsite_pairs = self.get_cutsite_pairs(cutsites)
2541
+
2542
+ result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs)
2543
+
2544
+ result = tuple([new]) if strands and not result else result
2545
+
2546
+ return tuple(strands) + tuple(result)
2547
+
2548
+ def melt_ss_dna(self, length) -> tuple["Dseq", list["Dseq"]]:
2549
+ """
2550
+ Melt to separate single stranded DNA
2551
+
2552
+ Single stranded DNA molecules shorter or equal to `length` shed from
2553
+ a double stranded DNA molecule without affecting the length of the
2554
+ remaining molecule.
2555
+
2556
+ In the examples below, the middle 2 nt part is released from the
2557
+ molecule.
2558
+
2559
+ ::
2560
+
2561
+ tagaa ta gtatg tagaa gtatg ta
2562
+ ||||| || ||||| --> ||||| ||||| + ||
2563
+ atcttcatccatac atcttcatccatac
2564
+
2565
+ tagaagtaggtatg tagaagtaggtatg
2566
+ ||||| || ||||| --> ||||| ||||| + ||
2567
+ atctt at catac atctt catac at
2568
+
2569
+
2570
+ Examples
2571
+ --------
2572
+ >>> from pydna.dseq import Dseq
2573
+ >>> ds = Dseq("tagaaqtaqgtatg")
2574
+ >>> ds
2575
+ Dseq(-14)
2576
+ tagaa ta gtatg
2577
+ atcttcatccatac
2578
+ >>> new, strands = ds.melt_ss_dna(2)
2579
+ >>> new
2580
+ Dseq(-14)
2581
+ tagaa gtatg
2582
+ atcttcatccatac
2583
+ >>> strands[0]
2584
+ Dseq(-2)
2585
+ ta
2586
+ <BLANKLINE>
2587
+ >>> ds = Dseq("tagaaptapgtatg")
2588
+ >>> ds
2589
+ Dseq(-14)
2590
+ tagaagtaggtatg
2591
+ atctt at catac
2592
+ >>> new, strands = ds.melt_ss_dna(2)
2593
+ >>> new
2594
+ Dseq(-14)
2595
+ tagaagtaggtatg
2596
+ atctt catac
2597
+ >>> strands[0]
2598
+ Dseq(-2)
2599
+ <BLANKLINE>
2600
+ at
2601
+ """
2602
+
2603
+ watsonnicks, cricknicks = self.get_ss_meltsites(length)
2604
+
2605
+ new, strands = self.shed_ss_dna(watsonnicks, cricknicks)
2606
+
2607
+ return new, strands
2608
+
2609
+ def shed_ss_dna(
2610
+ self,
2611
+ watson_cutpairs: list[tuple[int, int]] = None,
2612
+ crick_cutpairs: list[tuple[int, int]] = None,
2613
+ ):
2614
+ """
2615
+ Separate parts of one of the DNA strands
2616
+
2617
+ Examples
2618
+ --------
2619
+ >>> from pydna.dseq import Dseq
2620
+ >>> ds = Dseq("tagaagtaggtatg")
2621
+ >>> ds
2622
+ Dseq(-14)
2623
+ tagaagtaggtatg
2624
+ atcttcatccatac
2625
+ >>> new, strands = ds.shed_ss_dna([(6, 8)],[])
2626
+ >>> new
2627
+ Dseq(-14)
2628
+ tagaag ggtatg
2629
+ atcttcatccatac
2630
+ >>> strands[0]
2631
+ Dseq(-2)
2632
+ ta
2633
+ <BLANKLINE>
2634
+ >>> new, strands = ds.shed_ss_dna([],[(6, 8)])
2635
+ >>> new
2636
+ Dseq(-14)
2637
+ tagaagtaggtatg
2638
+ atcttc ccatac
2639
+ >>> strands[0]
2640
+ Dseq(-2)
2641
+ <BLANKLINE>
2642
+ at
2643
+ >>> ds = Dseq("tagaagtaggtatg")
2644
+ >>> new, (strand1, strand2) = ds.shed_ss_dna([(6, 8), (9, 11)],[])
2645
+ >>> new
2646
+ Dseq(-14)
2647
+ tagaag g atg
2648
+ atcttcatccatac
2649
+ >>> strand1
2650
+ Dseq(-2)
2651
+ ta
2652
+ <BLANKLINE>
2653
+ >>> strand2
2654
+ Dseq(-2)
2655
+ gt
2656
+ <BLANKLINE>
2657
+ """
2658
+
2659
+ watson_cutpairs = watson_cutpairs or list()
2660
+ crick_cutpairs = crick_cutpairs or list()
2661
+ strands = []
2662
+
2663
+ new = bytearray(self._data)
2664
+
2665
+ for x, y in watson_cutpairs:
2666
+ stuffer = new[x:y]
2667
+ ss = Dseq.quick(new[x:y].translate(dscode_to_watson_tail_table))
2668
+ new[x:y] = stuffer.translate(dscode_to_crick_tail_table)
2669
+ strands.append(ss)
2670
+
2671
+ for x, y in crick_cutpairs:
2672
+ stuffer = new[x:y]
2673
+ ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table))
2674
+ new[x:y] = stuffer.translate(dscode_to_watson_tail_table)
2675
+ strands.append(ss)
2676
+
2677
+ return Dseq.quick(new), strands
1707
2678
 
1708
2679
  def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq":
1709
2680
  """Extracts a subfragment of the sequence between two cuts.
@@ -1760,25 +2731,22 @@ class Dseq(_Seq):
1760
2731
  GttCTTAA
1761
2732
 
1762
2733
  """
1763
- if _cuts_overlap(left_cut, right_cut, len(self)):
2734
+ if cuts_overlap(left_cut, right_cut, len(self)):
1764
2735
  raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1]))
1765
2736
 
1766
2737
  left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True)
1767
2738
  right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False)
1768
2739
  return Dseq(
1769
- str(self[left_watson:right_watson]),
1770
- # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case
1771
- str(
1772
- self.reverse_complement()[
1773
- len(self) - right_crick : len(self) - left_crick
1774
- ]
1775
- ),
2740
+ self[left_watson:right_watson]._data.translate(dscode_to_watson_table),
2741
+ self[left_crick:right_crick]
2742
+ .reverse_complement()
2743
+ ._data.translate(dscode_to_watson_table),
1776
2744
  ovhg=ovhg_left,
1777
2745
  )
1778
2746
 
1779
2747
  def get_cutsite_pairs(
1780
- self, cutsites: _List[CutSiteType]
1781
- ) -> _List[_Tuple[_Union[None, CutSiteType], _Union[None, CutSiteType]]]:
2748
+ self, cutsites: List[CutSiteType]
2749
+ ) -> List[Tuple[Union[None, CutSiteType], Union[None, CutSiteType]]]:
1782
2750
  """Returns pairs of cutsites that render the edges of the resulting fragments.
1783
2751
 
1784
2752
  A fragment produced by restriction is represented by a tuple of length 2 that
@@ -1828,3 +2796,105 @@ class Dseq(_Seq):
1828
2796
  cutsites.append(cutsites[0])
1829
2797
 
1830
2798
  return list(zip(cutsites, cutsites[1:]))
2799
+
2800
+ def get_parts(self):
2801
+ """
2802
+ Returns a DseqParts instance containing the parts (strings) of a dsDNA
2803
+ sequence. DseqParts instance field names:
2804
+
2805
+ ::
2806
+
2807
+ "sticky_left5"
2808
+ |
2809
+ | "sticky_right5"
2810
+ | |
2811
+ --- ---
2812
+ GGGATCC
2813
+ TAGGTCA
2814
+ ----
2815
+ |
2816
+ "middle"
2817
+
2818
+
2819
+
2820
+ "sticky_left3"
2821
+ |
2822
+ | "sticky_right3"
2823
+ | |
2824
+ --- ---
2825
+ ATCCAGT
2826
+ CCCTAGG
2827
+ ----
2828
+ |
2829
+ "middle"
2830
+
2831
+
2832
+
2833
+ "single_watson" (only an upper strand)
2834
+ |
2835
+ -------
2836
+ ATCCAGT
2837
+ |||||||
2838
+
2839
+
2840
+
2841
+ "single_crick" (only a lower strand)
2842
+ |
2843
+ -------
2844
+
2845
+ |||||||
2846
+ CCCTAGG
2847
+
2848
+
2849
+ Up to seven groups (0..6) are captured, but some are mutually exclusive
2850
+ which means that one of them is an empty string:
2851
+
2852
+ 0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
2853
+
2854
+ 2 or 5 or 6, a DNA molecule has a ds region or is single stranded.
2855
+
2856
+ 3 or 4, not both, either 5' or 3' sticky end.
2857
+
2858
+ Note that internal single stranded regions are not identified and will
2859
+ be contained in the middle part if they are present.
2860
+
2861
+ Examples
2862
+ --------
2863
+ >>> from pydna.dseq import Dseq
2864
+ >>> ds = Dseq("PPPATCFQZ")
2865
+ >>> ds
2866
+ Dseq(-9)
2867
+ GGGATC
2868
+ TAGTCA
2869
+ >>> parts = ds.get_parts()
2870
+ >>> parts
2871
+ DseqParts(sticky_left5='PPP', sticky_left3='', middle='ATC', sticky_right3='', sticky_right5='FQZ', single_watson='', single_crick='')
2872
+ >>> Dseq(parts.sticky_left5)
2873
+ Dseq(-3)
2874
+ GGG
2875
+ <BLANKLINE>
2876
+ >>> Dseq(parts.middle)
2877
+ Dseq(-3)
2878
+ ATC
2879
+ TAG
2880
+ >>> Dseq(parts.sticky_right5)
2881
+ Dseq(-3)
2882
+ <BLANKLINE>
2883
+ TCA
2884
+
2885
+ Parameters
2886
+ ----------
2887
+ datastring : str
2888
+ A string with dscode.
2889
+
2890
+ Returns
2891
+ -------
2892
+ namedtuple
2893
+ Seven string fields describing the DNA molecule.
2894
+ fragment(sticky_left5='', sticky_left3='',
2895
+ middle='',
2896
+ sticky_right3='', sticky_right5='',
2897
+ single_watson='', single_crick='')
2898
+
2899
+ """
2900
+ return get_parts(self._data.decode("ascii"))