pydna 5.5.3__py3-none-any.whl → 5.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +24 -193
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +650 -405
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/download.py +6 -15
- pydna/dseq.py +1794 -718
- pydna/dseqrecord.py +220 -171
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +680 -0
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +21 -18
- pydna/utils.py +97 -75
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info}/METADATA +14 -46
- pydna-5.5.5.dist-info/RECORD +43 -0
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.3.dist-info/RECORD +0 -45
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info/licenses}/LICENSE.txt +0 -0
pydna/assembly2.py
CHANGED
|
@@ -4,29 +4,29 @@ Improved implementation of the assembly module. To see a list of issues with the
|
|
|
4
4
|
see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import networkx as
|
|
8
|
-
import itertools
|
|
7
|
+
import networkx as nx
|
|
8
|
+
import itertools
|
|
9
9
|
from Bio.SeqFeature import SimpleLocation, Location
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
from Bio.Restriction.Restriction import RestrictionBatch
|
|
12
12
|
import regex
|
|
13
13
|
import copy
|
|
14
14
|
|
|
15
15
|
from pydna.utils import (
|
|
16
|
-
shift_location
|
|
16
|
+
shift_location,
|
|
17
17
|
flatten,
|
|
18
|
-
location_boundaries
|
|
19
|
-
locations_overlap
|
|
18
|
+
location_boundaries,
|
|
19
|
+
locations_overlap,
|
|
20
20
|
sum_is_sticky,
|
|
21
21
|
limit_iterator,
|
|
22
22
|
create_location,
|
|
23
23
|
)
|
|
24
|
-
from pydna._pretty import pretty_str as
|
|
24
|
+
from pydna._pretty import pretty_str as ps
|
|
25
25
|
from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
|
|
26
|
-
from pydna.dseqrecord import Dseqrecord
|
|
27
|
-
from pydna.dseq import Dseq
|
|
28
|
-
from pydna.primer import Primer
|
|
29
|
-
from pydna.seqrecord import SeqRecord
|
|
26
|
+
from pydna.dseqrecord import Dseqrecord
|
|
27
|
+
from pydna.dseq import Dseq
|
|
28
|
+
from pydna.primer import Primer
|
|
29
|
+
from pydna.seqrecord import SeqRecord
|
|
30
30
|
from pydna.types import (
|
|
31
31
|
CutSiteType,
|
|
32
32
|
# TODO: allow user to enforce multi-site
|
|
@@ -38,11 +38,29 @@ from pydna.types import (
|
|
|
38
38
|
)
|
|
39
39
|
from pydna.gateway import gateway_overlap, find_gateway_sites
|
|
40
40
|
from pydna.cre_lox import cre_loxP_overlap
|
|
41
|
+
from pydna.alphabet import anneal_strands
|
|
42
|
+
|
|
43
|
+
from typing import TYPE_CHECKING, Callable, Literal
|
|
44
|
+
from pydna.opencloning_models import (
|
|
45
|
+
AssemblySource,
|
|
46
|
+
RestrictionAndLigationSource,
|
|
47
|
+
GibsonAssemblySource,
|
|
48
|
+
InFusionSource,
|
|
49
|
+
OverlapExtensionPCRLigationSource,
|
|
50
|
+
InVivoAssemblySource,
|
|
51
|
+
LigationSource,
|
|
52
|
+
GatewaySource,
|
|
53
|
+
HomologousRecombinationSource,
|
|
54
|
+
CreLoxRecombinationSource,
|
|
55
|
+
PCRSource,
|
|
56
|
+
SourceInput,
|
|
57
|
+
CRISPRSource,
|
|
58
|
+
)
|
|
59
|
+
from pydna.crispr import cas9
|
|
60
|
+
import warnings
|
|
41
61
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if TYPE_CHECKING:
|
|
45
|
-
from Bio.Restriction import AbstractCut as _AbstractCut
|
|
62
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
63
|
+
from Bio.Restriction import AbstractCut
|
|
46
64
|
|
|
47
65
|
|
|
48
66
|
def gather_overlapping_locations(
|
|
@@ -54,45 +72,52 @@ def gather_overlapping_locations(
|
|
|
54
72
|
the output will be [(loc1, loc2), (loc3,)].
|
|
55
73
|
"""
|
|
56
74
|
# Make a graph with all the locations as nodes
|
|
57
|
-
G =
|
|
75
|
+
G = nx.Graph()
|
|
58
76
|
for i, loc in enumerate(locs):
|
|
59
77
|
G.add_node(i, location=loc)
|
|
60
78
|
|
|
61
79
|
# Add edges between nodes that overlap
|
|
62
80
|
for i in range(len(locs)):
|
|
63
81
|
for j in range(i + 1, len(locs)):
|
|
64
|
-
if
|
|
82
|
+
if locations_overlap(locs[i], locs[j], fragment_length):
|
|
65
83
|
G.add_edge(i, j)
|
|
66
84
|
|
|
67
85
|
# Get groups of overlapping locations
|
|
68
86
|
groups = list()
|
|
69
|
-
for loc_set in
|
|
87
|
+
for loc_set in nx.connected_components(G):
|
|
70
88
|
groups.append(tuple(locs[i] for i in loc_set))
|
|
71
89
|
|
|
72
90
|
# Sort by location of the first element in each group (does not matter which since they are overlapping)
|
|
73
|
-
groups.sort(key=lambda x:
|
|
91
|
+
groups.sort(key=lambda x: location_boundaries(x[0])[0])
|
|
74
92
|
|
|
75
93
|
return groups
|
|
76
94
|
|
|
77
95
|
|
|
78
96
|
def ends_from_cutsite(
|
|
79
|
-
cutsite: CutSiteType, seq:
|
|
97
|
+
cutsite: CutSiteType, seq: Dseq
|
|
80
98
|
) -> tuple[tuple[str, str], tuple[str, str]]:
|
|
81
99
|
"""Get the sticky or blunt ends created by a restriction enzyme cut.
|
|
82
100
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
cutsite : CutSiteType
|
|
104
|
+
A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
|
|
105
|
+
seq : _Dseq
|
|
106
|
+
The DNA sequence being cut
|
|
86
107
|
|
|
87
|
-
Raises
|
|
88
|
-
|
|
108
|
+
Raises
|
|
109
|
+
------
|
|
110
|
+
ValueError
|
|
111
|
+
If cutsite is None
|
|
89
112
|
|
|
90
|
-
Returns
|
|
91
|
-
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
tuple[tuple[str, str], tuple[str, str]]
|
|
116
|
+
A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
|
|
92
117
|
and the sequence of the overhang. The first tuple is for the left end, second for the right end.
|
|
93
118
|
|
|
94
119
|
>>> from Bio.Restriction import NotI
|
|
95
|
-
>>> x =
|
|
120
|
+
>>> x = Dseq("ctcgGCGGCCGCcagcggccg")
|
|
96
121
|
>>> x.get_cutsites(NotI)
|
|
97
122
|
[((6, -4), NotI)]
|
|
98
123
|
>>> ends_from_cutsite(x.get_cutsites(NotI)[0], x)
|
|
@@ -119,8 +144,8 @@ def ends_from_cutsite(
|
|
|
119
144
|
|
|
120
145
|
|
|
121
146
|
def restriction_ligation_overlap(
|
|
122
|
-
seqx:
|
|
123
|
-
seqy:
|
|
147
|
+
seqx: Dseqrecord,
|
|
148
|
+
seqy: Dseqrecord,
|
|
124
149
|
enzymes=RestrictionBatch,
|
|
125
150
|
partial=False,
|
|
126
151
|
allow_blunt=False,
|
|
@@ -129,14 +154,23 @@ def restriction_ligation_overlap(
|
|
|
129
154
|
|
|
130
155
|
Like in sticky and gibson, the order matters (see example below of partial overlap)
|
|
131
156
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
seqx : Dseqrecord
|
|
160
|
+
The first sequence
|
|
161
|
+
seqy : Dseqrecord
|
|
162
|
+
The second sequence
|
|
163
|
+
enzymes : RestrictionBatch
|
|
164
|
+
The enzymes to use
|
|
165
|
+
partial : bool
|
|
166
|
+
Whether to allow partial overlaps
|
|
167
|
+
allow_blunt : bool
|
|
168
|
+
Whether to allow blunt ends
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
list[SequenceOverlap]
|
|
173
|
+
A list of overlaps between the two sequences
|
|
140
174
|
|
|
141
175
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
142
176
|
>>> from pydna.assembly2 import restriction_ligation_overlap
|
|
@@ -178,7 +212,7 @@ def restriction_ligation_overlap(
|
|
|
178
212
|
# if not seqy.circular:
|
|
179
213
|
# cuts_y.append(((0, 0), None))
|
|
180
214
|
matches = list()
|
|
181
|
-
for cut_x, cut_y in
|
|
215
|
+
for cut_x, cut_y in itertools.product(cuts_x, cuts_y):
|
|
182
216
|
# A blunt end
|
|
183
217
|
if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
|
|
184
218
|
matches.append((cut_x[0][0], cut_y[0][0], 0))
|
|
@@ -222,7 +256,7 @@ def combine_algorithms(*algorithms: AssemblyAlgorithmType) -> AssemblyAlgorithmT
|
|
|
222
256
|
|
|
223
257
|
|
|
224
258
|
def blunt_overlap(
|
|
225
|
-
seqx:
|
|
259
|
+
seqx: Dseqrecord, seqy: Dseqrecord, limit=None
|
|
226
260
|
) -> list[SequenceOverlap]:
|
|
227
261
|
"""
|
|
228
262
|
Assembly algorithm to find blunt overlaps. Used for blunt ligation.
|
|
@@ -230,13 +264,19 @@ def blunt_overlap(
|
|
|
230
264
|
It basically returns [(len(seqx), 0, 0)] if the right end of seqx is blunt and the
|
|
231
265
|
left end of seqy is blunt (compatible with blunt ligation). Otherwise, it returns an empty list.
|
|
232
266
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
seqx : Dseqrecord
|
|
270
|
+
The first sequence
|
|
271
|
+
seqy : Dseqrecord
|
|
272
|
+
The second sequence
|
|
273
|
+
limit : int
|
|
274
|
+
There for compatibility, but it is ignored
|
|
237
275
|
|
|
238
|
-
Returns
|
|
239
|
-
|
|
276
|
+
Returns
|
|
277
|
+
-------
|
|
278
|
+
list[SequenceOverlap]
|
|
279
|
+
A list of overlaps between the two sequences
|
|
240
280
|
|
|
241
281
|
>>> from pydna.assembly2 import blunt_overlap
|
|
242
282
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
@@ -254,7 +294,7 @@ def blunt_overlap(
|
|
|
254
294
|
|
|
255
295
|
|
|
256
296
|
def common_sub_strings(
|
|
257
|
-
seqx:
|
|
297
|
+
seqx: Dseqrecord, seqy: Dseqrecord, limit=25
|
|
258
298
|
) -> list[SequenceOverlap]:
|
|
259
299
|
"""
|
|
260
300
|
Assembly algorithm to find common substrings of length == limit. see the docs of
|
|
@@ -317,30 +357,36 @@ def common_sub_strings(
|
|
|
317
357
|
return [r for r in results if r not in shifted_matches]
|
|
318
358
|
|
|
319
359
|
|
|
320
|
-
def gibson_overlap(seqx:
|
|
360
|
+
def gibson_overlap(seqx: Dseqrecord, seqy: Dseqrecord, limit=25):
|
|
321
361
|
"""
|
|
322
362
|
Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
|
|
323
363
|
The order matters, we want alignments like:
|
|
324
364
|
|
|
325
|
-
|
|
326
|
-
seqx: oooo------xxxx
|
|
327
|
-
seqy: xxxx------oooo
|
|
328
|
-
Product: oooo------xxxx------oooo
|
|
365
|
+
::
|
|
329
366
|
|
|
330
|
-
|
|
367
|
+
seqx: oooo------xxxx
|
|
368
|
+
seqy: xxxx------oooo
|
|
369
|
+
Product: oooo------xxxx------oooo
|
|
331
370
|
|
|
332
|
-
|
|
333
|
-
seqy: xxxx------oooo
|
|
334
|
-
Product (unwanted): oooo
|
|
335
|
-
```
|
|
371
|
+
Not like:
|
|
336
372
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
373
|
+
seqx: oooo------xxxx
|
|
374
|
+
seqy: xxxx------oooo
|
|
375
|
+
Product (unwanted): oooo
|
|
376
|
+
|
|
377
|
+
Parameters
|
|
378
|
+
----------
|
|
379
|
+
seqx : Dseqrecord
|
|
380
|
+
The first sequence
|
|
381
|
+
seqy : Dseqrecord
|
|
382
|
+
The second sequence
|
|
383
|
+
limit : int
|
|
384
|
+
Minimum length of the overlap
|
|
341
385
|
|
|
342
|
-
Returns
|
|
343
|
-
|
|
386
|
+
Returns
|
|
387
|
+
-------
|
|
388
|
+
list[SequenceOverlap]
|
|
389
|
+
A list of overlaps between the two sequences
|
|
344
390
|
|
|
345
391
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
346
392
|
>>> from pydna.assembly2 import gibson_overlap
|
|
@@ -357,9 +403,9 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
357
403
|
# This is only relevant for linear fragments, so we don't need to worry about
|
|
358
404
|
# shifting locations for circular fragments.
|
|
359
405
|
trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
|
|
360
|
-
trim_x_right = seqx.seq.watson_ovhg
|
|
406
|
+
trim_x_right = seqx.seq.watson_ovhg if seqx.seq.watson_ovhg < 0 else None
|
|
361
407
|
trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
|
|
362
|
-
trim_y_right = seqy.seq.watson_ovhg
|
|
408
|
+
trim_y_right = seqy.seq.watson_ovhg if seqy.seq.watson_ovhg < 0 else None
|
|
363
409
|
|
|
364
410
|
stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
|
|
365
411
|
stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
|
|
@@ -377,20 +423,26 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
377
423
|
return [tuple(m) for m in matches]
|
|
378
424
|
|
|
379
425
|
|
|
380
|
-
def sticky_end_sub_strings(seqx:
|
|
426
|
+
def sticky_end_sub_strings(seqx: Dseqrecord, seqy: Dseqrecord, limit: bool = False):
|
|
381
427
|
"""
|
|
382
428
|
Assembly algorithm for ligation of sticky ends.
|
|
383
429
|
|
|
384
430
|
For now, if limit 0 / False (default) only full overlaps are considered.
|
|
385
431
|
Otherwise, partial overlaps are also returned.
|
|
386
432
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
433
|
+
Parameters
|
|
434
|
+
----------
|
|
435
|
+
seqx : Dseqrecord
|
|
436
|
+
The first sequence
|
|
437
|
+
seqy : Dseqrecord
|
|
438
|
+
The second sequence
|
|
439
|
+
limit : bool
|
|
440
|
+
Whether to allow partial overlaps
|
|
391
441
|
|
|
392
|
-
Returns
|
|
393
|
-
|
|
442
|
+
Returns
|
|
443
|
+
-------
|
|
444
|
+
list[SequenceOverlap]
|
|
445
|
+
A list of overlaps between the two sequences
|
|
394
446
|
|
|
395
447
|
|
|
396
448
|
Ligation of fully overlapping sticky ends, note how the order matters
|
|
@@ -415,6 +467,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
|
|
|
415
467
|
[(4, 0, 2)]
|
|
416
468
|
|
|
417
469
|
"""
|
|
470
|
+
|
|
418
471
|
overlap = sum_is_sticky(
|
|
419
472
|
seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit
|
|
420
473
|
)
|
|
@@ -424,7 +477,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
|
|
|
424
477
|
|
|
425
478
|
|
|
426
479
|
def zip_match_leftwards(
|
|
427
|
-
seqx:
|
|
480
|
+
seqx: SeqRecord, seqy: SeqRecord, match: SequenceOverlap
|
|
428
481
|
) -> SequenceOverlap:
|
|
429
482
|
"""
|
|
430
483
|
Starting from the rightmost edge of the match, return a new match encompassing the max
|
|
@@ -432,15 +485,15 @@ def zip_match_leftwards(
|
|
|
432
485
|
than the limit or a shorter match if there are mismatches. This is convenient to maintain
|
|
433
486
|
as many features as possible. It is used in PCR assembly.
|
|
434
487
|
|
|
435
|
-
>>> seq =
|
|
436
|
-
>>> primer =
|
|
488
|
+
>>> seq = Dseqrecord('AAAAACGTCCCGT')
|
|
489
|
+
>>> primer = Dseqrecord('ACGTCCCGT')
|
|
437
490
|
>>> match = (13, 9, 0) # an empty match at the end of each
|
|
438
491
|
>>> zip_match_leftwards(seq, primer, match)
|
|
439
492
|
(4, 0, 9)
|
|
440
493
|
|
|
441
494
|
Works in circular molecules if the match spans the origin:
|
|
442
|
-
>>> seq =
|
|
443
|
-
>>> primer =
|
|
495
|
+
>>> seq = Dseqrecord('TCCCGTAAAAACG', circular=True)
|
|
496
|
+
>>> primer = Dseqrecord('ACGTCCCGT')
|
|
444
497
|
>>> match = (6, 9, 0)
|
|
445
498
|
>>> zip_match_leftwards(seq, primer, match)
|
|
446
499
|
(10, 0, 9)
|
|
@@ -461,11 +514,11 @@ def zip_match_leftwards(
|
|
|
461
514
|
# For those cases we shift by length, then go back
|
|
462
515
|
|
|
463
516
|
end_on_x = match[0] + match[2]
|
|
464
|
-
if isinstance(seqx,
|
|
517
|
+
if isinstance(seqx, Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
|
|
465
518
|
end_on_x += len(seqx)
|
|
466
519
|
|
|
467
520
|
end_on_y = match[1] + match[2]
|
|
468
|
-
if isinstance(seqy,
|
|
521
|
+
if isinstance(seqy, Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
|
|
469
522
|
end_on_y += len(seqy)
|
|
470
523
|
|
|
471
524
|
count = 0
|
|
@@ -482,7 +535,7 @@ def zip_match_leftwards(
|
|
|
482
535
|
|
|
483
536
|
|
|
484
537
|
def zip_match_rightwards(
|
|
485
|
-
seqx:
|
|
538
|
+
seqx: Dseqrecord, seqy: Dseqrecord, match: SequenceOverlap
|
|
486
539
|
) -> SequenceOverlap:
|
|
487
540
|
"""Same as zip_match_leftwards, but towards the right."""
|
|
488
541
|
|
|
@@ -498,19 +551,19 @@ def zip_match_rightwards(
|
|
|
498
551
|
return (start_on_x, start_on_y, count)
|
|
499
552
|
|
|
500
553
|
|
|
501
|
-
def seqrecord2_uppercase_DNA_string(seqr:
|
|
554
|
+
def seqrecord2_uppercase_DNA_string(seqr: SeqRecord) -> str:
|
|
502
555
|
"""
|
|
503
556
|
Transform a Dseqrecord to a sequence string where U is replaced by T, everything is upper case and
|
|
504
557
|
circular sequences are repeated twice. This is used for PCR, to support primers with U's (e.g. for USER cloning).
|
|
505
558
|
"""
|
|
506
559
|
out = str(seqr.seq).upper().replace("U", "T")
|
|
507
|
-
if isinstance(seqr,
|
|
560
|
+
if isinstance(seqr, Dseqrecord) and seqr.circular:
|
|
508
561
|
return out * 2
|
|
509
562
|
return out
|
|
510
563
|
|
|
511
564
|
|
|
512
565
|
def primer_template_overlap(
|
|
513
|
-
seqx:
|
|
566
|
+
seqx: Dseqrecord | Primer, seqy: Dseqrecord | Primer, limit=25, mismatches=0
|
|
514
567
|
) -> list[SequenceOverlap]:
|
|
515
568
|
"""
|
|
516
569
|
Assembly algorithm to find overlaps between a primer and a template. It accepts mismatches.
|
|
@@ -520,14 +573,21 @@ def primer_template_overlap(
|
|
|
520
573
|
If seqx is a template and seqy is a primer, it represents the binding of a reverse primer,
|
|
521
574
|
where the primer has been passed as its reverse complement (see examples).
|
|
522
575
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
576
|
+
Parameters
|
|
577
|
+
----------
|
|
578
|
+
seqx : Dseqrecord | Primer
|
|
579
|
+
The primer
|
|
580
|
+
seqy : Dseqrecord | Primer
|
|
581
|
+
The template
|
|
582
|
+
limit : int
|
|
583
|
+
Minimum length of the overlap
|
|
584
|
+
mismatches : int
|
|
585
|
+
Maximum number of mismatches (only substitutions, no deletion or insertion)
|
|
528
586
|
|
|
529
|
-
Returns
|
|
530
|
-
|
|
587
|
+
Returns
|
|
588
|
+
-------
|
|
589
|
+
list[SequenceOverlap]
|
|
590
|
+
A list of overlaps between the primer and the template
|
|
531
591
|
|
|
532
592
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
533
593
|
>>> from pydna.primer import Primer
|
|
@@ -537,7 +597,7 @@ def primer_template_overlap(
|
|
|
537
597
|
>>> primer_template_overlap(primer, template, limit=8, mismatches=0)
|
|
538
598
|
[(0, 2, 8)]
|
|
539
599
|
|
|
540
|
-
This actually represents the binding of the primer
|
|
600
|
+
This actually represents the binding of the primer ``GCTGCTAA`` (reverse complement)
|
|
541
601
|
>>> primer_template_overlap(template, primer, limit=8, mismatches=0)
|
|
542
602
|
[(2, 0, 8)]
|
|
543
603
|
>>> primer_template_overlap(primer, template.reverse_complement(), limit=8, mismatches=0)
|
|
@@ -546,11 +606,11 @@ def primer_template_overlap(
|
|
|
546
606
|
[]
|
|
547
607
|
"""
|
|
548
608
|
|
|
549
|
-
if isinstance(seqx,
|
|
609
|
+
if isinstance(seqx, Primer) and isinstance(seqy, Dseqrecord):
|
|
550
610
|
primer = seqx
|
|
551
611
|
template = seqy
|
|
552
612
|
reverse_primer = False
|
|
553
|
-
elif isinstance(seqx,
|
|
613
|
+
elif isinstance(seqx, Dseqrecord) and isinstance(seqy, Primer):
|
|
554
614
|
primer = seqy
|
|
555
615
|
template = seqx
|
|
556
616
|
reverse_primer = True
|
|
@@ -604,45 +664,8 @@ def primer_template_overlap(
|
|
|
604
664
|
return list(sorted(out))
|
|
605
665
|
|
|
606
666
|
|
|
607
|
-
def fill_left(seq: _Dseq) -> _Dseq:
|
|
608
|
-
"""Fill the left overhang of a sequence with the complementary sequence."""
|
|
609
|
-
new_watson = seq.watson
|
|
610
|
-
new_crick = seq.crick
|
|
611
|
-
|
|
612
|
-
# Watson 5' overhang
|
|
613
|
-
if seq.ovhg < 0:
|
|
614
|
-
new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
|
|
615
|
-
# Crick 5' overhang
|
|
616
|
-
elif seq.ovhg > 0:
|
|
617
|
-
new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
|
|
618
|
-
|
|
619
|
-
return _Dseq(new_watson, new_crick, 0)
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
def fill_right(seq: _Dseq) -> _Dseq:
|
|
623
|
-
"""Fill the right overhang of a sequence with the complementary sequence."""
|
|
624
|
-
new_watson = seq.watson
|
|
625
|
-
new_crick = seq.crick
|
|
626
|
-
|
|
627
|
-
# Watson 3' overhang
|
|
628
|
-
watson_ovhg = seq.watson_ovhg()
|
|
629
|
-
if watson_ovhg < 0:
|
|
630
|
-
new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
|
|
631
|
-
|
|
632
|
-
# Crick 3' overhang
|
|
633
|
-
elif watson_ovhg > 0:
|
|
634
|
-
new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
|
|
635
|
-
|
|
636
|
-
return _Dseq(new_watson, new_crick, seq.ovhg)
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
def fill_dseq(seq: _Dseq) -> _Dseq:
|
|
640
|
-
"""Fill the overhangs of a sequence with the complementary sequence."""
|
|
641
|
-
return fill_left(fill_right(seq))
|
|
642
|
-
|
|
643
|
-
|
|
644
667
|
def reverse_complement_assembly(
|
|
645
|
-
assembly: EdgeRepresentationAssembly, fragments: list[
|
|
668
|
+
assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
|
|
646
669
|
) -> EdgeRepresentationAssembly:
|
|
647
670
|
"""Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
|
|
648
671
|
new_assembly = list()
|
|
@@ -656,7 +679,7 @@ def reverse_complement_assembly(
|
|
|
656
679
|
def filter_linear_subassemblies(
|
|
657
680
|
linear_assemblies: list[EdgeRepresentationAssembly],
|
|
658
681
|
circular_assemblies: list[EdgeRepresentationAssembly],
|
|
659
|
-
fragments: list[
|
|
682
|
+
fragments: list[Dseqrecord],
|
|
660
683
|
) -> list[EdgeRepresentationAssembly]:
|
|
661
684
|
"""Remove linear assemblies which are sub-assemblies of circular assemblies"""
|
|
662
685
|
all_circular_assemblies = circular_assemblies + [
|
|
@@ -702,7 +725,7 @@ def assembly2str(assembly: EdgeRepresentationAssembly) -> str:
|
|
|
702
725
|
('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
|
|
703
726
|
|
|
704
727
|
The reason for this is that by default, a feature '[8:14]' when present in a tuple
|
|
705
|
-
is printed to the console as
|
|
728
|
+
is printed to the console as ``SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)`` (very long).
|
|
706
729
|
"""
|
|
707
730
|
return str(tuple(f"{u}{lu}:{v}{lv}" for u, v, lu, lv in assembly))
|
|
708
731
|
|
|
@@ -715,7 +738,7 @@ def assembly2str_tuple(assembly: EdgeRepresentationAssembly) -> str:
|
|
|
715
738
|
|
|
716
739
|
|
|
717
740
|
def assembly_has_mismatches(
|
|
718
|
-
fragments: list[
|
|
741
|
+
fragments: list[Dseqrecord], assembly: EdgeRepresentationAssembly
|
|
719
742
|
) -> bool:
|
|
720
743
|
"""Check if an assembly has mismatches. This should never happen and if so it returns an error."""
|
|
721
744
|
for u, v, loc_u, loc_v in assembly:
|
|
@@ -731,7 +754,7 @@ def assembly_has_mismatches(
|
|
|
731
754
|
|
|
732
755
|
|
|
733
756
|
def assembly_is_circular(
|
|
734
|
-
assembly: EdgeRepresentationAssembly, fragments: list[
|
|
757
|
+
assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
|
|
735
758
|
) -> bool:
|
|
736
759
|
"""
|
|
737
760
|
Based on the topology of the locations of an assembly, determine if it is circular.
|
|
@@ -740,22 +763,22 @@ def assembly_is_circular(
|
|
|
740
763
|
if assembly[0][0] != assembly[-1][1]:
|
|
741
764
|
return False
|
|
742
765
|
elif (
|
|
743
|
-
isinstance(fragments[abs(assembly[0][0]) - 1],
|
|
766
|
+
isinstance(fragments[abs(assembly[0][0]) - 1], Dseqrecord)
|
|
744
767
|
and fragments[abs(assembly[0][0]) - 1].circular
|
|
745
768
|
):
|
|
746
769
|
return True
|
|
747
770
|
else:
|
|
748
771
|
return (
|
|
749
|
-
|
|
750
|
-
>
|
|
772
|
+
location_boundaries(assembly[0][2])[0]
|
|
773
|
+
> location_boundaries(assembly[-1][3])[0]
|
|
751
774
|
)
|
|
752
775
|
|
|
753
776
|
|
|
754
777
|
def assemble(
|
|
755
|
-
fragments: list[
|
|
778
|
+
fragments: list[Dseqrecord],
|
|
756
779
|
assembly: EdgeRepresentationAssembly,
|
|
757
780
|
is_insertion: bool = False,
|
|
758
|
-
) ->
|
|
781
|
+
) -> Dseqrecord:
|
|
759
782
|
"""Generate a Dseqrecord from an assembly and a list of fragments."""
|
|
760
783
|
|
|
761
784
|
if is_insertion:
|
|
@@ -772,14 +795,15 @@ def assemble(
|
|
|
772
795
|
u, v, loc_u, loc_v = asm_edge
|
|
773
796
|
f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
|
|
774
797
|
f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
|
|
775
|
-
seq_u = str(loc_u.extract(f_u).seq)
|
|
776
|
-
seq_v = str(loc_v.extract(f_v).seq
|
|
777
|
-
if seq_u
|
|
798
|
+
seq_u = str(loc_u.extract(f_u).seq)
|
|
799
|
+
seq_v = str(loc_v.extract(f_v).seq.rc())
|
|
800
|
+
# Test if seq_u and seq_v anneal
|
|
801
|
+
if not anneal_strands(seq_u, seq_v):
|
|
778
802
|
raise ValueError("Mismatch in assembly")
|
|
779
803
|
|
|
780
804
|
# We transform into Dseqrecords (for primers)
|
|
781
805
|
dseqr_fragments = [
|
|
782
|
-
f if isinstance(f,
|
|
806
|
+
f if isinstance(f, Dseqrecord) else Dseqrecord(f) for f in fragments
|
|
783
807
|
]
|
|
784
808
|
subfragments = get_assembly_subfragments(
|
|
785
809
|
dseqr_fragments, subfragment_representation
|
|
@@ -787,49 +811,33 @@ def assemble(
|
|
|
787
811
|
|
|
788
812
|
# Length of the overlaps between consecutive assembly fragments
|
|
789
813
|
fragment_overlaps = [len(e[-1]) for e in assembly]
|
|
814
|
+
out_dseqrecord = subfragments.pop(0)
|
|
790
815
|
|
|
791
|
-
|
|
816
|
+
for fragment, overlap in zip(subfragments, fragment_overlaps):
|
|
817
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
|
|
818
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
|
|
819
|
+
fragment.seq = fragment.seq.cast_to_ds_left()
|
|
820
|
+
fragment.seq = fragment.seq.exo1_front(overlap)
|
|
821
|
+
out_dseqrecord += fragment
|
|
792
822
|
|
|
793
|
-
|
|
794
|
-
# Shift the features of the right fragment to the left by `overlap`
|
|
795
|
-
new_features = [
|
|
796
|
-
f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
|
|
797
|
-
]
|
|
798
|
-
# Join the left sequence including the overlap with the right sequence without the overlap
|
|
799
|
-
# we use fill_right / fill_left so that it works for ligation of sticky ends
|
|
800
|
-
out_dseqrecord = _Dseqrecord(
|
|
801
|
-
fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
|
|
802
|
-
features=out_dseqrecord.features + new_features,
|
|
803
|
-
)
|
|
804
|
-
|
|
805
|
-
# For circular assemblies, close the loop and wrap origin-spanning features
|
|
823
|
+
# For circular assemblies, process the fragment and loop
|
|
806
824
|
if is_circular:
|
|
825
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_left()
|
|
826
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
|
|
807
827
|
overlap = fragment_overlaps[-1]
|
|
828
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_front(overlap)
|
|
829
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
|
|
830
|
+
out_dseqrecord = out_dseqrecord.looped()
|
|
808
831
|
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
# Remove trailing overlap
|
|
814
|
-
out_dseqrecord = _Dseqrecord(
|
|
815
|
-
fill_dseq(out_dseqrecord.seq)[:-overlap],
|
|
816
|
-
features=out_dseqrecord.features,
|
|
817
|
-
circular=True,
|
|
818
|
-
)
|
|
819
|
-
for feature in out_dseqrecord.features:
|
|
820
|
-
start, end = _location_boundaries(feature.location)
|
|
821
|
-
if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
|
|
822
|
-
# Wrap around the origin
|
|
823
|
-
feature.location = _shift_location(
|
|
824
|
-
feature.location, 0, len(out_dseqrecord)
|
|
825
|
-
)
|
|
826
|
-
|
|
832
|
+
out_dseqrecord.source = AssemblySource.from_subfragment_representation(
|
|
833
|
+
subfragment_representation, fragments, is_circular
|
|
834
|
+
)
|
|
827
835
|
return out_dseqrecord
|
|
828
836
|
|
|
829
837
|
|
|
830
838
|
def annotate_primer_binding_sites(
|
|
831
|
-
input_dseqr:
|
|
832
|
-
) ->
|
|
839
|
+
input_dseqr: Dseqrecord, fragments: list[Dseqrecord]
|
|
840
|
+
) -> Dseqrecord:
|
|
833
841
|
"""Annotate the primer binding sites in a Dseqrecord."""
|
|
834
842
|
fwd, _, rvs = fragments
|
|
835
843
|
start_rvs = len(input_dseqr) - len(rvs)
|
|
@@ -909,37 +917,36 @@ def subfragment_representation2edge_representation(
|
|
|
909
917
|
|
|
910
918
|
|
|
911
919
|
def get_assembly_subfragments(
|
|
912
|
-
fragments: list[
|
|
920
|
+
fragments: list[Dseqrecord],
|
|
913
921
|
subfragment_representation: SubFragmentRepresentationAssembly,
|
|
914
|
-
) -> list[
|
|
922
|
+
) -> list[Dseqrecord]:
|
|
915
923
|
"""From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
|
|
916
924
|
|
|
917
925
|
Subfragments are the slices of the fragments that are joined together
|
|
918
926
|
|
|
919
|
-
For example
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
To reproduce
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
|
|
927
|
+
For example::
|
|
928
|
+
|
|
929
|
+
--A--
|
|
930
|
+
TACGTAAT
|
|
931
|
+
--B--
|
|
932
|
+
TCGTAACGA
|
|
933
|
+
|
|
934
|
+
Gives: TACGTAA / CGTAACGA
|
|
935
|
+
|
|
936
|
+
To reproduce::
|
|
937
|
+
|
|
938
|
+
a = Dseqrecord('TACGTAAT')
|
|
939
|
+
b = Dseqrecord('TCGTAACGA')
|
|
940
|
+
f = Assembly([a, b], limit=5)
|
|
941
|
+
a0 = f.get_linear_assemblies()[0]
|
|
942
|
+
print(assembly2str(a0))
|
|
943
|
+
a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
|
|
944
|
+
for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
|
|
945
|
+
print(f.seq)
|
|
946
|
+
|
|
947
|
+
# prints TACGTAA and CGTAACGA
|
|
948
|
+
|
|
949
|
+
Subfragments: ``cccccgtatcgtgt``, ``atcgtgtactgtcatattc``
|
|
943
950
|
"""
|
|
944
951
|
subfragments = list()
|
|
945
952
|
for node, start_location, end_location in subfragment_representation:
|
|
@@ -953,19 +960,26 @@ def get_assembly_subfragments(
|
|
|
953
960
|
|
|
954
961
|
|
|
955
962
|
def extract_subfragment(
|
|
956
|
-
seq:
|
|
957
|
-
) ->
|
|
963
|
+
seq: Dseqrecord, start_location: Location | None, end_location: Location | None
|
|
964
|
+
) -> Dseqrecord:
|
|
958
965
|
"""Extract a subfragment from a sequence for an assembly, given the start and end locations of the subfragment."""
|
|
959
|
-
|
|
960
|
-
|
|
966
|
+
|
|
967
|
+
if seq.circular and (start_location is None or end_location is None):
|
|
968
|
+
raise ValueError(
|
|
969
|
+
"Start and end locations cannot be None for circular sequences"
|
|
970
|
+
)
|
|
971
|
+
# This could be used to have consistent behaviour for circular sequences, where the start is arbitrary. However,
|
|
972
|
+
# they should never get None, so this is not used.
|
|
973
|
+
# if start_location is None:
|
|
974
|
+
# start_location = end_location
|
|
975
|
+
# elif end_location is None:
|
|
976
|
+
# end_location = start_location
|
|
977
|
+
|
|
978
|
+
start = 0 if start_location is None else location_boundaries(start_location)[0]
|
|
979
|
+
end = None if end_location is None else location_boundaries(end_location)[1]
|
|
961
980
|
|
|
962
981
|
# Special case, some of it could be handled by better Dseqrecord slicing in the future
|
|
963
|
-
if (
|
|
964
|
-
seq.circular
|
|
965
|
-
and start_location is not None
|
|
966
|
-
and end_location is not None
|
|
967
|
-
and _locations_overlap(start_location, end_location, len(seq))
|
|
968
|
-
):
|
|
982
|
+
if seq.circular and locations_overlap(start_location, end_location, len(seq)):
|
|
969
983
|
# The overhang is different for origin-spanning features, for instance
|
|
970
984
|
# for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
|
|
971
985
|
# is -4, not 9
|
|
@@ -975,7 +989,7 @@ def extract_subfragment(
|
|
|
975
989
|
ovhg = 0
|
|
976
990
|
dummy_cut = ((start, ovhg), None)
|
|
977
991
|
open_seq = seq.apply_cut(dummy_cut, dummy_cut)
|
|
978
|
-
return
|
|
992
|
+
return Dseqrecord(open_seq.seq.cast_to_ds(), features=open_seq.features)
|
|
979
993
|
|
|
980
994
|
return seq[start:end]
|
|
981
995
|
|
|
@@ -1028,33 +1042,38 @@ class Assembly:
|
|
|
1028
1042
|
|
|
1029
1043
|
The assembly contains a directed graph, where nodes represent fragments and
|
|
1030
1044
|
edges represent overlaps between fragments. :
|
|
1045
|
+
|
|
1031
1046
|
- The node keys are integers, representing the index of the fragment in the
|
|
1032
|
-
|
|
1033
|
-
|
|
1047
|
+
input list of fragments. The sign of the node key represents the orientation
|
|
1048
|
+
of the fragment, positive for forward orientation, negative for reverse orientation.
|
|
1034
1049
|
- The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
|
|
1035
1050
|
- u and v are the nodes connected by the edge.
|
|
1036
1051
|
- key is a string that represents the location of the overlap. In the format:
|
|
1037
|
-
|
|
1052
|
+
'u[start:end](strand):v[start:end](strand)'.
|
|
1038
1053
|
- Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
|
|
1039
|
-
|
|
1054
|
+
representing the location of the overlap in the u and v fragment, respectively.
|
|
1040
1055
|
- You can think of an edge as a representation of the join of two fragments.
|
|
1041
1056
|
|
|
1042
1057
|
If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
|
|
1043
1058
|
there will be 4 edges representing that overlap in the graph, for all possible
|
|
1044
1059
|
orientations of the fragments (see add_edges_from_match for details):
|
|
1045
|
-
|
|
1046
|
-
-
|
|
1047
|
-
-
|
|
1048
|
-
-
|
|
1060
|
+
|
|
1061
|
+
- ``(1, 2, '1[8:14]:2[1:7]')``
|
|
1062
|
+
- ``(2, 1, '2[1:7]:1[8:14]')``
|
|
1063
|
+
- ``(-1, -2, '-1[0:6]:-2[10:16]')``
|
|
1064
|
+
- ``(-2, -1, '-2[10:16]:-1[0:6]')``
|
|
1049
1065
|
|
|
1050
1066
|
An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
|
|
1051
1067
|
as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
|
|
1052
1068
|
and second fragment. Assemblies are then represented as:
|
|
1069
|
+
|
|
1053
1070
|
- Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
|
|
1054
1071
|
- Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
|
|
1072
|
+
|
|
1055
1073
|
Note that the first and last fragment are the same in a circular assembly.
|
|
1056
1074
|
|
|
1057
1075
|
The following constrains are applied to remove duplicate assemblies:
|
|
1076
|
+
|
|
1058
1077
|
- Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
|
|
1059
1078
|
use_fragment_order is ignored.
|
|
1060
1079
|
- Linear assemblies:
|
|
@@ -1065,7 +1084,7 @@ class Assembly:
|
|
|
1065
1084
|
frags : list
|
|
1066
1085
|
A list of Dseqrecord objects.
|
|
1067
1086
|
limit : int, optional
|
|
1068
|
-
The shortest shared homology to be considered, this is passed as the third argument to the
|
|
1087
|
+
The shortest shared homology to be considered, this is passed as the third argument to the ``algorithm`` function.
|
|
1069
1088
|
For certain algorithms, this might be ignored.
|
|
1070
1089
|
algorithm : function, optional
|
|
1071
1090
|
The algorithm used to determine the shared sequences. It's a function that takes two Dseqrecord objects as inputs,
|
|
@@ -1113,14 +1132,15 @@ class Assembly:
|
|
|
1113
1132
|
|
|
1114
1133
|
def __init__(
|
|
1115
1134
|
self,
|
|
1116
|
-
frags: list[
|
|
1135
|
+
frags: list[Dseqrecord],
|
|
1117
1136
|
limit: int = 25,
|
|
1118
1137
|
algorithm: AssemblyAlgorithmType = common_sub_strings,
|
|
1119
1138
|
use_fragment_order: bool = True,
|
|
1120
1139
|
use_all_fragments: bool = False,
|
|
1121
1140
|
):
|
|
1141
|
+
|
|
1122
1142
|
# TODO: allow for the same fragment to be included more than once?
|
|
1123
|
-
self.G =
|
|
1143
|
+
self.G = nx.MultiDiGraph()
|
|
1124
1144
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1125
1145
|
self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
|
|
1126
1146
|
self.G.add_nodes_from(
|
|
@@ -1128,12 +1148,12 @@ class Assembly:
|
|
|
1128
1148
|
)
|
|
1129
1149
|
|
|
1130
1150
|
# Iterate over all possible combinations of fragments
|
|
1131
|
-
fragment_pairs =
|
|
1151
|
+
fragment_pairs = itertools.combinations(
|
|
1132
1152
|
filter(lambda x: x > 0, self.G.nodes), 2
|
|
1133
1153
|
)
|
|
1134
1154
|
for i, j in fragment_pairs:
|
|
1135
1155
|
# All the relative orientations of the fragments in the pair
|
|
1136
|
-
for u, v in
|
|
1156
|
+
for u, v in itertools.product([i, -i], [j, -j]):
|
|
1137
1157
|
u_seq = self.G.nodes[u]["seq"]
|
|
1138
1158
|
v_seq = self.G.nodes[v]["seq"]
|
|
1139
1159
|
matches = algorithm(u_seq, v_seq, limit)
|
|
@@ -1151,7 +1171,7 @@ class Assembly:
|
|
|
1151
1171
|
@classmethod
|
|
1152
1172
|
def assembly_is_valid(
|
|
1153
1173
|
cls,
|
|
1154
|
-
fragments: list[
|
|
1174
|
+
fragments: list[Dseqrecord | Primer],
|
|
1155
1175
|
assembly: EdgeRepresentationAssembly,
|
|
1156
1176
|
is_circular: bool,
|
|
1157
1177
|
use_all_fragments: bool,
|
|
@@ -1167,6 +1187,23 @@ class Assembly:
|
|
|
1167
1187
|
if len(assembly) == 0:
|
|
1168
1188
|
return False
|
|
1169
1189
|
|
|
1190
|
+
# Topology check -> Circular sequences cannot be first or last in a linear assembly.
|
|
1191
|
+
# For example, let's imagine aACGTc (linear) and gACGTc (circular).
|
|
1192
|
+
# It should not be possible to join them into a linear assembly. It's similar if we
|
|
1193
|
+
# think of a restriction-ligation assembly, example: aGAATTCc (linear) and gGAATTCc
|
|
1194
|
+
# (circular).
|
|
1195
|
+
# A linear product can be generated where the circular molecule is cut open, and one end
|
|
1196
|
+
# it joins the linear molecule and on the other it's free, but for now it's not a
|
|
1197
|
+
# relevant product and it's excluded.
|
|
1198
|
+
first_fragment = fragments[abs(assembly[0][0]) - 1]
|
|
1199
|
+
last_fragment = fragments[abs(assembly[-1][1]) - 1]
|
|
1200
|
+
if not is_circular and (
|
|
1201
|
+
isinstance(first_fragment, Dseqrecord)
|
|
1202
|
+
and first_fragment.circular
|
|
1203
|
+
or (isinstance(last_fragment, Dseqrecord) and last_fragment.circular)
|
|
1204
|
+
):
|
|
1205
|
+
return False
|
|
1206
|
+
|
|
1170
1207
|
if use_all_fragments and len(fragments) != len(
|
|
1171
1208
|
set(flatten(map(abs, e[:2]) for e in assembly))
|
|
1172
1209
|
):
|
|
@@ -1204,8 +1241,8 @@ class Assembly:
|
|
|
1204
1241
|
# Incompatible as described in figure above
|
|
1205
1242
|
fragment = fragments[abs(v1) - 1]
|
|
1206
1243
|
if (
|
|
1207
|
-
isinstance(fragment,
|
|
1208
|
-
) and
|
|
1244
|
+
isinstance(fragment, Primer) or not fragment.circular
|
|
1245
|
+
) and location_boundaries(start_location)[1] >= location_boundaries(
|
|
1209
1246
|
end_location
|
|
1210
1247
|
)[
|
|
1211
1248
|
1
|
|
@@ -1229,14 +1266,15 @@ class Assembly:
|
|
|
1229
1266
|
match: SequenceOverlap,
|
|
1230
1267
|
u: int,
|
|
1231
1268
|
v: int,
|
|
1232
|
-
first:
|
|
1233
|
-
secnd:
|
|
1269
|
+
first: Dseqrecord,
|
|
1270
|
+
secnd: Dseqrecord,
|
|
1234
1271
|
):
|
|
1235
|
-
"""Add edges to the graph from a match returned by the
|
|
1272
|
+
"""Add edges to the graph from a match returned by the ``algorithm`` function (see pydna.common_substrings). For
|
|
1236
1273
|
format of edges (see documentation of the Assembly class).
|
|
1237
1274
|
|
|
1238
|
-
Matches are directional, because not all
|
|
1275
|
+
Matches are directional, because not all ``algorithm`` functions return the same match for (u,v) and (v,u). For example,
|
|
1239
1276
|
homologous recombination does but sticky end ligation does not. The function returns two edges:
|
|
1277
|
+
|
|
1240
1278
|
- Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
|
|
1241
1279
|
- Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
|
|
1242
1280
|
|
|
@@ -1248,10 +1286,10 @@ class Assembly:
|
|
|
1248
1286
|
else:
|
|
1249
1287
|
# We use shift_location with 0 to wrap origin-spanning features
|
|
1250
1288
|
locs = [
|
|
1251
|
-
|
|
1289
|
+
shift_location(
|
|
1252
1290
|
SimpleLocation(x_start, x_start + length), 0, len(first)
|
|
1253
1291
|
),
|
|
1254
|
-
|
|
1292
|
+
shift_location(
|
|
1255
1293
|
SimpleLocation(y_start, y_start + length), 0, len(secnd)
|
|
1256
1294
|
),
|
|
1257
1295
|
]
|
|
@@ -1286,7 +1324,7 @@ class Assembly:
|
|
|
1286
1324
|
"""
|
|
1287
1325
|
|
|
1288
1326
|
# Copy the graph since we will add the begin and end mock nodes
|
|
1289
|
-
G =
|
|
1327
|
+
G = nx.MultiDiGraph(self.G)
|
|
1290
1328
|
G.add_nodes_from(["begin", "end"])
|
|
1291
1329
|
|
|
1292
1330
|
if self.use_fragment_order:
|
|
@@ -1324,7 +1362,7 @@ class Assembly:
|
|
|
1324
1362
|
def node_path2assembly_list(
|
|
1325
1363
|
self, cycle: list[int], circular: bool
|
|
1326
1364
|
) -> list[EdgeRepresentationAssembly]:
|
|
1327
|
-
"""Convert a node path in the format [1, 2, 3] (as returned by
|
|
1365
|
+
"""Convert a node path in the format [1, 2, 3] (as returned by networkx.cycles.simple_cycles) to a list of all
|
|
1328
1366
|
possible assemblies.
|
|
1329
1367
|
|
|
1330
1368
|
There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
|
|
@@ -1338,11 +1376,11 @@ class Assembly:
|
|
|
1338
1376
|
combine.append([(u, v, key) for key in self.G[u][v]])
|
|
1339
1377
|
return [
|
|
1340
1378
|
tuple(map(self.format_assembly_edge, x))
|
|
1341
|
-
for x in
|
|
1379
|
+
for x in itertools.product(*combine)
|
|
1342
1380
|
]
|
|
1343
1381
|
|
|
1344
1382
|
def get_unique_linear_paths(
|
|
1345
|
-
self, G_with_begin_end:
|
|
1383
|
+
self, G_with_begin_end: nx.MultiDiGraph, max_paths=10000
|
|
1346
1384
|
) -> list[list[int]]:
|
|
1347
1385
|
"""Get unique linear paths from the graph, removing those that contain the same node twice."""
|
|
1348
1386
|
# We remove the begin and end nodes, and get all paths without edges
|
|
@@ -1353,8 +1391,8 @@ class Assembly:
|
|
|
1353
1391
|
node_paths = [
|
|
1354
1392
|
x[1:-1]
|
|
1355
1393
|
for x in limit_iterator(
|
|
1356
|
-
|
|
1357
|
-
|
|
1394
|
+
nx.all_simple_paths(
|
|
1395
|
+
nx.DiGraph(G_with_begin_end),
|
|
1358
1396
|
"begin",
|
|
1359
1397
|
"end",
|
|
1360
1398
|
cutoff=(len(self.fragments) + 1),
|
|
@@ -1403,7 +1441,7 @@ class Assembly:
|
|
|
1403
1441
|
sorted_cycles = map(
|
|
1404
1442
|
circular_permutation_min_abs,
|
|
1405
1443
|
limit_iterator(
|
|
1406
|
-
|
|
1444
|
+
nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
|
|
1407
1445
|
10000,
|
|
1408
1446
|
),
|
|
1409
1447
|
)
|
|
@@ -1446,17 +1484,18 @@ class Assembly:
|
|
|
1446
1484
|
Here we check if one of the joins between fragments represents the edges of an insertion assembly
|
|
1447
1485
|
The fragment must be linear, and the join must be as indicated below
|
|
1448
1486
|
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1487
|
+
::
|
|
1488
|
+
|
|
1489
|
+
-------- ------- Fragment 1
|
|
1490
|
+
|| ||
|
|
1491
|
+
xxxxxxxx || Fragment 2
|
|
1492
|
+
|| ||
|
|
1493
|
+
oooooooooo Fragment 3
|
|
1494
|
+
|
|
1456
1495
|
The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
|
|
1457
1496
|
|
|
1458
1497
|
These could be returned in any order by simple_cycles, so we sort the edges so that the first
|
|
1459
|
-
and last
|
|
1498
|
+
and last ``u`` and ``v`` match the fragment that gets the insertion (1 in the example above).
|
|
1460
1499
|
"""
|
|
1461
1500
|
edge_pair_index = list()
|
|
1462
1501
|
|
|
@@ -1467,8 +1506,8 @@ class Assembly:
|
|
|
1467
1506
|
fragment = self.fragments[abs(v1) - 1]
|
|
1468
1507
|
# Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
|
|
1469
1508
|
# the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
|
|
1470
|
-
left_of_insertion =
|
|
1471
|
-
right_of_insertion =
|
|
1509
|
+
left_of_insertion = location_boundaries(start_location)[0]
|
|
1510
|
+
right_of_insertion = location_boundaries(end_location)[0]
|
|
1472
1511
|
if not fragment.circular and (
|
|
1473
1512
|
right_of_insertion >= left_of_insertion
|
|
1474
1513
|
# The below condition is for single-site integration.
|
|
@@ -1480,7 +1519,7 @@ class Assembly:
|
|
|
1480
1519
|
#
|
|
1481
1520
|
# The locations of homology on the genome are [0:10] and [2:12], so not identical
|
|
1482
1521
|
# but they overlap.
|
|
1483
|
-
or
|
|
1522
|
+
or locations_overlap(start_location, end_location, len(fragment))
|
|
1484
1523
|
):
|
|
1485
1524
|
edge_pair_index.append(i)
|
|
1486
1525
|
|
|
@@ -1511,13 +1550,13 @@ class Assembly:
|
|
|
1511
1550
|
fragment1 = self.fragments[abs(f1) - 1]
|
|
1512
1551
|
fragment2 = self.fragments[abs(f2) - 1]
|
|
1513
1552
|
|
|
1514
|
-
if not
|
|
1553
|
+
if not locations_overlap(
|
|
1515
1554
|
loc_f1_1, loc_f1_2, len(fragment1)
|
|
1516
|
-
) or not
|
|
1555
|
+
) or not locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
|
|
1517
1556
|
return same_assembly
|
|
1518
1557
|
|
|
1519
1558
|
# Sort to make compatible with insertion assembly
|
|
1520
|
-
if
|
|
1559
|
+
if location_boundaries(loc_f1_1)[0] > location_boundaries(loc_f1_2)[0]:
|
|
1521
1560
|
new_assembly = same_assembly[::-1]
|
|
1522
1561
|
else:
|
|
1523
1562
|
new_assembly = same_assembly[:]
|
|
@@ -1530,10 +1569,10 @@ class Assembly:
|
|
|
1530
1569
|
fragment2 = self.fragments[abs(f2) - 1]
|
|
1531
1570
|
|
|
1532
1571
|
# Extract boundaries
|
|
1533
|
-
f2_1_start, _ =
|
|
1534
|
-
f2_2_start, f2_2_end =
|
|
1535
|
-
f1_1_start, _ =
|
|
1536
|
-
f1_2_start, f1_2_end =
|
|
1572
|
+
f2_1_start, _ = location_boundaries(loc_f2_1)
|
|
1573
|
+
f2_2_start, f2_2_end = location_boundaries(loc_f2_2)
|
|
1574
|
+
f1_1_start, _ = location_boundaries(loc_f1_1)
|
|
1575
|
+
f1_2_start, f1_2_end = location_boundaries(loc_f1_2)
|
|
1537
1576
|
|
|
1538
1577
|
overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(
|
|
1539
1578
|
fragment2[f2_1_start:f2_2_end]
|
|
@@ -1573,7 +1612,7 @@ class Assembly:
|
|
|
1573
1612
|
"only_adjacent_edges not implemented for insertion assemblies"
|
|
1574
1613
|
)
|
|
1575
1614
|
|
|
1576
|
-
cycles = limit_iterator(
|
|
1615
|
+
cycles = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
|
|
1577
1616
|
|
|
1578
1617
|
# We apply constrains already here because sometimes the combinatorial explosion is too large
|
|
1579
1618
|
if self.use_all_fragments:
|
|
@@ -1592,7 +1631,7 @@ class Assembly:
|
|
|
1592
1631
|
)
|
|
1593
1632
|
|
|
1594
1633
|
# We find cycles first
|
|
1595
|
-
iterator = limit_iterator(
|
|
1634
|
+
iterator = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
|
|
1596
1635
|
assemblies = sum(
|
|
1597
1636
|
map(lambda x: self.node_path2assembly_list(x, True), iterator), []
|
|
1598
1637
|
)
|
|
@@ -1616,29 +1655,27 @@ class Assembly:
|
|
|
1616
1655
|
|
|
1617
1656
|
def assemble_linear(
|
|
1618
1657
|
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1619
|
-
) -> list[
|
|
1658
|
+
) -> list[Dseqrecord]:
|
|
1620
1659
|
"""Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
|
|
1621
1660
|
assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
|
|
1622
1661
|
return [assemble(self.fragments, a) for a in assemblies]
|
|
1623
1662
|
|
|
1624
1663
|
def assemble_circular(
|
|
1625
1664
|
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1626
|
-
) -> list[
|
|
1665
|
+
) -> list[Dseqrecord]:
|
|
1627
1666
|
"""Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
|
|
1628
1667
|
assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
|
|
1629
1668
|
return [assemble(self.fragments, a) for a in assemblies]
|
|
1630
1669
|
|
|
1631
|
-
def assemble_insertion(
|
|
1632
|
-
self, only_adjacent_edges: bool = False
|
|
1633
|
-
) -> list[_Dseqrecord]:
|
|
1670
|
+
def assemble_insertion(self, only_adjacent_edges: bool = False) -> list[Dseqrecord]:
|
|
1634
1671
|
"""Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
|
|
1635
1672
|
assemblies = self.get_insertion_assemblies(only_adjacent_edges)
|
|
1636
1673
|
return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
|
|
1637
1674
|
|
|
1638
1675
|
def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
|
|
1639
1676
|
"""Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
|
|
1640
|
-
|
|
1641
|
-
and right side. The values in
|
|
1677
|
+
``left``, ``right``, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
|
|
1678
|
+
and right side. The values in ``left`` and ``right`` are often the same, except in restriction-ligation with partial overlap enabled,
|
|
1642
1679
|
where we can end up with a situation like this:
|
|
1643
1680
|
|
|
1644
1681
|
GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
|
|
@@ -1651,13 +1688,14 @@ class Assembly:
|
|
|
1651
1688
|
aGGTCTCCxxCCAATT
|
|
1652
1689
|
tCCAGAGGTTGGxxAA
|
|
1653
1690
|
|
|
1654
|
-
Would return
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1691
|
+
Would return::
|
|
1692
|
+
|
|
1693
|
+
{
|
|
1694
|
+
1: {'left': [7:9], 'right': [9:11]},
|
|
1695
|
+
2: {'left': [8:10], 'right': [10:12]},
|
|
1696
|
+
-1: {'left': [2:4], 'right': [4:6]},
|
|
1697
|
+
-2: {'left': [2:4], 'right': [4:6]}
|
|
1698
|
+
}
|
|
1661
1699
|
|
|
1662
1700
|
"""
|
|
1663
1701
|
|
|
@@ -1671,10 +1709,10 @@ class Assembly:
|
|
|
1671
1709
|
if edge_location not in this_dict[key]:
|
|
1672
1710
|
this_dict[key].append(edge_location)
|
|
1673
1711
|
this_dict["left"] = sorted(
|
|
1674
|
-
this_dict["left"], key=lambda x:
|
|
1712
|
+
this_dict["left"], key=lambda x: location_boundaries(x)[0]
|
|
1675
1713
|
)
|
|
1676
1714
|
this_dict["right"] = sorted(
|
|
1677
|
-
this_dict["right"], key=lambda x:
|
|
1715
|
+
this_dict["right"], key=lambda x: location_boundaries(x)[0]
|
|
1678
1716
|
)
|
|
1679
1717
|
locations_on_fragments[node] = this_dict
|
|
1680
1718
|
|
|
@@ -1686,10 +1724,10 @@ class Assembly:
|
|
|
1686
1724
|
and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
|
|
1687
1725
|
and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
|
|
1688
1726
|
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1727
|
+
::
|
|
1728
|
+
|
|
1729
|
+
x y z
|
|
1730
|
+
-------|-------|-------|---------
|
|
1693
1731
|
|
|
1694
1732
|
We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
|
|
1695
1733
|
The latter would indicate that the fragment was partially digested.
|
|
@@ -1721,7 +1759,7 @@ class Assembly:
|
|
|
1721
1759
|
|
|
1722
1760
|
pairs = list()
|
|
1723
1761
|
for pair in zip(left, right):
|
|
1724
|
-
pairs += list(
|
|
1762
|
+
pairs += list(itertools.product(*pair))
|
|
1725
1763
|
allowed_location_pairs[node] = pairs
|
|
1726
1764
|
|
|
1727
1765
|
fragment_assembly = edge_representation2subfragment_representation(
|
|
@@ -1734,7 +1772,7 @@ class Assembly:
|
|
|
1734
1772
|
|
|
1735
1773
|
def __repr__(self):
|
|
1736
1774
|
# https://pyformat.info
|
|
1737
|
-
return
|
|
1775
|
+
return ps(
|
|
1738
1776
|
"Assembly\n"
|
|
1739
1777
|
"fragments..: {sequences}\n"
|
|
1740
1778
|
"limit(bp)..: {limit}\n"
|
|
@@ -1750,12 +1788,12 @@ class Assembly:
|
|
|
1750
1788
|
|
|
1751
1789
|
class PCRAssembly(Assembly):
|
|
1752
1790
|
"""
|
|
1753
|
-
An assembly that represents a PCR, where
|
|
1754
|
-
It always uses the
|
|
1791
|
+
An assembly that represents a PCR, where ``fragments`` is a list of primer, template, primer (in that order).
|
|
1792
|
+
It always uses the ``primer_template_overlap`` algorithm and accepts the ``mismatches`` argument to indicate
|
|
1755
1793
|
the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
|
|
1756
1794
|
"""
|
|
1757
1795
|
|
|
1758
|
-
def __init__(self, frags: list[
|
|
1796
|
+
def __init__(self, frags: list[Dseqrecord | Primer], limit=25, mismatches=0):
|
|
1759
1797
|
|
|
1760
1798
|
value_error = ValueError(
|
|
1761
1799
|
"PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer"
|
|
@@ -1765,15 +1803,15 @@ class PCRAssembly(Assembly):
|
|
|
1765
1803
|
|
|
1766
1804
|
# Validate the inputs: should be a series of primer, template, primer
|
|
1767
1805
|
wrong_fragment_class = (
|
|
1768
|
-
not isinstance(frags[0],
|
|
1769
|
-
isinstance(frags[1],
|
|
1770
|
-
not isinstance(frags[2],
|
|
1806
|
+
not isinstance(frags[0], Primer),
|
|
1807
|
+
isinstance(frags[1], Primer),
|
|
1808
|
+
not isinstance(frags[2], Primer),
|
|
1771
1809
|
)
|
|
1772
1810
|
if any(wrong_fragment_class):
|
|
1773
1811
|
raise value_error
|
|
1774
1812
|
|
|
1775
1813
|
# TODO: allow for the same fragment to be included more than once?
|
|
1776
|
-
self.G =
|
|
1814
|
+
self.G = nx.MultiDiGraph()
|
|
1777
1815
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1778
1816
|
self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
|
|
1779
1817
|
self.G.add_nodes_from(
|
|
@@ -1786,8 +1824,8 @@ class PCRAssembly(Assembly):
|
|
|
1786
1824
|
# primer, template, primer
|
|
1787
1825
|
p1, t, p2 = (i + 1, i + 2, i + 3)
|
|
1788
1826
|
primer_ids += [p1, p2]
|
|
1789
|
-
pairs += list(
|
|
1790
|
-
pairs += list(
|
|
1827
|
+
pairs += list(itertools.product([p1, p2], [t, -t]))
|
|
1828
|
+
pairs += list(itertools.product([t, -t], [-p1, -p2]))
|
|
1791
1829
|
|
|
1792
1830
|
for u, v in pairs:
|
|
1793
1831
|
u_seq = self.G.nodes[u]["seq"]
|
|
@@ -1826,20 +1864,33 @@ class PCRAssembly(Assembly):
|
|
|
1826
1864
|
"get_insertion_assemblies not implemented for PCR assemblies"
|
|
1827
1865
|
)
|
|
1828
1866
|
|
|
1867
|
+
def assemble_linear(
|
|
1868
|
+
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1869
|
+
) -> list[Dseqrecord]:
|
|
1870
|
+
"""
|
|
1871
|
+
Overrides the parent method to ensure that the 5' of the crick strand of the product matches the
|
|
1872
|
+
sequence of the reverse primer. This is important when using primers with dUTP (for USER cloning).
|
|
1873
|
+
"""
|
|
1874
|
+
results = super().assemble_linear(only_adjacent_edges, max_assemblies)
|
|
1875
|
+
for result in results:
|
|
1876
|
+
rp = self.fragments[2]
|
|
1877
|
+
result.seq = result.seq[: -len(rp)] + Dseq(str(rp.seq.rc()))
|
|
1878
|
+
return results
|
|
1879
|
+
|
|
1829
1880
|
|
|
1830
1881
|
class SingleFragmentAssembly(Assembly):
|
|
1831
1882
|
"""
|
|
1832
1883
|
An assembly that represents the circularisation or splicing of a single fragment.
|
|
1833
1884
|
"""
|
|
1834
1885
|
|
|
1835
|
-
def __init__(self, frags: [
|
|
1886
|
+
def __init__(self, frags: [Dseqrecord], limit=25, algorithm=common_sub_strings):
|
|
1836
1887
|
|
|
1837
1888
|
if len(frags) != 1:
|
|
1838
1889
|
raise ValueError(
|
|
1839
1890
|
"SingleFragmentAssembly assembly must be initialised with a single fragment"
|
|
1840
1891
|
)
|
|
1841
1892
|
# TODO: allow for the same fragment to be included more than once?
|
|
1842
|
-
self.G =
|
|
1893
|
+
self.G = nx.MultiDiGraph()
|
|
1843
1894
|
frag = frags[0]
|
|
1844
1895
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1845
1896
|
self.G.add_node(1, seq=frag)
|
|
@@ -1890,8 +1941,8 @@ class SingleFragmentAssembly(Assembly):
|
|
|
1890
1941
|
if x[0][2] == x[0][3]:
|
|
1891
1942
|
return False
|
|
1892
1943
|
# We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
|
|
1893
|
-
left_start, _ =
|
|
1894
|
-
_, right_end =
|
|
1944
|
+
left_start, _ = location_boundaries(x[0][2])
|
|
1945
|
+
_, right_end = location_boundaries(x[0][3])
|
|
1895
1946
|
if left_start == 0 and right_end == len(self.fragments[0]):
|
|
1896
1947
|
return False
|
|
1897
1948
|
return True
|
|
@@ -1914,18 +1965,19 @@ class SingleFragmentAssembly(Assembly):
|
|
|
1914
1965
|
|
|
1915
1966
|
|
|
1916
1967
|
def common_function_assembly_products(
|
|
1917
|
-
frags: list[
|
|
1968
|
+
frags: list[Dseqrecord],
|
|
1918
1969
|
limit: int | None,
|
|
1919
1970
|
algorithm: Callable,
|
|
1920
1971
|
circular_only: bool,
|
|
1921
1972
|
filter_results_function: Callable | None = None,
|
|
1922
|
-
|
|
1973
|
+
only_adjacent_edges: bool = False,
|
|
1974
|
+
) -> list[Dseqrecord]:
|
|
1923
1975
|
"""Common function to avoid code duplication. Could be simplified further
|
|
1924
1976
|
once SingleFragmentAssembly and Assembly are merged.
|
|
1925
1977
|
|
|
1926
1978
|
Parameters
|
|
1927
1979
|
----------
|
|
1928
|
-
frags : list[
|
|
1980
|
+
frags : list[Dseqrecord]
|
|
1929
1981
|
List of DNA fragments to assemble
|
|
1930
1982
|
limit : int or None
|
|
1931
1983
|
Minimum overlap length required, or None if not applicable
|
|
@@ -1933,10 +1985,14 @@ def common_function_assembly_products(
|
|
|
1933
1985
|
Function that determines valid overlaps between fragments
|
|
1934
1986
|
circular_only : bool
|
|
1935
1987
|
If True, only return circular assemblies
|
|
1988
|
+
filter_results_function : Callable or None
|
|
1989
|
+
Function that filters the results
|
|
1990
|
+
only_adjacent_edges : bool
|
|
1991
|
+
If True, only return assemblies that use only adjacent edges
|
|
1936
1992
|
|
|
1937
1993
|
Returns
|
|
1938
1994
|
-------
|
|
1939
|
-
list[
|
|
1995
|
+
list[Dseqrecord]
|
|
1940
1996
|
List of assembled DNA molecules
|
|
1941
1997
|
"""
|
|
1942
1998
|
if len(frags) == 1:
|
|
@@ -1945,10 +2001,10 @@ def common_function_assembly_products(
|
|
|
1945
2001
|
asm = Assembly(
|
|
1946
2002
|
frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
|
|
1947
2003
|
)
|
|
1948
|
-
output_assemblies = asm.get_circular_assemblies()
|
|
2004
|
+
output_assemblies = asm.get_circular_assemblies(only_adjacent_edges)
|
|
1949
2005
|
if not circular_only and len(frags) > 1:
|
|
1950
2006
|
output_assemblies += filter_linear_subassemblies(
|
|
1951
|
-
asm.get_linear_assemblies(), output_assemblies, frags
|
|
2007
|
+
asm.get_linear_assemblies(only_adjacent_edges), output_assemblies, frags
|
|
1952
2008
|
)
|
|
1953
2009
|
if not circular_only and len(frags) == 1:
|
|
1954
2010
|
output_assemblies += asm.get_insertion_assemblies()
|
|
@@ -1959,14 +2015,29 @@ def common_function_assembly_products(
|
|
|
1959
2015
|
return [assemble(frags, a) for a in output_assemblies]
|
|
1960
2016
|
|
|
1961
2017
|
|
|
2018
|
+
def _recast_sources(
|
|
2019
|
+
products: list[Dseqrecord], source_cls, **extra_fields
|
|
2020
|
+
) -> list[Dseqrecord]:
|
|
2021
|
+
"""Recast the `source` of each product to `source_cls` with optional extras.
|
|
2022
|
+
|
|
2023
|
+
This avoids repeating the same for-loop across many assembly functions.
|
|
2024
|
+
"""
|
|
2025
|
+
for prod in products:
|
|
2026
|
+
prod.source = source_cls(
|
|
2027
|
+
**prod.source.to_unserialized_dict(),
|
|
2028
|
+
**extra_fields,
|
|
2029
|
+
)
|
|
2030
|
+
return products
|
|
2031
|
+
|
|
2032
|
+
|
|
1962
2033
|
def gibson_assembly(
|
|
1963
|
-
frags: list[
|
|
1964
|
-
) -> list[
|
|
2034
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2035
|
+
) -> list[Dseqrecord]:
|
|
1965
2036
|
"""Returns the products for Gibson assembly.
|
|
1966
2037
|
|
|
1967
2038
|
Parameters
|
|
1968
2039
|
----------
|
|
1969
|
-
frags : list[
|
|
2040
|
+
frags : list[Dseqrecord]
|
|
1970
2041
|
List of DNA fragments to assemble
|
|
1971
2042
|
limit : int, optional
|
|
1972
2043
|
Minimum overlap length required, by default 25
|
|
@@ -1975,23 +2046,25 @@ def gibson_assembly(
|
|
|
1975
2046
|
|
|
1976
2047
|
Returns
|
|
1977
2048
|
-------
|
|
1978
|
-
list[
|
|
2049
|
+
list[Dseqrecord]
|
|
1979
2050
|
List of assembled DNA molecules
|
|
1980
2051
|
"""
|
|
1981
|
-
|
|
2052
|
+
|
|
2053
|
+
products = common_function_assembly_products(
|
|
1982
2054
|
frags, limit, gibson_overlap, circular_only
|
|
1983
2055
|
)
|
|
2056
|
+
return _recast_sources(products, GibsonAssemblySource)
|
|
1984
2057
|
|
|
1985
2058
|
|
|
1986
2059
|
def in_fusion_assembly(
|
|
1987
|
-
frags: list[
|
|
1988
|
-
) -> list[
|
|
2060
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2061
|
+
) -> list[Dseqrecord]:
|
|
1989
2062
|
"""Returns the products for in-fusion assembly. This is the same as Gibson
|
|
1990
2063
|
assembly, but with a different name.
|
|
1991
2064
|
|
|
1992
2065
|
Parameters
|
|
1993
2066
|
----------
|
|
1994
|
-
frags : list[
|
|
2067
|
+
frags : list[Dseqrecord]
|
|
1995
2068
|
List of DNA fragments to assemble
|
|
1996
2069
|
limit : int, optional
|
|
1997
2070
|
Minimum overlap length required, by default 25
|
|
@@ -2000,21 +2073,23 @@ def in_fusion_assembly(
|
|
|
2000
2073
|
|
|
2001
2074
|
Returns
|
|
2002
2075
|
-------
|
|
2003
|
-
list[
|
|
2076
|
+
list[Dseqrecord]
|
|
2004
2077
|
List of assembled DNA molecules
|
|
2005
2078
|
"""
|
|
2006
|
-
|
|
2079
|
+
|
|
2080
|
+
products = gibson_assembly(frags, limit)
|
|
2081
|
+
return _recast_sources(products, InFusionSource)
|
|
2007
2082
|
|
|
2008
2083
|
|
|
2009
2084
|
def fusion_pcr_assembly(
|
|
2010
|
-
frags: list[
|
|
2011
|
-
) -> list[
|
|
2085
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2086
|
+
) -> list[Dseqrecord]:
|
|
2012
2087
|
"""Returns the products for fusion PCR assembly. This is the same as Gibson
|
|
2013
2088
|
assembly, but with a different name.
|
|
2014
2089
|
|
|
2015
2090
|
Parameters
|
|
2016
2091
|
----------
|
|
2017
|
-
frags : list[
|
|
2092
|
+
frags : list[Dseqrecord]
|
|
2018
2093
|
List of DNA fragments to assemble
|
|
2019
2094
|
limit : int, optional
|
|
2020
2095
|
Minimum overlap length required, by default 25
|
|
@@ -2023,20 +2098,21 @@ def fusion_pcr_assembly(
|
|
|
2023
2098
|
|
|
2024
2099
|
Returns
|
|
2025
2100
|
-------
|
|
2026
|
-
list[
|
|
2101
|
+
list[Dseqrecord]
|
|
2027
2102
|
List of assembled DNA molecules
|
|
2028
2103
|
"""
|
|
2029
|
-
|
|
2104
|
+
products = gibson_assembly(frags, limit)
|
|
2105
|
+
return _recast_sources(products, OverlapExtensionPCRLigationSource)
|
|
2030
2106
|
|
|
2031
2107
|
|
|
2032
2108
|
def in_vivo_assembly(
|
|
2033
|
-
frags: list[
|
|
2034
|
-
) -> list[
|
|
2109
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2110
|
+
) -> list[Dseqrecord]:
|
|
2035
2111
|
"""Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
|
|
2036
2112
|
|
|
2037
2113
|
Parameters
|
|
2038
2114
|
----------
|
|
2039
|
-
frags : list[
|
|
2115
|
+
frags : list[Dseqrecord]
|
|
2040
2116
|
List of DNA fragments to assemble
|
|
2041
2117
|
limit : int, optional
|
|
2042
2118
|
Minimum overlap length required, by default 25
|
|
@@ -2045,30 +2121,32 @@ def in_vivo_assembly(
|
|
|
2045
2121
|
|
|
2046
2122
|
Returns
|
|
2047
2123
|
-------
|
|
2048
|
-
list[
|
|
2124
|
+
list[Dseqrecord]
|
|
2049
2125
|
List of assembled DNA molecules
|
|
2050
2126
|
"""
|
|
2051
|
-
|
|
2127
|
+
products = common_function_assembly_products(
|
|
2052
2128
|
frags, limit, common_sub_strings, circular_only
|
|
2053
2129
|
)
|
|
2130
|
+
return _recast_sources(products, InVivoAssemblySource)
|
|
2054
2131
|
|
|
2055
2132
|
|
|
2056
2133
|
def restriction_ligation_assembly(
|
|
2057
|
-
frags: list[
|
|
2058
|
-
enzymes: list["
|
|
2134
|
+
frags: list[Dseqrecord],
|
|
2135
|
+
enzymes: list["AbstractCut"],
|
|
2059
2136
|
allow_blunt: bool = True,
|
|
2060
2137
|
circular_only: bool = False,
|
|
2061
|
-
) -> list[
|
|
2138
|
+
) -> list[Dseqrecord]:
|
|
2062
2139
|
"""Returns the products for restriction ligation assembly:
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2140
|
+
|
|
2141
|
+
- Finds cutsites in the fragments
|
|
2142
|
+
- Finds all products that could be assembled by ligating the fragments based on those cutsites
|
|
2143
|
+
- Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
|
|
2066
2144
|
|
|
2067
2145
|
Parameters
|
|
2068
2146
|
----------
|
|
2069
|
-
frags : list[
|
|
2147
|
+
frags : list[Dseqrecord]
|
|
2070
2148
|
List of DNA fragments to assemble
|
|
2071
|
-
enzymes : list[
|
|
2149
|
+
enzymes : list[AbstractCut]
|
|
2072
2150
|
List of restriction enzymes to use
|
|
2073
2151
|
allow_blunt : bool, optional
|
|
2074
2152
|
If True, allow blunt end ligations, by default True
|
|
@@ -2077,15 +2155,15 @@ def restriction_ligation_assembly(
|
|
|
2077
2155
|
|
|
2078
2156
|
Returns
|
|
2079
2157
|
-------
|
|
2080
|
-
list[
|
|
2158
|
+
list[Dseqrecord]
|
|
2081
2159
|
List of assembled DNA molecules
|
|
2082
2160
|
|
|
2083
2161
|
Examples
|
|
2084
2162
|
--------
|
|
2085
2163
|
In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
|
|
2086
|
-
Note how 2 circular products are returned, one contains the insert (
|
|
2087
|
-
and the desired part of the backbone (
|
|
2088
|
-
reversed insert (
|
|
2164
|
+
Note how 2 circular products are returned, one contains the insert (``acgt``)
|
|
2165
|
+
and the desired part of the backbone (``cccccc``), the other contains the
|
|
2166
|
+
reversed insert (``tgga``) and the cut-out part of the backbone (``aaa``).
|
|
2089
2167
|
|
|
2090
2168
|
>>> from pydna.assembly2 import restriction_ligation_assembly
|
|
2091
2169
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
@@ -2119,28 +2197,33 @@ def restriction_ligation_assembly(
|
|
|
2119
2197
|
TTAAGtttC
|
|
2120
2198
|
"""
|
|
2121
2199
|
|
|
2122
|
-
def
|
|
2200
|
+
def algorithm_fn(x, y, _l):
|
|
2123
2201
|
# By default, we allow blunt ends
|
|
2124
2202
|
return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
|
|
2125
2203
|
|
|
2126
|
-
|
|
2204
|
+
products = common_function_assembly_products(
|
|
2205
|
+
frags, None, algorithm_fn, circular_only, only_adjacent_edges=True
|
|
2206
|
+
)
|
|
2207
|
+
return _recast_sources(
|
|
2208
|
+
products, RestrictionAndLigationSource, restriction_enzymes=enzymes
|
|
2209
|
+
)
|
|
2127
2210
|
|
|
2128
2211
|
|
|
2129
2212
|
def golden_gate_assembly(
|
|
2130
|
-
frags: list[
|
|
2131
|
-
enzymes: list["
|
|
2213
|
+
frags: list[Dseqrecord],
|
|
2214
|
+
enzymes: list["AbstractCut"],
|
|
2132
2215
|
allow_blunt: bool = True,
|
|
2133
2216
|
circular_only: bool = False,
|
|
2134
|
-
) -> list[
|
|
2217
|
+
) -> list[Dseqrecord]:
|
|
2135
2218
|
"""Returns the products for Golden Gate assembly. This is the same as
|
|
2136
2219
|
restriction ligation assembly, but with a different name. Check the documentation
|
|
2137
|
-
for
|
|
2220
|
+
for ``restriction_ligation_assembly`` for more details.
|
|
2138
2221
|
|
|
2139
2222
|
Parameters
|
|
2140
2223
|
----------
|
|
2141
|
-
frags : list[
|
|
2224
|
+
frags : list[Dseqrecord]
|
|
2142
2225
|
List of DNA fragments to assemble
|
|
2143
|
-
enzymes : list[
|
|
2226
|
+
enzymes : list[AbstractCut]
|
|
2144
2227
|
List of restriction enzymes to use
|
|
2145
2228
|
allow_blunt : bool, optional
|
|
2146
2229
|
If True, allow blunt end ligations, by default True
|
|
@@ -2149,30 +2232,30 @@ def golden_gate_assembly(
|
|
|
2149
2232
|
|
|
2150
2233
|
Returns
|
|
2151
2234
|
-------
|
|
2152
|
-
list[
|
|
2235
|
+
list[Dseqrecord]
|
|
2153
2236
|
List of assembled DNA molecules
|
|
2154
2237
|
|
|
2155
2238
|
Examples
|
|
2156
2239
|
--------
|
|
2157
|
-
See the example for
|
|
2240
|
+
See the example for ``restriction_ligation_assembly``.
|
|
2158
2241
|
"""
|
|
2159
2242
|
return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
|
|
2160
2243
|
|
|
2161
2244
|
|
|
2162
2245
|
def ligation_assembly(
|
|
2163
|
-
frags: list[
|
|
2246
|
+
frags: list[Dseqrecord],
|
|
2164
2247
|
allow_blunt: bool = False,
|
|
2165
2248
|
allow_partial_overlap: bool = False,
|
|
2166
2249
|
circular_only: bool = False,
|
|
2167
|
-
) -> list[
|
|
2250
|
+
) -> list[Dseqrecord]:
|
|
2168
2251
|
"""Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
|
|
2169
2252
|
will be ligated.
|
|
2170
2253
|
|
|
2171
|
-
For most cases, you probably should use
|
|
2254
|
+
For most cases, you probably should use ``restriction_ligation_assembly`` instead.
|
|
2172
2255
|
|
|
2173
2256
|
Parameters
|
|
2174
2257
|
----------
|
|
2175
|
-
frags : list[
|
|
2258
|
+
frags : list[Dseqrecord]
|
|
2176
2259
|
List of DNA fragments to assemble
|
|
2177
2260
|
allow_blunt : bool, optional
|
|
2178
2261
|
If True, allow blunt end ligations, by default False
|
|
@@ -2183,7 +2266,7 @@ def ligation_assembly(
|
|
|
2183
2266
|
|
|
2184
2267
|
Returns
|
|
2185
2268
|
-------
|
|
2186
|
-
list[
|
|
2269
|
+
list[Dseqrecord]
|
|
2187
2270
|
List of assembled DNA molecules
|
|
2188
2271
|
|
|
2189
2272
|
|
|
@@ -2215,11 +2298,14 @@ def ligation_assembly(
|
|
|
2215
2298
|
return sticky_end_sub_strings(x, y, allow_partial_overlap)
|
|
2216
2299
|
|
|
2217
2300
|
if allow_blunt:
|
|
2218
|
-
|
|
2301
|
+
algorithm_fn = combine_algorithms(sticky_end_algorithm, blunt_overlap)
|
|
2219
2302
|
else:
|
|
2220
|
-
|
|
2303
|
+
algorithm_fn = sticky_end_algorithm
|
|
2221
2304
|
|
|
2222
|
-
|
|
2305
|
+
products = common_function_assembly_products(
|
|
2306
|
+
frags, None, algorithm_fn, circular_only
|
|
2307
|
+
)
|
|
2308
|
+
return _recast_sources(products, LigationSource)
|
|
2223
2309
|
|
|
2224
2310
|
|
|
2225
2311
|
def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
|
|
@@ -2235,20 +2321,20 @@ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
|
|
|
2235
2321
|
|
|
2236
2322
|
|
|
2237
2323
|
def gateway_assembly(
|
|
2238
|
-
frags: list[
|
|
2239
|
-
reaction_type:
|
|
2324
|
+
frags: list[Dseqrecord],
|
|
2325
|
+
reaction_type: Literal["BP", "LR"],
|
|
2240
2326
|
greedy: bool = False,
|
|
2241
2327
|
circular_only: bool = False,
|
|
2242
2328
|
multi_site_only: bool = False,
|
|
2243
|
-
) -> list[
|
|
2329
|
+
) -> list[Dseqrecord]:
|
|
2244
2330
|
"""Returns the products for Gateway assembly / Gateway cloning.
|
|
2245
2331
|
|
|
2246
2332
|
Parameters
|
|
2247
2333
|
----------
|
|
2248
|
-
frags : list[
|
|
2334
|
+
frags : list[Dseqrecord]
|
|
2249
2335
|
List of DNA fragments to assemble
|
|
2250
|
-
reaction_type :
|
|
2251
|
-
Type of Gateway reaction
|
|
2336
|
+
reaction_type : Literal['BP', 'LR']
|
|
2337
|
+
Type of Gateway reaction
|
|
2252
2338
|
greedy : bool, optional
|
|
2253
2339
|
If True, use greedy gateway consensus sites, by default False
|
|
2254
2340
|
circular_only : bool, optional
|
|
@@ -2261,7 +2347,7 @@ def gateway_assembly(
|
|
|
2261
2347
|
|
|
2262
2348
|
Returns
|
|
2263
2349
|
-------
|
|
2264
|
-
list[
|
|
2350
|
+
list[Dseqrecord]
|
|
2265
2351
|
List of assembled DNA molecules
|
|
2266
2352
|
|
|
2267
2353
|
|
|
@@ -2288,9 +2374,9 @@ def gateway_assembly(
|
|
|
2288
2374
|
>>> len(products_LR)
|
|
2289
2375
|
2
|
|
2290
2376
|
|
|
2291
|
-
Now let's understand the
|
|
2377
|
+
Now let's understand the ``multi_site_only`` parameter. Let's consider a case where we are swapping fragments
|
|
2292
2378
|
between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
|
|
2293
|
-
swapping between the two att sites. That's what we get if we set
|
|
2379
|
+
swapping between the two att sites. That's what we get if we set ``multi_site_only`` to True.
|
|
2294
2380
|
|
|
2295
2381
|
>>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
|
|
2296
2382
|
>>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
|
|
@@ -2300,7 +2386,7 @@ def gateway_assembly(
|
|
|
2300
2386
|
>>> len(products)
|
|
2301
2387
|
2
|
|
2302
2388
|
|
|
2303
|
-
However, if we set
|
|
2389
|
+
However, if we set ``multi_site_only`` to False, we get 4 products, which also include the intermediate products
|
|
2304
2390
|
where the two plasmids are combined into a single one through recombination of a single att site. This is an
|
|
2305
2391
|
intermediate of the reaction, and typically we don't want it:
|
|
2306
2392
|
|
|
@@ -2316,13 +2402,19 @@ def gateway_assembly(
|
|
|
2316
2402
|
f"Invalid reaction type: {reaction_type}, can only be BP or LR"
|
|
2317
2403
|
)
|
|
2318
2404
|
|
|
2319
|
-
def
|
|
2405
|
+
def algorithm_fn(x, y, _l):
|
|
2320
2406
|
return gateway_overlap(x, y, reaction_type, greedy)
|
|
2321
2407
|
|
|
2322
2408
|
filter_results_function = None if not multi_site_only else assembly_is_multi_site
|
|
2323
2409
|
|
|
2324
2410
|
products = common_function_assembly_products(
|
|
2325
|
-
frags, None,
|
|
2411
|
+
frags, None, algorithm_fn, circular_only, filter_results_function
|
|
2412
|
+
)
|
|
2413
|
+
products = _recast_sources(
|
|
2414
|
+
products,
|
|
2415
|
+
GatewaySource,
|
|
2416
|
+
reaction_type=reaction_type,
|
|
2417
|
+
greedy=greedy,
|
|
2326
2418
|
)
|
|
2327
2419
|
|
|
2328
2420
|
if len(products) == 0:
|
|
@@ -2342,13 +2434,13 @@ def gateway_assembly(
|
|
|
2342
2434
|
|
|
2343
2435
|
|
|
2344
2436
|
def common_function_integration_products(
|
|
2345
|
-
frags: list[
|
|
2346
|
-
) -> list[
|
|
2437
|
+
frags: list[Dseqrecord], limit: int | None, algorithm: Callable
|
|
2438
|
+
) -> list[Dseqrecord]:
|
|
2347
2439
|
"""Common function to avoid code duplication for integration products.
|
|
2348
2440
|
|
|
2349
2441
|
Parameters
|
|
2350
2442
|
----------
|
|
2351
|
-
frags : list[
|
|
2443
|
+
frags : list[Dseqrecord]
|
|
2352
2444
|
List of DNA fragments to integrate
|
|
2353
2445
|
limit : int or None
|
|
2354
2446
|
Minimum overlap length required, or None if not applicable
|
|
@@ -2357,7 +2449,7 @@ def common_function_integration_products(
|
|
|
2357
2449
|
|
|
2358
2450
|
Returns
|
|
2359
2451
|
-------
|
|
2360
|
-
list[
|
|
2452
|
+
list[Dseqrecord]
|
|
2361
2453
|
List of integrated DNA molecules
|
|
2362
2454
|
"""
|
|
2363
2455
|
if len(frags) == 1:
|
|
@@ -2378,27 +2470,27 @@ def common_function_integration_products(
|
|
|
2378
2470
|
|
|
2379
2471
|
|
|
2380
2472
|
def common_handle_insertion_fragments(
|
|
2381
|
-
genome:
|
|
2382
|
-
) -> list[
|
|
2473
|
+
genome: Dseqrecord, inserts: list[Dseqrecord]
|
|
2474
|
+
) -> list[Dseqrecord]:
|
|
2383
2475
|
"""Common function to handle / validate insertion fragments.
|
|
2384
2476
|
|
|
2385
2477
|
Parameters
|
|
2386
2478
|
----------
|
|
2387
|
-
genome :
|
|
2479
|
+
genome : Dseqrecord
|
|
2388
2480
|
Target genome sequence
|
|
2389
|
-
inserts : list[
|
|
2481
|
+
inserts : list[Dseqrecord] or Dseqrecord
|
|
2390
2482
|
DNA fragment(s) to insert
|
|
2391
2483
|
|
|
2392
2484
|
Returns
|
|
2393
2485
|
-------
|
|
2394
|
-
list[
|
|
2486
|
+
list[Dseqrecord]
|
|
2395
2487
|
List containing genome and insert fragments
|
|
2396
2488
|
"""
|
|
2397
|
-
if not isinstance(genome,
|
|
2489
|
+
if not isinstance(genome, Dseqrecord):
|
|
2398
2490
|
raise ValueError("Genome must be a Dseqrecord object")
|
|
2399
2491
|
|
|
2400
2492
|
if not isinstance(inserts, list) or not all(
|
|
2401
|
-
isinstance(f,
|
|
2493
|
+
isinstance(f, Dseqrecord) for f in inserts
|
|
2402
2494
|
):
|
|
2403
2495
|
raise ValueError("Inserts must be a list of Dseqrecord objects")
|
|
2404
2496
|
|
|
@@ -2409,13 +2501,13 @@ def common_handle_insertion_fragments(
|
|
|
2409
2501
|
|
|
2410
2502
|
|
|
2411
2503
|
def common_function_excision_products(
|
|
2412
|
-
genome:
|
|
2413
|
-
) -> list[
|
|
2504
|
+
genome: Dseqrecord, limit: int | None, algorithm: Callable
|
|
2505
|
+
) -> list[Dseqrecord]:
|
|
2414
2506
|
"""Common function to avoid code duplication for excision products.
|
|
2415
2507
|
|
|
2416
2508
|
Parameters
|
|
2417
2509
|
----------
|
|
2418
|
-
genome :
|
|
2510
|
+
genome : Dseqrecord
|
|
2419
2511
|
Target genome sequence
|
|
2420
2512
|
limit : int or None
|
|
2421
2513
|
Minimum overlap length required, or None if not applicable
|
|
@@ -2424,7 +2516,7 @@ def common_function_excision_products(
|
|
|
2424
2516
|
|
|
2425
2517
|
Returns
|
|
2426
2518
|
-------
|
|
2427
|
-
list[
|
|
2519
|
+
list[Dseqrecord]
|
|
2428
2520
|
List of excised DNA molecules
|
|
2429
2521
|
"""
|
|
2430
2522
|
asm = SingleFragmentAssembly([genome], limit, algorithm)
|
|
@@ -2432,25 +2524,25 @@ def common_function_excision_products(
|
|
|
2432
2524
|
|
|
2433
2525
|
|
|
2434
2526
|
def homologous_recombination_integration(
|
|
2435
|
-
genome:
|
|
2436
|
-
inserts: list[
|
|
2527
|
+
genome: Dseqrecord,
|
|
2528
|
+
inserts: list[Dseqrecord],
|
|
2437
2529
|
limit: int = 40,
|
|
2438
|
-
) -> list[
|
|
2530
|
+
) -> list[Dseqrecord]:
|
|
2439
2531
|
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2440
2532
|
through in vivo recombination) into the genome through homologous recombination.
|
|
2441
2533
|
|
|
2442
2534
|
Parameters
|
|
2443
2535
|
----------
|
|
2444
|
-
genome :
|
|
2536
|
+
genome : Dseqrecord
|
|
2445
2537
|
Target genome sequence
|
|
2446
|
-
inserts : list[
|
|
2538
|
+
inserts : list[Dseqrecord]
|
|
2447
2539
|
DNA fragment(s) to insert
|
|
2448
2540
|
limit : int, optional
|
|
2449
2541
|
Minimum homology length required, by default 40
|
|
2450
2542
|
|
|
2451
2543
|
Returns
|
|
2452
2544
|
-------
|
|
2453
|
-
list[
|
|
2545
|
+
list[Dseqrecord]
|
|
2454
2546
|
List of integrated DNA molecules
|
|
2455
2547
|
|
|
2456
2548
|
|
|
@@ -2479,25 +2571,28 @@ def homologous_recombination_integration(
|
|
|
2479
2571
|
"""
|
|
2480
2572
|
fragments = common_handle_insertion_fragments(genome, inserts)
|
|
2481
2573
|
|
|
2482
|
-
|
|
2574
|
+
products = common_function_integration_products(
|
|
2575
|
+
fragments, limit, common_sub_strings
|
|
2576
|
+
)
|
|
2577
|
+
return _recast_sources(products, HomologousRecombinationSource)
|
|
2483
2578
|
|
|
2484
2579
|
|
|
2485
2580
|
def homologous_recombination_excision(
|
|
2486
|
-
genome:
|
|
2487
|
-
) -> list[
|
|
2581
|
+
genome: Dseqrecord, limit: int = 40
|
|
2582
|
+
) -> list[Dseqrecord]:
|
|
2488
2583
|
"""Returns the products resulting from the excision of a fragment from the genome through
|
|
2489
2584
|
homologous recombination.
|
|
2490
2585
|
|
|
2491
2586
|
Parameters
|
|
2492
2587
|
----------
|
|
2493
|
-
genome :
|
|
2588
|
+
genome : Dseqrecord
|
|
2494
2589
|
Target genome sequence
|
|
2495
2590
|
limit : int, optional
|
|
2496
2591
|
Minimum homology length required, by default 40
|
|
2497
2592
|
|
|
2498
2593
|
Returns
|
|
2499
2594
|
-------
|
|
2500
|
-
list[
|
|
2595
|
+
list[Dseqrecord]
|
|
2501
2596
|
List containing excised plasmid and remaining genome sequence
|
|
2502
2597
|
|
|
2503
2598
|
Examples
|
|
@@ -2515,27 +2610,28 @@ def homologous_recombination_excision(
|
|
|
2515
2610
|
>>> products
|
|
2516
2611
|
[Dseqrecord(o25), Dseqrecord(-32)]
|
|
2517
2612
|
"""
|
|
2518
|
-
|
|
2613
|
+
products = common_function_excision_products(genome, limit, common_sub_strings)
|
|
2614
|
+
return _recast_sources(products, HomologousRecombinationSource)
|
|
2519
2615
|
|
|
2520
2616
|
|
|
2521
2617
|
def cre_lox_integration(
|
|
2522
|
-
genome:
|
|
2523
|
-
) -> list[
|
|
2618
|
+
genome: Dseqrecord, inserts: list[Dseqrecord]
|
|
2619
|
+
) -> list[Dseqrecord]:
|
|
2524
2620
|
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2525
2621
|
through cre-lox recombination among them) into the genome through cre-lox integration.
|
|
2526
2622
|
|
|
2527
|
-
Also works with lox66 and lox71 (see
|
|
2623
|
+
Also works with lox66 and lox71 (see ``pydna.cre_lox`` for more details).
|
|
2528
2624
|
|
|
2529
2625
|
Parameters
|
|
2530
2626
|
----------
|
|
2531
|
-
genome :
|
|
2627
|
+
genome : Dseqrecord
|
|
2532
2628
|
Target genome sequence
|
|
2533
|
-
inserts : list[
|
|
2629
|
+
inserts : list[Dseqrecord] or Dseqrecord
|
|
2534
2630
|
DNA fragment(s) to insert
|
|
2535
2631
|
|
|
2536
2632
|
Returns
|
|
2537
2633
|
-------
|
|
2538
|
-
list[
|
|
2634
|
+
list[Dseqrecord]
|
|
2539
2635
|
List of integrated DNA molecules
|
|
2540
2636
|
|
|
2541
2637
|
Examples
|
|
@@ -2574,20 +2670,21 @@ def cre_lox_integration(
|
|
|
2574
2670
|
|
|
2575
2671
|
"""
|
|
2576
2672
|
fragments = common_handle_insertion_fragments(genome, inserts)
|
|
2577
|
-
|
|
2673
|
+
products = common_function_integration_products(fragments, None, cre_loxP_overlap)
|
|
2674
|
+
return _recast_sources(products, CreLoxRecombinationSource)
|
|
2578
2675
|
|
|
2579
2676
|
|
|
2580
|
-
def cre_lox_excision(genome:
|
|
2677
|
+
def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]:
|
|
2581
2678
|
"""Returns the products for CRE-lox excision.
|
|
2582
2679
|
|
|
2583
2680
|
Parameters
|
|
2584
2681
|
----------
|
|
2585
|
-
genome :
|
|
2682
|
+
genome : Dseqrecord
|
|
2586
2683
|
Target genome sequence
|
|
2587
2684
|
|
|
2588
2685
|
Returns
|
|
2589
2686
|
-------
|
|
2590
|
-
list[
|
|
2687
|
+
list[Dseqrecord]
|
|
2591
2688
|
List containing excised plasmid and remaining genome sequence
|
|
2592
2689
|
|
|
2593
2690
|
Examples
|
|
@@ -2624,4 +2721,152 @@ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
|
|
|
2624
2721
|
>>> res2
|
|
2625
2722
|
[Dseqrecord(o39), Dseqrecord(-45)]
|
|
2626
2723
|
"""
|
|
2627
|
-
|
|
2724
|
+
products = common_function_excision_products(genome, None, cre_loxP_overlap)
|
|
2725
|
+
return _recast_sources(products, CreLoxRecombinationSource)
|
|
2726
|
+
|
|
2727
|
+
|
|
2728
|
+
def crispr_integration(
|
|
2729
|
+
genome: Dseqrecord,
|
|
2730
|
+
inserts: list[Dseqrecord],
|
|
2731
|
+
guides: list[Primer],
|
|
2732
|
+
limit: int = 40,
|
|
2733
|
+
) -> list[Dseqrecord]:
|
|
2734
|
+
"""
|
|
2735
|
+
Returns the products for CRISPR integration.
|
|
2736
|
+
|
|
2737
|
+
Parameters
|
|
2738
|
+
----------
|
|
2739
|
+
genome : Dseqrecord
|
|
2740
|
+
Target genome sequence
|
|
2741
|
+
inserts : list[Dseqrecord]
|
|
2742
|
+
DNA fragment(s) to insert
|
|
2743
|
+
guides : list[Primer]
|
|
2744
|
+
List of guide RNAs as Primer objects. This may change in the future.
|
|
2745
|
+
limit : int, optional
|
|
2746
|
+
Minimum overlap length required, by default 40
|
|
2747
|
+
|
|
2748
|
+
Returns
|
|
2749
|
+
-------
|
|
2750
|
+
list[Dseqrecord]
|
|
2751
|
+
List of integrated DNA molecules
|
|
2752
|
+
|
|
2753
|
+
Examples
|
|
2754
|
+
--------
|
|
2755
|
+
|
|
2756
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2757
|
+
>>> from pydna.assembly2 import crispr_integration
|
|
2758
|
+
>>> from pydna.primer import Primer
|
|
2759
|
+
>>> genome = Dseqrecord("aaccggttcaatgcaaacagtaatgatggatgacattcaaagcac", name="genome")
|
|
2760
|
+
>>> insert = Dseqrecord("aaccggttAAAAAAAAAttcaaagcac", name="insert")
|
|
2761
|
+
>>> guide = Primer("ttcaatgcaaacagtaatga", name="guide")
|
|
2762
|
+
>>> product, *_ = crispr_integration(genome, [insert], [guide], 8)
|
|
2763
|
+
>>> product
|
|
2764
|
+
Dseqrecord(-27)
|
|
2765
|
+
|
|
2766
|
+
"""
|
|
2767
|
+
if len(guides) == 0:
|
|
2768
|
+
raise ValueError("At least one guide RNA is required for CRISPR integration")
|
|
2769
|
+
|
|
2770
|
+
# Get all the possible products from the homologous recombination integration
|
|
2771
|
+
products = homologous_recombination_integration(genome, inserts, limit)
|
|
2772
|
+
|
|
2773
|
+
# Verify that the guides cut in the region that will be repaired
|
|
2774
|
+
|
|
2775
|
+
# First we collect the positions where the guides cut
|
|
2776
|
+
guide_cuts = []
|
|
2777
|
+
for guide in guides:
|
|
2778
|
+
enzyme = cas9(str(guide.seq))
|
|
2779
|
+
possible_cuts = genome.seq.get_cutsites(enzyme)
|
|
2780
|
+
if len(possible_cuts) == 0:
|
|
2781
|
+
raise ValueError(
|
|
2782
|
+
f"Could not find Cas9 cutsite in the target sequence using the guide: {guide.name}"
|
|
2783
|
+
)
|
|
2784
|
+
# Keep only the position of the cut
|
|
2785
|
+
possible_cuts = [cut[0] for (cut, _) in possible_cuts]
|
|
2786
|
+
guide_cuts.append(possible_cuts)
|
|
2787
|
+
|
|
2788
|
+
# Then, we check it the possible homologous recombination products contain the cuts
|
|
2789
|
+
# from the guides inside the repair region.
|
|
2790
|
+
# We also add the used guides to each product. This is very important!
|
|
2791
|
+
valid_products = []
|
|
2792
|
+
for i, product in enumerate(products):
|
|
2793
|
+
# The second element of product.source.input is conventionally the insert/repair fragment
|
|
2794
|
+
# The other two (first and third) are the two bits of the genome
|
|
2795
|
+
repair_start = location_boundaries(product.source.input[0].right_location)[0]
|
|
2796
|
+
# Here we do +1 because the position of the cut marks the boundary (e.g. 0:10, 10:20 if a cut is at pos 10)
|
|
2797
|
+
repair_end = location_boundaries(product.source.input[2].left_location)[1] + 1
|
|
2798
|
+
repair_location = create_location(repair_start, repair_end, len(genome))
|
|
2799
|
+
some_cuts_inside_repair = []
|
|
2800
|
+
all_cuts_inside_repair = []
|
|
2801
|
+
for cut_group in guide_cuts:
|
|
2802
|
+
cuts_in_repair = [cut for cut in cut_group if cut in repair_location]
|
|
2803
|
+
some_cuts_inside_repair.append(len(cuts_in_repair) != 0)
|
|
2804
|
+
all_cuts_inside_repair.append(len(cuts_in_repair) == len(cut_group))
|
|
2805
|
+
|
|
2806
|
+
if all(some_cuts_inside_repair):
|
|
2807
|
+
used_guides = [g for i, g in enumerate(guides) if all_cuts_inside_repair[i]]
|
|
2808
|
+
# Add the used guides to the product <----- VERY IMPORTANT!
|
|
2809
|
+
product.source.input.extend([SourceInput(sequence=g) for g in used_guides])
|
|
2810
|
+
valid_products.append(product)
|
|
2811
|
+
|
|
2812
|
+
if not all(all_cuts_inside_repair):
|
|
2813
|
+
raise ValueError(
|
|
2814
|
+
"Some guides cut outside the repair region, please check the guides"
|
|
2815
|
+
)
|
|
2816
|
+
|
|
2817
|
+
if len(valid_products) != len(products):
|
|
2818
|
+
warnings.warn(
|
|
2819
|
+
"Some recombination products were discarded because they had off-target cuts",
|
|
2820
|
+
category=UserWarning,
|
|
2821
|
+
stacklevel=2,
|
|
2822
|
+
)
|
|
2823
|
+
|
|
2824
|
+
return _recast_sources(valid_products, CRISPRSource)
|
|
2825
|
+
|
|
2826
|
+
|
|
2827
|
+
def pcr_assembly(
|
|
2828
|
+
template: Dseqrecord,
|
|
2829
|
+
fwd_primer: Primer,
|
|
2830
|
+
rvs_primer: Primer,
|
|
2831
|
+
add_primer_features: bool = False,
|
|
2832
|
+
limit: int = 14,
|
|
2833
|
+
mismatches: int = 0,
|
|
2834
|
+
) -> list[Dseqrecord]:
|
|
2835
|
+
"""Returns the products for PCR assembly.
|
|
2836
|
+
|
|
2837
|
+
Parameters
|
|
2838
|
+
----------
|
|
2839
|
+
template : Dseqrecord
|
|
2840
|
+
Template sequence
|
|
2841
|
+
fwd_primer : Primer
|
|
2842
|
+
Forward primer
|
|
2843
|
+
rvs_primer : Primer
|
|
2844
|
+
Reverse primer
|
|
2845
|
+
add_primer_features : bool, optional
|
|
2846
|
+
If True, add primer features to the product, by default False
|
|
2847
|
+
limit : int, optional
|
|
2848
|
+
Minimum overlap length required, by default 14
|
|
2849
|
+
mismatches : int, optional
|
|
2850
|
+
Maximum number of mismatches, by default 0
|
|
2851
|
+
|
|
2852
|
+
Returns
|
|
2853
|
+
-------
|
|
2854
|
+
list[Dseqrecord]
|
|
2855
|
+
List of assembled DNA molecules
|
|
2856
|
+
"""
|
|
2857
|
+
|
|
2858
|
+
minimal_annealing = limit + mismatches
|
|
2859
|
+
fragments = [fwd_primer, template, rvs_primer]
|
|
2860
|
+
asm = PCRAssembly(
|
|
2861
|
+
fragments,
|
|
2862
|
+
limit=minimal_annealing,
|
|
2863
|
+
mismatches=mismatches,
|
|
2864
|
+
)
|
|
2865
|
+
products = asm.assemble_linear()
|
|
2866
|
+
# If both primers are the same, remove duplicates
|
|
2867
|
+
if str(fwd_primer.seq).upper() == str(rvs_primer.seq).upper():
|
|
2868
|
+
products = [p for p in products if not p.source.input[1].reverse_complemented]
|
|
2869
|
+
if add_primer_features:
|
|
2870
|
+
products = [annotate_primer_binding_sites(prod, fragments) for prod in products]
|
|
2871
|
+
|
|
2872
|
+
return _recast_sources(products, PCRSource, add_primer_features=add_primer_features)
|