pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +30 -195
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/all.py +1 -12
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +375 -310
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/dseq.py +1788 -718
- pydna/dseqrecord.py +197 -179
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +187 -60
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +19 -19
- pydna/utils.py +97 -75
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/METADATA +8 -8
- pydna-5.5.6.dist-info/RECORD +42 -0
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/download.py +0 -32
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.4.dist-info/RECORD +0 -46
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/licenses/LICENSE.txt +0 -0
pydna/assembly2.py
CHANGED
|
@@ -4,29 +4,29 @@ Improved implementation of the assembly module. To see a list of issues with the
|
|
|
4
4
|
see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import networkx as
|
|
8
|
-
import itertools
|
|
7
|
+
import networkx as nx
|
|
8
|
+
import itertools
|
|
9
9
|
from Bio.SeqFeature import SimpleLocation, Location
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
from Bio.Restriction.Restriction import RestrictionBatch
|
|
12
12
|
import regex
|
|
13
13
|
import copy
|
|
14
14
|
|
|
15
15
|
from pydna.utils import (
|
|
16
|
-
shift_location
|
|
16
|
+
shift_location,
|
|
17
17
|
flatten,
|
|
18
|
-
location_boundaries
|
|
19
|
-
locations_overlap
|
|
18
|
+
location_boundaries,
|
|
19
|
+
locations_overlap,
|
|
20
20
|
sum_is_sticky,
|
|
21
21
|
limit_iterator,
|
|
22
22
|
create_location,
|
|
23
23
|
)
|
|
24
|
-
from pydna._pretty import pretty_str as
|
|
24
|
+
from pydna._pretty import pretty_str as ps
|
|
25
25
|
from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
|
|
26
|
-
from pydna.dseqrecord import Dseqrecord
|
|
27
|
-
from pydna.dseq import Dseq
|
|
28
|
-
from pydna.primer import Primer
|
|
29
|
-
from pydna.seqrecord import SeqRecord
|
|
26
|
+
from pydna.dseqrecord import Dseqrecord
|
|
27
|
+
from pydna.dseq import Dseq
|
|
28
|
+
from pydna.primer import Primer
|
|
29
|
+
from pydna.seqrecord import SeqRecord
|
|
30
30
|
from pydna.types import (
|
|
31
31
|
CutSiteType,
|
|
32
32
|
# TODO: allow user to enforce multi-site
|
|
@@ -38,6 +38,7 @@ from pydna.types import (
|
|
|
38
38
|
)
|
|
39
39
|
from pydna.gateway import gateway_overlap, find_gateway_sites
|
|
40
40
|
from pydna.cre_lox import cre_loxP_overlap
|
|
41
|
+
from pydna.alphabet import anneal_strands
|
|
41
42
|
|
|
42
43
|
from typing import TYPE_CHECKING, Callable, Literal
|
|
43
44
|
from pydna.opencloning_models import (
|
|
@@ -59,7 +60,7 @@ from pydna.crispr import cas9
|
|
|
59
60
|
import warnings
|
|
60
61
|
|
|
61
62
|
if TYPE_CHECKING: # pragma: no cover
|
|
62
|
-
from Bio.Restriction import AbstractCut
|
|
63
|
+
from Bio.Restriction import AbstractCut
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
def gather_overlapping_locations(
|
|
@@ -71,29 +72,29 @@ def gather_overlapping_locations(
|
|
|
71
72
|
the output will be [(loc1, loc2), (loc3,)].
|
|
72
73
|
"""
|
|
73
74
|
# Make a graph with all the locations as nodes
|
|
74
|
-
G =
|
|
75
|
+
G = nx.Graph()
|
|
75
76
|
for i, loc in enumerate(locs):
|
|
76
77
|
G.add_node(i, location=loc)
|
|
77
78
|
|
|
78
79
|
# Add edges between nodes that overlap
|
|
79
80
|
for i in range(len(locs)):
|
|
80
81
|
for j in range(i + 1, len(locs)):
|
|
81
|
-
if
|
|
82
|
+
if locations_overlap(locs[i], locs[j], fragment_length):
|
|
82
83
|
G.add_edge(i, j)
|
|
83
84
|
|
|
84
85
|
# Get groups of overlapping locations
|
|
85
86
|
groups = list()
|
|
86
|
-
for loc_set in
|
|
87
|
+
for loc_set in nx.connected_components(G):
|
|
87
88
|
groups.append(tuple(locs[i] for i in loc_set))
|
|
88
89
|
|
|
89
90
|
# Sort by location of the first element in each group (does not matter which since they are overlapping)
|
|
90
|
-
groups.sort(key=lambda x:
|
|
91
|
+
groups.sort(key=lambda x: location_boundaries(x[0])[0])
|
|
91
92
|
|
|
92
93
|
return groups
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
def ends_from_cutsite(
|
|
96
|
-
cutsite: CutSiteType, seq:
|
|
97
|
+
cutsite: CutSiteType, seq: Dseq
|
|
97
98
|
) -> tuple[tuple[str, str], tuple[str, str]]:
|
|
98
99
|
"""Get the sticky or blunt ends created by a restriction enzyme cut.
|
|
99
100
|
|
|
@@ -116,7 +117,7 @@ def ends_from_cutsite(
|
|
|
116
117
|
and the sequence of the overhang. The first tuple is for the left end, second for the right end.
|
|
117
118
|
|
|
118
119
|
>>> from Bio.Restriction import NotI
|
|
119
|
-
>>> x =
|
|
120
|
+
>>> x = Dseq("ctcgGCGGCCGCcagcggccg")
|
|
120
121
|
>>> x.get_cutsites(NotI)
|
|
121
122
|
[((6, -4), NotI)]
|
|
122
123
|
>>> ends_from_cutsite(x.get_cutsites(NotI)[0], x)
|
|
@@ -143,8 +144,8 @@ def ends_from_cutsite(
|
|
|
143
144
|
|
|
144
145
|
|
|
145
146
|
def restriction_ligation_overlap(
|
|
146
|
-
seqx:
|
|
147
|
-
seqy:
|
|
147
|
+
seqx: Dseqrecord,
|
|
148
|
+
seqy: Dseqrecord,
|
|
148
149
|
enzymes=RestrictionBatch,
|
|
149
150
|
partial=False,
|
|
150
151
|
allow_blunt=False,
|
|
@@ -155,9 +156,9 @@ def restriction_ligation_overlap(
|
|
|
155
156
|
|
|
156
157
|
Parameters
|
|
157
158
|
----------
|
|
158
|
-
seqx :
|
|
159
|
+
seqx : Dseqrecord
|
|
159
160
|
The first sequence
|
|
160
|
-
seqy :
|
|
161
|
+
seqy : Dseqrecord
|
|
161
162
|
The second sequence
|
|
162
163
|
enzymes : RestrictionBatch
|
|
163
164
|
The enzymes to use
|
|
@@ -211,7 +212,7 @@ def restriction_ligation_overlap(
|
|
|
211
212
|
# if not seqy.circular:
|
|
212
213
|
# cuts_y.append(((0, 0), None))
|
|
213
214
|
matches = list()
|
|
214
|
-
for cut_x, cut_y in
|
|
215
|
+
for cut_x, cut_y in itertools.product(cuts_x, cuts_y):
|
|
215
216
|
# A blunt end
|
|
216
217
|
if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
|
|
217
218
|
matches.append((cut_x[0][0], cut_y[0][0], 0))
|
|
@@ -255,7 +256,7 @@ def combine_algorithms(*algorithms: AssemblyAlgorithmType) -> AssemblyAlgorithmT
|
|
|
255
256
|
|
|
256
257
|
|
|
257
258
|
def blunt_overlap(
|
|
258
|
-
seqx:
|
|
259
|
+
seqx: Dseqrecord, seqy: Dseqrecord, limit=None
|
|
259
260
|
) -> list[SequenceOverlap]:
|
|
260
261
|
"""
|
|
261
262
|
Assembly algorithm to find blunt overlaps. Used for blunt ligation.
|
|
@@ -265,9 +266,9 @@ def blunt_overlap(
|
|
|
265
266
|
|
|
266
267
|
Parameters
|
|
267
268
|
----------
|
|
268
|
-
seqx :
|
|
269
|
+
seqx : Dseqrecord
|
|
269
270
|
The first sequence
|
|
270
|
-
seqy :
|
|
271
|
+
seqy : Dseqrecord
|
|
271
272
|
The second sequence
|
|
272
273
|
limit : int
|
|
273
274
|
There for compatibility, but it is ignored
|
|
@@ -293,7 +294,7 @@ def blunt_overlap(
|
|
|
293
294
|
|
|
294
295
|
|
|
295
296
|
def common_sub_strings(
|
|
296
|
-
seqx:
|
|
297
|
+
seqx: Dseqrecord, seqy: Dseqrecord, limit=25
|
|
297
298
|
) -> list[SequenceOverlap]:
|
|
298
299
|
"""
|
|
299
300
|
Assembly algorithm to find common substrings of length == limit. see the docs of
|
|
@@ -356,7 +357,18 @@ def common_sub_strings(
|
|
|
356
357
|
return [r for r in results if r not in shifted_matches]
|
|
357
358
|
|
|
358
359
|
|
|
359
|
-
def
|
|
360
|
+
def _get_trim_end_info(
|
|
361
|
+
end_info: tuple[str, str], trim_ends: str, is_five_prime: bool
|
|
362
|
+
) -> int | None:
|
|
363
|
+
"""Utility function to get the trim information for terminal_overlap."""
|
|
364
|
+
if end_info[0] == trim_ends:
|
|
365
|
+
return len(end_info[1]) if is_five_prime else len(end_info[1]) * -1
|
|
366
|
+
return 0 if is_five_prime else None
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def terminal_overlap(
|
|
370
|
+
seqx: Dseqrecord, seqy: Dseqrecord, limit=25, trim_ends: None | str = None
|
|
371
|
+
):
|
|
360
372
|
"""
|
|
361
373
|
Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
|
|
362
374
|
The order matters, we want alignments like:
|
|
@@ -375,12 +387,15 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
375
387
|
|
|
376
388
|
Parameters
|
|
377
389
|
----------
|
|
378
|
-
seqx :
|
|
390
|
+
seqx : Dseqrecord
|
|
379
391
|
The first sequence
|
|
380
|
-
seqy :
|
|
392
|
+
seqy : Dseqrecord
|
|
381
393
|
The second sequence
|
|
382
394
|
limit : int
|
|
383
395
|
Minimum length of the overlap
|
|
396
|
+
trim_ends : str
|
|
397
|
+
The ends to trim, either '5' or '3'
|
|
398
|
+
If None, no trimming is done
|
|
384
399
|
|
|
385
400
|
Returns
|
|
386
401
|
-------
|
|
@@ -388,32 +403,64 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
388
403
|
A list of overlaps between the two sequences
|
|
389
404
|
|
|
390
405
|
>>> from pydna.dseqrecord import Dseqrecord
|
|
391
|
-
>>> from pydna.assembly2 import
|
|
406
|
+
>>> from pydna.assembly2 import terminal_overlap
|
|
392
407
|
>>> x = Dseqrecord("ttactaAAAAAA")
|
|
393
408
|
>>> y = Dseqrecord("AAAAAAcgcacg")
|
|
394
|
-
>>>
|
|
409
|
+
>>> terminal_overlap(x, y, limit=5)
|
|
395
410
|
[(6, 0, 6), (7, 0, 5)]
|
|
396
|
-
>>>
|
|
411
|
+
>>> terminal_overlap(y, x, limit=5)
|
|
412
|
+
[]
|
|
413
|
+
|
|
414
|
+
Trimming the ends:
|
|
415
|
+
>>> from pydna.dseq import Dseq
|
|
416
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
417
|
+
>>> from pydna.assembly2 import terminal_overlap
|
|
418
|
+
>>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("aaaACGT", 0, 3))
|
|
419
|
+
>>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("ACGTccc", 3, 0))
|
|
420
|
+
>>> terminal_overlap(x, y, limit=4)
|
|
421
|
+
[(3, 0, 4)]
|
|
422
|
+
>>> terminal_overlap(x, y, limit=4, trim_ends="5'")
|
|
423
|
+
[(3, 0, 4)]
|
|
424
|
+
>>> terminal_overlap(x, y, limit=4, trim_ends="3'")
|
|
397
425
|
[]
|
|
398
426
|
"""
|
|
399
427
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
428
|
+
if trim_ends is not None and trim_ends not in ["5'", "3'"]:
|
|
429
|
+
raise ValueError("trim_ends must be '5' or '3'")
|
|
430
|
+
|
|
431
|
+
if trim_ends is None:
|
|
432
|
+
trim_x_left, trim_x_right, trim_y_left, trim_y_right = (0, None, 0, None)
|
|
433
|
+
stringx = str(seqx.seq).upper()
|
|
434
|
+
stringy = str(seqy.seq).upper()
|
|
435
|
+
else:
|
|
436
|
+
trim_x_right = _get_trim_end_info(
|
|
437
|
+
seqx.seq.three_prime_end(), trim_ends, is_five_prime=False
|
|
438
|
+
)
|
|
439
|
+
trim_y_left = _get_trim_end_info(
|
|
440
|
+
seqy.seq.five_prime_end(), trim_ends, is_five_prime=True
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# I actually don't think these two are needed, since only the terminal
|
|
444
|
+
# join between x_right and y_left is tested, but maybe there is some edge-case
|
|
445
|
+
# that I am missing, so keeping them just in case.
|
|
446
|
+
trim_x_left = _get_trim_end_info(
|
|
447
|
+
seqx.seq.five_prime_end(), trim_ends, is_five_prime=True
|
|
448
|
+
)
|
|
449
|
+
trim_y_right = _get_trim_end_info(
|
|
450
|
+
seqy.seq.three_prime_end(), trim_ends, is_five_prime=False
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
|
|
454
|
+
stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
|
|
455
|
+
|
|
411
456
|
# We have to convert to list because we need to modify the matches
|
|
412
457
|
matches = [
|
|
413
458
|
list(m)
|
|
414
459
|
for m in common_sub_strings_str(stringx, stringy, limit)
|
|
415
460
|
if (m[1] == 0 and m[0] + m[2] == len(stringx))
|
|
416
461
|
]
|
|
462
|
+
|
|
463
|
+
# Shift the matches if the left end has been trimmed
|
|
417
464
|
for match in matches:
|
|
418
465
|
match[0] += trim_x_left
|
|
419
466
|
match[1] += trim_y_left
|
|
@@ -422,7 +469,32 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
422
469
|
return [tuple(m) for m in matches]
|
|
423
470
|
|
|
424
471
|
|
|
425
|
-
def
|
|
472
|
+
def gibson_overlap(seqx: Dseqrecord, seqy: Dseqrecord, limit=25):
|
|
473
|
+
"""
|
|
474
|
+
Assembly algorithm to find terminal overlaps for Gibson assembly.
|
|
475
|
+
It is a wrapper around terminal_overlap with trim_ends="5'".
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
return terminal_overlap(seqx, seqy, limit, trim_ends="5'")
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def in_fusion_overlap(seqx: Dseqrecord, seqy: Dseqrecord, limit=25):
|
|
482
|
+
"""
|
|
483
|
+
Assembly algorithm to find terminal overlaps for in-fusion assembly.
|
|
484
|
+
It is a wrapper around terminal_overlap with trim_ends="3'".
|
|
485
|
+
"""
|
|
486
|
+
return terminal_overlap(seqx, seqy, limit, trim_ends="3'")
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def pcr_fusion_overlap(seqx: Dseqrecord, seqy: Dseqrecord, limit=25):
|
|
490
|
+
"""
|
|
491
|
+
Assembly algorithm to find terminal overlaps for PCR fusion assembly.
|
|
492
|
+
It is a wrapper around terminal_overlap with trim_ends=None.
|
|
493
|
+
"""
|
|
494
|
+
return terminal_overlap(seqx, seqy, limit, trim_ends=None)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def sticky_end_sub_strings(seqx: Dseqrecord, seqy: Dseqrecord, limit: bool = False):
|
|
426
498
|
"""
|
|
427
499
|
Assembly algorithm for ligation of sticky ends.
|
|
428
500
|
|
|
@@ -431,9 +503,9 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
|
|
|
431
503
|
|
|
432
504
|
Parameters
|
|
433
505
|
----------
|
|
434
|
-
seqx :
|
|
506
|
+
seqx : Dseqrecord
|
|
435
507
|
The first sequence
|
|
436
|
-
seqy :
|
|
508
|
+
seqy : Dseqrecord
|
|
437
509
|
The second sequence
|
|
438
510
|
limit : bool
|
|
439
511
|
Whether to allow partial overlaps
|
|
@@ -466,6 +538,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
|
|
|
466
538
|
[(4, 0, 2)]
|
|
467
539
|
|
|
468
540
|
"""
|
|
541
|
+
|
|
469
542
|
overlap = sum_is_sticky(
|
|
470
543
|
seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit
|
|
471
544
|
)
|
|
@@ -475,7 +548,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
|
|
|
475
548
|
|
|
476
549
|
|
|
477
550
|
def zip_match_leftwards(
|
|
478
|
-
seqx:
|
|
551
|
+
seqx: SeqRecord, seqy: SeqRecord, match: SequenceOverlap
|
|
479
552
|
) -> SequenceOverlap:
|
|
480
553
|
"""
|
|
481
554
|
Starting from the rightmost edge of the match, return a new match encompassing the max
|
|
@@ -483,15 +556,15 @@ def zip_match_leftwards(
|
|
|
483
556
|
than the limit or a shorter match if there are mismatches. This is convenient to maintain
|
|
484
557
|
as many features as possible. It is used in PCR assembly.
|
|
485
558
|
|
|
486
|
-
>>> seq =
|
|
487
|
-
>>> primer =
|
|
559
|
+
>>> seq = Dseqrecord('AAAAACGTCCCGT')
|
|
560
|
+
>>> primer = Dseqrecord('ACGTCCCGT')
|
|
488
561
|
>>> match = (13, 9, 0) # an empty match at the end of each
|
|
489
562
|
>>> zip_match_leftwards(seq, primer, match)
|
|
490
563
|
(4, 0, 9)
|
|
491
564
|
|
|
492
565
|
Works in circular molecules if the match spans the origin:
|
|
493
|
-
>>> seq =
|
|
494
|
-
>>> primer =
|
|
566
|
+
>>> seq = Dseqrecord('TCCCGTAAAAACG', circular=True)
|
|
567
|
+
>>> primer = Dseqrecord('ACGTCCCGT')
|
|
495
568
|
>>> match = (6, 9, 0)
|
|
496
569
|
>>> zip_match_leftwards(seq, primer, match)
|
|
497
570
|
(10, 0, 9)
|
|
@@ -512,11 +585,11 @@ def zip_match_leftwards(
|
|
|
512
585
|
# For those cases we shift by length, then go back
|
|
513
586
|
|
|
514
587
|
end_on_x = match[0] + match[2]
|
|
515
|
-
if isinstance(seqx,
|
|
588
|
+
if isinstance(seqx, Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
|
|
516
589
|
end_on_x += len(seqx)
|
|
517
590
|
|
|
518
591
|
end_on_y = match[1] + match[2]
|
|
519
|
-
if isinstance(seqy,
|
|
592
|
+
if isinstance(seqy, Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
|
|
520
593
|
end_on_y += len(seqy)
|
|
521
594
|
|
|
522
595
|
count = 0
|
|
@@ -533,7 +606,7 @@ def zip_match_leftwards(
|
|
|
533
606
|
|
|
534
607
|
|
|
535
608
|
def zip_match_rightwards(
|
|
536
|
-
seqx:
|
|
609
|
+
seqx: Dseqrecord, seqy: Dseqrecord, match: SequenceOverlap
|
|
537
610
|
) -> SequenceOverlap:
|
|
538
611
|
"""Same as zip_match_leftwards, but towards the right."""
|
|
539
612
|
|
|
@@ -549,19 +622,19 @@ def zip_match_rightwards(
|
|
|
549
622
|
return (start_on_x, start_on_y, count)
|
|
550
623
|
|
|
551
624
|
|
|
552
|
-
def seqrecord2_uppercase_DNA_string(seqr:
|
|
625
|
+
def seqrecord2_uppercase_DNA_string(seqr: SeqRecord) -> str:
|
|
553
626
|
"""
|
|
554
627
|
Transform a Dseqrecord to a sequence string where U is replaced by T, everything is upper case and
|
|
555
628
|
circular sequences are repeated twice. This is used for PCR, to support primers with U's (e.g. for USER cloning).
|
|
556
629
|
"""
|
|
557
630
|
out = str(seqr.seq).upper().replace("U", "T")
|
|
558
|
-
if isinstance(seqr,
|
|
631
|
+
if isinstance(seqr, Dseqrecord) and seqr.circular:
|
|
559
632
|
return out * 2
|
|
560
633
|
return out
|
|
561
634
|
|
|
562
635
|
|
|
563
636
|
def primer_template_overlap(
|
|
564
|
-
seqx:
|
|
637
|
+
seqx: Dseqrecord | Primer, seqy: Dseqrecord | Primer, limit=25, mismatches=0
|
|
565
638
|
) -> list[SequenceOverlap]:
|
|
566
639
|
"""
|
|
567
640
|
Assembly algorithm to find overlaps between a primer and a template. It accepts mismatches.
|
|
@@ -573,9 +646,9 @@ def primer_template_overlap(
|
|
|
573
646
|
|
|
574
647
|
Parameters
|
|
575
648
|
----------
|
|
576
|
-
seqx :
|
|
649
|
+
seqx : Dseqrecord | Primer
|
|
577
650
|
The primer
|
|
578
|
-
seqy :
|
|
651
|
+
seqy : Dseqrecord | Primer
|
|
579
652
|
The template
|
|
580
653
|
limit : int
|
|
581
654
|
Minimum length of the overlap
|
|
@@ -604,11 +677,11 @@ def primer_template_overlap(
|
|
|
604
677
|
[]
|
|
605
678
|
"""
|
|
606
679
|
|
|
607
|
-
if isinstance(seqx,
|
|
680
|
+
if isinstance(seqx, Primer) and isinstance(seqy, Dseqrecord):
|
|
608
681
|
primer = seqx
|
|
609
682
|
template = seqy
|
|
610
683
|
reverse_primer = False
|
|
611
|
-
elif isinstance(seqx,
|
|
684
|
+
elif isinstance(seqx, Dseqrecord) and isinstance(seqy, Primer):
|
|
612
685
|
primer = seqy
|
|
613
686
|
template = seqx
|
|
614
687
|
reverse_primer = True
|
|
@@ -662,45 +735,8 @@ def primer_template_overlap(
|
|
|
662
735
|
return list(sorted(out))
|
|
663
736
|
|
|
664
737
|
|
|
665
|
-
def fill_left(seq: _Dseq) -> _Dseq:
|
|
666
|
-
"""Fill the left overhang of a sequence with the complementary sequence."""
|
|
667
|
-
new_watson = seq.watson
|
|
668
|
-
new_crick = seq.crick
|
|
669
|
-
|
|
670
|
-
# Watson 5' overhang
|
|
671
|
-
if seq.ovhg < 0:
|
|
672
|
-
new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
|
|
673
|
-
# Crick 5' overhang
|
|
674
|
-
elif seq.ovhg > 0:
|
|
675
|
-
new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
|
|
676
|
-
|
|
677
|
-
return _Dseq(new_watson, new_crick, 0)
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
def fill_right(seq: _Dseq) -> _Dseq:
|
|
681
|
-
"""Fill the right overhang of a sequence with the complementary sequence."""
|
|
682
|
-
new_watson = seq.watson
|
|
683
|
-
new_crick = seq.crick
|
|
684
|
-
|
|
685
|
-
# Watson 3' overhang
|
|
686
|
-
watson_ovhg = seq.watson_ovhg()
|
|
687
|
-
if watson_ovhg < 0:
|
|
688
|
-
new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
|
|
689
|
-
|
|
690
|
-
# Crick 3' overhang
|
|
691
|
-
elif watson_ovhg > 0:
|
|
692
|
-
new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
|
|
693
|
-
|
|
694
|
-
return _Dseq(new_watson, new_crick, seq.ovhg)
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
def fill_dseq(seq: _Dseq) -> _Dseq:
|
|
698
|
-
"""Fill the overhangs of a sequence with the complementary sequence."""
|
|
699
|
-
return fill_left(fill_right(seq))
|
|
700
|
-
|
|
701
|
-
|
|
702
738
|
def reverse_complement_assembly(
|
|
703
|
-
assembly: EdgeRepresentationAssembly, fragments: list[
|
|
739
|
+
assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
|
|
704
740
|
) -> EdgeRepresentationAssembly:
|
|
705
741
|
"""Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
|
|
706
742
|
new_assembly = list()
|
|
@@ -714,7 +750,7 @@ def reverse_complement_assembly(
|
|
|
714
750
|
def filter_linear_subassemblies(
|
|
715
751
|
linear_assemblies: list[EdgeRepresentationAssembly],
|
|
716
752
|
circular_assemblies: list[EdgeRepresentationAssembly],
|
|
717
|
-
fragments: list[
|
|
753
|
+
fragments: list[Dseqrecord],
|
|
718
754
|
) -> list[EdgeRepresentationAssembly]:
|
|
719
755
|
"""Remove linear assemblies which are sub-assemblies of circular assemblies"""
|
|
720
756
|
all_circular_assemblies = circular_assemblies + [
|
|
@@ -773,7 +809,7 @@ def assembly2str_tuple(assembly: EdgeRepresentationAssembly) -> str:
|
|
|
773
809
|
|
|
774
810
|
|
|
775
811
|
def assembly_has_mismatches(
|
|
776
|
-
fragments: list[
|
|
812
|
+
fragments: list[Dseqrecord], assembly: EdgeRepresentationAssembly
|
|
777
813
|
) -> bool:
|
|
778
814
|
"""Check if an assembly has mismatches. This should never happen and if so it returns an error."""
|
|
779
815
|
for u, v, loc_u, loc_v in assembly:
|
|
@@ -789,7 +825,7 @@ def assembly_has_mismatches(
|
|
|
789
825
|
|
|
790
826
|
|
|
791
827
|
def assembly_is_circular(
|
|
792
|
-
assembly: EdgeRepresentationAssembly, fragments: list[
|
|
828
|
+
assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
|
|
793
829
|
) -> bool:
|
|
794
830
|
"""
|
|
795
831
|
Based on the topology of the locations of an assembly, determine if it is circular.
|
|
@@ -798,22 +834,22 @@ def assembly_is_circular(
|
|
|
798
834
|
if assembly[0][0] != assembly[-1][1]:
|
|
799
835
|
return False
|
|
800
836
|
elif (
|
|
801
|
-
isinstance(fragments[abs(assembly[0][0]) - 1],
|
|
837
|
+
isinstance(fragments[abs(assembly[0][0]) - 1], Dseqrecord)
|
|
802
838
|
and fragments[abs(assembly[0][0]) - 1].circular
|
|
803
839
|
):
|
|
804
840
|
return True
|
|
805
841
|
else:
|
|
806
842
|
return (
|
|
807
|
-
|
|
808
|
-
>
|
|
843
|
+
location_boundaries(assembly[0][2])[0]
|
|
844
|
+
> location_boundaries(assembly[-1][3])[0]
|
|
809
845
|
)
|
|
810
846
|
|
|
811
847
|
|
|
812
848
|
def assemble(
|
|
813
|
-
fragments: list[
|
|
849
|
+
fragments: list[Dseqrecord],
|
|
814
850
|
assembly: EdgeRepresentationAssembly,
|
|
815
851
|
is_insertion: bool = False,
|
|
816
|
-
) ->
|
|
852
|
+
) -> Dseqrecord:
|
|
817
853
|
"""Generate a Dseqrecord from an assembly and a list of fragments."""
|
|
818
854
|
|
|
819
855
|
if is_insertion:
|
|
@@ -830,14 +866,15 @@ def assemble(
|
|
|
830
866
|
u, v, loc_u, loc_v = asm_edge
|
|
831
867
|
f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
|
|
832
868
|
f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
|
|
833
|
-
seq_u = str(loc_u.extract(f_u).seq)
|
|
834
|
-
seq_v = str(loc_v.extract(f_v).seq
|
|
835
|
-
if seq_u
|
|
869
|
+
seq_u = str(loc_u.extract(f_u).seq)
|
|
870
|
+
seq_v = str(loc_v.extract(f_v).seq.rc())
|
|
871
|
+
# Test if seq_u and seq_v anneal
|
|
872
|
+
if not anneal_strands(seq_u, seq_v):
|
|
836
873
|
raise ValueError("Mismatch in assembly")
|
|
837
874
|
|
|
838
875
|
# We transform into Dseqrecords (for primers)
|
|
839
876
|
dseqr_fragments = [
|
|
840
|
-
f if isinstance(f,
|
|
877
|
+
f if isinstance(f, Dseqrecord) else Dseqrecord(f) for f in fragments
|
|
841
878
|
]
|
|
842
879
|
subfragments = get_assembly_subfragments(
|
|
843
880
|
dseqr_fragments, subfragment_representation
|
|
@@ -845,42 +882,23 @@ def assemble(
|
|
|
845
882
|
|
|
846
883
|
# Length of the overlaps between consecutive assembly fragments
|
|
847
884
|
fragment_overlaps = [len(e[-1]) for e in assembly]
|
|
885
|
+
out_dseqrecord = subfragments.pop(0)
|
|
848
886
|
|
|
849
|
-
|
|
887
|
+
for fragment, overlap in zip(subfragments, fragment_overlaps):
|
|
888
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
|
|
889
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
|
|
890
|
+
fragment.seq = fragment.seq.cast_to_ds_left()
|
|
891
|
+
fragment.seq = fragment.seq.exo1_front(overlap)
|
|
892
|
+
out_dseqrecord += fragment
|
|
850
893
|
|
|
851
|
-
|
|
852
|
-
# Shift the features of the right fragment to the left by ``overlap``
|
|
853
|
-
new_features = [
|
|
854
|
-
f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
|
|
855
|
-
]
|
|
856
|
-
# Join the left sequence including the overlap with the right sequence without the overlap
|
|
857
|
-
# we use fill_right / fill_left so that it works for ligation of sticky ends
|
|
858
|
-
out_dseqrecord = _Dseqrecord(
|
|
859
|
-
fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
|
|
860
|
-
features=out_dseqrecord.features + new_features,
|
|
861
|
-
)
|
|
862
|
-
|
|
863
|
-
# For circular assemblies, close the loop and wrap origin-spanning features
|
|
894
|
+
# For circular assemblies, process the fragment and loop
|
|
864
895
|
if is_circular:
|
|
896
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_left()
|
|
897
|
+
out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
|
|
865
898
|
overlap = fragment_overlaps[-1]
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
out_dseqrecord = out_dseqrecord.looped()
|
|
870
|
-
else:
|
|
871
|
-
# Remove trailing overlap
|
|
872
|
-
out_dseqrecord = _Dseqrecord(
|
|
873
|
-
fill_dseq(out_dseqrecord.seq)[:-overlap],
|
|
874
|
-
features=out_dseqrecord.features,
|
|
875
|
-
circular=True,
|
|
876
|
-
)
|
|
877
|
-
for feature in out_dseqrecord.features:
|
|
878
|
-
start, end = _location_boundaries(feature.location)
|
|
879
|
-
if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
|
|
880
|
-
# Wrap around the origin
|
|
881
|
-
feature.location = _shift_location(
|
|
882
|
-
feature.location, 0, len(out_dseqrecord)
|
|
883
|
-
)
|
|
899
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_front(overlap)
|
|
900
|
+
out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
|
|
901
|
+
out_dseqrecord = out_dseqrecord.looped()
|
|
884
902
|
|
|
885
903
|
out_dseqrecord.source = AssemblySource.from_subfragment_representation(
|
|
886
904
|
subfragment_representation, fragments, is_circular
|
|
@@ -889,8 +907,8 @@ def assemble(
|
|
|
889
907
|
|
|
890
908
|
|
|
891
909
|
def annotate_primer_binding_sites(
|
|
892
|
-
input_dseqr:
|
|
893
|
-
) ->
|
|
910
|
+
input_dseqr: Dseqrecord, fragments: list[Dseqrecord]
|
|
911
|
+
) -> Dseqrecord:
|
|
894
912
|
"""Annotate the primer binding sites in a Dseqrecord."""
|
|
895
913
|
fwd, _, rvs = fragments
|
|
896
914
|
start_rvs = len(input_dseqr) - len(rvs)
|
|
@@ -970,9 +988,9 @@ def subfragment_representation2edge_representation(
|
|
|
970
988
|
|
|
971
989
|
|
|
972
990
|
def get_assembly_subfragments(
|
|
973
|
-
fragments: list[
|
|
991
|
+
fragments: list[Dseqrecord],
|
|
974
992
|
subfragment_representation: SubFragmentRepresentationAssembly,
|
|
975
|
-
) -> list[
|
|
993
|
+
) -> list[Dseqrecord]:
|
|
976
994
|
"""From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
|
|
977
995
|
|
|
978
996
|
Subfragments are the slices of the fragments that are joined together
|
|
@@ -1013,19 +1031,26 @@ def get_assembly_subfragments(
|
|
|
1013
1031
|
|
|
1014
1032
|
|
|
1015
1033
|
def extract_subfragment(
|
|
1016
|
-
seq:
|
|
1017
|
-
) ->
|
|
1034
|
+
seq: Dseqrecord, start_location: Location | None, end_location: Location | None
|
|
1035
|
+
) -> Dseqrecord:
|
|
1018
1036
|
"""Extract a subfragment from a sequence for an assembly, given the start and end locations of the subfragment."""
|
|
1019
|
-
|
|
1020
|
-
|
|
1037
|
+
|
|
1038
|
+
if seq.circular and (start_location is None or end_location is None):
|
|
1039
|
+
raise ValueError(
|
|
1040
|
+
"Start and end locations cannot be None for circular sequences"
|
|
1041
|
+
)
|
|
1042
|
+
# This could be used to have consistent behaviour for circular sequences, where the start is arbitrary. However,
|
|
1043
|
+
# they should never get None, so this is not used.
|
|
1044
|
+
# if start_location is None:
|
|
1045
|
+
# start_location = end_location
|
|
1046
|
+
# elif end_location is None:
|
|
1047
|
+
# end_location = start_location
|
|
1048
|
+
|
|
1049
|
+
start = 0 if start_location is None else location_boundaries(start_location)[0]
|
|
1050
|
+
end = None if end_location is None else location_boundaries(end_location)[1]
|
|
1021
1051
|
|
|
1022
1052
|
# Special case, some of it could be handled by better Dseqrecord slicing in the future
|
|
1023
|
-
if (
|
|
1024
|
-
seq.circular
|
|
1025
|
-
and start_location is not None
|
|
1026
|
-
and end_location is not None
|
|
1027
|
-
and _locations_overlap(start_location, end_location, len(seq))
|
|
1028
|
-
):
|
|
1053
|
+
if seq.circular and locations_overlap(start_location, end_location, len(seq)):
|
|
1029
1054
|
# The overhang is different for origin-spanning features, for instance
|
|
1030
1055
|
# for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
|
|
1031
1056
|
# is -4, not 9
|
|
@@ -1035,7 +1060,7 @@ def extract_subfragment(
|
|
|
1035
1060
|
ovhg = 0
|
|
1036
1061
|
dummy_cut = ((start, ovhg), None)
|
|
1037
1062
|
open_seq = seq.apply_cut(dummy_cut, dummy_cut)
|
|
1038
|
-
return
|
|
1063
|
+
return Dseqrecord(open_seq.seq.cast_to_ds(), features=open_seq.features)
|
|
1039
1064
|
|
|
1040
1065
|
return seq[start:end]
|
|
1041
1066
|
|
|
@@ -1178,14 +1203,15 @@ class Assembly:
|
|
|
1178
1203
|
|
|
1179
1204
|
def __init__(
|
|
1180
1205
|
self,
|
|
1181
|
-
frags: list[
|
|
1206
|
+
frags: list[Dseqrecord],
|
|
1182
1207
|
limit: int = 25,
|
|
1183
1208
|
algorithm: AssemblyAlgorithmType = common_sub_strings,
|
|
1184
1209
|
use_fragment_order: bool = True,
|
|
1185
1210
|
use_all_fragments: bool = False,
|
|
1186
1211
|
):
|
|
1212
|
+
|
|
1187
1213
|
# TODO: allow for the same fragment to be included more than once?
|
|
1188
|
-
self.G =
|
|
1214
|
+
self.G = nx.MultiDiGraph()
|
|
1189
1215
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1190
1216
|
self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
|
|
1191
1217
|
self.G.add_nodes_from(
|
|
@@ -1193,12 +1219,12 @@ class Assembly:
|
|
|
1193
1219
|
)
|
|
1194
1220
|
|
|
1195
1221
|
# Iterate over all possible combinations of fragments
|
|
1196
|
-
fragment_pairs =
|
|
1222
|
+
fragment_pairs = itertools.combinations(
|
|
1197
1223
|
filter(lambda x: x > 0, self.G.nodes), 2
|
|
1198
1224
|
)
|
|
1199
1225
|
for i, j in fragment_pairs:
|
|
1200
1226
|
# All the relative orientations of the fragments in the pair
|
|
1201
|
-
for u, v in
|
|
1227
|
+
for u, v in itertools.product([i, -i], [j, -j]):
|
|
1202
1228
|
u_seq = self.G.nodes[u]["seq"]
|
|
1203
1229
|
v_seq = self.G.nodes[v]["seq"]
|
|
1204
1230
|
matches = algorithm(u_seq, v_seq, limit)
|
|
@@ -1216,7 +1242,7 @@ class Assembly:
|
|
|
1216
1242
|
@classmethod
|
|
1217
1243
|
def assembly_is_valid(
|
|
1218
1244
|
cls,
|
|
1219
|
-
fragments: list[
|
|
1245
|
+
fragments: list[Dseqrecord | Primer],
|
|
1220
1246
|
assembly: EdgeRepresentationAssembly,
|
|
1221
1247
|
is_circular: bool,
|
|
1222
1248
|
use_all_fragments: bool,
|
|
@@ -1232,6 +1258,23 @@ class Assembly:
|
|
|
1232
1258
|
if len(assembly) == 0:
|
|
1233
1259
|
return False
|
|
1234
1260
|
|
|
1261
|
+
# Topology check -> Circular sequences cannot be first or last in a linear assembly.
|
|
1262
|
+
# For example, let's imagine aACGTc (linear) and gACGTc (circular).
|
|
1263
|
+
# It should not be possible to join them into a linear assembly. It's similar if we
|
|
1264
|
+
# think of a restriction-ligation assembly, example: aGAATTCc (linear) and gGAATTCc
|
|
1265
|
+
# (circular).
|
|
1266
|
+
# A linear product can be generated where the circular molecule is cut open, and one end
|
|
1267
|
+
# it joins the linear molecule and on the other it's free, but for now it's not a
|
|
1268
|
+
# relevant product and it's excluded.
|
|
1269
|
+
first_fragment = fragments[abs(assembly[0][0]) - 1]
|
|
1270
|
+
last_fragment = fragments[abs(assembly[-1][1]) - 1]
|
|
1271
|
+
if not is_circular and (
|
|
1272
|
+
isinstance(first_fragment, Dseqrecord)
|
|
1273
|
+
and first_fragment.circular
|
|
1274
|
+
or (isinstance(last_fragment, Dseqrecord) and last_fragment.circular)
|
|
1275
|
+
):
|
|
1276
|
+
return False
|
|
1277
|
+
|
|
1235
1278
|
if use_all_fragments and len(fragments) != len(
|
|
1236
1279
|
set(flatten(map(abs, e[:2]) for e in assembly))
|
|
1237
1280
|
):
|
|
@@ -1269,8 +1312,8 @@ class Assembly:
|
|
|
1269
1312
|
# Incompatible as described in figure above
|
|
1270
1313
|
fragment = fragments[abs(v1) - 1]
|
|
1271
1314
|
if (
|
|
1272
|
-
isinstance(fragment,
|
|
1273
|
-
) and
|
|
1315
|
+
isinstance(fragment, Primer) or not fragment.circular
|
|
1316
|
+
) and location_boundaries(start_location)[1] >= location_boundaries(
|
|
1274
1317
|
end_location
|
|
1275
1318
|
)[
|
|
1276
1319
|
1
|
|
@@ -1294,8 +1337,8 @@ class Assembly:
|
|
|
1294
1337
|
match: SequenceOverlap,
|
|
1295
1338
|
u: int,
|
|
1296
1339
|
v: int,
|
|
1297
|
-
first:
|
|
1298
|
-
secnd:
|
|
1340
|
+
first: Dseqrecord,
|
|
1341
|
+
secnd: Dseqrecord,
|
|
1299
1342
|
):
|
|
1300
1343
|
"""Add edges to the graph from a match returned by the ``algorithm`` function (see pydna.common_substrings). For
|
|
1301
1344
|
format of edges (see documentation of the Assembly class).
|
|
@@ -1314,10 +1357,10 @@ class Assembly:
|
|
|
1314
1357
|
else:
|
|
1315
1358
|
# We use shift_location with 0 to wrap origin-spanning features
|
|
1316
1359
|
locs = [
|
|
1317
|
-
|
|
1360
|
+
shift_location(
|
|
1318
1361
|
SimpleLocation(x_start, x_start + length), 0, len(first)
|
|
1319
1362
|
),
|
|
1320
|
-
|
|
1363
|
+
shift_location(
|
|
1321
1364
|
SimpleLocation(y_start, y_start + length), 0, len(secnd)
|
|
1322
1365
|
),
|
|
1323
1366
|
]
|
|
@@ -1352,7 +1395,7 @@ class Assembly:
|
|
|
1352
1395
|
"""
|
|
1353
1396
|
|
|
1354
1397
|
# Copy the graph since we will add the begin and end mock nodes
|
|
1355
|
-
G =
|
|
1398
|
+
G = nx.MultiDiGraph(self.G)
|
|
1356
1399
|
G.add_nodes_from(["begin", "end"])
|
|
1357
1400
|
|
|
1358
1401
|
if self.use_fragment_order:
|
|
@@ -1390,7 +1433,7 @@ class Assembly:
|
|
|
1390
1433
|
def node_path2assembly_list(
|
|
1391
1434
|
self, cycle: list[int], circular: bool
|
|
1392
1435
|
) -> list[EdgeRepresentationAssembly]:
|
|
1393
|
-
"""Convert a node path in the format [1, 2, 3] (as returned by
|
|
1436
|
+
"""Convert a node path in the format [1, 2, 3] (as returned by networkx.cycles.simple_cycles) to a list of all
|
|
1394
1437
|
possible assemblies.
|
|
1395
1438
|
|
|
1396
1439
|
There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
|
|
@@ -1404,11 +1447,11 @@ class Assembly:
|
|
|
1404
1447
|
combine.append([(u, v, key) for key in self.G[u][v]])
|
|
1405
1448
|
return [
|
|
1406
1449
|
tuple(map(self.format_assembly_edge, x))
|
|
1407
|
-
for x in
|
|
1450
|
+
for x in itertools.product(*combine)
|
|
1408
1451
|
]
|
|
1409
1452
|
|
|
1410
1453
|
def get_unique_linear_paths(
|
|
1411
|
-
self, G_with_begin_end:
|
|
1454
|
+
self, G_with_begin_end: nx.MultiDiGraph, max_paths=10000
|
|
1412
1455
|
) -> list[list[int]]:
|
|
1413
1456
|
"""Get unique linear paths from the graph, removing those that contain the same node twice."""
|
|
1414
1457
|
# We remove the begin and end nodes, and get all paths without edges
|
|
@@ -1419,8 +1462,8 @@ class Assembly:
|
|
|
1419
1462
|
node_paths = [
|
|
1420
1463
|
x[1:-1]
|
|
1421
1464
|
for x in limit_iterator(
|
|
1422
|
-
|
|
1423
|
-
|
|
1465
|
+
nx.all_simple_paths(
|
|
1466
|
+
nx.DiGraph(G_with_begin_end),
|
|
1424
1467
|
"begin",
|
|
1425
1468
|
"end",
|
|
1426
1469
|
cutoff=(len(self.fragments) + 1),
|
|
@@ -1469,7 +1512,7 @@ class Assembly:
|
|
|
1469
1512
|
sorted_cycles = map(
|
|
1470
1513
|
circular_permutation_min_abs,
|
|
1471
1514
|
limit_iterator(
|
|
1472
|
-
|
|
1515
|
+
nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
|
|
1473
1516
|
10000,
|
|
1474
1517
|
),
|
|
1475
1518
|
)
|
|
@@ -1534,8 +1577,8 @@ class Assembly:
|
|
|
1534
1577
|
fragment = self.fragments[abs(v1) - 1]
|
|
1535
1578
|
# Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
|
|
1536
1579
|
# the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
|
|
1537
|
-
left_of_insertion =
|
|
1538
|
-
right_of_insertion =
|
|
1580
|
+
left_of_insertion = location_boundaries(start_location)[0]
|
|
1581
|
+
right_of_insertion = location_boundaries(end_location)[0]
|
|
1539
1582
|
if not fragment.circular and (
|
|
1540
1583
|
right_of_insertion >= left_of_insertion
|
|
1541
1584
|
# The below condition is for single-site integration.
|
|
@@ -1547,7 +1590,7 @@ class Assembly:
|
|
|
1547
1590
|
#
|
|
1548
1591
|
# The locations of homology on the genome are [0:10] and [2:12], so not identical
|
|
1549
1592
|
# but they overlap.
|
|
1550
|
-
or
|
|
1593
|
+
or locations_overlap(start_location, end_location, len(fragment))
|
|
1551
1594
|
):
|
|
1552
1595
|
edge_pair_index.append(i)
|
|
1553
1596
|
|
|
@@ -1578,13 +1621,13 @@ class Assembly:
|
|
|
1578
1621
|
fragment1 = self.fragments[abs(f1) - 1]
|
|
1579
1622
|
fragment2 = self.fragments[abs(f2) - 1]
|
|
1580
1623
|
|
|
1581
|
-
if not
|
|
1624
|
+
if not locations_overlap(
|
|
1582
1625
|
loc_f1_1, loc_f1_2, len(fragment1)
|
|
1583
|
-
) or not
|
|
1626
|
+
) or not locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
|
|
1584
1627
|
return same_assembly
|
|
1585
1628
|
|
|
1586
1629
|
# Sort to make compatible with insertion assembly
|
|
1587
|
-
if
|
|
1630
|
+
if location_boundaries(loc_f1_1)[0] > location_boundaries(loc_f1_2)[0]:
|
|
1588
1631
|
new_assembly = same_assembly[::-1]
|
|
1589
1632
|
else:
|
|
1590
1633
|
new_assembly = same_assembly[:]
|
|
@@ -1597,17 +1640,18 @@ class Assembly:
|
|
|
1597
1640
|
fragment2 = self.fragments[abs(f2) - 1]
|
|
1598
1641
|
|
|
1599
1642
|
# Extract boundaries
|
|
1600
|
-
f2_1_start, _ =
|
|
1601
|
-
f2_2_start, f2_2_end =
|
|
1602
|
-
f1_1_start, _ =
|
|
1603
|
-
f1_2_start, f1_2_end =
|
|
1643
|
+
f2_1_start, _ = location_boundaries(loc_f2_1)
|
|
1644
|
+
f2_2_start, f2_2_end = location_boundaries(loc_f2_2)
|
|
1645
|
+
f1_1_start, _ = location_boundaries(loc_f1_1)
|
|
1646
|
+
f1_2_start, f1_2_end = location_boundaries(loc_f1_2)
|
|
1604
1647
|
|
|
1605
1648
|
overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(
|
|
1606
1649
|
fragment2[f2_1_start:f2_2_end]
|
|
1607
1650
|
)
|
|
1608
1651
|
|
|
1609
|
-
|
|
1610
|
-
|
|
1652
|
+
# Safeguard
|
|
1653
|
+
if overlap_diff == 0: # pragma: no cover
|
|
1654
|
+
raise AssertionError("Overlap is 0")
|
|
1611
1655
|
|
|
1612
1656
|
if overlap_diff > 0:
|
|
1613
1657
|
new_loc_f1_1 = create_location(
|
|
@@ -1640,7 +1684,7 @@ class Assembly:
|
|
|
1640
1684
|
"only_adjacent_edges not implemented for insertion assemblies"
|
|
1641
1685
|
)
|
|
1642
1686
|
|
|
1643
|
-
cycles = limit_iterator(
|
|
1687
|
+
cycles = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
|
|
1644
1688
|
|
|
1645
1689
|
# We apply constrains already here because sometimes the combinatorial explosion is too large
|
|
1646
1690
|
if self.use_all_fragments:
|
|
@@ -1659,7 +1703,7 @@ class Assembly:
|
|
|
1659
1703
|
)
|
|
1660
1704
|
|
|
1661
1705
|
# We find cycles first
|
|
1662
|
-
iterator = limit_iterator(
|
|
1706
|
+
iterator = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
|
|
1663
1707
|
assemblies = sum(
|
|
1664
1708
|
map(lambda x: self.node_path2assembly_list(x, True), iterator), []
|
|
1665
1709
|
)
|
|
@@ -1683,21 +1727,19 @@ class Assembly:
|
|
|
1683
1727
|
|
|
1684
1728
|
def assemble_linear(
|
|
1685
1729
|
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1686
|
-
) -> list[
|
|
1730
|
+
) -> list[Dseqrecord]:
|
|
1687
1731
|
"""Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
|
|
1688
1732
|
assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
|
|
1689
1733
|
return [assemble(self.fragments, a) for a in assemblies]
|
|
1690
1734
|
|
|
1691
1735
|
def assemble_circular(
|
|
1692
1736
|
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1693
|
-
) -> list[
|
|
1737
|
+
) -> list[Dseqrecord]:
|
|
1694
1738
|
"""Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
|
|
1695
1739
|
assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
|
|
1696
1740
|
return [assemble(self.fragments, a) for a in assemblies]
|
|
1697
1741
|
|
|
1698
|
-
def assemble_insertion(
|
|
1699
|
-
self, only_adjacent_edges: bool = False
|
|
1700
|
-
) -> list[_Dseqrecord]:
|
|
1742
|
+
def assemble_insertion(self, only_adjacent_edges: bool = False) -> list[Dseqrecord]:
|
|
1701
1743
|
"""Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
|
|
1702
1744
|
assemblies = self.get_insertion_assemblies(only_adjacent_edges)
|
|
1703
1745
|
return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
|
|
@@ -1739,10 +1781,10 @@ class Assembly:
|
|
|
1739
1781
|
if edge_location not in this_dict[key]:
|
|
1740
1782
|
this_dict[key].append(edge_location)
|
|
1741
1783
|
this_dict["left"] = sorted(
|
|
1742
|
-
this_dict["left"], key=lambda x:
|
|
1784
|
+
this_dict["left"], key=lambda x: location_boundaries(x)[0]
|
|
1743
1785
|
)
|
|
1744
1786
|
this_dict["right"] = sorted(
|
|
1745
|
-
this_dict["right"], key=lambda x:
|
|
1787
|
+
this_dict["right"], key=lambda x: location_boundaries(x)[0]
|
|
1746
1788
|
)
|
|
1747
1789
|
locations_on_fragments[node] = this_dict
|
|
1748
1790
|
|
|
@@ -1789,7 +1831,7 @@ class Assembly:
|
|
|
1789
1831
|
|
|
1790
1832
|
pairs = list()
|
|
1791
1833
|
for pair in zip(left, right):
|
|
1792
|
-
pairs += list(
|
|
1834
|
+
pairs += list(itertools.product(*pair))
|
|
1793
1835
|
allowed_location_pairs[node] = pairs
|
|
1794
1836
|
|
|
1795
1837
|
fragment_assembly = edge_representation2subfragment_representation(
|
|
@@ -1802,7 +1844,7 @@ class Assembly:
|
|
|
1802
1844
|
|
|
1803
1845
|
def __repr__(self):
|
|
1804
1846
|
# https://pyformat.info
|
|
1805
|
-
return
|
|
1847
|
+
return ps(
|
|
1806
1848
|
"Assembly\n"
|
|
1807
1849
|
"fragments..: {sequences}\n"
|
|
1808
1850
|
"limit(bp)..: {limit}\n"
|
|
@@ -1823,7 +1865,7 @@ class PCRAssembly(Assembly):
|
|
|
1823
1865
|
the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
|
|
1824
1866
|
"""
|
|
1825
1867
|
|
|
1826
|
-
def __init__(self, frags: list[
|
|
1868
|
+
def __init__(self, frags: list[Dseqrecord | Primer], limit=25, mismatches=0):
|
|
1827
1869
|
|
|
1828
1870
|
value_error = ValueError(
|
|
1829
1871
|
"PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer"
|
|
@@ -1833,15 +1875,15 @@ class PCRAssembly(Assembly):
|
|
|
1833
1875
|
|
|
1834
1876
|
# Validate the inputs: should be a series of primer, template, primer
|
|
1835
1877
|
wrong_fragment_class = (
|
|
1836
|
-
not isinstance(frags[0],
|
|
1837
|
-
isinstance(frags[1],
|
|
1838
|
-
not isinstance(frags[2],
|
|
1878
|
+
not isinstance(frags[0], Primer),
|
|
1879
|
+
isinstance(frags[1], Primer),
|
|
1880
|
+
not isinstance(frags[2], Primer),
|
|
1839
1881
|
)
|
|
1840
1882
|
if any(wrong_fragment_class):
|
|
1841
1883
|
raise value_error
|
|
1842
1884
|
|
|
1843
1885
|
# TODO: allow for the same fragment to be included more than once?
|
|
1844
|
-
self.G =
|
|
1886
|
+
self.G = nx.MultiDiGraph()
|
|
1845
1887
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1846
1888
|
self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
|
|
1847
1889
|
self.G.add_nodes_from(
|
|
@@ -1854,8 +1896,8 @@ class PCRAssembly(Assembly):
|
|
|
1854
1896
|
# primer, template, primer
|
|
1855
1897
|
p1, t, p2 = (i + 1, i + 2, i + 3)
|
|
1856
1898
|
primer_ids += [p1, p2]
|
|
1857
|
-
pairs += list(
|
|
1858
|
-
pairs += list(
|
|
1899
|
+
pairs += list(itertools.product([p1, p2], [t, -t]))
|
|
1900
|
+
pairs += list(itertools.product([t, -t], [-p1, -p2]))
|
|
1859
1901
|
|
|
1860
1902
|
for u, v in pairs:
|
|
1861
1903
|
u_seq = self.G.nodes[u]["seq"]
|
|
@@ -1894,20 +1936,33 @@ class PCRAssembly(Assembly):
|
|
|
1894
1936
|
"get_insertion_assemblies not implemented for PCR assemblies"
|
|
1895
1937
|
)
|
|
1896
1938
|
|
|
1939
|
+
def assemble_linear(
|
|
1940
|
+
self, only_adjacent_edges: bool = False, max_assemblies: int = 50
|
|
1941
|
+
) -> list[Dseqrecord]:
|
|
1942
|
+
"""
|
|
1943
|
+
Overrides the parent method to ensure that the 5' of the crick strand of the product matches the
|
|
1944
|
+
sequence of the reverse primer. This is important when using primers with dUTP (for USER cloning).
|
|
1945
|
+
"""
|
|
1946
|
+
results = super().assemble_linear(only_adjacent_edges, max_assemblies)
|
|
1947
|
+
for result in results:
|
|
1948
|
+
rp = self.fragments[2]
|
|
1949
|
+
result.seq = result.seq[: -len(rp)] + Dseq(str(rp.seq.rc()))
|
|
1950
|
+
return results
|
|
1951
|
+
|
|
1897
1952
|
|
|
1898
1953
|
class SingleFragmentAssembly(Assembly):
|
|
1899
1954
|
"""
|
|
1900
1955
|
An assembly that represents the circularisation or splicing of a single fragment.
|
|
1901
1956
|
"""
|
|
1902
1957
|
|
|
1903
|
-
def __init__(self, frags: [
|
|
1958
|
+
def __init__(self, frags: [Dseqrecord], limit=25, algorithm=common_sub_strings):
|
|
1904
1959
|
|
|
1905
1960
|
if len(frags) != 1:
|
|
1906
1961
|
raise ValueError(
|
|
1907
1962
|
"SingleFragmentAssembly assembly must be initialised with a single fragment"
|
|
1908
1963
|
)
|
|
1909
1964
|
# TODO: allow for the same fragment to be included more than once?
|
|
1910
|
-
self.G =
|
|
1965
|
+
self.G = nx.MultiDiGraph()
|
|
1911
1966
|
frag = frags[0]
|
|
1912
1967
|
# Add positive and negative nodes for forward and reverse fragments
|
|
1913
1968
|
self.G.add_node(1, seq=frag)
|
|
@@ -1958,8 +2013,8 @@ class SingleFragmentAssembly(Assembly):
|
|
|
1958
2013
|
if x[0][2] == x[0][3]:
|
|
1959
2014
|
return False
|
|
1960
2015
|
# We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
|
|
1961
|
-
left_start, _ =
|
|
1962
|
-
_, right_end =
|
|
2016
|
+
left_start, _ = location_boundaries(x[0][2])
|
|
2017
|
+
_, right_end = location_boundaries(x[0][3])
|
|
1963
2018
|
if left_start == 0 and right_end == len(self.fragments[0]):
|
|
1964
2019
|
return False
|
|
1965
2020
|
return True
|
|
@@ -1982,18 +2037,19 @@ class SingleFragmentAssembly(Assembly):
|
|
|
1982
2037
|
|
|
1983
2038
|
|
|
1984
2039
|
def common_function_assembly_products(
|
|
1985
|
-
frags: list[
|
|
2040
|
+
frags: list[Dseqrecord],
|
|
1986
2041
|
limit: int | None,
|
|
1987
2042
|
algorithm: Callable,
|
|
1988
2043
|
circular_only: bool,
|
|
1989
2044
|
filter_results_function: Callable | None = None,
|
|
1990
|
-
|
|
2045
|
+
only_adjacent_edges: bool = False,
|
|
2046
|
+
) -> list[Dseqrecord]:
|
|
1991
2047
|
"""Common function to avoid code duplication. Could be simplified further
|
|
1992
2048
|
once SingleFragmentAssembly and Assembly are merged.
|
|
1993
2049
|
|
|
1994
2050
|
Parameters
|
|
1995
2051
|
----------
|
|
1996
|
-
frags : list[
|
|
2052
|
+
frags : list[Dseqrecord]
|
|
1997
2053
|
List of DNA fragments to assemble
|
|
1998
2054
|
limit : int or None
|
|
1999
2055
|
Minimum overlap length required, or None if not applicable
|
|
@@ -2001,10 +2057,14 @@ def common_function_assembly_products(
|
|
|
2001
2057
|
Function that determines valid overlaps between fragments
|
|
2002
2058
|
circular_only : bool
|
|
2003
2059
|
If True, only return circular assemblies
|
|
2060
|
+
filter_results_function : Callable or None
|
|
2061
|
+
Function that filters the results
|
|
2062
|
+
only_adjacent_edges : bool
|
|
2063
|
+
If True, only return assemblies that use only adjacent edges
|
|
2004
2064
|
|
|
2005
2065
|
Returns
|
|
2006
2066
|
-------
|
|
2007
|
-
list[
|
|
2067
|
+
list[Dseqrecord]
|
|
2008
2068
|
List of assembled DNA molecules
|
|
2009
2069
|
"""
|
|
2010
2070
|
if len(frags) == 1:
|
|
@@ -2013,10 +2073,10 @@ def common_function_assembly_products(
|
|
|
2013
2073
|
asm = Assembly(
|
|
2014
2074
|
frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
|
|
2015
2075
|
)
|
|
2016
|
-
output_assemblies = asm.get_circular_assemblies()
|
|
2076
|
+
output_assemblies = asm.get_circular_assemblies(only_adjacent_edges)
|
|
2017
2077
|
if not circular_only and len(frags) > 1:
|
|
2018
2078
|
output_assemblies += filter_linear_subassemblies(
|
|
2019
|
-
asm.get_linear_assemblies(), output_assemblies, frags
|
|
2079
|
+
asm.get_linear_assemblies(only_adjacent_edges), output_assemblies, frags
|
|
2020
2080
|
)
|
|
2021
2081
|
if not circular_only and len(frags) == 1:
|
|
2022
2082
|
output_assemblies += asm.get_insertion_assemblies()
|
|
@@ -2028,28 +2088,28 @@ def common_function_assembly_products(
|
|
|
2028
2088
|
|
|
2029
2089
|
|
|
2030
2090
|
def _recast_sources(
|
|
2031
|
-
products: list[
|
|
2032
|
-
) -> list[
|
|
2091
|
+
products: list[Dseqrecord], source_cls, **extra_fields
|
|
2092
|
+
) -> list[Dseqrecord]:
|
|
2033
2093
|
"""Recast the `source` of each product to `source_cls` with optional extras.
|
|
2034
2094
|
|
|
2035
2095
|
This avoids repeating the same for-loop across many assembly functions.
|
|
2036
2096
|
"""
|
|
2037
2097
|
for prod in products:
|
|
2038
2098
|
prod.source = source_cls(
|
|
2039
|
-
**prod.source.
|
|
2099
|
+
**prod.source.to_unserialized_dict(),
|
|
2040
2100
|
**extra_fields,
|
|
2041
2101
|
)
|
|
2042
2102
|
return products
|
|
2043
2103
|
|
|
2044
2104
|
|
|
2045
2105
|
def gibson_assembly(
|
|
2046
|
-
frags: list[
|
|
2047
|
-
) -> list[
|
|
2106
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2107
|
+
) -> list[Dseqrecord]:
|
|
2048
2108
|
"""Returns the products for Gibson assembly.
|
|
2049
2109
|
|
|
2050
2110
|
Parameters
|
|
2051
2111
|
----------
|
|
2052
|
-
frags : list[
|
|
2112
|
+
frags : list[Dseqrecord]
|
|
2053
2113
|
List of DNA fragments to assemble
|
|
2054
2114
|
limit : int, optional
|
|
2055
2115
|
Minimum overlap length required, by default 25
|
|
@@ -2058,7 +2118,7 @@ def gibson_assembly(
|
|
|
2058
2118
|
|
|
2059
2119
|
Returns
|
|
2060
2120
|
-------
|
|
2061
|
-
list[
|
|
2121
|
+
list[Dseqrecord]
|
|
2062
2122
|
List of assembled DNA molecules
|
|
2063
2123
|
"""
|
|
2064
2124
|
|
|
@@ -2069,14 +2129,14 @@ def gibson_assembly(
|
|
|
2069
2129
|
|
|
2070
2130
|
|
|
2071
2131
|
def in_fusion_assembly(
|
|
2072
|
-
frags: list[
|
|
2073
|
-
) -> list[
|
|
2132
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2133
|
+
) -> list[Dseqrecord]:
|
|
2074
2134
|
"""Returns the products for in-fusion assembly. This is the same as Gibson
|
|
2075
2135
|
assembly, but with a different name.
|
|
2076
2136
|
|
|
2077
2137
|
Parameters
|
|
2078
2138
|
----------
|
|
2079
|
-
frags : list[
|
|
2139
|
+
frags : list[Dseqrecord]
|
|
2080
2140
|
List of DNA fragments to assemble
|
|
2081
2141
|
limit : int, optional
|
|
2082
2142
|
Minimum overlap length required, by default 25
|
|
@@ -2085,23 +2145,25 @@ def in_fusion_assembly(
|
|
|
2085
2145
|
|
|
2086
2146
|
Returns
|
|
2087
2147
|
-------
|
|
2088
|
-
list[
|
|
2148
|
+
list[Dseqrecord]
|
|
2089
2149
|
List of assembled DNA molecules
|
|
2090
2150
|
"""
|
|
2091
2151
|
|
|
2092
|
-
products =
|
|
2152
|
+
products = common_function_assembly_products(
|
|
2153
|
+
frags, limit, in_fusion_overlap, circular_only
|
|
2154
|
+
)
|
|
2093
2155
|
return _recast_sources(products, InFusionSource)
|
|
2094
2156
|
|
|
2095
2157
|
|
|
2096
2158
|
def fusion_pcr_assembly(
|
|
2097
|
-
frags: list[
|
|
2098
|
-
) -> list[
|
|
2159
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2160
|
+
) -> list[Dseqrecord]:
|
|
2099
2161
|
"""Returns the products for fusion PCR assembly. This is the same as Gibson
|
|
2100
2162
|
assembly, but with a different name.
|
|
2101
2163
|
|
|
2102
2164
|
Parameters
|
|
2103
2165
|
----------
|
|
2104
|
-
frags : list[
|
|
2166
|
+
frags : list[Dseqrecord]
|
|
2105
2167
|
List of DNA fragments to assemble
|
|
2106
2168
|
limit : int, optional
|
|
2107
2169
|
Minimum overlap length required, by default 25
|
|
@@ -2110,21 +2172,23 @@ def fusion_pcr_assembly(
|
|
|
2110
2172
|
|
|
2111
2173
|
Returns
|
|
2112
2174
|
-------
|
|
2113
|
-
list[
|
|
2175
|
+
list[Dseqrecord]
|
|
2114
2176
|
List of assembled DNA molecules
|
|
2115
2177
|
"""
|
|
2116
|
-
products =
|
|
2178
|
+
products = common_function_assembly_products(
|
|
2179
|
+
frags, limit, pcr_fusion_overlap, circular_only
|
|
2180
|
+
)
|
|
2117
2181
|
return _recast_sources(products, OverlapExtensionPCRLigationSource)
|
|
2118
2182
|
|
|
2119
2183
|
|
|
2120
2184
|
def in_vivo_assembly(
|
|
2121
|
-
frags: list[
|
|
2122
|
-
) -> list[
|
|
2185
|
+
frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2186
|
+
) -> list[Dseqrecord]:
|
|
2123
2187
|
"""Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
|
|
2124
2188
|
|
|
2125
2189
|
Parameters
|
|
2126
2190
|
----------
|
|
2127
|
-
frags : list[
|
|
2191
|
+
frags : list[Dseqrecord]
|
|
2128
2192
|
List of DNA fragments to assemble
|
|
2129
2193
|
limit : int, optional
|
|
2130
2194
|
Minimum overlap length required, by default 25
|
|
@@ -2133,7 +2197,7 @@ def in_vivo_assembly(
|
|
|
2133
2197
|
|
|
2134
2198
|
Returns
|
|
2135
2199
|
-------
|
|
2136
|
-
list[
|
|
2200
|
+
list[Dseqrecord]
|
|
2137
2201
|
List of assembled DNA molecules
|
|
2138
2202
|
"""
|
|
2139
2203
|
products = common_function_assembly_products(
|
|
@@ -2143,11 +2207,11 @@ def in_vivo_assembly(
|
|
|
2143
2207
|
|
|
2144
2208
|
|
|
2145
2209
|
def restriction_ligation_assembly(
|
|
2146
|
-
frags: list[
|
|
2147
|
-
enzymes: list["
|
|
2210
|
+
frags: list[Dseqrecord],
|
|
2211
|
+
enzymes: list["AbstractCut"],
|
|
2148
2212
|
allow_blunt: bool = True,
|
|
2149
2213
|
circular_only: bool = False,
|
|
2150
|
-
) -> list[
|
|
2214
|
+
) -> list[Dseqrecord]:
|
|
2151
2215
|
"""Returns the products for restriction ligation assembly:
|
|
2152
2216
|
|
|
2153
2217
|
- Finds cutsites in the fragments
|
|
@@ -2156,9 +2220,9 @@ def restriction_ligation_assembly(
|
|
|
2156
2220
|
|
|
2157
2221
|
Parameters
|
|
2158
2222
|
----------
|
|
2159
|
-
frags : list[
|
|
2223
|
+
frags : list[Dseqrecord]
|
|
2160
2224
|
List of DNA fragments to assemble
|
|
2161
|
-
enzymes : list[
|
|
2225
|
+
enzymes : list[AbstractCut]
|
|
2162
2226
|
List of restriction enzymes to use
|
|
2163
2227
|
allow_blunt : bool, optional
|
|
2164
2228
|
If True, allow blunt end ligations, by default True
|
|
@@ -2167,7 +2231,7 @@ def restriction_ligation_assembly(
|
|
|
2167
2231
|
|
|
2168
2232
|
Returns
|
|
2169
2233
|
-------
|
|
2170
|
-
list[
|
|
2234
|
+
list[Dseqrecord]
|
|
2171
2235
|
List of assembled DNA molecules
|
|
2172
2236
|
|
|
2173
2237
|
Examples
|
|
@@ -2214,7 +2278,7 @@ def restriction_ligation_assembly(
|
|
|
2214
2278
|
return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
|
|
2215
2279
|
|
|
2216
2280
|
products = common_function_assembly_products(
|
|
2217
|
-
frags, None, algorithm_fn, circular_only
|
|
2281
|
+
frags, None, algorithm_fn, circular_only, only_adjacent_edges=True
|
|
2218
2282
|
)
|
|
2219
2283
|
return _recast_sources(
|
|
2220
2284
|
products, RestrictionAndLigationSource, restriction_enzymes=enzymes
|
|
@@ -2222,20 +2286,20 @@ def restriction_ligation_assembly(
|
|
|
2222
2286
|
|
|
2223
2287
|
|
|
2224
2288
|
def golden_gate_assembly(
|
|
2225
|
-
frags: list[
|
|
2226
|
-
enzymes: list["
|
|
2289
|
+
frags: list[Dseqrecord],
|
|
2290
|
+
enzymes: list["AbstractCut"],
|
|
2227
2291
|
allow_blunt: bool = True,
|
|
2228
2292
|
circular_only: bool = False,
|
|
2229
|
-
) -> list[
|
|
2293
|
+
) -> list[Dseqrecord]:
|
|
2230
2294
|
"""Returns the products for Golden Gate assembly. This is the same as
|
|
2231
2295
|
restriction ligation assembly, but with a different name. Check the documentation
|
|
2232
2296
|
for ``restriction_ligation_assembly`` for more details.
|
|
2233
2297
|
|
|
2234
2298
|
Parameters
|
|
2235
2299
|
----------
|
|
2236
|
-
frags : list[
|
|
2300
|
+
frags : list[Dseqrecord]
|
|
2237
2301
|
List of DNA fragments to assemble
|
|
2238
|
-
enzymes : list[
|
|
2302
|
+
enzymes : list[AbstractCut]
|
|
2239
2303
|
List of restriction enzymes to use
|
|
2240
2304
|
allow_blunt : bool, optional
|
|
2241
2305
|
If True, allow blunt end ligations, by default True
|
|
@@ -2244,7 +2308,7 @@ def golden_gate_assembly(
|
|
|
2244
2308
|
|
|
2245
2309
|
Returns
|
|
2246
2310
|
-------
|
|
2247
|
-
list[
|
|
2311
|
+
list[Dseqrecord]
|
|
2248
2312
|
List of assembled DNA molecules
|
|
2249
2313
|
|
|
2250
2314
|
Examples
|
|
@@ -2255,11 +2319,11 @@ def golden_gate_assembly(
|
|
|
2255
2319
|
|
|
2256
2320
|
|
|
2257
2321
|
def ligation_assembly(
|
|
2258
|
-
frags: list[
|
|
2322
|
+
frags: list[Dseqrecord],
|
|
2259
2323
|
allow_blunt: bool = False,
|
|
2260
2324
|
allow_partial_overlap: bool = False,
|
|
2261
2325
|
circular_only: bool = False,
|
|
2262
|
-
) -> list[
|
|
2326
|
+
) -> list[Dseqrecord]:
|
|
2263
2327
|
"""Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
|
|
2264
2328
|
will be ligated.
|
|
2265
2329
|
|
|
@@ -2267,7 +2331,7 @@ def ligation_assembly(
|
|
|
2267
2331
|
|
|
2268
2332
|
Parameters
|
|
2269
2333
|
----------
|
|
2270
|
-
frags : list[
|
|
2334
|
+
frags : list[Dseqrecord]
|
|
2271
2335
|
List of DNA fragments to assemble
|
|
2272
2336
|
allow_blunt : bool, optional
|
|
2273
2337
|
If True, allow blunt end ligations, by default False
|
|
@@ -2278,7 +2342,7 @@ def ligation_assembly(
|
|
|
2278
2342
|
|
|
2279
2343
|
Returns
|
|
2280
2344
|
-------
|
|
2281
|
-
list[
|
|
2345
|
+
list[Dseqrecord]
|
|
2282
2346
|
List of assembled DNA molecules
|
|
2283
2347
|
|
|
2284
2348
|
|
|
@@ -2333,17 +2397,17 @@ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
|
|
|
2333
2397
|
|
|
2334
2398
|
|
|
2335
2399
|
def gateway_assembly(
|
|
2336
|
-
frags: list[
|
|
2400
|
+
frags: list[Dseqrecord],
|
|
2337
2401
|
reaction_type: Literal["BP", "LR"],
|
|
2338
2402
|
greedy: bool = False,
|
|
2339
2403
|
circular_only: bool = False,
|
|
2340
2404
|
multi_site_only: bool = False,
|
|
2341
|
-
) -> list[
|
|
2405
|
+
) -> list[Dseqrecord]:
|
|
2342
2406
|
"""Returns the products for Gateway assembly / Gateway cloning.
|
|
2343
2407
|
|
|
2344
2408
|
Parameters
|
|
2345
2409
|
----------
|
|
2346
|
-
frags : list[
|
|
2410
|
+
frags : list[Dseqrecord]
|
|
2347
2411
|
List of DNA fragments to assemble
|
|
2348
2412
|
reaction_type : Literal['BP', 'LR']
|
|
2349
2413
|
Type of Gateway reaction
|
|
@@ -2359,7 +2423,7 @@ def gateway_assembly(
|
|
|
2359
2423
|
|
|
2360
2424
|
Returns
|
|
2361
2425
|
-------
|
|
2362
|
-
list[
|
|
2426
|
+
list[Dseqrecord]
|
|
2363
2427
|
List of assembled DNA molecules
|
|
2364
2428
|
|
|
2365
2429
|
|
|
@@ -2446,13 +2510,13 @@ def gateway_assembly(
|
|
|
2446
2510
|
|
|
2447
2511
|
|
|
2448
2512
|
def common_function_integration_products(
|
|
2449
|
-
frags: list[
|
|
2450
|
-
) -> list[
|
|
2513
|
+
frags: list[Dseqrecord], limit: int | None, algorithm: Callable
|
|
2514
|
+
) -> list[Dseqrecord]:
|
|
2451
2515
|
"""Common function to avoid code duplication for integration products.
|
|
2452
2516
|
|
|
2453
2517
|
Parameters
|
|
2454
2518
|
----------
|
|
2455
|
-
frags : list[
|
|
2519
|
+
frags : list[Dseqrecord]
|
|
2456
2520
|
List of DNA fragments to integrate
|
|
2457
2521
|
limit : int or None
|
|
2458
2522
|
Minimum overlap length required, or None if not applicable
|
|
@@ -2461,7 +2525,7 @@ def common_function_integration_products(
|
|
|
2461
2525
|
|
|
2462
2526
|
Returns
|
|
2463
2527
|
-------
|
|
2464
|
-
list[
|
|
2528
|
+
list[Dseqrecord]
|
|
2465
2529
|
List of integrated DNA molecules
|
|
2466
2530
|
"""
|
|
2467
2531
|
if len(frags) == 1:
|
|
@@ -2482,27 +2546,27 @@ def common_function_integration_products(
|
|
|
2482
2546
|
|
|
2483
2547
|
|
|
2484
2548
|
def common_handle_insertion_fragments(
|
|
2485
|
-
genome:
|
|
2486
|
-
) -> list[
|
|
2549
|
+
genome: Dseqrecord, inserts: list[Dseqrecord]
|
|
2550
|
+
) -> list[Dseqrecord]:
|
|
2487
2551
|
"""Common function to handle / validate insertion fragments.
|
|
2488
2552
|
|
|
2489
2553
|
Parameters
|
|
2490
2554
|
----------
|
|
2491
|
-
genome :
|
|
2555
|
+
genome : Dseqrecord
|
|
2492
2556
|
Target genome sequence
|
|
2493
|
-
inserts : list[
|
|
2557
|
+
inserts : list[Dseqrecord] or Dseqrecord
|
|
2494
2558
|
DNA fragment(s) to insert
|
|
2495
2559
|
|
|
2496
2560
|
Returns
|
|
2497
2561
|
-------
|
|
2498
|
-
list[
|
|
2562
|
+
list[Dseqrecord]
|
|
2499
2563
|
List containing genome and insert fragments
|
|
2500
2564
|
"""
|
|
2501
|
-
if not isinstance(genome,
|
|
2565
|
+
if not isinstance(genome, Dseqrecord):
|
|
2502
2566
|
raise ValueError("Genome must be a Dseqrecord object")
|
|
2503
2567
|
|
|
2504
2568
|
if not isinstance(inserts, list) or not all(
|
|
2505
|
-
isinstance(f,
|
|
2569
|
+
isinstance(f, Dseqrecord) for f in inserts
|
|
2506
2570
|
):
|
|
2507
2571
|
raise ValueError("Inserts must be a list of Dseqrecord objects")
|
|
2508
2572
|
|
|
@@ -2513,13 +2577,13 @@ def common_handle_insertion_fragments(
|
|
|
2513
2577
|
|
|
2514
2578
|
|
|
2515
2579
|
def common_function_excision_products(
|
|
2516
|
-
genome:
|
|
2517
|
-
) -> list[
|
|
2580
|
+
genome: Dseqrecord, limit: int | None, algorithm: Callable
|
|
2581
|
+
) -> list[Dseqrecord]:
|
|
2518
2582
|
"""Common function to avoid code duplication for excision products.
|
|
2519
2583
|
|
|
2520
2584
|
Parameters
|
|
2521
2585
|
----------
|
|
2522
|
-
genome :
|
|
2586
|
+
genome : Dseqrecord
|
|
2523
2587
|
Target genome sequence
|
|
2524
2588
|
limit : int or None
|
|
2525
2589
|
Minimum overlap length required, or None if not applicable
|
|
@@ -2528,7 +2592,7 @@ def common_function_excision_products(
|
|
|
2528
2592
|
|
|
2529
2593
|
Returns
|
|
2530
2594
|
-------
|
|
2531
|
-
list[
|
|
2595
|
+
list[Dseqrecord]
|
|
2532
2596
|
List of excised DNA molecules
|
|
2533
2597
|
"""
|
|
2534
2598
|
asm = SingleFragmentAssembly([genome], limit, algorithm)
|
|
@@ -2536,25 +2600,25 @@ def common_function_excision_products(
|
|
|
2536
2600
|
|
|
2537
2601
|
|
|
2538
2602
|
def homologous_recombination_integration(
|
|
2539
|
-
genome:
|
|
2540
|
-
inserts: list[
|
|
2603
|
+
genome: Dseqrecord,
|
|
2604
|
+
inserts: list[Dseqrecord],
|
|
2541
2605
|
limit: int = 40,
|
|
2542
|
-
) -> list[
|
|
2606
|
+
) -> list[Dseqrecord]:
|
|
2543
2607
|
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2544
2608
|
through in vivo recombination) into the genome through homologous recombination.
|
|
2545
2609
|
|
|
2546
2610
|
Parameters
|
|
2547
2611
|
----------
|
|
2548
|
-
genome :
|
|
2612
|
+
genome : Dseqrecord
|
|
2549
2613
|
Target genome sequence
|
|
2550
|
-
inserts : list[
|
|
2614
|
+
inserts : list[Dseqrecord]
|
|
2551
2615
|
DNA fragment(s) to insert
|
|
2552
2616
|
limit : int, optional
|
|
2553
2617
|
Minimum homology length required, by default 40
|
|
2554
2618
|
|
|
2555
2619
|
Returns
|
|
2556
2620
|
-------
|
|
2557
|
-
list[
|
|
2621
|
+
list[Dseqrecord]
|
|
2558
2622
|
List of integrated DNA molecules
|
|
2559
2623
|
|
|
2560
2624
|
|
|
@@ -2590,21 +2654,21 @@ def homologous_recombination_integration(
|
|
|
2590
2654
|
|
|
2591
2655
|
|
|
2592
2656
|
def homologous_recombination_excision(
|
|
2593
|
-
genome:
|
|
2594
|
-
) -> list[
|
|
2657
|
+
genome: Dseqrecord, limit: int = 40
|
|
2658
|
+
) -> list[Dseqrecord]:
|
|
2595
2659
|
"""Returns the products resulting from the excision of a fragment from the genome through
|
|
2596
2660
|
homologous recombination.
|
|
2597
2661
|
|
|
2598
2662
|
Parameters
|
|
2599
2663
|
----------
|
|
2600
|
-
genome :
|
|
2664
|
+
genome : Dseqrecord
|
|
2601
2665
|
Target genome sequence
|
|
2602
2666
|
limit : int, optional
|
|
2603
2667
|
Minimum homology length required, by default 40
|
|
2604
2668
|
|
|
2605
2669
|
Returns
|
|
2606
2670
|
-------
|
|
2607
|
-
list[
|
|
2671
|
+
list[Dseqrecord]
|
|
2608
2672
|
List containing excised plasmid and remaining genome sequence
|
|
2609
2673
|
|
|
2610
2674
|
Examples
|
|
@@ -2627,8 +2691,8 @@ def homologous_recombination_excision(
|
|
|
2627
2691
|
|
|
2628
2692
|
|
|
2629
2693
|
def cre_lox_integration(
|
|
2630
|
-
genome:
|
|
2631
|
-
) -> list[
|
|
2694
|
+
genome: Dseqrecord, inserts: list[Dseqrecord]
|
|
2695
|
+
) -> list[Dseqrecord]:
|
|
2632
2696
|
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2633
2697
|
through cre-lox recombination among them) into the genome through cre-lox integration.
|
|
2634
2698
|
|
|
@@ -2636,14 +2700,14 @@ def cre_lox_integration(
|
|
|
2636
2700
|
|
|
2637
2701
|
Parameters
|
|
2638
2702
|
----------
|
|
2639
|
-
genome :
|
|
2703
|
+
genome : Dseqrecord
|
|
2640
2704
|
Target genome sequence
|
|
2641
|
-
inserts : list[
|
|
2705
|
+
inserts : list[Dseqrecord] or Dseqrecord
|
|
2642
2706
|
DNA fragment(s) to insert
|
|
2643
2707
|
|
|
2644
2708
|
Returns
|
|
2645
2709
|
-------
|
|
2646
|
-
list[
|
|
2710
|
+
list[Dseqrecord]
|
|
2647
2711
|
List of integrated DNA molecules
|
|
2648
2712
|
|
|
2649
2713
|
Examples
|
|
@@ -2686,17 +2750,17 @@ def cre_lox_integration(
|
|
|
2686
2750
|
return _recast_sources(products, CreLoxRecombinationSource)
|
|
2687
2751
|
|
|
2688
2752
|
|
|
2689
|
-
def cre_lox_excision(genome:
|
|
2753
|
+
def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]:
|
|
2690
2754
|
"""Returns the products for CRE-lox excision.
|
|
2691
2755
|
|
|
2692
2756
|
Parameters
|
|
2693
2757
|
----------
|
|
2694
|
-
genome :
|
|
2758
|
+
genome : Dseqrecord
|
|
2695
2759
|
Target genome sequence
|
|
2696
2760
|
|
|
2697
2761
|
Returns
|
|
2698
2762
|
-------
|
|
2699
|
-
list[
|
|
2763
|
+
list[Dseqrecord]
|
|
2700
2764
|
List containing excised plasmid and remaining genome sequence
|
|
2701
2765
|
|
|
2702
2766
|
Examples
|
|
@@ -2738,28 +2802,28 @@ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
|
|
|
2738
2802
|
|
|
2739
2803
|
|
|
2740
2804
|
def crispr_integration(
|
|
2741
|
-
genome:
|
|
2742
|
-
inserts: list[
|
|
2743
|
-
guides: list[
|
|
2805
|
+
genome: Dseqrecord,
|
|
2806
|
+
inserts: list[Dseqrecord],
|
|
2807
|
+
guides: list[Primer],
|
|
2744
2808
|
limit: int = 40,
|
|
2745
|
-
) -> list[
|
|
2809
|
+
) -> list[Dseqrecord]:
|
|
2746
2810
|
"""
|
|
2747
2811
|
Returns the products for CRISPR integration.
|
|
2748
2812
|
|
|
2749
2813
|
Parameters
|
|
2750
2814
|
----------
|
|
2751
|
-
genome :
|
|
2815
|
+
genome : Dseqrecord
|
|
2752
2816
|
Target genome sequence
|
|
2753
|
-
inserts : list[
|
|
2817
|
+
inserts : list[Dseqrecord]
|
|
2754
2818
|
DNA fragment(s) to insert
|
|
2755
|
-
guides : list[
|
|
2819
|
+
guides : list[Primer]
|
|
2756
2820
|
List of guide RNAs as Primer objects. This may change in the future.
|
|
2757
2821
|
limit : int, optional
|
|
2758
2822
|
Minimum overlap length required, by default 40
|
|
2759
2823
|
|
|
2760
2824
|
Returns
|
|
2761
2825
|
-------
|
|
2762
|
-
list[
|
|
2826
|
+
list[Dseqrecord]
|
|
2763
2827
|
List of integrated DNA molecules
|
|
2764
2828
|
|
|
2765
2829
|
Examples
|
|
@@ -2804,8 +2868,9 @@ def crispr_integration(
|
|
|
2804
2868
|
for i, product in enumerate(products):
|
|
2805
2869
|
# The second element of product.source.input is conventionally the insert/repair fragment
|
|
2806
2870
|
# The other two (first and third) are the two bits of the genome
|
|
2807
|
-
repair_start =
|
|
2808
|
-
|
|
2871
|
+
repair_start = location_boundaries(product.source.input[0].right_location)[0]
|
|
2872
|
+
# Here we do +1 because the position of the cut marks the boundary (e.g. 0:10, 10:20 if a cut is at pos 10)
|
|
2873
|
+
repair_end = location_boundaries(product.source.input[2].left_location)[1] + 1
|
|
2809
2874
|
repair_location = create_location(repair_start, repair_end, len(genome))
|
|
2810
2875
|
some_cuts_inside_repair = []
|
|
2811
2876
|
all_cuts_inside_repair = []
|
|
@@ -2836,22 +2901,22 @@ def crispr_integration(
|
|
|
2836
2901
|
|
|
2837
2902
|
|
|
2838
2903
|
def pcr_assembly(
|
|
2839
|
-
template:
|
|
2840
|
-
fwd_primer:
|
|
2841
|
-
rvs_primer:
|
|
2904
|
+
template: Dseqrecord,
|
|
2905
|
+
fwd_primer: Primer,
|
|
2906
|
+
rvs_primer: Primer,
|
|
2842
2907
|
add_primer_features: bool = False,
|
|
2843
2908
|
limit: int = 14,
|
|
2844
2909
|
mismatches: int = 0,
|
|
2845
|
-
) -> list[
|
|
2910
|
+
) -> list[Dseqrecord]:
|
|
2846
2911
|
"""Returns the products for PCR assembly.
|
|
2847
2912
|
|
|
2848
2913
|
Parameters
|
|
2849
2914
|
----------
|
|
2850
|
-
template :
|
|
2915
|
+
template : Dseqrecord
|
|
2851
2916
|
Template sequence
|
|
2852
|
-
fwd_primer :
|
|
2917
|
+
fwd_primer : Primer
|
|
2853
2918
|
Forward primer
|
|
2854
|
-
rvs_primer :
|
|
2919
|
+
rvs_primer : Primer
|
|
2855
2920
|
Reverse primer
|
|
2856
2921
|
add_primer_features : bool, optional
|
|
2857
2922
|
If True, add primer features to the product, by default False
|
|
@@ -2862,7 +2927,7 @@ def pcr_assembly(
|
|
|
2862
2927
|
|
|
2863
2928
|
Returns
|
|
2864
2929
|
-------
|
|
2865
|
-
list[
|
|
2930
|
+
list[Dseqrecord]
|
|
2866
2931
|
List of assembled DNA molecules
|
|
2867
2932
|
"""
|
|
2868
2933
|
|