pydna 5.5.2__py3-none-any.whl → 5.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +1 -1
- pydna/assembly2.py +731 -6
- pydna/cre_lox.py +130 -0
- pydna/gateway.py +154 -152
- pydna/parsers.py +23 -0
- pydna/seqrecord.py +1 -1
- pydna/sequence_regex.py +44 -0
- {pydna-5.5.2.dist-info → pydna-5.5.3.dist-info}/METADATA +7 -17
- {pydna-5.5.2.dist-info → pydna-5.5.3.dist-info}/RECORD +11 -9
- {pydna-5.5.2.dist-info → pydna-5.5.3.dist-info}/LICENSE.txt +0 -0
- {pydna-5.5.2.dist-info → pydna-5.5.3.dist-info}/WHEEL +0 -0
pydna/assembly2.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
2
|
+
"""
|
|
3
|
+
Improved implementation of the assembly module. To see a list of issues with the previous implementation,
|
|
4
|
+
see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
|
|
5
|
+
"""
|
|
3
6
|
|
|
4
7
|
import networkx as _nx
|
|
5
8
|
import itertools as _itertools
|
|
@@ -26,12 +29,20 @@ from pydna.primer import Primer as _Primer
|
|
|
26
29
|
from pydna.seqrecord import SeqRecord as _SeqRecord
|
|
27
30
|
from pydna.types import (
|
|
28
31
|
CutSiteType,
|
|
32
|
+
# TODO: allow user to enforce multi-site
|
|
29
33
|
EdgeRepresentationAssembly,
|
|
30
34
|
SubFragmentRepresentationAssembly,
|
|
31
35
|
AssemblyAlgorithmType,
|
|
32
36
|
SequenceOverlap,
|
|
33
37
|
AssemblyEdgeType,
|
|
34
38
|
)
|
|
39
|
+
from pydna.gateway import gateway_overlap, find_gateway_sites
|
|
40
|
+
from pydna.cre_lox import cre_loxP_overlap
|
|
41
|
+
|
|
42
|
+
from typing import TYPE_CHECKING, Callable
|
|
43
|
+
|
|
44
|
+
if TYPE_CHECKING:
|
|
45
|
+
from Bio.Restriction import AbstractCut as _AbstractCut
|
|
35
46
|
|
|
36
47
|
|
|
37
48
|
def gather_overlapping_locations(
|
|
@@ -366,7 +377,7 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
|
366
377
|
return [tuple(m) for m in matches]
|
|
367
378
|
|
|
368
379
|
|
|
369
|
-
def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=
|
|
380
|
+
def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = False):
|
|
370
381
|
"""
|
|
371
382
|
Assembly algorithm for ligation of sticky ends.
|
|
372
383
|
|
|
@@ -376,7 +387,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
|
|
|
376
387
|
Args:
|
|
377
388
|
seqx (_Dseqrecord): The first sequence
|
|
378
389
|
seqy (_Dseqrecord): The second sequence
|
|
379
|
-
limit (
|
|
390
|
+
limit (bool): Whether to allow partial overlaps
|
|
380
391
|
|
|
381
392
|
Returns:
|
|
382
393
|
list[SequenceOverlap]: A list of overlaps between the two sequences
|
|
@@ -389,16 +400,16 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
|
|
|
389
400
|
>>> from pydna.assembly2 import sticky_end_sub_strings
|
|
390
401
|
>>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 3))
|
|
391
402
|
>>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
|
|
392
|
-
>>> sticky_end_sub_strings(x, y, limit=
|
|
403
|
+
>>> sticky_end_sub_strings(x, y, limit=False)
|
|
393
404
|
[(3, 0, 3)]
|
|
394
|
-
>>> sticky_end_sub_strings(y, x, limit=
|
|
405
|
+
>>> sticky_end_sub_strings(y, x, limit=False)
|
|
395
406
|
[]
|
|
396
407
|
|
|
397
408
|
Ligation of partially overlapping sticky ends, specified with limit=True
|
|
398
409
|
|
|
399
410
|
>>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 2))
|
|
400
411
|
>>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
|
|
401
|
-
>>> sticky_end_sub_strings(x, y, limit=
|
|
412
|
+
>>> sticky_end_sub_strings(x, y, limit=False)
|
|
402
413
|
[]
|
|
403
414
|
>>> sticky_end_sub_strings(x, y, limit=True)
|
|
404
415
|
[(4, 0, 2)]
|
|
@@ -1900,3 +1911,717 @@ class SingleFragmentAssembly(Assembly):
|
|
|
1900
1911
|
|
|
1901
1912
|
def get_linear_assemblies(self):
|
|
1902
1913
|
raise NotImplementedError("Linear assembly does not make sense")
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
def common_function_assembly_products(
|
|
1917
|
+
frags: list[_Dseqrecord],
|
|
1918
|
+
limit: int | None,
|
|
1919
|
+
algorithm: Callable,
|
|
1920
|
+
circular_only: bool,
|
|
1921
|
+
filter_results_function: Callable | None = None,
|
|
1922
|
+
) -> list[_Dseqrecord]:
|
|
1923
|
+
"""Common function to avoid code duplication. Could be simplified further
|
|
1924
|
+
once SingleFragmentAssembly and Assembly are merged.
|
|
1925
|
+
|
|
1926
|
+
Parameters
|
|
1927
|
+
----------
|
|
1928
|
+
frags : list[_Dseqrecord]
|
|
1929
|
+
List of DNA fragments to assemble
|
|
1930
|
+
limit : int or None
|
|
1931
|
+
Minimum overlap length required, or None if not applicable
|
|
1932
|
+
algorithm : Callable
|
|
1933
|
+
Function that determines valid overlaps between fragments
|
|
1934
|
+
circular_only : bool
|
|
1935
|
+
If True, only return circular assemblies
|
|
1936
|
+
|
|
1937
|
+
Returns
|
|
1938
|
+
-------
|
|
1939
|
+
list[_Dseqrecord]
|
|
1940
|
+
List of assembled DNA molecules
|
|
1941
|
+
"""
|
|
1942
|
+
if len(frags) == 1:
|
|
1943
|
+
asm = SingleFragmentAssembly(frags, limit, algorithm)
|
|
1944
|
+
else:
|
|
1945
|
+
asm = Assembly(
|
|
1946
|
+
frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
|
|
1947
|
+
)
|
|
1948
|
+
output_assemblies = asm.get_circular_assemblies()
|
|
1949
|
+
if not circular_only and len(frags) > 1:
|
|
1950
|
+
output_assemblies += filter_linear_subassemblies(
|
|
1951
|
+
asm.get_linear_assemblies(), output_assemblies, frags
|
|
1952
|
+
)
|
|
1953
|
+
if not circular_only and len(frags) == 1:
|
|
1954
|
+
output_assemblies += asm.get_insertion_assemblies()
|
|
1955
|
+
|
|
1956
|
+
if filter_results_function:
|
|
1957
|
+
output_assemblies = [a for a in output_assemblies if filter_results_function(a)]
|
|
1958
|
+
|
|
1959
|
+
return [assemble(frags, a) for a in output_assemblies]
|
|
1960
|
+
|
|
1961
|
+
|
|
1962
|
+
def gibson_assembly(
|
|
1963
|
+
frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
1964
|
+
) -> list[_Dseqrecord]:
|
|
1965
|
+
"""Returns the products for Gibson assembly.
|
|
1966
|
+
|
|
1967
|
+
Parameters
|
|
1968
|
+
----------
|
|
1969
|
+
frags : list[_Dseqrecord]
|
|
1970
|
+
List of DNA fragments to assemble
|
|
1971
|
+
limit : int, optional
|
|
1972
|
+
Minimum overlap length required, by default 25
|
|
1973
|
+
circular_only : bool, optional
|
|
1974
|
+
If True, only return circular assemblies, by default False
|
|
1975
|
+
|
|
1976
|
+
Returns
|
|
1977
|
+
-------
|
|
1978
|
+
list[_Dseqrecord]
|
|
1979
|
+
List of assembled DNA molecules
|
|
1980
|
+
"""
|
|
1981
|
+
return common_function_assembly_products(
|
|
1982
|
+
frags, limit, gibson_overlap, circular_only
|
|
1983
|
+
)
|
|
1984
|
+
|
|
1985
|
+
|
|
1986
|
+
def in_fusion_assembly(
|
|
1987
|
+
frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
1988
|
+
) -> list[_Dseqrecord]:
|
|
1989
|
+
"""Returns the products for in-fusion assembly. This is the same as Gibson
|
|
1990
|
+
assembly, but with a different name.
|
|
1991
|
+
|
|
1992
|
+
Parameters
|
|
1993
|
+
----------
|
|
1994
|
+
frags : list[_Dseqrecord]
|
|
1995
|
+
List of DNA fragments to assemble
|
|
1996
|
+
limit : int, optional
|
|
1997
|
+
Minimum overlap length required, by default 25
|
|
1998
|
+
circular_only : bool, optional
|
|
1999
|
+
If True, only return circular assemblies, by default False
|
|
2000
|
+
|
|
2001
|
+
Returns
|
|
2002
|
+
-------
|
|
2003
|
+
list[_Dseqrecord]
|
|
2004
|
+
List of assembled DNA molecules
|
|
2005
|
+
"""
|
|
2006
|
+
return gibson_assembly(frags, limit)
|
|
2007
|
+
|
|
2008
|
+
|
|
2009
|
+
def fusion_pcr_assembly(
|
|
2010
|
+
frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2011
|
+
) -> list[_Dseqrecord]:
|
|
2012
|
+
"""Returns the products for fusion PCR assembly. This is the same as Gibson
|
|
2013
|
+
assembly, but with a different name.
|
|
2014
|
+
|
|
2015
|
+
Parameters
|
|
2016
|
+
----------
|
|
2017
|
+
frags : list[_Dseqrecord]
|
|
2018
|
+
List of DNA fragments to assemble
|
|
2019
|
+
limit : int, optional
|
|
2020
|
+
Minimum overlap length required, by default 25
|
|
2021
|
+
circular_only : bool, optional
|
|
2022
|
+
If True, only return circular assemblies, by default False
|
|
2023
|
+
|
|
2024
|
+
Returns
|
|
2025
|
+
-------
|
|
2026
|
+
list[_Dseqrecord]
|
|
2027
|
+
List of assembled DNA molecules
|
|
2028
|
+
"""
|
|
2029
|
+
return gibson_assembly(frags, limit)
|
|
2030
|
+
|
|
2031
|
+
|
|
2032
|
+
def in_vivo_assembly(
|
|
2033
|
+
frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
|
|
2034
|
+
) -> list[_Dseqrecord]:
|
|
2035
|
+
"""Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
|
|
2036
|
+
|
|
2037
|
+
Parameters
|
|
2038
|
+
----------
|
|
2039
|
+
frags : list[_Dseqrecord]
|
|
2040
|
+
List of DNA fragments to assemble
|
|
2041
|
+
limit : int, optional
|
|
2042
|
+
Minimum overlap length required, by default 25
|
|
2043
|
+
circular_only : bool, optional
|
|
2044
|
+
If True, only return circular assemblies, by default False
|
|
2045
|
+
|
|
2046
|
+
Returns
|
|
2047
|
+
-------
|
|
2048
|
+
list[_Dseqrecord]
|
|
2049
|
+
List of assembled DNA molecules
|
|
2050
|
+
"""
|
|
2051
|
+
return common_function_assembly_products(
|
|
2052
|
+
frags, limit, common_sub_strings, circular_only
|
|
2053
|
+
)
|
|
2054
|
+
|
|
2055
|
+
|
|
2056
|
+
def restriction_ligation_assembly(
|
|
2057
|
+
frags: list[_Dseqrecord],
|
|
2058
|
+
enzymes: list["_AbstractCut"],
|
|
2059
|
+
allow_blunt: bool = True,
|
|
2060
|
+
circular_only: bool = False,
|
|
2061
|
+
) -> list[_Dseqrecord]:
|
|
2062
|
+
"""Returns the products for restriction ligation assembly:
|
|
2063
|
+
* Finds cutsites in the fragments
|
|
2064
|
+
* Finds all products that could be assembled by ligating the fragments based on those cutsites
|
|
2065
|
+
* Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
|
|
2066
|
+
|
|
2067
|
+
Parameters
|
|
2068
|
+
----------
|
|
2069
|
+
frags : list[_Dseqrecord]
|
|
2070
|
+
List of DNA fragments to assemble
|
|
2071
|
+
enzymes : list[_AbstractCut]
|
|
2072
|
+
List of restriction enzymes to use
|
|
2073
|
+
allow_blunt : bool, optional
|
|
2074
|
+
If True, allow blunt end ligations, by default True
|
|
2075
|
+
circular_only : bool, optional
|
|
2076
|
+
If True, only return circular assemblies, by default False
|
|
2077
|
+
|
|
2078
|
+
Returns
|
|
2079
|
+
-------
|
|
2080
|
+
list[_Dseqrecord]
|
|
2081
|
+
List of assembled DNA molecules
|
|
2082
|
+
|
|
2083
|
+
Examples
|
|
2084
|
+
--------
|
|
2085
|
+
In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
|
|
2086
|
+
Note how 2 circular products are returned, one contains the insert (`acgt`)
|
|
2087
|
+
and the desired part of the backbone (`cccccc`), the other contains the
|
|
2088
|
+
reversed insert (`tgga`) and the cut-out part of the backbone (`aaa`).
|
|
2089
|
+
|
|
2090
|
+
>>> from pydna.assembly2 import restriction_ligation_assembly
|
|
2091
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2092
|
+
>>> from Bio.Restriction import EcoRI, SalI
|
|
2093
|
+
>>> backbone = Dseqrecord("cccGAATTCaaaGTCGACccc", circular=True)
|
|
2094
|
+
>>> insert = Dseqrecord("ggGAATTCaggtGTCGACgg")
|
|
2095
|
+
>>> products = restriction_ligation_assembly([backbone, insert], [EcoRI, SalI], circular_only=True)
|
|
2096
|
+
>>> products[0].seq
|
|
2097
|
+
Dseq(o22)
|
|
2098
|
+
TCGACccccccGAATTCaggtG
|
|
2099
|
+
AGCTGggggggCTTAAGtccaC
|
|
2100
|
+
>>> products[1].seq
|
|
2101
|
+
Dseq(o19)
|
|
2102
|
+
AATTCaaaGTCGACacctG
|
|
2103
|
+
TTAAGtttCAGCTGtggaC
|
|
2104
|
+
|
|
2105
|
+
Note that passing a pre-cut fragment will not work.
|
|
2106
|
+
|
|
2107
|
+
>>> restriction_products = insert.cut([EcoRI, SalI])
|
|
2108
|
+
>>> cut_insert = restriction_products[1]
|
|
2109
|
+
>>> restriction_ligation_assembly([backbone, cut_insert], [EcoRI, SalI], circular_only=True)
|
|
2110
|
+
[]
|
|
2111
|
+
|
|
2112
|
+
It also works with a single fragment, for circularization:
|
|
2113
|
+
|
|
2114
|
+
>>> seq = Dseqrecord("GAATTCaaaGAATTC")
|
|
2115
|
+
>>> products =restriction_ligation_assembly([seq], [EcoRI])
|
|
2116
|
+
>>> products[0].seq
|
|
2117
|
+
Dseq(o9)
|
|
2118
|
+
AATTCaaaG
|
|
2119
|
+
TTAAGtttC
|
|
2120
|
+
"""
|
|
2121
|
+
|
|
2122
|
+
def algo(x, y, _l):
|
|
2123
|
+
# By default, we allow blunt ends
|
|
2124
|
+
return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
|
|
2125
|
+
|
|
2126
|
+
return common_function_assembly_products(frags, None, algo, circular_only)
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
def golden_gate_assembly(
|
|
2130
|
+
frags: list[_Dseqrecord],
|
|
2131
|
+
enzymes: list["_AbstractCut"],
|
|
2132
|
+
allow_blunt: bool = True,
|
|
2133
|
+
circular_only: bool = False,
|
|
2134
|
+
) -> list[_Dseqrecord]:
|
|
2135
|
+
"""Returns the products for Golden Gate assembly. This is the same as
|
|
2136
|
+
restriction ligation assembly, but with a different name. Check the documentation
|
|
2137
|
+
for `restriction_ligation_assembly` for more details.
|
|
2138
|
+
|
|
2139
|
+
Parameters
|
|
2140
|
+
----------
|
|
2141
|
+
frags : list[_Dseqrecord]
|
|
2142
|
+
List of DNA fragments to assemble
|
|
2143
|
+
enzymes : list[_AbstractCut]
|
|
2144
|
+
List of restriction enzymes to use
|
|
2145
|
+
allow_blunt : bool, optional
|
|
2146
|
+
If True, allow blunt end ligations, by default True
|
|
2147
|
+
circular_only : bool, optional
|
|
2148
|
+
If True, only return circular assemblies, by default False
|
|
2149
|
+
|
|
2150
|
+
Returns
|
|
2151
|
+
-------
|
|
2152
|
+
list[_Dseqrecord]
|
|
2153
|
+
List of assembled DNA molecules
|
|
2154
|
+
|
|
2155
|
+
Examples
|
|
2156
|
+
--------
|
|
2157
|
+
See the example for `restriction_ligation_assembly`.
|
|
2158
|
+
"""
|
|
2159
|
+
return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
|
|
2160
|
+
|
|
2161
|
+
|
|
2162
|
+
def ligation_assembly(
|
|
2163
|
+
frags: list[_Dseqrecord],
|
|
2164
|
+
allow_blunt: bool = False,
|
|
2165
|
+
allow_partial_overlap: bool = False,
|
|
2166
|
+
circular_only: bool = False,
|
|
2167
|
+
) -> list[_Dseqrecord]:
|
|
2168
|
+
"""Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
|
|
2169
|
+
will be ligated.
|
|
2170
|
+
|
|
2171
|
+
For most cases, you probably should use `restriction_ligation_assembly` instead.
|
|
2172
|
+
|
|
2173
|
+
Parameters
|
|
2174
|
+
----------
|
|
2175
|
+
frags : list[_Dseqrecord]
|
|
2176
|
+
List of DNA fragments to assemble
|
|
2177
|
+
allow_blunt : bool, optional
|
|
2178
|
+
If True, allow blunt end ligations, by default False
|
|
2179
|
+
allow_partial_overlap : bool, optional
|
|
2180
|
+
If True, allow partial overlaps between sticky ends, by default False
|
|
2181
|
+
circular_only : bool, optional
|
|
2182
|
+
If True, only return circular assemblies, by default False
|
|
2183
|
+
|
|
2184
|
+
Returns
|
|
2185
|
+
-------
|
|
2186
|
+
list[_Dseqrecord]
|
|
2187
|
+
List of assembled DNA molecules
|
|
2188
|
+
|
|
2189
|
+
|
|
2190
|
+
Examples
|
|
2191
|
+
--------
|
|
2192
|
+
In the example below, we plan to assemble a plasmid from a backbone and an insert,
|
|
2193
|
+
using the EcoRI enzyme. The insert and insertion site in the backbone are flanked by
|
|
2194
|
+
EcoRI sites, so there are two possible products depending on the orientation of the insert.
|
|
2195
|
+
|
|
2196
|
+
>>> from pydna.assembly2 import ligation_assembly
|
|
2197
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2198
|
+
>>> from Bio.Restriction import EcoRI
|
|
2199
|
+
>>> backbone = Dseqrecord("cccGAATTCaaaGAATTCccc", circular=True)
|
|
2200
|
+
>>> backbone_cut = backbone.cut(EcoRI)[1]
|
|
2201
|
+
>>> insert = Dseqrecord("ggGAATTCaggtGAATTCgg")
|
|
2202
|
+
>>> insert_cut = insert.cut(EcoRI)[1]
|
|
2203
|
+
>>> products = ligation_assembly([backbone_cut, insert_cut])
|
|
2204
|
+
>>> products[0].seq
|
|
2205
|
+
Dseq(o22)
|
|
2206
|
+
AATTCccccccGAATTCaggtG
|
|
2207
|
+
TTAAGggggggCTTAAGtccaC
|
|
2208
|
+
>>> products[1].seq
|
|
2209
|
+
Dseq(o22)
|
|
2210
|
+
AATTCccccccGAATTCacctG
|
|
2211
|
+
TTAAGggggggCTTAAGtggaC
|
|
2212
|
+
"""
|
|
2213
|
+
|
|
2214
|
+
def sticky_end_algorithm(x, y, _l):
|
|
2215
|
+
return sticky_end_sub_strings(x, y, allow_partial_overlap)
|
|
2216
|
+
|
|
2217
|
+
if allow_blunt:
|
|
2218
|
+
algo = combine_algorithms(sticky_end_algorithm, blunt_overlap)
|
|
2219
|
+
else:
|
|
2220
|
+
algo = sticky_end_algorithm
|
|
2221
|
+
|
|
2222
|
+
return common_function_assembly_products(frags, None, algo, circular_only)
|
|
2223
|
+
|
|
2224
|
+
|
|
2225
|
+
def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
|
|
2226
|
+
"""Returns True if the assembly is a multi-site assembly, False otherwise."""
|
|
2227
|
+
|
|
2228
|
+
if len(asm) < 2:
|
|
2229
|
+
return False
|
|
2230
|
+
|
|
2231
|
+
is_cycle = asm[0][1] == asm[-1][0]
|
|
2232
|
+
asm2 = edge_representation2subfragment_representation(asm, is_cycle)
|
|
2233
|
+
|
|
2234
|
+
return all(f[1] != f[2] for f in asm2)
|
|
2235
|
+
|
|
2236
|
+
|
|
2237
|
+
def gateway_assembly(
|
|
2238
|
+
frags: list[_Dseqrecord],
|
|
2239
|
+
reaction_type: str,
|
|
2240
|
+
greedy: bool = False,
|
|
2241
|
+
circular_only: bool = False,
|
|
2242
|
+
multi_site_only: bool = False,
|
|
2243
|
+
) -> list[_Dseqrecord]:
|
|
2244
|
+
"""Returns the products for Gateway assembly / Gateway cloning.
|
|
2245
|
+
|
|
2246
|
+
Parameters
|
|
2247
|
+
----------
|
|
2248
|
+
frags : list[_Dseqrecord]
|
|
2249
|
+
List of DNA fragments to assemble
|
|
2250
|
+
reaction_type : str
|
|
2251
|
+
Type of Gateway reaction, either 'BP' or 'LR'
|
|
2252
|
+
greedy : bool, optional
|
|
2253
|
+
If True, use greedy gateway consensus sites, by default False
|
|
2254
|
+
circular_only : bool, optional
|
|
2255
|
+
If True, only return circular assemblies, by default False
|
|
2256
|
+
multi_site_only : bool, optional
|
|
2257
|
+
If True, only return products that where 2 sites recombined. Even if input sequences
|
|
2258
|
+
contain multiple att sites (typically 2), a product could be generated where only one
|
|
2259
|
+
site recombines. That's typically not what you want, so you can set this to True to
|
|
2260
|
+
only return products where both att sites recombined.
|
|
2261
|
+
|
|
2262
|
+
Returns
|
|
2263
|
+
-------
|
|
2264
|
+
list[_Dseqrecord]
|
|
2265
|
+
List of assembled DNA molecules
|
|
2266
|
+
|
|
2267
|
+
|
|
2268
|
+
Examples
|
|
2269
|
+
--------
|
|
2270
|
+
|
|
2271
|
+
Below an example with dummy Gateway sequences, composed with minimal sequences and the consensus
|
|
2272
|
+
att sites.
|
|
2273
|
+
|
|
2274
|
+
>>> from pydna.assembly2 import gateway_assembly
|
|
2275
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2276
|
+
>>> attB1 = "ACAACTTTGTACAAAAAAGCAGAAG"
|
|
2277
|
+
>>> attP1 = "AAAATAATGATTTTATTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCTGAACGAGAAGCGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATCCAGTCACTATGAATCAACTACTTAGATGGTATTAGTGACCTGTA"
|
|
2278
|
+
>>> attR1 = "ACAACTTTGTACAAAAAAGCTGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATGCAGTCACTATG"
|
|
2279
|
+
>>> attL1 = "CAAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATAAGCAATGCTTTCTTATAATGCCAACTTTGTACAAAAAAGCAGGCT"
|
|
2280
|
+
>>> seq1 = Dseqrecord("aaa" + attB1 + "ccc")
|
|
2281
|
+
>>> seq2 = Dseqrecord("aaa" + attP1 + "ccc")
|
|
2282
|
+
>>> seq3 = Dseqrecord("aaa" + attR1 + "ccc")
|
|
2283
|
+
>>> seq4 = Dseqrecord("aaa" + attL1 + "ccc")
|
|
2284
|
+
>>> products_BP = gateway_assembly([seq1, seq2], "BP")
|
|
2285
|
+
>>> products_LR = gateway_assembly([seq3, seq4], "LR")
|
|
2286
|
+
>>> len(products_BP)
|
|
2287
|
+
2
|
|
2288
|
+
>>> len(products_LR)
|
|
2289
|
+
2
|
|
2290
|
+
|
|
2291
|
+
Now let's understand the `multi_site_only` parameter. Let's consider a case where we are swapping fragments
|
|
2292
|
+
between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
|
|
2293
|
+
swapping between the two att sites. That's what we get if we set `multi_site_only` to True.
|
|
2294
|
+
|
|
2295
|
+
>>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
|
|
2296
|
+
>>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
|
|
2297
|
+
>>> insert = Dseqrecord("cccccc" + attL1 + "ccc" + attL2 + "cccccc", circular=True)
|
|
2298
|
+
>>> backbone = Dseqrecord("ttttt" + attR1 + "aaa" + attR2, circular=True)
|
|
2299
|
+
>>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=True)
|
|
2300
|
+
>>> len(products)
|
|
2301
|
+
2
|
|
2302
|
+
|
|
2303
|
+
However, if we set `multi_site_only` to False, we get 4 products, which also include the intermediate products
|
|
2304
|
+
where the two plasmids are combined into a single one through recombination of a single att site. This is an
|
|
2305
|
+
intermediate of the reaction, and typically we don't want it:
|
|
2306
|
+
|
|
2307
|
+
>>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=False)
|
|
2308
|
+
>>> print([len(p) for p in products])
|
|
2309
|
+
[469, 237, 232, 469]
|
|
2310
|
+
|
|
2311
|
+
|
|
2312
|
+
"""
|
|
2313
|
+
|
|
2314
|
+
if reaction_type not in ["BP", "LR"]:
|
|
2315
|
+
raise ValueError(
|
|
2316
|
+
f"Invalid reaction type: {reaction_type}, can only be BP or LR"
|
|
2317
|
+
)
|
|
2318
|
+
|
|
2319
|
+
def algo(x, y, _l):
|
|
2320
|
+
return gateway_overlap(x, y, reaction_type, greedy)
|
|
2321
|
+
|
|
2322
|
+
filter_results_function = None if not multi_site_only else assembly_is_multi_site
|
|
2323
|
+
|
|
2324
|
+
products = common_function_assembly_products(
|
|
2325
|
+
frags, None, algo, circular_only, filter_results_function
|
|
2326
|
+
)
|
|
2327
|
+
|
|
2328
|
+
if len(products) == 0:
|
|
2329
|
+
# Build a list of all the sites in the fragments
|
|
2330
|
+
sites_in_fragments = list()
|
|
2331
|
+
for frag in frags:
|
|
2332
|
+
sites_in_fragments.append(list(find_gateway_sites(frag, greedy).keys()))
|
|
2333
|
+
formatted_strings = [
|
|
2334
|
+
f'fragment {i + 1}: {", ".join(sites)}'
|
|
2335
|
+
for i, sites in enumerate(sites_in_fragments)
|
|
2336
|
+
]
|
|
2337
|
+
raise ValueError(
|
|
2338
|
+
f"Inputs are not compatible for {reaction_type} reaction.\n\n"
|
|
2339
|
+
+ "\n".join(formatted_strings),
|
|
2340
|
+
)
|
|
2341
|
+
return products
|
|
2342
|
+
|
|
2343
|
+
|
|
2344
|
+
def common_function_integration_products(
|
|
2345
|
+
frags: list[_Dseqrecord], limit: int | None, algorithm: Callable
|
|
2346
|
+
) -> list[_Dseqrecord]:
|
|
2347
|
+
"""Common function to avoid code duplication for integration products.
|
|
2348
|
+
|
|
2349
|
+
Parameters
|
|
2350
|
+
----------
|
|
2351
|
+
frags : list[_Dseqrecord]
|
|
2352
|
+
List of DNA fragments to integrate
|
|
2353
|
+
limit : int or None
|
|
2354
|
+
Minimum overlap length required, or None if not applicable
|
|
2355
|
+
algorithm : Callable
|
|
2356
|
+
Function that determines valid overlaps between fragments
|
|
2357
|
+
|
|
2358
|
+
Returns
|
|
2359
|
+
-------
|
|
2360
|
+
list[_Dseqrecord]
|
|
2361
|
+
List of integrated DNA molecules
|
|
2362
|
+
"""
|
|
2363
|
+
if len(frags) == 1:
|
|
2364
|
+
asm = SingleFragmentAssembly(frags, limit, algorithm)
|
|
2365
|
+
else:
|
|
2366
|
+
asm = Assembly(
|
|
2367
|
+
frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
|
|
2368
|
+
)
|
|
2369
|
+
|
|
2370
|
+
if frags[0].circular:
|
|
2371
|
+
raise ValueError(
|
|
2372
|
+
"Genome must be linear for integration assembly, use in vivo assembly instead"
|
|
2373
|
+
)
|
|
2374
|
+
|
|
2375
|
+
# We only want insertions in the genome (first fragment)
|
|
2376
|
+
output_assemblies = [a for a in asm.get_insertion_assemblies() if a[0][0] == 1]
|
|
2377
|
+
return [assemble(frags, a, True) for a in output_assemblies]
|
|
2378
|
+
|
|
2379
|
+
|
|
2380
|
+
def common_handle_insertion_fragments(
|
|
2381
|
+
genome: _Dseqrecord, inserts: list[_Dseqrecord]
|
|
2382
|
+
) -> list[_Dseqrecord]:
|
|
2383
|
+
"""Common function to handle / validate insertion fragments.
|
|
2384
|
+
|
|
2385
|
+
Parameters
|
|
2386
|
+
----------
|
|
2387
|
+
genome : _Dseqrecord
|
|
2388
|
+
Target genome sequence
|
|
2389
|
+
inserts : list[_Dseqrecord] or _Dseqrecord
|
|
2390
|
+
DNA fragment(s) to insert
|
|
2391
|
+
|
|
2392
|
+
Returns
|
|
2393
|
+
-------
|
|
2394
|
+
list[_Dseqrecord]
|
|
2395
|
+
List containing genome and insert fragments
|
|
2396
|
+
"""
|
|
2397
|
+
if not isinstance(genome, _Dseqrecord):
|
|
2398
|
+
raise ValueError("Genome must be a Dseqrecord object")
|
|
2399
|
+
|
|
2400
|
+
if not isinstance(inserts, list) or not all(
|
|
2401
|
+
isinstance(f, _Dseqrecord) for f in inserts
|
|
2402
|
+
):
|
|
2403
|
+
raise ValueError("Inserts must be a list of Dseqrecord objects")
|
|
2404
|
+
|
|
2405
|
+
if len(inserts) == 0:
|
|
2406
|
+
raise ValueError("Inserts must be a non-empty list of Dseqrecord objects")
|
|
2407
|
+
|
|
2408
|
+
return [genome] + inserts
|
|
2409
|
+
|
|
2410
|
+
|
|
2411
|
+
def common_function_excision_products(
|
|
2412
|
+
genome: _Dseqrecord, limit: int | None, algorithm: Callable
|
|
2413
|
+
) -> list[_Dseqrecord]:
|
|
2414
|
+
"""Common function to avoid code duplication for excision products.
|
|
2415
|
+
|
|
2416
|
+
Parameters
|
|
2417
|
+
----------
|
|
2418
|
+
genome : _Dseqrecord
|
|
2419
|
+
Target genome sequence
|
|
2420
|
+
limit : int or None
|
|
2421
|
+
Minimum overlap length required, or None if not applicable
|
|
2422
|
+
algorithm : Callable
|
|
2423
|
+
Function that determines valid overlaps between fragments
|
|
2424
|
+
|
|
2425
|
+
Returns
|
|
2426
|
+
-------
|
|
2427
|
+
list[_Dseqrecord]
|
|
2428
|
+
List of excised DNA molecules
|
|
2429
|
+
"""
|
|
2430
|
+
asm = SingleFragmentAssembly([genome], limit, algorithm)
|
|
2431
|
+
return asm.assemble_circular() + asm.assemble_insertion()
|
|
2432
|
+
|
|
2433
|
+
|
|
2434
|
+
def homologous_recombination_integration(
|
|
2435
|
+
genome: _Dseqrecord,
|
|
2436
|
+
inserts: list[_Dseqrecord],
|
|
2437
|
+
limit: int = 40,
|
|
2438
|
+
) -> list[_Dseqrecord]:
|
|
2439
|
+
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2440
|
+
through in vivo recombination) into the genome through homologous recombination.
|
|
2441
|
+
|
|
2442
|
+
Parameters
|
|
2443
|
+
----------
|
|
2444
|
+
genome : _Dseqrecord
|
|
2445
|
+
Target genome sequence
|
|
2446
|
+
inserts : list[_Dseqrecord]
|
|
2447
|
+
DNA fragment(s) to insert
|
|
2448
|
+
limit : int, optional
|
|
2449
|
+
Minimum homology length required, by default 40
|
|
2450
|
+
|
|
2451
|
+
Returns
|
|
2452
|
+
-------
|
|
2453
|
+
list[_Dseqrecord]
|
|
2454
|
+
List of integrated DNA molecules
|
|
2455
|
+
|
|
2456
|
+
|
|
2457
|
+
Examples
|
|
2458
|
+
--------
|
|
2459
|
+
|
|
2460
|
+
Below an example with a single insert.
|
|
2461
|
+
|
|
2462
|
+
>>> from pydna.assembly2 import homologous_recombination_integration
|
|
2463
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2464
|
+
>>> homology = "AAGTCCGTTCGTTTTACCTG"
|
|
2465
|
+
>>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
|
|
2466
|
+
>>> insert = Dseqrecord(f"{homology}gggg{homology}")
|
|
2467
|
+
>>> products = homologous_recombination_integration(genome, [insert], 20)
|
|
2468
|
+
>>> str(products[0].seq)
|
|
2469
|
+
'aaaaaaAAGTCCGTTCGTTTTACCTGggggAAGTCCGTTCGTTTTACCTGaaaaaa'
|
|
2470
|
+
|
|
2471
|
+
Below an example with two inserts joined through homology.
|
|
2472
|
+
|
|
2473
|
+
>>> homology2 = "ATTACAGCATGGGAAGAAAGA"
|
|
2474
|
+
>>> insert_1 = Dseqrecord(f"{homology}gggg{homology2}")
|
|
2475
|
+
>>> insert_2 = Dseqrecord(f"{homology2}cccc{homology}")
|
|
2476
|
+
>>> products = homologous_recombination_integration(genome, [insert_1, insert_2], 20)
|
|
2477
|
+
>>> str(products[0].seq)
|
|
2478
|
+
'aaaaaaAAGTCCGTTCGTTTTACCTGggggATTACAGCATGGGAAGAAAGAccccAAGTCCGTTCGTTTTACCTGaaaaaa'
|
|
2479
|
+
"""
|
|
2480
|
+
fragments = common_handle_insertion_fragments(genome, inserts)
|
|
2481
|
+
|
|
2482
|
+
return common_function_integration_products(fragments, limit, common_sub_strings)
|
|
2483
|
+
|
|
2484
|
+
|
|
2485
|
+
def homologous_recombination_excision(
|
|
2486
|
+
genome: _Dseqrecord, limit: int = 40
|
|
2487
|
+
) -> list[_Dseqrecord]:
|
|
2488
|
+
"""Returns the products resulting from the excision of a fragment from the genome through
|
|
2489
|
+
homologous recombination.
|
|
2490
|
+
|
|
2491
|
+
Parameters
|
|
2492
|
+
----------
|
|
2493
|
+
genome : _Dseqrecord
|
|
2494
|
+
Target genome sequence
|
|
2495
|
+
limit : int, optional
|
|
2496
|
+
Minimum homology length required, by default 40
|
|
2497
|
+
|
|
2498
|
+
Returns
|
|
2499
|
+
-------
|
|
2500
|
+
list[_Dseqrecord]
|
|
2501
|
+
List containing excised plasmid and remaining genome sequence
|
|
2502
|
+
|
|
2503
|
+
Examples
|
|
2504
|
+
--------
|
|
2505
|
+
|
|
2506
|
+
Example of a homologous recombination event, where a plasmid is excised from the
|
|
2507
|
+
genome (circular sequence of 25 bp), and that part is removed from the genome,
|
|
2508
|
+
leaving a shorter linear sequence (32 bp).
|
|
2509
|
+
|
|
2510
|
+
>>> from pydna.assembly2 import homologous_recombination_excision
|
|
2511
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2512
|
+
>>> homology = "AAGTCCGTTCGTTTTACCTG"
|
|
2513
|
+
>>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
|
|
2514
|
+
>>> products = homologous_recombination_excision(genome, 20)
|
|
2515
|
+
>>> products
|
|
2516
|
+
[Dseqrecord(o25), Dseqrecord(-32)]
|
|
2517
|
+
"""
|
|
2518
|
+
return common_function_excision_products(genome, limit, common_sub_strings)
|
|
2519
|
+
|
|
2520
|
+
|
|
2521
|
+
def cre_lox_integration(
|
|
2522
|
+
genome: _Dseqrecord, inserts: list[_Dseqrecord]
|
|
2523
|
+
) -> list[_Dseqrecord]:
|
|
2524
|
+
"""Returns the products resulting from the integration of an insert (or inserts joined
|
|
2525
|
+
through cre-lox recombination among them) into the genome through cre-lox integration.
|
|
2526
|
+
|
|
2527
|
+
Also works with lox66 and lox71 (see `pydna.cre_lox` for more details).
|
|
2528
|
+
|
|
2529
|
+
Parameters
|
|
2530
|
+
----------
|
|
2531
|
+
genome : _Dseqrecord
|
|
2532
|
+
Target genome sequence
|
|
2533
|
+
inserts : list[_Dseqrecord] or _Dseqrecord
|
|
2534
|
+
DNA fragment(s) to insert
|
|
2535
|
+
|
|
2536
|
+
Returns
|
|
2537
|
+
-------
|
|
2538
|
+
list[_Dseqrecord]
|
|
2539
|
+
List of integrated DNA molecules
|
|
2540
|
+
|
|
2541
|
+
Examples
|
|
2542
|
+
--------
|
|
2543
|
+
|
|
2544
|
+
Below an example of reversible integration and excision.
|
|
2545
|
+
|
|
2546
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2547
|
+
>>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
|
|
2548
|
+
>>> from pydna.cre_lox import LOXP_SEQUENCE
|
|
2549
|
+
>>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
|
|
2550
|
+
>>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
|
|
2551
|
+
>>> [a, b]
|
|
2552
|
+
[Dseqrecord(-45), Dseqrecord(o39)]
|
|
2553
|
+
>>> res = cre_lox_integration(a, [b])
|
|
2554
|
+
>>> res
|
|
2555
|
+
[Dseqrecord(-84)]
|
|
2556
|
+
>>> res2 = cre_lox_excision(res[0])
|
|
2557
|
+
>>> res2
|
|
2558
|
+
[Dseqrecord(o39), Dseqrecord(-45)]
|
|
2559
|
+
|
|
2560
|
+
Below an example with lox66 and lox71 (irreversible integration).
|
|
2561
|
+
Here, the result of excision is still returned because there is a low
|
|
2562
|
+
probability of it happening, but it's considered a rare event.
|
|
2563
|
+
|
|
2564
|
+
>>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
|
|
2565
|
+
>>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
|
|
2566
|
+
>>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
|
|
2567
|
+
>>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
|
|
2568
|
+
>>> res = cre_lox_integration(a, [b])
|
|
2569
|
+
>>> res
|
|
2570
|
+
[Dseqrecord(-84)]
|
|
2571
|
+
>>> res2 = cre_lox_excision(res[0])
|
|
2572
|
+
>>> res2
|
|
2573
|
+
[Dseqrecord(o39), Dseqrecord(-45)]
|
|
2574
|
+
|
|
2575
|
+
"""
|
|
2576
|
+
fragments = common_handle_insertion_fragments(genome, inserts)
|
|
2577
|
+
return common_function_integration_products(fragments, None, cre_loxP_overlap)
|
|
2578
|
+
|
|
2579
|
+
|
|
2580
|
+
def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
|
|
2581
|
+
"""Returns the products for CRE-lox excision.
|
|
2582
|
+
|
|
2583
|
+
Parameters
|
|
2584
|
+
----------
|
|
2585
|
+
genome : _Dseqrecord
|
|
2586
|
+
Target genome sequence
|
|
2587
|
+
|
|
2588
|
+
Returns
|
|
2589
|
+
-------
|
|
2590
|
+
list[_Dseqrecord]
|
|
2591
|
+
List containing excised plasmid and remaining genome sequence
|
|
2592
|
+
|
|
2593
|
+
Examples
|
|
2594
|
+
--------
|
|
2595
|
+
|
|
2596
|
+
Below an example of reversible integration and excision.
|
|
2597
|
+
|
|
2598
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
2599
|
+
>>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
|
|
2600
|
+
>>> from pydna.cre_lox import LOXP_SEQUENCE
|
|
2601
|
+
>>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
|
|
2602
|
+
>>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
|
|
2603
|
+
>>> [a, b]
|
|
2604
|
+
[Dseqrecord(-45), Dseqrecord(o39)]
|
|
2605
|
+
>>> res = cre_lox_integration(a, [b])
|
|
2606
|
+
>>> res
|
|
2607
|
+
[Dseqrecord(-84)]
|
|
2608
|
+
>>> res2 = cre_lox_excision(res[0])
|
|
2609
|
+
>>> res2
|
|
2610
|
+
[Dseqrecord(o39), Dseqrecord(-45)]
|
|
2611
|
+
|
|
2612
|
+
Below an example with lox66 and lox71 (irreversible integration).
|
|
2613
|
+
Here, the result of excision is still returned because there is a low
|
|
2614
|
+
probability of it happening, but it's considered a rare event.
|
|
2615
|
+
|
|
2616
|
+
>>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
|
|
2617
|
+
>>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
|
|
2618
|
+
>>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
|
|
2619
|
+
>>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
|
|
2620
|
+
>>> res = cre_lox_integration(a, [b])
|
|
2621
|
+
>>> res
|
|
2622
|
+
[Dseqrecord(-84)]
|
|
2623
|
+
>>> res2 = cre_lox_excision(res[0])
|
|
2624
|
+
>>> res2
|
|
2625
|
+
[Dseqrecord(o39), Dseqrecord(-45)]
|
|
2626
|
+
"""
|
|
2627
|
+
return common_function_excision_products(genome, None, cre_loxP_overlap)
|