pydna 5.5.2__py3-none-any.whl → 5.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/assembly2.py CHANGED
@@ -1,5 +1,8 @@
1
1
  # -*- coding: utf-8 -*-
2
- """Slightly different assembly implementation"""
2
+ """
3
+ Improved implementation of the assembly module. To see a list of issues with the previous implementation,
4
+ see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
5
+ """
3
6
 
4
7
  import networkx as _nx
5
8
  import itertools as _itertools
@@ -26,12 +29,20 @@ from pydna.primer import Primer as _Primer
26
29
  from pydna.seqrecord import SeqRecord as _SeqRecord
27
30
  from pydna.types import (
28
31
  CutSiteType,
32
+ # TODO: allow user to enforce multi-site
29
33
  EdgeRepresentationAssembly,
30
34
  SubFragmentRepresentationAssembly,
31
35
  AssemblyAlgorithmType,
32
36
  SequenceOverlap,
33
37
  AssemblyEdgeType,
34
38
  )
39
+ from pydna.gateway import gateway_overlap, find_gateway_sites
40
+ from pydna.cre_lox import cre_loxP_overlap
41
+
42
+ from typing import TYPE_CHECKING, Callable
43
+
44
+ if TYPE_CHECKING:
45
+ from Bio.Restriction import AbstractCut as _AbstractCut
35
46
 
36
47
 
37
48
  def gather_overlapping_locations(
@@ -366,7 +377,7 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
366
377
  return [tuple(m) for m in matches]
367
378
 
368
379
 
369
- def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
380
+ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = False):
370
381
  """
371
382
  Assembly algorithm for ligation of sticky ends.
372
383
 
@@ -376,7 +387,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
376
387
  Args:
377
388
  seqx (_Dseqrecord): The first sequence
378
389
  seqy (_Dseqrecord): The second sequence
379
- limit (int): Minimum length of the overlap
390
+ limit (bool): Whether to allow partial overlaps
380
391
 
381
392
  Returns:
382
393
  list[SequenceOverlap]: A list of overlaps between the two sequences
@@ -389,16 +400,16 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
389
400
  >>> from pydna.assembly2 import sticky_end_sub_strings
390
401
  >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 3))
391
402
  >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
392
- >>> sticky_end_sub_strings(x, y, limit=0)
403
+ >>> sticky_end_sub_strings(x, y, limit=False)
393
404
  [(3, 0, 3)]
394
- >>> sticky_end_sub_strings(y, x, limit=0)
405
+ >>> sticky_end_sub_strings(y, x, limit=False)
395
406
  []
396
407
 
397
408
  Ligation of partially overlapping sticky ends, specified with limit=True
398
409
 
399
410
  >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 2))
400
411
  >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
401
- >>> sticky_end_sub_strings(x, y, limit=0)
412
+ >>> sticky_end_sub_strings(x, y, limit=False)
402
413
  []
403
414
  >>> sticky_end_sub_strings(x, y, limit=True)
404
415
  [(4, 0, 2)]
@@ -1900,3 +1911,717 @@ class SingleFragmentAssembly(Assembly):
1900
1911
 
1901
1912
  def get_linear_assemblies(self):
1902
1913
  raise NotImplementedError("Linear assembly does not make sense")
1914
+
1915
+
1916
+ def common_function_assembly_products(
1917
+ frags: list[_Dseqrecord],
1918
+ limit: int | None,
1919
+ algorithm: Callable,
1920
+ circular_only: bool,
1921
+ filter_results_function: Callable | None = None,
1922
+ ) -> list[_Dseqrecord]:
1923
+ """Common function to avoid code duplication. Could be simplified further
1924
+ once SingleFragmentAssembly and Assembly are merged.
1925
+
1926
+ Parameters
1927
+ ----------
1928
+ frags : list[_Dseqrecord]
1929
+ List of DNA fragments to assemble
1930
+ limit : int or None
1931
+ Minimum overlap length required, or None if not applicable
1932
+ algorithm : Callable
1933
+ Function that determines valid overlaps between fragments
1934
+ circular_only : bool
1935
+ If True, only return circular assemblies
1936
+
1937
+ Returns
1938
+ -------
1939
+ list[_Dseqrecord]
1940
+ List of assembled DNA molecules
1941
+ """
1942
+ if len(frags) == 1:
1943
+ asm = SingleFragmentAssembly(frags, limit, algorithm)
1944
+ else:
1945
+ asm = Assembly(
1946
+ frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
1947
+ )
1948
+ output_assemblies = asm.get_circular_assemblies()
1949
+ if not circular_only and len(frags) > 1:
1950
+ output_assemblies += filter_linear_subassemblies(
1951
+ asm.get_linear_assemblies(), output_assemblies, frags
1952
+ )
1953
+ if not circular_only and len(frags) == 1:
1954
+ output_assemblies += asm.get_insertion_assemblies()
1955
+
1956
+ if filter_results_function:
1957
+ output_assemblies = [a for a in output_assemblies if filter_results_function(a)]
1958
+
1959
+ return [assemble(frags, a) for a in output_assemblies]
1960
+
1961
+
1962
+ def gibson_assembly(
1963
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1964
+ ) -> list[_Dseqrecord]:
1965
+ """Returns the products for Gibson assembly.
1966
+
1967
+ Parameters
1968
+ ----------
1969
+ frags : list[_Dseqrecord]
1970
+ List of DNA fragments to assemble
1971
+ limit : int, optional
1972
+ Minimum overlap length required, by default 25
1973
+ circular_only : bool, optional
1974
+ If True, only return circular assemblies, by default False
1975
+
1976
+ Returns
1977
+ -------
1978
+ list[_Dseqrecord]
1979
+ List of assembled DNA molecules
1980
+ """
1981
+ return common_function_assembly_products(
1982
+ frags, limit, gibson_overlap, circular_only
1983
+ )
1984
+
1985
+
1986
+ def in_fusion_assembly(
1987
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1988
+ ) -> list[_Dseqrecord]:
1989
+ """Returns the products for in-fusion assembly. This is the same as Gibson
1990
+ assembly, but with a different name.
1991
+
1992
+ Parameters
1993
+ ----------
1994
+ frags : list[_Dseqrecord]
1995
+ List of DNA fragments to assemble
1996
+ limit : int, optional
1997
+ Minimum overlap length required, by default 25
1998
+ circular_only : bool, optional
1999
+ If True, only return circular assemblies, by default False
2000
+
2001
+ Returns
2002
+ -------
2003
+ list[_Dseqrecord]
2004
+ List of assembled DNA molecules
2005
+ """
2006
+ return gibson_assembly(frags, limit)
2007
+
2008
+
2009
+ def fusion_pcr_assembly(
2010
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2011
+ ) -> list[_Dseqrecord]:
2012
+ """Returns the products for fusion PCR assembly. This is the same as Gibson
2013
+ assembly, but with a different name.
2014
+
2015
+ Parameters
2016
+ ----------
2017
+ frags : list[_Dseqrecord]
2018
+ List of DNA fragments to assemble
2019
+ limit : int, optional
2020
+ Minimum overlap length required, by default 25
2021
+ circular_only : bool, optional
2022
+ If True, only return circular assemblies, by default False
2023
+
2024
+ Returns
2025
+ -------
2026
+ list[_Dseqrecord]
2027
+ List of assembled DNA molecules
2028
+ """
2029
+ return gibson_assembly(frags, limit)
2030
+
2031
+
2032
+ def in_vivo_assembly(
2033
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2034
+ ) -> list[_Dseqrecord]:
2035
+ """Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
2036
+
2037
+ Parameters
2038
+ ----------
2039
+ frags : list[_Dseqrecord]
2040
+ List of DNA fragments to assemble
2041
+ limit : int, optional
2042
+ Minimum overlap length required, by default 25
2043
+ circular_only : bool, optional
2044
+ If True, only return circular assemblies, by default False
2045
+
2046
+ Returns
2047
+ -------
2048
+ list[_Dseqrecord]
2049
+ List of assembled DNA molecules
2050
+ """
2051
+ return common_function_assembly_products(
2052
+ frags, limit, common_sub_strings, circular_only
2053
+ )
2054
+
2055
+
2056
+ def restriction_ligation_assembly(
2057
+ frags: list[_Dseqrecord],
2058
+ enzymes: list["_AbstractCut"],
2059
+ allow_blunt: bool = True,
2060
+ circular_only: bool = False,
2061
+ ) -> list[_Dseqrecord]:
2062
+ """Returns the products for restriction ligation assembly:
2063
+ * Finds cutsites in the fragments
2064
+ * Finds all products that could be assembled by ligating the fragments based on those cutsites
2065
+ * Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2066
+
2067
+ Parameters
2068
+ ----------
2069
+ frags : list[_Dseqrecord]
2070
+ List of DNA fragments to assemble
2071
+ enzymes : list[_AbstractCut]
2072
+ List of restriction enzymes to use
2073
+ allow_blunt : bool, optional
2074
+ If True, allow blunt end ligations, by default True
2075
+ circular_only : bool, optional
2076
+ If True, only return circular assemblies, by default False
2077
+
2078
+ Returns
2079
+ -------
2080
+ list[_Dseqrecord]
2081
+ List of assembled DNA molecules
2082
+
2083
+ Examples
2084
+ --------
2085
+ In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
2086
+ Note how 2 circular products are returned, one contains the insert (`acgt`)
2087
+ and the desired part of the backbone (`cccccc`), the other contains the
2088
+ reversed insert (`tgga`) and the cut-out part of the backbone (`aaa`).
2089
+
2090
+ >>> from pydna.assembly2 import restriction_ligation_assembly
2091
+ >>> from pydna.dseqrecord import Dseqrecord
2092
+ >>> from Bio.Restriction import EcoRI, SalI
2093
+ >>> backbone = Dseqrecord("cccGAATTCaaaGTCGACccc", circular=True)
2094
+ >>> insert = Dseqrecord("ggGAATTCaggtGTCGACgg")
2095
+ >>> products = restriction_ligation_assembly([backbone, insert], [EcoRI, SalI], circular_only=True)
2096
+ >>> products[0].seq
2097
+ Dseq(o22)
2098
+ TCGACccccccGAATTCaggtG
2099
+ AGCTGggggggCTTAAGtccaC
2100
+ >>> products[1].seq
2101
+ Dseq(o19)
2102
+ AATTCaaaGTCGACacctG
2103
+ TTAAGtttCAGCTGtggaC
2104
+
2105
+ Note that passing a pre-cut fragment will not work.
2106
+
2107
+ >>> restriction_products = insert.cut([EcoRI, SalI])
2108
+ >>> cut_insert = restriction_products[1]
2109
+ >>> restriction_ligation_assembly([backbone, cut_insert], [EcoRI, SalI], circular_only=True)
2110
+ []
2111
+
2112
+ It also works with a single fragment, for circularization:
2113
+
2114
+ >>> seq = Dseqrecord("GAATTCaaaGAATTC")
2115
+ >>> products =restriction_ligation_assembly([seq], [EcoRI])
2116
+ >>> products[0].seq
2117
+ Dseq(o9)
2118
+ AATTCaaaG
2119
+ TTAAGtttC
2120
+ """
2121
+
2122
+ def algo(x, y, _l):
2123
+ # By default, we allow blunt ends
2124
+ return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
2125
+
2126
+ return common_function_assembly_products(frags, None, algo, circular_only)
2127
+
2128
+
2129
+ def golden_gate_assembly(
2130
+ frags: list[_Dseqrecord],
2131
+ enzymes: list["_AbstractCut"],
2132
+ allow_blunt: bool = True,
2133
+ circular_only: bool = False,
2134
+ ) -> list[_Dseqrecord]:
2135
+ """Returns the products for Golden Gate assembly. This is the same as
2136
+ restriction ligation assembly, but with a different name. Check the documentation
2137
+ for `restriction_ligation_assembly` for more details.
2138
+
2139
+ Parameters
2140
+ ----------
2141
+ frags : list[_Dseqrecord]
2142
+ List of DNA fragments to assemble
2143
+ enzymes : list[_AbstractCut]
2144
+ List of restriction enzymes to use
2145
+ allow_blunt : bool, optional
2146
+ If True, allow blunt end ligations, by default True
2147
+ circular_only : bool, optional
2148
+ If True, only return circular assemblies, by default False
2149
+
2150
+ Returns
2151
+ -------
2152
+ list[_Dseqrecord]
2153
+ List of assembled DNA molecules
2154
+
2155
+ Examples
2156
+ --------
2157
+ See the example for `restriction_ligation_assembly`.
2158
+ """
2159
+ return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
2160
+
2161
+
2162
+ def ligation_assembly(
2163
+ frags: list[_Dseqrecord],
2164
+ allow_blunt: bool = False,
2165
+ allow_partial_overlap: bool = False,
2166
+ circular_only: bool = False,
2167
+ ) -> list[_Dseqrecord]:
2168
+ """Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
2169
+ will be ligated.
2170
+
2171
+ For most cases, you probably should use `restriction_ligation_assembly` instead.
2172
+
2173
+ Parameters
2174
+ ----------
2175
+ frags : list[_Dseqrecord]
2176
+ List of DNA fragments to assemble
2177
+ allow_blunt : bool, optional
2178
+ If True, allow blunt end ligations, by default False
2179
+ allow_partial_overlap : bool, optional
2180
+ If True, allow partial overlaps between sticky ends, by default False
2181
+ circular_only : bool, optional
2182
+ If True, only return circular assemblies, by default False
2183
+
2184
+ Returns
2185
+ -------
2186
+ list[_Dseqrecord]
2187
+ List of assembled DNA molecules
2188
+
2189
+
2190
+ Examples
2191
+ --------
2192
+ In the example below, we plan to assemble a plasmid from a backbone and an insert,
2193
+ using the EcoRI enzyme. The insert and insertion site in the backbone are flanked by
2194
+ EcoRI sites, so there are two possible products depending on the orientation of the insert.
2195
+
2196
+ >>> from pydna.assembly2 import ligation_assembly
2197
+ >>> from pydna.dseqrecord import Dseqrecord
2198
+ >>> from Bio.Restriction import EcoRI
2199
+ >>> backbone = Dseqrecord("cccGAATTCaaaGAATTCccc", circular=True)
2200
+ >>> backbone_cut = backbone.cut(EcoRI)[1]
2201
+ >>> insert = Dseqrecord("ggGAATTCaggtGAATTCgg")
2202
+ >>> insert_cut = insert.cut(EcoRI)[1]
2203
+ >>> products = ligation_assembly([backbone_cut, insert_cut])
2204
+ >>> products[0].seq
2205
+ Dseq(o22)
2206
+ AATTCccccccGAATTCaggtG
2207
+ TTAAGggggggCTTAAGtccaC
2208
+ >>> products[1].seq
2209
+ Dseq(o22)
2210
+ AATTCccccccGAATTCacctG
2211
+ TTAAGggggggCTTAAGtggaC
2212
+ """
2213
+
2214
+ def sticky_end_algorithm(x, y, _l):
2215
+ return sticky_end_sub_strings(x, y, allow_partial_overlap)
2216
+
2217
+ if allow_blunt:
2218
+ algo = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2219
+ else:
2220
+ algo = sticky_end_algorithm
2221
+
2222
+ return common_function_assembly_products(frags, None, algo, circular_only)
2223
+
2224
+
2225
+ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
2226
+ """Returns True if the assembly is a multi-site assembly, False otherwise."""
2227
+
2228
+ if len(asm) < 2:
2229
+ return False
2230
+
2231
+ is_cycle = asm[0][1] == asm[-1][0]
2232
+ asm2 = edge_representation2subfragment_representation(asm, is_cycle)
2233
+
2234
+ return all(f[1] != f[2] for f in asm2)
2235
+
2236
+
2237
+ def gateway_assembly(
2238
+ frags: list[_Dseqrecord],
2239
+ reaction_type: str,
2240
+ greedy: bool = False,
2241
+ circular_only: bool = False,
2242
+ multi_site_only: bool = False,
2243
+ ) -> list[_Dseqrecord]:
2244
+ """Returns the products for Gateway assembly / Gateway cloning.
2245
+
2246
+ Parameters
2247
+ ----------
2248
+ frags : list[_Dseqrecord]
2249
+ List of DNA fragments to assemble
2250
+ reaction_type : str
2251
+ Type of Gateway reaction, either 'BP' or 'LR'
2252
+ greedy : bool, optional
2253
+ If True, use greedy gateway consensus sites, by default False
2254
+ circular_only : bool, optional
2255
+ If True, only return circular assemblies, by default False
2256
+ multi_site_only : bool, optional
2257
+ If True, only return products that where 2 sites recombined. Even if input sequences
2258
+ contain multiple att sites (typically 2), a product could be generated where only one
2259
+ site recombines. That's typically not what you want, so you can set this to True to
2260
+ only return products where both att sites recombined.
2261
+
2262
+ Returns
2263
+ -------
2264
+ list[_Dseqrecord]
2265
+ List of assembled DNA molecules
2266
+
2267
+
2268
+ Examples
2269
+ --------
2270
+
2271
+ Below an example with dummy Gateway sequences, composed with minimal sequences and the consensus
2272
+ att sites.
2273
+
2274
+ >>> from pydna.assembly2 import gateway_assembly
2275
+ >>> from pydna.dseqrecord import Dseqrecord
2276
+ >>> attB1 = "ACAACTTTGTACAAAAAAGCAGAAG"
2277
+ >>> attP1 = "AAAATAATGATTTTATTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCTGAACGAGAAGCGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATCCAGTCACTATGAATCAACTACTTAGATGGTATTAGTGACCTGTA"
2278
+ >>> attR1 = "ACAACTTTGTACAAAAAAGCTGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATGCAGTCACTATG"
2279
+ >>> attL1 = "CAAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATAAGCAATGCTTTCTTATAATGCCAACTTTGTACAAAAAAGCAGGCT"
2280
+ >>> seq1 = Dseqrecord("aaa" + attB1 + "ccc")
2281
+ >>> seq2 = Dseqrecord("aaa" + attP1 + "ccc")
2282
+ >>> seq3 = Dseqrecord("aaa" + attR1 + "ccc")
2283
+ >>> seq4 = Dseqrecord("aaa" + attL1 + "ccc")
2284
+ >>> products_BP = gateway_assembly([seq1, seq2], "BP")
2285
+ >>> products_LR = gateway_assembly([seq3, seq4], "LR")
2286
+ >>> len(products_BP)
2287
+ 2
2288
+ >>> len(products_LR)
2289
+ 2
2290
+
2291
+ Now let's understand the `multi_site_only` parameter. Let's consider a case where we are swapping fragments
2292
+ between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
2293
+ swapping between the two att sites. That's what we get if we set `multi_site_only` to True.
2294
+
2295
+ >>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
2296
+ >>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
2297
+ >>> insert = Dseqrecord("cccccc" + attL1 + "ccc" + attL2 + "cccccc", circular=True)
2298
+ >>> backbone = Dseqrecord("ttttt" + attR1 + "aaa" + attR2, circular=True)
2299
+ >>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=True)
2300
+ >>> len(products)
2301
+ 2
2302
+
2303
+ However, if we set `multi_site_only` to False, we get 4 products, which also include the intermediate products
2304
+ where the two plasmids are combined into a single one through recombination of a single att site. This is an
2305
+ intermediate of the reaction, and typically we don't want it:
2306
+
2307
+ >>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=False)
2308
+ >>> print([len(p) for p in products])
2309
+ [469, 237, 232, 469]
2310
+
2311
+
2312
+ """
2313
+
2314
+ if reaction_type not in ["BP", "LR"]:
2315
+ raise ValueError(
2316
+ f"Invalid reaction type: {reaction_type}, can only be BP or LR"
2317
+ )
2318
+
2319
+ def algo(x, y, _l):
2320
+ return gateway_overlap(x, y, reaction_type, greedy)
2321
+
2322
+ filter_results_function = None if not multi_site_only else assembly_is_multi_site
2323
+
2324
+ products = common_function_assembly_products(
2325
+ frags, None, algo, circular_only, filter_results_function
2326
+ )
2327
+
2328
+ if len(products) == 0:
2329
+ # Build a list of all the sites in the fragments
2330
+ sites_in_fragments = list()
2331
+ for frag in frags:
2332
+ sites_in_fragments.append(list(find_gateway_sites(frag, greedy).keys()))
2333
+ formatted_strings = [
2334
+ f'fragment {i + 1}: {", ".join(sites)}'
2335
+ for i, sites in enumerate(sites_in_fragments)
2336
+ ]
2337
+ raise ValueError(
2338
+ f"Inputs are not compatible for {reaction_type} reaction.\n\n"
2339
+ + "\n".join(formatted_strings),
2340
+ )
2341
+ return products
2342
+
2343
+
2344
+ def common_function_integration_products(
2345
+ frags: list[_Dseqrecord], limit: int | None, algorithm: Callable
2346
+ ) -> list[_Dseqrecord]:
2347
+ """Common function to avoid code duplication for integration products.
2348
+
2349
+ Parameters
2350
+ ----------
2351
+ frags : list[_Dseqrecord]
2352
+ List of DNA fragments to integrate
2353
+ limit : int or None
2354
+ Minimum overlap length required, or None if not applicable
2355
+ algorithm : Callable
2356
+ Function that determines valid overlaps between fragments
2357
+
2358
+ Returns
2359
+ -------
2360
+ list[_Dseqrecord]
2361
+ List of integrated DNA molecules
2362
+ """
2363
+ if len(frags) == 1:
2364
+ asm = SingleFragmentAssembly(frags, limit, algorithm)
2365
+ else:
2366
+ asm = Assembly(
2367
+ frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
2368
+ )
2369
+
2370
+ if frags[0].circular:
2371
+ raise ValueError(
2372
+ "Genome must be linear for integration assembly, use in vivo assembly instead"
2373
+ )
2374
+
2375
+ # We only want insertions in the genome (first fragment)
2376
+ output_assemblies = [a for a in asm.get_insertion_assemblies() if a[0][0] == 1]
2377
+ return [assemble(frags, a, True) for a in output_assemblies]
2378
+
2379
+
2380
+ def common_handle_insertion_fragments(
2381
+ genome: _Dseqrecord, inserts: list[_Dseqrecord]
2382
+ ) -> list[_Dseqrecord]:
2383
+ """Common function to handle / validate insertion fragments.
2384
+
2385
+ Parameters
2386
+ ----------
2387
+ genome : _Dseqrecord
2388
+ Target genome sequence
2389
+ inserts : list[_Dseqrecord] or _Dseqrecord
2390
+ DNA fragment(s) to insert
2391
+
2392
+ Returns
2393
+ -------
2394
+ list[_Dseqrecord]
2395
+ List containing genome and insert fragments
2396
+ """
2397
+ if not isinstance(genome, _Dseqrecord):
2398
+ raise ValueError("Genome must be a Dseqrecord object")
2399
+
2400
+ if not isinstance(inserts, list) or not all(
2401
+ isinstance(f, _Dseqrecord) for f in inserts
2402
+ ):
2403
+ raise ValueError("Inserts must be a list of Dseqrecord objects")
2404
+
2405
+ if len(inserts) == 0:
2406
+ raise ValueError("Inserts must be a non-empty list of Dseqrecord objects")
2407
+
2408
+ return [genome] + inserts
2409
+
2410
+
2411
+ def common_function_excision_products(
2412
+ genome: _Dseqrecord, limit: int | None, algorithm: Callable
2413
+ ) -> list[_Dseqrecord]:
2414
+ """Common function to avoid code duplication for excision products.
2415
+
2416
+ Parameters
2417
+ ----------
2418
+ genome : _Dseqrecord
2419
+ Target genome sequence
2420
+ limit : int or None
2421
+ Minimum overlap length required, or None if not applicable
2422
+ algorithm : Callable
2423
+ Function that determines valid overlaps between fragments
2424
+
2425
+ Returns
2426
+ -------
2427
+ list[_Dseqrecord]
2428
+ List of excised DNA molecules
2429
+ """
2430
+ asm = SingleFragmentAssembly([genome], limit, algorithm)
2431
+ return asm.assemble_circular() + asm.assemble_insertion()
2432
+
2433
+
2434
+ def homologous_recombination_integration(
2435
+ genome: _Dseqrecord,
2436
+ inserts: list[_Dseqrecord],
2437
+ limit: int = 40,
2438
+ ) -> list[_Dseqrecord]:
2439
+ """Returns the products resulting from the integration of an insert (or inserts joined
2440
+ through in vivo recombination) into the genome through homologous recombination.
2441
+
2442
+ Parameters
2443
+ ----------
2444
+ genome : _Dseqrecord
2445
+ Target genome sequence
2446
+ inserts : list[_Dseqrecord]
2447
+ DNA fragment(s) to insert
2448
+ limit : int, optional
2449
+ Minimum homology length required, by default 40
2450
+
2451
+ Returns
2452
+ -------
2453
+ list[_Dseqrecord]
2454
+ List of integrated DNA molecules
2455
+
2456
+
2457
+ Examples
2458
+ --------
2459
+
2460
+ Below an example with a single insert.
2461
+
2462
+ >>> from pydna.assembly2 import homologous_recombination_integration
2463
+ >>> from pydna.dseqrecord import Dseqrecord
2464
+ >>> homology = "AAGTCCGTTCGTTTTACCTG"
2465
+ >>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
2466
+ >>> insert = Dseqrecord(f"{homology}gggg{homology}")
2467
+ >>> products = homologous_recombination_integration(genome, [insert], 20)
2468
+ >>> str(products[0].seq)
2469
+ 'aaaaaaAAGTCCGTTCGTTTTACCTGggggAAGTCCGTTCGTTTTACCTGaaaaaa'
2470
+
2471
+ Below an example with two inserts joined through homology.
2472
+
2473
+ >>> homology2 = "ATTACAGCATGGGAAGAAAGA"
2474
+ >>> insert_1 = Dseqrecord(f"{homology}gggg{homology2}")
2475
+ >>> insert_2 = Dseqrecord(f"{homology2}cccc{homology}")
2476
+ >>> products = homologous_recombination_integration(genome, [insert_1, insert_2], 20)
2477
+ >>> str(products[0].seq)
2478
+ 'aaaaaaAAGTCCGTTCGTTTTACCTGggggATTACAGCATGGGAAGAAAGAccccAAGTCCGTTCGTTTTACCTGaaaaaa'
2479
+ """
2480
+ fragments = common_handle_insertion_fragments(genome, inserts)
2481
+
2482
+ return common_function_integration_products(fragments, limit, common_sub_strings)
2483
+
2484
+
2485
+ def homologous_recombination_excision(
2486
+ genome: _Dseqrecord, limit: int = 40
2487
+ ) -> list[_Dseqrecord]:
2488
+ """Returns the products resulting from the excision of a fragment from the genome through
2489
+ homologous recombination.
2490
+
2491
+ Parameters
2492
+ ----------
2493
+ genome : _Dseqrecord
2494
+ Target genome sequence
2495
+ limit : int, optional
2496
+ Minimum homology length required, by default 40
2497
+
2498
+ Returns
2499
+ -------
2500
+ list[_Dseqrecord]
2501
+ List containing excised plasmid and remaining genome sequence
2502
+
2503
+ Examples
2504
+ --------
2505
+
2506
+ Example of a homologous recombination event, where a plasmid is excised from the
2507
+ genome (circular sequence of 25 bp), and that part is removed from the genome,
2508
+ leaving a shorter linear sequence (32 bp).
2509
+
2510
+ >>> from pydna.assembly2 import homologous_recombination_excision
2511
+ >>> from pydna.dseqrecord import Dseqrecord
2512
+ >>> homology = "AAGTCCGTTCGTTTTACCTG"
2513
+ >>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
2514
+ >>> products = homologous_recombination_excision(genome, 20)
2515
+ >>> products
2516
+ [Dseqrecord(o25), Dseqrecord(-32)]
2517
+ """
2518
+ return common_function_excision_products(genome, limit, common_sub_strings)
2519
+
2520
+
2521
+ def cre_lox_integration(
2522
+ genome: _Dseqrecord, inserts: list[_Dseqrecord]
2523
+ ) -> list[_Dseqrecord]:
2524
+ """Returns the products resulting from the integration of an insert (or inserts joined
2525
+ through cre-lox recombination among them) into the genome through cre-lox integration.
2526
+
2527
+ Also works with lox66 and lox71 (see `pydna.cre_lox` for more details).
2528
+
2529
+ Parameters
2530
+ ----------
2531
+ genome : _Dseqrecord
2532
+ Target genome sequence
2533
+ inserts : list[_Dseqrecord] or _Dseqrecord
2534
+ DNA fragment(s) to insert
2535
+
2536
+ Returns
2537
+ -------
2538
+ list[_Dseqrecord]
2539
+ List of integrated DNA molecules
2540
+
2541
+ Examples
2542
+ --------
2543
+
2544
+ Below an example of reversible integration and excision.
2545
+
2546
+ >>> from pydna.dseqrecord import Dseqrecord
2547
+ >>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
2548
+ >>> from pydna.cre_lox import LOXP_SEQUENCE
2549
+ >>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
2550
+ >>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
2551
+ >>> [a, b]
2552
+ [Dseqrecord(-45), Dseqrecord(o39)]
2553
+ >>> res = cre_lox_integration(a, [b])
2554
+ >>> res
2555
+ [Dseqrecord(-84)]
2556
+ >>> res2 = cre_lox_excision(res[0])
2557
+ >>> res2
2558
+ [Dseqrecord(o39), Dseqrecord(-45)]
2559
+
2560
+ Below an example with lox66 and lox71 (irreversible integration).
2561
+ Here, the result of excision is still returned because there is a low
2562
+ probability of it happening, but it's considered a rare event.
2563
+
2564
+ >>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
2565
+ >>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
2566
+ >>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
2567
+ >>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
2568
+ >>> res = cre_lox_integration(a, [b])
2569
+ >>> res
2570
+ [Dseqrecord(-84)]
2571
+ >>> res2 = cre_lox_excision(res[0])
2572
+ >>> res2
2573
+ [Dseqrecord(o39), Dseqrecord(-45)]
2574
+
2575
+ """
2576
+ fragments = common_handle_insertion_fragments(genome, inserts)
2577
+ return common_function_integration_products(fragments, None, cre_loxP_overlap)
2578
+
2579
+
2580
+ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
2581
+ """Returns the products for CRE-lox excision.
2582
+
2583
+ Parameters
2584
+ ----------
2585
+ genome : _Dseqrecord
2586
+ Target genome sequence
2587
+
2588
+ Returns
2589
+ -------
2590
+ list[_Dseqrecord]
2591
+ List containing excised plasmid and remaining genome sequence
2592
+
2593
+ Examples
2594
+ --------
2595
+
2596
+ Below an example of reversible integration and excision.
2597
+
2598
+ >>> from pydna.dseqrecord import Dseqrecord
2599
+ >>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
2600
+ >>> from pydna.cre_lox import LOXP_SEQUENCE
2601
+ >>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
2602
+ >>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
2603
+ >>> [a, b]
2604
+ [Dseqrecord(-45), Dseqrecord(o39)]
2605
+ >>> res = cre_lox_integration(a, [b])
2606
+ >>> res
2607
+ [Dseqrecord(-84)]
2608
+ >>> res2 = cre_lox_excision(res[0])
2609
+ >>> res2
2610
+ [Dseqrecord(o39), Dseqrecord(-45)]
2611
+
2612
+ Below an example with lox66 and lox71 (irreversible integration).
2613
+ Here, the result of excision is still returned because there is a low
2614
+ probability of it happening, but it's considered a rare event.
2615
+
2616
+ >>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
2617
+ >>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
2618
+ >>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
2619
+ >>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
2620
+ >>> res = cre_lox_integration(a, [b])
2621
+ >>> res
2622
+ [Dseqrecord(-84)]
2623
+ >>> res2 = cre_lox_excision(res[0])
2624
+ >>> res2
2625
+ [Dseqrecord(o39), Dseqrecord(-45)]
2626
+ """
2627
+ return common_function_excision_products(genome, None, cre_loxP_overlap)