pydna 5.5.3__py3-none-any.whl → 5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/assembly2.py CHANGED
@@ -39,9 +39,26 @@ from pydna.types import (
39
39
  from pydna.gateway import gateway_overlap, find_gateway_sites
40
40
  from pydna.cre_lox import cre_loxP_overlap
41
41
 
42
- from typing import TYPE_CHECKING, Callable
42
+ from typing import TYPE_CHECKING, Callable, Literal
43
+ from pydna.opencloning_models import (
44
+ AssemblySource,
45
+ RestrictionAndLigationSource,
46
+ GibsonAssemblySource,
47
+ InFusionSource,
48
+ OverlapExtensionPCRLigationSource,
49
+ InVivoAssemblySource,
50
+ LigationSource,
51
+ GatewaySource,
52
+ HomologousRecombinationSource,
53
+ CreLoxRecombinationSource,
54
+ PCRSource,
55
+ SourceInput,
56
+ CRISPRSource,
57
+ )
58
+ from pydna.crispr import cas9
59
+ import warnings
43
60
 
44
- if TYPE_CHECKING:
61
+ if TYPE_CHECKING: # pragma: no cover
45
62
  from Bio.Restriction import AbstractCut as _AbstractCut
46
63
 
47
64
 
@@ -80,15 +97,22 @@ def ends_from_cutsite(
80
97
  ) -> tuple[tuple[str, str], tuple[str, str]]:
81
98
  """Get the sticky or blunt ends created by a restriction enzyme cut.
82
99
 
83
- Args:
84
- cutsite (CutSiteType): A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
85
- seq (_Dseq): The DNA sequence being cut
100
+ Parameters
101
+ ----------
102
+ cutsite : CutSiteType
103
+ A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
104
+ seq : _Dseq
105
+ The DNA sequence being cut
86
106
 
87
- Raises:
88
- ValueError: If cutsite is None
107
+ Raises
108
+ ------
109
+ ValueError
110
+ If cutsite is None
89
111
 
90
- Returns:
91
- tuple[tuple[str, str], tuple[str, str]]: A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
112
+ Returns
113
+ -------
114
+ tuple[tuple[str, str], tuple[str, str]]
115
+ A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
92
116
  and the sequence of the overhang. The first tuple is for the left end, second for the right end.
93
117
 
94
118
  >>> from Bio.Restriction import NotI
@@ -129,14 +153,23 @@ def restriction_ligation_overlap(
129
153
 
130
154
  Like in sticky and gibson, the order matters (see example below of partial overlap)
131
155
 
132
- Args:
133
- seqx (_Dseqrecord): The first sequence
134
- seqy (_Dseqrecord): The second sequence
135
- enzymes (RestrictionBatch): The enzymes to use
136
- partial (bool): Whether to allow partial overlaps
137
- allow_blunt (bool): Whether to allow blunt ends
138
- Returns:
139
- list[SequenceOverlap]: A list of overlaps between the two sequences
156
+ Parameters
157
+ ----------
158
+ seqx : _Dseqrecord
159
+ The first sequence
160
+ seqy : _Dseqrecord
161
+ The second sequence
162
+ enzymes : RestrictionBatch
163
+ The enzymes to use
164
+ partial : bool
165
+ Whether to allow partial overlaps
166
+ allow_blunt : bool
167
+ Whether to allow blunt ends
168
+
169
+ Returns
170
+ -------
171
+ list[SequenceOverlap]
172
+ A list of overlaps between the two sequences
140
173
 
141
174
  >>> from pydna.dseqrecord import Dseqrecord
142
175
  >>> from pydna.assembly2 import restriction_ligation_overlap
@@ -230,13 +263,19 @@ def blunt_overlap(
230
263
  It basically returns [(len(seqx), 0, 0)] if the right end of seqx is blunt and the
231
264
  left end of seqy is blunt (compatible with blunt ligation). Otherwise, it returns an empty list.
232
265
 
233
- Args:
234
- seqx (_Dseqrecord): The first sequence
235
- seqy (_Dseqrecord): The second sequence
236
- limit (int): There for compatibility, but it is ignored
266
+ Parameters
267
+ ----------
268
+ seqx : _Dseqrecord
269
+ The first sequence
270
+ seqy : _Dseqrecord
271
+ The second sequence
272
+ limit : int
273
+ There for compatibility, but it is ignored
237
274
 
238
- Returns:
239
- list[SequenceOverlap]: A list of overlaps between the two sequences
275
+ Returns
276
+ -------
277
+ list[SequenceOverlap]
278
+ A list of overlaps between the two sequences
240
279
 
241
280
  >>> from pydna.assembly2 import blunt_overlap
242
281
  >>> from pydna.dseqrecord import Dseqrecord
@@ -322,25 +361,31 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
322
361
  Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
323
362
  The order matters, we want alignments like:
324
363
 
325
- ```
326
- seqx: oooo------xxxx
327
- seqy: xxxx------oooo
328
- Product: oooo------xxxx------oooo
364
+ ::
329
365
 
330
- Not like:
366
+ seqx: oooo------xxxx
367
+ seqy: xxxx------oooo
368
+ Product: oooo------xxxx------oooo
331
369
 
332
- seqx: oooo------xxxx
333
- seqy: xxxx------oooo
334
- Product (unwanted): oooo
335
- ```
370
+ Not like:
336
371
 
337
- Args:
338
- seqx (_Dseqrecord): The first sequence
339
- seqy (_Dseqrecord): The second sequence
340
- limit (int): Minimum length of the overlap
372
+ seqx: oooo------xxxx
373
+ seqy: xxxx------oooo
374
+ Product (unwanted): oooo
341
375
 
342
- Returns:
343
- list[SequenceOverlap]: A list of overlaps between the two sequences
376
+ Parameters
377
+ ----------
378
+ seqx : _Dseqrecord
379
+ The first sequence
380
+ seqy : _Dseqrecord
381
+ The second sequence
382
+ limit : int
383
+ Minimum length of the overlap
384
+
385
+ Returns
386
+ -------
387
+ list[SequenceOverlap]
388
+ A list of overlaps between the two sequences
344
389
 
345
390
  >>> from pydna.dseqrecord import Dseqrecord
346
391
  >>> from pydna.assembly2 import gibson_overlap
@@ -384,13 +429,19 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
384
429
  For now, if limit 0 / False (default) only full overlaps are considered.
385
430
  Otherwise, partial overlaps are also returned.
386
431
 
387
- Args:
388
- seqx (_Dseqrecord): The first sequence
389
- seqy (_Dseqrecord): The second sequence
390
- limit (bool): Whether to allow partial overlaps
432
+ Parameters
433
+ ----------
434
+ seqx : _Dseqrecord
435
+ The first sequence
436
+ seqy : _Dseqrecord
437
+ The second sequence
438
+ limit : bool
439
+ Whether to allow partial overlaps
391
440
 
392
- Returns:
393
- list[SequenceOverlap]: A list of overlaps between the two sequences
441
+ Returns
442
+ -------
443
+ list[SequenceOverlap]
444
+ A list of overlaps between the two sequences
394
445
 
395
446
 
396
447
  Ligation of fully overlapping sticky ends, note how the order matters
@@ -520,14 +571,21 @@ def primer_template_overlap(
520
571
  If seqx is a template and seqy is a primer, it represents the binding of a reverse primer,
521
572
  where the primer has been passed as its reverse complement (see examples).
522
573
 
523
- Args:
524
- seqx (_Dseqrecord | _Primer): The primer
525
- seqy (_Dseqrecord | _Primer): The template
526
- limit (int): Minimum length of the overlap
527
- mismatches (int): Maximum number of mismatches (only substitutions, no deletion or insertion)
574
+ Parameters
575
+ ----------
576
+ seqx : _Dseqrecord | _Primer
577
+ The primer
578
+ seqy : _Dseqrecord | _Primer
579
+ The template
580
+ limit : int
581
+ Minimum length of the overlap
582
+ mismatches : int
583
+ Maximum number of mismatches (only substitutions, no deletion or insertion)
528
584
 
529
- Returns:
530
- list[SequenceOverlap]: A list of overlaps between the primer and the template
585
+ Returns
586
+ -------
587
+ list[SequenceOverlap]
588
+ A list of overlaps between the primer and the template
531
589
 
532
590
  >>> from pydna.dseqrecord import Dseqrecord
533
591
  >>> from pydna.primer import Primer
@@ -537,7 +595,7 @@ def primer_template_overlap(
537
595
  >>> primer_template_overlap(primer, template, limit=8, mismatches=0)
538
596
  [(0, 2, 8)]
539
597
 
540
- This actually represents the binding of the primer `GCTGCTAA` (reverse complement)
598
+ This actually represents the binding of the primer ``GCTGCTAA`` (reverse complement)
541
599
  >>> primer_template_overlap(template, primer, limit=8, mismatches=0)
542
600
  [(2, 0, 8)]
543
601
  >>> primer_template_overlap(primer, template.reverse_complement(), limit=8, mismatches=0)
@@ -702,7 +760,7 @@ def assembly2str(assembly: EdgeRepresentationAssembly) -> str:
702
760
  ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
703
761
 
704
762
  The reason for this is that by default, a feature '[8:14]' when present in a tuple
705
- is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
763
+ is printed to the console as ``SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)`` (very long).
706
764
  """
707
765
  return str(tuple(f"{u}{lu}:{v}{lv}" for u, v, lu, lv in assembly))
708
766
 
@@ -791,7 +849,7 @@ def assemble(
791
849
  out_dseqrecord = _Dseqrecord(subfragments[0])
792
850
 
793
851
  for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
794
- # Shift the features of the right fragment to the left by `overlap`
852
+ # Shift the features of the right fragment to the left by ``overlap``
795
853
  new_features = [
796
854
  f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
797
855
  ]
@@ -808,22 +866,25 @@ def assemble(
808
866
 
809
867
  # Special case for blunt circularisation
810
868
  if overlap == 0:
811
- return out_dseqrecord.looped()
812
-
813
- # Remove trailing overlap
814
- out_dseqrecord = _Dseqrecord(
815
- fill_dseq(out_dseqrecord.seq)[:-overlap],
816
- features=out_dseqrecord.features,
817
- circular=True,
818
- )
819
- for feature in out_dseqrecord.features:
820
- start, end = _location_boundaries(feature.location)
821
- if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
822
- # Wrap around the origin
823
- feature.location = _shift_location(
824
- feature.location, 0, len(out_dseqrecord)
825
- )
826
-
869
+ out_dseqrecord = out_dseqrecord.looped()
870
+ else:
871
+ # Remove trailing overlap
872
+ out_dseqrecord = _Dseqrecord(
873
+ fill_dseq(out_dseqrecord.seq)[:-overlap],
874
+ features=out_dseqrecord.features,
875
+ circular=True,
876
+ )
877
+ for feature in out_dseqrecord.features:
878
+ start, end = _location_boundaries(feature.location)
879
+ if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
880
+ # Wrap around the origin
881
+ feature.location = _shift_location(
882
+ feature.location, 0, len(out_dseqrecord)
883
+ )
884
+
885
+ out_dseqrecord.source = AssemblySource.from_subfragment_representation(
886
+ subfragment_representation, fragments, is_circular
887
+ )
827
888
  return out_dseqrecord
828
889
 
829
890
 
@@ -916,30 +977,29 @@ def get_assembly_subfragments(
916
977
 
917
978
  Subfragments are the slices of the fragments that are joined together
918
979
 
919
- For example:
920
- ```
921
- --A--
922
- TACGTAAT
923
- --B--
924
- TCGTAACGA
925
-
926
- Gives: TACGTAA / CGTAACGA
927
- ```
928
- To reproduce:
929
- ```
930
- a = Dseqrecord('TACGTAAT')
931
- b = Dseqrecord('TCGTAACGA')
932
- f = Assembly([a, b], limit=5)
933
- a0 = f.get_linear_assemblies()[0]
934
- print(assembly2str(a0))
935
- a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
936
- for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
937
- print(f.seq)
938
-
939
- # prints TACGTAA and CGTAACGA
940
- ```
941
-
942
- Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
980
+ For example::
981
+
982
+ --A--
983
+ TACGTAAT
984
+ --B--
985
+ TCGTAACGA
986
+
987
+ Gives: TACGTAA / CGTAACGA
988
+
989
+ To reproduce::
990
+
991
+ a = Dseqrecord('TACGTAAT')
992
+ b = Dseqrecord('TCGTAACGA')
993
+ f = Assembly([a, b], limit=5)
994
+ a0 = f.get_linear_assemblies()[0]
995
+ print(assembly2str(a0))
996
+ a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
997
+ for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
998
+ print(f.seq)
999
+
1000
+ # prints TACGTAA and CGTAACGA
1001
+
1002
+ Subfragments: ``cccccgtatcgtgt``, ``atcgtgtactgtcatattc``
943
1003
  """
944
1004
  subfragments = list()
945
1005
  for node, start_location, end_location in subfragment_representation:
@@ -1028,33 +1088,38 @@ class Assembly:
1028
1088
 
1029
1089
  The assembly contains a directed graph, where nodes represent fragments and
1030
1090
  edges represent overlaps between fragments. :
1091
+
1031
1092
  - The node keys are integers, representing the index of the fragment in the
1032
- input list of fragments. The sign of the node key represents the orientation
1033
- of the fragment, positive for forward orientation, negative for reverse orientation.
1093
+ input list of fragments. The sign of the node key represents the orientation
1094
+ of the fragment, positive for forward orientation, negative for reverse orientation.
1034
1095
  - The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
1035
1096
  - u and v are the nodes connected by the edge.
1036
1097
  - key is a string that represents the location of the overlap. In the format:
1037
- 'u[start:end](strand):v[start:end](strand)'.
1098
+ 'u[start:end](strand):v[start:end](strand)'.
1038
1099
  - Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
1039
- representing the location of the overlap in the u and v fragment, respectively.
1100
+ representing the location of the overlap in the u and v fragment, respectively.
1040
1101
  - You can think of an edge as a representation of the join of two fragments.
1041
1102
 
1042
1103
  If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
1043
1104
  there will be 4 edges representing that overlap in the graph, for all possible
1044
1105
  orientations of the fragments (see add_edges_from_match for details):
1045
- - `(1, 2, '1[8:14]:2[1:7]')`
1046
- - `(2, 1, '2[1:7]:1[8:14]')`
1047
- - `(-1, -2, '-1[0:6]:-2[10:16]')`
1048
- - `(-2, -1, '-2[10:16]:-1[0:6]')`
1106
+
1107
+ - ``(1, 2, '1[8:14]:2[1:7]')``
1108
+ - ``(2, 1, '2[1:7]:1[8:14]')``
1109
+ - ``(-1, -2, '-1[0:6]:-2[10:16]')``
1110
+ - ``(-2, -1, '-2[10:16]:-1[0:6]')``
1049
1111
 
1050
1112
  An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
1051
1113
  as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
1052
1114
  and second fragment. Assemblies are then represented as:
1115
+
1053
1116
  - Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
1054
1117
  - Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
1118
+
1055
1119
  Note that the first and last fragment are the same in a circular assembly.
1056
1120
 
1057
1121
  The following constrains are applied to remove duplicate assemblies:
1122
+
1058
1123
  - Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
1059
1124
  use_fragment_order is ignored.
1060
1125
  - Linear assemblies:
@@ -1065,7 +1130,7 @@ class Assembly:
1065
1130
  frags : list
1066
1131
  A list of Dseqrecord objects.
1067
1132
  limit : int, optional
1068
- The shortest shared homology to be considered, this is passed as the third argument to the `algorithm` function.
1133
+ The shortest shared homology to be considered, this is passed as the third argument to the ``algorithm`` function.
1069
1134
  For certain algorithms, this might be ignored.
1070
1135
  algorithm : function, optional
1071
1136
  The algorithm used to determine the shared sequences. It's a function that takes two Dseqrecord objects as inputs,
@@ -1232,11 +1297,12 @@ class Assembly:
1232
1297
  first: _Dseqrecord,
1233
1298
  secnd: _Dseqrecord,
1234
1299
  ):
1235
- """Add edges to the graph from a match returned by the `algorithm` function (see pydna.common_substrings). For
1300
+ """Add edges to the graph from a match returned by the ``algorithm`` function (see pydna.common_substrings). For
1236
1301
  format of edges (see documentation of the Assembly class).
1237
1302
 
1238
- Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
1303
+ Matches are directional, because not all ``algorithm`` functions return the same match for (u,v) and (v,u). For example,
1239
1304
  homologous recombination does but sticky end ligation does not. The function returns two edges:
1305
+
1240
1306
  - Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
1241
1307
  - Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
1242
1308
 
@@ -1446,17 +1512,18 @@ class Assembly:
1446
1512
  Here we check if one of the joins between fragments represents the edges of an insertion assembly
1447
1513
  The fragment must be linear, and the join must be as indicated below
1448
1514
 
1449
- ```
1450
- -------- ------- Fragment 1
1451
- || ||
1452
- xxxxxxxx || Fragment 2
1453
- || ||
1454
- oooooooooo Fragment 3
1455
- ```
1515
+ ::
1516
+
1517
+ -------- ------- Fragment 1
1518
+ || ||
1519
+ xxxxxxxx || Fragment 2
1520
+ || ||
1521
+ oooooooooo Fragment 3
1522
+
1456
1523
  The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
1457
1524
 
1458
1525
  These could be returned in any order by simple_cycles, so we sort the edges so that the first
1459
- and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
1526
+ and last ``u`` and ``v`` match the fragment that gets the insertion (1 in the example above).
1460
1527
  """
1461
1528
  edge_pair_index = list()
1462
1529
 
@@ -1637,8 +1704,8 @@ class Assembly:
1637
1704
 
1638
1705
  def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
1639
1706
  """Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
1640
- `left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1641
- and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
1707
+ ``left``, ``right``, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1708
+ and right side. The values in ``left`` and ``right`` are often the same, except in restriction-ligation with partial overlap enabled,
1642
1709
  where we can end up with a situation like this:
1643
1710
 
1644
1711
  GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
@@ -1651,13 +1718,14 @@ class Assembly:
1651
1718
  aGGTCTCCxxCCAATT
1652
1719
  tCCAGAGGTTGGxxAA
1653
1720
 
1654
- Would return
1655
- {
1656
- 1: {'left': [7:9], 'right': [9:11]},
1657
- 2: {'left': [8:10], 'right': [10:12]},
1658
- -1: {'left': [2:4], 'right': [4:6]},
1659
- -2: {'left': [2:4], 'right': [4:6]}
1660
- }
1721
+ Would return::
1722
+
1723
+ {
1724
+ 1: {'left': [7:9], 'right': [9:11]},
1725
+ 2: {'left': [8:10], 'right': [10:12]},
1726
+ -1: {'left': [2:4], 'right': [4:6]},
1727
+ -2: {'left': [2:4], 'right': [4:6]}
1728
+ }
1661
1729
 
1662
1730
  """
1663
1731
 
@@ -1686,10 +1754,10 @@ class Assembly:
1686
1754
  and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
1687
1755
  and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
1688
1756
 
1689
- ```
1690
- x y z
1691
- -------|-------|-------|---------
1692
- ```
1757
+ ::
1758
+
1759
+ x y z
1760
+ -------|-------|-------|---------
1693
1761
 
1694
1762
  We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
1695
1763
  The latter would indicate that the fragment was partially digested.
@@ -1750,8 +1818,8 @@ class Assembly:
1750
1818
 
1751
1819
  class PCRAssembly(Assembly):
1752
1820
  """
1753
- An assembly that represents a PCR, where `fragments` is a list of primer, template, primer (in that order).
1754
- It always uses the `primer_template_overlap` algorithm and accepts the `mismatches` argument to indicate
1821
+ An assembly that represents a PCR, where ``fragments`` is a list of primer, template, primer (in that order).
1822
+ It always uses the ``primer_template_overlap`` algorithm and accepts the ``mismatches`` argument to indicate
1755
1823
  the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
1756
1824
  """
1757
1825
 
@@ -1959,6 +2027,21 @@ def common_function_assembly_products(
1959
2027
  return [assemble(frags, a) for a in output_assemblies]
1960
2028
 
1961
2029
 
2030
+ def _recast_sources(
2031
+ products: list[_Dseqrecord], source_cls, **extra_fields
2032
+ ) -> list[_Dseqrecord]:
2033
+ """Recast the `source` of each product to `source_cls` with optional extras.
2034
+
2035
+ This avoids repeating the same for-loop across many assembly functions.
2036
+ """
2037
+ for prod in products:
2038
+ prod.source = source_cls(
2039
+ **prod.source.model_dump(),
2040
+ **extra_fields,
2041
+ )
2042
+ return products
2043
+
2044
+
1962
2045
  def gibson_assembly(
1963
2046
  frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1964
2047
  ) -> list[_Dseqrecord]:
@@ -1978,9 +2061,11 @@ def gibson_assembly(
1978
2061
  list[_Dseqrecord]
1979
2062
  List of assembled DNA molecules
1980
2063
  """
1981
- return common_function_assembly_products(
2064
+
2065
+ products = common_function_assembly_products(
1982
2066
  frags, limit, gibson_overlap, circular_only
1983
2067
  )
2068
+ return _recast_sources(products, GibsonAssemblySource)
1984
2069
 
1985
2070
 
1986
2071
  def in_fusion_assembly(
@@ -2003,7 +2088,9 @@ def in_fusion_assembly(
2003
2088
  list[_Dseqrecord]
2004
2089
  List of assembled DNA molecules
2005
2090
  """
2006
- return gibson_assembly(frags, limit)
2091
+
2092
+ products = gibson_assembly(frags, limit)
2093
+ return _recast_sources(products, InFusionSource)
2007
2094
 
2008
2095
 
2009
2096
  def fusion_pcr_assembly(
@@ -2026,7 +2113,8 @@ def fusion_pcr_assembly(
2026
2113
  list[_Dseqrecord]
2027
2114
  List of assembled DNA molecules
2028
2115
  """
2029
- return gibson_assembly(frags, limit)
2116
+ products = gibson_assembly(frags, limit)
2117
+ return _recast_sources(products, OverlapExtensionPCRLigationSource)
2030
2118
 
2031
2119
 
2032
2120
  def in_vivo_assembly(
@@ -2048,9 +2136,10 @@ def in_vivo_assembly(
2048
2136
  list[_Dseqrecord]
2049
2137
  List of assembled DNA molecules
2050
2138
  """
2051
- return common_function_assembly_products(
2139
+ products = common_function_assembly_products(
2052
2140
  frags, limit, common_sub_strings, circular_only
2053
2141
  )
2142
+ return _recast_sources(products, InVivoAssemblySource)
2054
2143
 
2055
2144
 
2056
2145
  def restriction_ligation_assembly(
@@ -2060,9 +2149,10 @@ def restriction_ligation_assembly(
2060
2149
  circular_only: bool = False,
2061
2150
  ) -> list[_Dseqrecord]:
2062
2151
  """Returns the products for restriction ligation assembly:
2063
- * Finds cutsites in the fragments
2064
- * Finds all products that could be assembled by ligating the fragments based on those cutsites
2065
- * Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2152
+
2153
+ - Finds cutsites in the fragments
2154
+ - Finds all products that could be assembled by ligating the fragments based on those cutsites
2155
+ - Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2066
2156
 
2067
2157
  Parameters
2068
2158
  ----------
@@ -2083,9 +2173,9 @@ def restriction_ligation_assembly(
2083
2173
  Examples
2084
2174
  --------
2085
2175
  In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
2086
- Note how 2 circular products are returned, one contains the insert (`acgt`)
2087
- and the desired part of the backbone (`cccccc`), the other contains the
2088
- reversed insert (`tgga`) and the cut-out part of the backbone (`aaa`).
2176
+ Note how 2 circular products are returned, one contains the insert (``acgt``)
2177
+ and the desired part of the backbone (``cccccc``), the other contains the
2178
+ reversed insert (``tgga``) and the cut-out part of the backbone (``aaa``).
2089
2179
 
2090
2180
  >>> from pydna.assembly2 import restriction_ligation_assembly
2091
2181
  >>> from pydna.dseqrecord import Dseqrecord
@@ -2119,11 +2209,16 @@ def restriction_ligation_assembly(
2119
2209
  TTAAGtttC
2120
2210
  """
2121
2211
 
2122
- def algo(x, y, _l):
2212
+ def algorithm_fn(x, y, _l):
2123
2213
  # By default, we allow blunt ends
2124
2214
  return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
2125
2215
 
2126
- return common_function_assembly_products(frags, None, algo, circular_only)
2216
+ products = common_function_assembly_products(
2217
+ frags, None, algorithm_fn, circular_only
2218
+ )
2219
+ return _recast_sources(
2220
+ products, RestrictionAndLigationSource, restriction_enzymes=enzymes
2221
+ )
2127
2222
 
2128
2223
 
2129
2224
  def golden_gate_assembly(
@@ -2134,7 +2229,7 @@ def golden_gate_assembly(
2134
2229
  ) -> list[_Dseqrecord]:
2135
2230
  """Returns the products for Golden Gate assembly. This is the same as
2136
2231
  restriction ligation assembly, but with a different name. Check the documentation
2137
- for `restriction_ligation_assembly` for more details.
2232
+ for ``restriction_ligation_assembly`` for more details.
2138
2233
 
2139
2234
  Parameters
2140
2235
  ----------
@@ -2154,7 +2249,7 @@ def golden_gate_assembly(
2154
2249
 
2155
2250
  Examples
2156
2251
  --------
2157
- See the example for `restriction_ligation_assembly`.
2252
+ See the example for ``restriction_ligation_assembly``.
2158
2253
  """
2159
2254
  return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
2160
2255
 
@@ -2168,7 +2263,7 @@ def ligation_assembly(
2168
2263
  """Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
2169
2264
  will be ligated.
2170
2265
 
2171
- For most cases, you probably should use `restriction_ligation_assembly` instead.
2266
+ For most cases, you probably should use ``restriction_ligation_assembly`` instead.
2172
2267
 
2173
2268
  Parameters
2174
2269
  ----------
@@ -2215,11 +2310,14 @@ def ligation_assembly(
2215
2310
  return sticky_end_sub_strings(x, y, allow_partial_overlap)
2216
2311
 
2217
2312
  if allow_blunt:
2218
- algo = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2313
+ algorithm_fn = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2219
2314
  else:
2220
- algo = sticky_end_algorithm
2315
+ algorithm_fn = sticky_end_algorithm
2221
2316
 
2222
- return common_function_assembly_products(frags, None, algo, circular_only)
2317
+ products = common_function_assembly_products(
2318
+ frags, None, algorithm_fn, circular_only
2319
+ )
2320
+ return _recast_sources(products, LigationSource)
2223
2321
 
2224
2322
 
2225
2323
  def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
@@ -2236,7 +2334,7 @@ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
2236
2334
 
2237
2335
  def gateway_assembly(
2238
2336
  frags: list[_Dseqrecord],
2239
- reaction_type: str,
2337
+ reaction_type: Literal["BP", "LR"],
2240
2338
  greedy: bool = False,
2241
2339
  circular_only: bool = False,
2242
2340
  multi_site_only: bool = False,
@@ -2247,8 +2345,8 @@ def gateway_assembly(
2247
2345
  ----------
2248
2346
  frags : list[_Dseqrecord]
2249
2347
  List of DNA fragments to assemble
2250
- reaction_type : str
2251
- Type of Gateway reaction, either 'BP' or 'LR'
2348
+ reaction_type : Literal['BP', 'LR']
2349
+ Type of Gateway reaction
2252
2350
  greedy : bool, optional
2253
2351
  If True, use greedy gateway consensus sites, by default False
2254
2352
  circular_only : bool, optional
@@ -2288,9 +2386,9 @@ def gateway_assembly(
2288
2386
  >>> len(products_LR)
2289
2387
  2
2290
2388
 
2291
- Now let's understand the `multi_site_only` parameter. Let's consider a case where we are swapping fragments
2389
+ Now let's understand the ``multi_site_only`` parameter. Let's consider a case where we are swapping fragments
2292
2390
  between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
2293
- swapping between the two att sites. That's what we get if we set `multi_site_only` to True.
2391
+ swapping between the two att sites. That's what we get if we set ``multi_site_only`` to True.
2294
2392
 
2295
2393
  >>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
2296
2394
  >>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
@@ -2300,7 +2398,7 @@ def gateway_assembly(
2300
2398
  >>> len(products)
2301
2399
  2
2302
2400
 
2303
- However, if we set `multi_site_only` to False, we get 4 products, which also include the intermediate products
2401
+ However, if we set ``multi_site_only`` to False, we get 4 products, which also include the intermediate products
2304
2402
  where the two plasmids are combined into a single one through recombination of a single att site. This is an
2305
2403
  intermediate of the reaction, and typically we don't want it:
2306
2404
 
@@ -2316,13 +2414,19 @@ def gateway_assembly(
2316
2414
  f"Invalid reaction type: {reaction_type}, can only be BP or LR"
2317
2415
  )
2318
2416
 
2319
- def algo(x, y, _l):
2417
+ def algorithm_fn(x, y, _l):
2320
2418
  return gateway_overlap(x, y, reaction_type, greedy)
2321
2419
 
2322
2420
  filter_results_function = None if not multi_site_only else assembly_is_multi_site
2323
2421
 
2324
2422
  products = common_function_assembly_products(
2325
- frags, None, algo, circular_only, filter_results_function
2423
+ frags, None, algorithm_fn, circular_only, filter_results_function
2424
+ )
2425
+ products = _recast_sources(
2426
+ products,
2427
+ GatewaySource,
2428
+ reaction_type=reaction_type,
2429
+ greedy=greedy,
2326
2430
  )
2327
2431
 
2328
2432
  if len(products) == 0:
@@ -2479,7 +2583,10 @@ def homologous_recombination_integration(
2479
2583
  """
2480
2584
  fragments = common_handle_insertion_fragments(genome, inserts)
2481
2585
 
2482
- return common_function_integration_products(fragments, limit, common_sub_strings)
2586
+ products = common_function_integration_products(
2587
+ fragments, limit, common_sub_strings
2588
+ )
2589
+ return _recast_sources(products, HomologousRecombinationSource)
2483
2590
 
2484
2591
 
2485
2592
  def homologous_recombination_excision(
@@ -2515,7 +2622,8 @@ def homologous_recombination_excision(
2515
2622
  >>> products
2516
2623
  [Dseqrecord(o25), Dseqrecord(-32)]
2517
2624
  """
2518
- return common_function_excision_products(genome, limit, common_sub_strings)
2625
+ products = common_function_excision_products(genome, limit, common_sub_strings)
2626
+ return _recast_sources(products, HomologousRecombinationSource)
2519
2627
 
2520
2628
 
2521
2629
  def cre_lox_integration(
@@ -2524,7 +2632,7 @@ def cre_lox_integration(
2524
2632
  """Returns the products resulting from the integration of an insert (or inserts joined
2525
2633
  through cre-lox recombination among them) into the genome through cre-lox integration.
2526
2634
 
2527
- Also works with lox66 and lox71 (see `pydna.cre_lox` for more details).
2635
+ Also works with lox66 and lox71 (see ``pydna.cre_lox`` for more details).
2528
2636
 
2529
2637
  Parameters
2530
2638
  ----------
@@ -2574,7 +2682,8 @@ def cre_lox_integration(
2574
2682
 
2575
2683
  """
2576
2684
  fragments = common_handle_insertion_fragments(genome, inserts)
2577
- return common_function_integration_products(fragments, None, cre_loxP_overlap)
2685
+ products = common_function_integration_products(fragments, None, cre_loxP_overlap)
2686
+ return _recast_sources(products, CreLoxRecombinationSource)
2578
2687
 
2579
2688
 
2580
2689
  def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
@@ -2624,4 +2733,151 @@ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
2624
2733
  >>> res2
2625
2734
  [Dseqrecord(o39), Dseqrecord(-45)]
2626
2735
  """
2627
- return common_function_excision_products(genome, None, cre_loxP_overlap)
2736
+ products = common_function_excision_products(genome, None, cre_loxP_overlap)
2737
+ return _recast_sources(products, CreLoxRecombinationSource)
2738
+
2739
+
2740
+ def crispr_integration(
2741
+ genome: _Dseqrecord,
2742
+ inserts: list[_Dseqrecord],
2743
+ guides: list[_Primer],
2744
+ limit: int = 40,
2745
+ ) -> list[_Dseqrecord]:
2746
+ """
2747
+ Returns the products for CRISPR integration.
2748
+
2749
+ Parameters
2750
+ ----------
2751
+ genome : _Dseqrecord
2752
+ Target genome sequence
2753
+ inserts : list[_Dseqrecord]
2754
+ DNA fragment(s) to insert
2755
+ guides : list[_Primer]
2756
+ List of guide RNAs as Primer objects. This may change in the future.
2757
+ limit : int, optional
2758
+ Minimum overlap length required, by default 40
2759
+
2760
+ Returns
2761
+ -------
2762
+ list[_Dseqrecord]
2763
+ List of integrated DNA molecules
2764
+
2765
+ Examples
2766
+ --------
2767
+
2768
+ >>> from pydna.dseqrecord import Dseqrecord
2769
+ >>> from pydna.assembly2 import crispr_integration
2770
+ >>> from pydna.primer import Primer
2771
+ >>> genome = Dseqrecord("aaccggttcaatgcaaacagtaatgatggatgacattcaaagcac", name="genome")
2772
+ >>> insert = Dseqrecord("aaccggttAAAAAAAAAttcaaagcac", name="insert")
2773
+ >>> guide = Primer("ttcaatgcaaacagtaatga", name="guide")
2774
+ >>> product, *_ = crispr_integration(genome, [insert], [guide], 8)
2775
+ >>> product
2776
+ Dseqrecord(-27)
2777
+
2778
+ """
2779
+ if len(guides) == 0:
2780
+ raise ValueError("At least one guide RNA is required for CRISPR integration")
2781
+
2782
+ # Get all the possible products from the homologous recombination integration
2783
+ products = homologous_recombination_integration(genome, inserts, limit)
2784
+
2785
+ # Verify that the guides cut in the region that will be repaired
2786
+
2787
+ # First we collect the positions where the guides cut
2788
+ guide_cuts = []
2789
+ for guide in guides:
2790
+ enzyme = cas9(str(guide.seq))
2791
+ possible_cuts = genome.seq.get_cutsites(enzyme)
2792
+ if len(possible_cuts) == 0:
2793
+ raise ValueError(
2794
+ f"Could not find Cas9 cutsite in the target sequence using the guide: {guide.name}"
2795
+ )
2796
+ # Keep only the position of the cut
2797
+ possible_cuts = [cut[0] for (cut, _) in possible_cuts]
2798
+ guide_cuts.append(possible_cuts)
2799
+
2800
+ # Then, we check it the possible homologous recombination products contain the cuts
2801
+ # from the guides inside the repair region.
2802
+ # We also add the used guides to each product. This is very important!
2803
+ valid_products = []
2804
+ for i, product in enumerate(products):
2805
+ # The second element of product.source.input is conventionally the insert/repair fragment
2806
+ # The other two (first and third) are the two bits of the genome
2807
+ repair_start = _location_boundaries(product.source.input[0].right_location)[0]
2808
+ repair_end = _location_boundaries(product.source.input[2].left_location)[1]
2809
+ repair_location = create_location(repair_start, repair_end, len(genome))
2810
+ some_cuts_inside_repair = []
2811
+ all_cuts_inside_repair = []
2812
+ for cut_group in guide_cuts:
2813
+ cuts_in_repair = [cut for cut in cut_group if cut in repair_location]
2814
+ some_cuts_inside_repair.append(len(cuts_in_repair) != 0)
2815
+ all_cuts_inside_repair.append(len(cuts_in_repair) == len(cut_group))
2816
+
2817
+ if all(some_cuts_inside_repair):
2818
+ used_guides = [g for i, g in enumerate(guides) if all_cuts_inside_repair[i]]
2819
+ # Add the used guides to the product <----- VERY IMPORTANT!
2820
+ product.source.input.extend([SourceInput(sequence=g) for g in used_guides])
2821
+ valid_products.append(product)
2822
+
2823
+ if not all(all_cuts_inside_repair):
2824
+ raise ValueError(
2825
+ "Some guides cut outside the repair region, please check the guides"
2826
+ )
2827
+
2828
+ if len(valid_products) != len(products):
2829
+ warnings.warn(
2830
+ "Some recombination products were discarded because they had off-target cuts",
2831
+ category=UserWarning,
2832
+ stacklevel=2,
2833
+ )
2834
+
2835
+ return _recast_sources(valid_products, CRISPRSource)
2836
+
2837
+
2838
+ def pcr_assembly(
2839
+ template: _Dseqrecord,
2840
+ fwd_primer: _Primer,
2841
+ rvs_primer: _Primer,
2842
+ add_primer_features: bool = False,
2843
+ limit: int = 14,
2844
+ mismatches: int = 0,
2845
+ ) -> list[_Dseqrecord]:
2846
+ """Returns the products for PCR assembly.
2847
+
2848
+ Parameters
2849
+ ----------
2850
+ template : _Dseqrecord
2851
+ Template sequence
2852
+ fwd_primer : _Primer
2853
+ Forward primer
2854
+ rvs_primer : _Primer
2855
+ Reverse primer
2856
+ add_primer_features : bool, optional
2857
+ If True, add primer features to the product, by default False
2858
+ limit : int, optional
2859
+ Minimum overlap length required, by default 14
2860
+ mismatches : int, optional
2861
+ Maximum number of mismatches, by default 0
2862
+
2863
+ Returns
2864
+ -------
2865
+ list[_Dseqrecord]
2866
+ List of assembled DNA molecules
2867
+ """
2868
+
2869
+ minimal_annealing = limit + mismatches
2870
+ fragments = [fwd_primer, template, rvs_primer]
2871
+ asm = PCRAssembly(
2872
+ fragments,
2873
+ limit=minimal_annealing,
2874
+ mismatches=mismatches,
2875
+ )
2876
+ products = asm.assemble_linear()
2877
+ # If both primers are the same, remove duplicates
2878
+ if str(fwd_primer.seq).upper() == str(rvs_primer.seq).upper():
2879
+ products = [p for p in products if not p.source.input[1].reverse_complemented]
2880
+ if add_primer_features:
2881
+ products = [annotate_primer_binding_sites(prod, fragments) for prod in products]
2882
+
2883
+ return _recast_sources(products, PCRSource, add_primer_features=add_primer_features)