pydna 5.5.3__py3-none-any.whl → 5.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/assembly2.py CHANGED
@@ -4,29 +4,29 @@ Improved implementation of the assembly module. To see a list of issues with the
4
4
  see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
5
5
  """
6
6
 
7
- import networkx as _nx
8
- import itertools as _itertools
7
+ import networkx as nx
8
+ import itertools
9
9
  from Bio.SeqFeature import SimpleLocation, Location
10
- from Bio.Seq import reverse_complement
10
+
11
11
  from Bio.Restriction.Restriction import RestrictionBatch
12
12
  import regex
13
13
  import copy
14
14
 
15
15
  from pydna.utils import (
16
- shift_location as _shift_location,
16
+ shift_location,
17
17
  flatten,
18
- location_boundaries as _location_boundaries,
19
- locations_overlap as _locations_overlap,
18
+ location_boundaries,
19
+ locations_overlap,
20
20
  sum_is_sticky,
21
21
  limit_iterator,
22
22
  create_location,
23
23
  )
24
- from pydna._pretty import pretty_str as _pretty_str
24
+ from pydna._pretty import pretty_str as ps
25
25
  from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
26
- from pydna.dseqrecord import Dseqrecord as _Dseqrecord
27
- from pydna.dseq import Dseq as _Dseq
28
- from pydna.primer import Primer as _Primer
29
- from pydna.seqrecord import SeqRecord as _SeqRecord
26
+ from pydna.dseqrecord import Dseqrecord
27
+ from pydna.dseq import Dseq
28
+ from pydna.primer import Primer
29
+ from pydna.seqrecord import SeqRecord
30
30
  from pydna.types import (
31
31
  CutSiteType,
32
32
  # TODO: allow user to enforce multi-site
@@ -38,11 +38,29 @@ from pydna.types import (
38
38
  )
39
39
  from pydna.gateway import gateway_overlap, find_gateway_sites
40
40
  from pydna.cre_lox import cre_loxP_overlap
41
+ from pydna.alphabet import anneal_strands
42
+
43
+ from typing import TYPE_CHECKING, Callable, Literal
44
+ from pydna.opencloning_models import (
45
+ AssemblySource,
46
+ RestrictionAndLigationSource,
47
+ GibsonAssemblySource,
48
+ InFusionSource,
49
+ OverlapExtensionPCRLigationSource,
50
+ InVivoAssemblySource,
51
+ LigationSource,
52
+ GatewaySource,
53
+ HomologousRecombinationSource,
54
+ CreLoxRecombinationSource,
55
+ PCRSource,
56
+ SourceInput,
57
+ CRISPRSource,
58
+ )
59
+ from pydna.crispr import cas9
60
+ import warnings
41
61
 
42
- from typing import TYPE_CHECKING, Callable
43
-
44
- if TYPE_CHECKING:
45
- from Bio.Restriction import AbstractCut as _AbstractCut
62
+ if TYPE_CHECKING: # pragma: no cover
63
+ from Bio.Restriction import AbstractCut
46
64
 
47
65
 
48
66
  def gather_overlapping_locations(
@@ -54,45 +72,52 @@ def gather_overlapping_locations(
54
72
  the output will be [(loc1, loc2), (loc3,)].
55
73
  """
56
74
  # Make a graph with all the locations as nodes
57
- G = _nx.Graph()
75
+ G = nx.Graph()
58
76
  for i, loc in enumerate(locs):
59
77
  G.add_node(i, location=loc)
60
78
 
61
79
  # Add edges between nodes that overlap
62
80
  for i in range(len(locs)):
63
81
  for j in range(i + 1, len(locs)):
64
- if _locations_overlap(locs[i], locs[j], fragment_length):
82
+ if locations_overlap(locs[i], locs[j], fragment_length):
65
83
  G.add_edge(i, j)
66
84
 
67
85
  # Get groups of overlapping locations
68
86
  groups = list()
69
- for loc_set in _nx.connected_components(G):
87
+ for loc_set in nx.connected_components(G):
70
88
  groups.append(tuple(locs[i] for i in loc_set))
71
89
 
72
90
  # Sort by location of the first element in each group (does not matter which since they are overlapping)
73
- groups.sort(key=lambda x: _location_boundaries(x[0])[0])
91
+ groups.sort(key=lambda x: location_boundaries(x[0])[0])
74
92
 
75
93
  return groups
76
94
 
77
95
 
78
96
  def ends_from_cutsite(
79
- cutsite: CutSiteType, seq: _Dseq
97
+ cutsite: CutSiteType, seq: Dseq
80
98
  ) -> tuple[tuple[str, str], tuple[str, str]]:
81
99
  """Get the sticky or blunt ends created by a restriction enzyme cut.
82
100
 
83
- Args:
84
- cutsite (CutSiteType): A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
85
- seq (_Dseq): The DNA sequence being cut
101
+ Parameters
102
+ ----------
103
+ cutsite : CutSiteType
104
+ A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
105
+ seq : _Dseq
106
+ The DNA sequence being cut
86
107
 
87
- Raises:
88
- ValueError: If cutsite is None
108
+ Raises
109
+ ------
110
+ ValueError
111
+ If cutsite is None
89
112
 
90
- Returns:
91
- tuple[tuple[str, str], tuple[str, str]]: A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
113
+ Returns
114
+ -------
115
+ tuple[tuple[str, str], tuple[str, str]]
116
+ A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
92
117
  and the sequence of the overhang. The first tuple is for the left end, second for the right end.
93
118
 
94
119
  >>> from Bio.Restriction import NotI
95
- >>> x = _Dseq("ctcgGCGGCCGCcagcggccg")
120
+ >>> x = Dseq("ctcgGCGGCCGCcagcggccg")
96
121
  >>> x.get_cutsites(NotI)
97
122
  [((6, -4), NotI)]
98
123
  >>> ends_from_cutsite(x.get_cutsites(NotI)[0], x)
@@ -119,8 +144,8 @@ def ends_from_cutsite(
119
144
 
120
145
 
121
146
  def restriction_ligation_overlap(
122
- seqx: _Dseqrecord,
123
- seqy: _Dseqrecord,
147
+ seqx: Dseqrecord,
148
+ seqy: Dseqrecord,
124
149
  enzymes=RestrictionBatch,
125
150
  partial=False,
126
151
  allow_blunt=False,
@@ -129,14 +154,23 @@ def restriction_ligation_overlap(
129
154
 
130
155
  Like in sticky and gibson, the order matters (see example below of partial overlap)
131
156
 
132
- Args:
133
- seqx (_Dseqrecord): The first sequence
134
- seqy (_Dseqrecord): The second sequence
135
- enzymes (RestrictionBatch): The enzymes to use
136
- partial (bool): Whether to allow partial overlaps
137
- allow_blunt (bool): Whether to allow blunt ends
138
- Returns:
139
- list[SequenceOverlap]: A list of overlaps between the two sequences
157
+ Parameters
158
+ ----------
159
+ seqx : Dseqrecord
160
+ The first sequence
161
+ seqy : Dseqrecord
162
+ The second sequence
163
+ enzymes : RestrictionBatch
164
+ The enzymes to use
165
+ partial : bool
166
+ Whether to allow partial overlaps
167
+ allow_blunt : bool
168
+ Whether to allow blunt ends
169
+
170
+ Returns
171
+ -------
172
+ list[SequenceOverlap]
173
+ A list of overlaps between the two sequences
140
174
 
141
175
  >>> from pydna.dseqrecord import Dseqrecord
142
176
  >>> from pydna.assembly2 import restriction_ligation_overlap
@@ -178,7 +212,7 @@ def restriction_ligation_overlap(
178
212
  # if not seqy.circular:
179
213
  # cuts_y.append(((0, 0), None))
180
214
  matches = list()
181
- for cut_x, cut_y in _itertools.product(cuts_x, cuts_y):
215
+ for cut_x, cut_y in itertools.product(cuts_x, cuts_y):
182
216
  # A blunt end
183
217
  if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
184
218
  matches.append((cut_x[0][0], cut_y[0][0], 0))
@@ -222,7 +256,7 @@ def combine_algorithms(*algorithms: AssemblyAlgorithmType) -> AssemblyAlgorithmT
222
256
 
223
257
 
224
258
  def blunt_overlap(
225
- seqx: _Dseqrecord, seqy: _Dseqrecord, limit=None
259
+ seqx: Dseqrecord, seqy: Dseqrecord, limit=None
226
260
  ) -> list[SequenceOverlap]:
227
261
  """
228
262
  Assembly algorithm to find blunt overlaps. Used for blunt ligation.
@@ -230,13 +264,19 @@ def blunt_overlap(
230
264
  It basically returns [(len(seqx), 0, 0)] if the right end of seqx is blunt and the
231
265
  left end of seqy is blunt (compatible with blunt ligation). Otherwise, it returns an empty list.
232
266
 
233
- Args:
234
- seqx (_Dseqrecord): The first sequence
235
- seqy (_Dseqrecord): The second sequence
236
- limit (int): There for compatibility, but it is ignored
267
+ Parameters
268
+ ----------
269
+ seqx : Dseqrecord
270
+ The first sequence
271
+ seqy : Dseqrecord
272
+ The second sequence
273
+ limit : int
274
+ There for compatibility, but it is ignored
237
275
 
238
- Returns:
239
- list[SequenceOverlap]: A list of overlaps between the two sequences
276
+ Returns
277
+ -------
278
+ list[SequenceOverlap]
279
+ A list of overlaps between the two sequences
240
280
 
241
281
  >>> from pydna.assembly2 import blunt_overlap
242
282
  >>> from pydna.dseqrecord import Dseqrecord
@@ -254,7 +294,7 @@ def blunt_overlap(
254
294
 
255
295
 
256
296
  def common_sub_strings(
257
- seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25
297
+ seqx: Dseqrecord, seqy: Dseqrecord, limit=25
258
298
  ) -> list[SequenceOverlap]:
259
299
  """
260
300
  Assembly algorithm to find common substrings of length == limit. see the docs of
@@ -317,30 +357,36 @@ def common_sub_strings(
317
357
  return [r for r in results if r not in shifted_matches]
318
358
 
319
359
 
320
- def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
360
+ def gibson_overlap(seqx: Dseqrecord, seqy: Dseqrecord, limit=25):
321
361
  """
322
362
  Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
323
363
  The order matters, we want alignments like:
324
364
 
325
- ```
326
- seqx: oooo------xxxx
327
- seqy: xxxx------oooo
328
- Product: oooo------xxxx------oooo
365
+ ::
329
366
 
330
- Not like:
367
+ seqx: oooo------xxxx
368
+ seqy: xxxx------oooo
369
+ Product: oooo------xxxx------oooo
331
370
 
332
- seqx: oooo------xxxx
333
- seqy: xxxx------oooo
334
- Product (unwanted): oooo
335
- ```
371
+ Not like:
336
372
 
337
- Args:
338
- seqx (_Dseqrecord): The first sequence
339
- seqy (_Dseqrecord): The second sequence
340
- limit (int): Minimum length of the overlap
373
+ seqx: oooo------xxxx
374
+ seqy: xxxx------oooo
375
+ Product (unwanted): oooo
376
+
377
+ Parameters
378
+ ----------
379
+ seqx : Dseqrecord
380
+ The first sequence
381
+ seqy : Dseqrecord
382
+ The second sequence
383
+ limit : int
384
+ Minimum length of the overlap
341
385
 
342
- Returns:
343
- list[SequenceOverlap]: A list of overlaps between the two sequences
386
+ Returns
387
+ -------
388
+ list[SequenceOverlap]
389
+ A list of overlaps between the two sequences
344
390
 
345
391
  >>> from pydna.dseqrecord import Dseqrecord
346
392
  >>> from pydna.assembly2 import gibson_overlap
@@ -357,9 +403,9 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
357
403
  # This is only relevant for linear fragments, so we don't need to worry about
358
404
  # shifting locations for circular fragments.
359
405
  trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
360
- trim_x_right = seqx.seq.watson_ovhg() if seqx.seq.watson_ovhg() < 0 else None
406
+ trim_x_right = seqx.seq.watson_ovhg if seqx.seq.watson_ovhg < 0 else None
361
407
  trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
362
- trim_y_right = seqy.seq.watson_ovhg() if seqy.seq.watson_ovhg() < 0 else None
408
+ trim_y_right = seqy.seq.watson_ovhg if seqy.seq.watson_ovhg < 0 else None
363
409
 
364
410
  stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
365
411
  stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
@@ -377,20 +423,26 @@ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
377
423
  return [tuple(m) for m in matches]
378
424
 
379
425
 
380
- def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = False):
426
+ def sticky_end_sub_strings(seqx: Dseqrecord, seqy: Dseqrecord, limit: bool = False):
381
427
  """
382
428
  Assembly algorithm for ligation of sticky ends.
383
429
 
384
430
  For now, if limit 0 / False (default) only full overlaps are considered.
385
431
  Otherwise, partial overlaps are also returned.
386
432
 
387
- Args:
388
- seqx (_Dseqrecord): The first sequence
389
- seqy (_Dseqrecord): The second sequence
390
- limit (bool): Whether to allow partial overlaps
433
+ Parameters
434
+ ----------
435
+ seqx : Dseqrecord
436
+ The first sequence
437
+ seqy : Dseqrecord
438
+ The second sequence
439
+ limit : bool
440
+ Whether to allow partial overlaps
391
441
 
392
- Returns:
393
- list[SequenceOverlap]: A list of overlaps between the two sequences
442
+ Returns
443
+ -------
444
+ list[SequenceOverlap]
445
+ A list of overlaps between the two sequences
394
446
 
395
447
 
396
448
  Ligation of fully overlapping sticky ends, note how the order matters
@@ -415,6 +467,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
415
467
  [(4, 0, 2)]
416
468
 
417
469
  """
470
+
418
471
  overlap = sum_is_sticky(
419
472
  seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit
420
473
  )
@@ -424,7 +477,7 @@ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = F
424
477
 
425
478
 
426
479
  def zip_match_leftwards(
427
- seqx: _SeqRecord, seqy: _SeqRecord, match: SequenceOverlap
480
+ seqx: SeqRecord, seqy: SeqRecord, match: SequenceOverlap
428
481
  ) -> SequenceOverlap:
429
482
  """
430
483
  Starting from the rightmost edge of the match, return a new match encompassing the max
@@ -432,15 +485,15 @@ def zip_match_leftwards(
432
485
  than the limit or a shorter match if there are mismatches. This is convenient to maintain
433
486
  as many features as possible. It is used in PCR assembly.
434
487
 
435
- >>> seq = _Dseqrecord('AAAAACGTCCCGT')
436
- >>> primer = _Dseqrecord('ACGTCCCGT')
488
+ >>> seq = Dseqrecord('AAAAACGTCCCGT')
489
+ >>> primer = Dseqrecord('ACGTCCCGT')
437
490
  >>> match = (13, 9, 0) # an empty match at the end of each
438
491
  >>> zip_match_leftwards(seq, primer, match)
439
492
  (4, 0, 9)
440
493
 
441
494
  Works in circular molecules if the match spans the origin:
442
- >>> seq = _Dseqrecord('TCCCGTAAAAACG', circular=True)
443
- >>> primer = _Dseqrecord('ACGTCCCGT')
495
+ >>> seq = Dseqrecord('TCCCGTAAAAACG', circular=True)
496
+ >>> primer = Dseqrecord('ACGTCCCGT')
444
497
  >>> match = (6, 9, 0)
445
498
  >>> zip_match_leftwards(seq, primer, match)
446
499
  (10, 0, 9)
@@ -461,11 +514,11 @@ def zip_match_leftwards(
461
514
  # For those cases we shift by length, then go back
462
515
 
463
516
  end_on_x = match[0] + match[2]
464
- if isinstance(seqx, _Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
517
+ if isinstance(seqx, Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
465
518
  end_on_x += len(seqx)
466
519
 
467
520
  end_on_y = match[1] + match[2]
468
- if isinstance(seqy, _Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
521
+ if isinstance(seqy, Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
469
522
  end_on_y += len(seqy)
470
523
 
471
524
  count = 0
@@ -482,7 +535,7 @@ def zip_match_leftwards(
482
535
 
483
536
 
484
537
  def zip_match_rightwards(
485
- seqx: _Dseqrecord, seqy: _Dseqrecord, match: SequenceOverlap
538
+ seqx: Dseqrecord, seqy: Dseqrecord, match: SequenceOverlap
486
539
  ) -> SequenceOverlap:
487
540
  """Same as zip_match_leftwards, but towards the right."""
488
541
 
@@ -498,19 +551,19 @@ def zip_match_rightwards(
498
551
  return (start_on_x, start_on_y, count)
499
552
 
500
553
 
501
- def seqrecord2_uppercase_DNA_string(seqr: _SeqRecord) -> str:
554
+ def seqrecord2_uppercase_DNA_string(seqr: SeqRecord) -> str:
502
555
  """
503
556
  Transform a Dseqrecord to a sequence string where U is replaced by T, everything is upper case and
504
557
  circular sequences are repeated twice. This is used for PCR, to support primers with U's (e.g. for USER cloning).
505
558
  """
506
559
  out = str(seqr.seq).upper().replace("U", "T")
507
- if isinstance(seqr, _Dseqrecord) and seqr.circular:
560
+ if isinstance(seqr, Dseqrecord) and seqr.circular:
508
561
  return out * 2
509
562
  return out
510
563
 
511
564
 
512
565
  def primer_template_overlap(
513
- seqx: _Dseqrecord | _Primer, seqy: _Dseqrecord | _Primer, limit=25, mismatches=0
566
+ seqx: Dseqrecord | Primer, seqy: Dseqrecord | Primer, limit=25, mismatches=0
514
567
  ) -> list[SequenceOverlap]:
515
568
  """
516
569
  Assembly algorithm to find overlaps between a primer and a template. It accepts mismatches.
@@ -520,14 +573,21 @@ def primer_template_overlap(
520
573
  If seqx is a template and seqy is a primer, it represents the binding of a reverse primer,
521
574
  where the primer has been passed as its reverse complement (see examples).
522
575
 
523
- Args:
524
- seqx (_Dseqrecord | _Primer): The primer
525
- seqy (_Dseqrecord | _Primer): The template
526
- limit (int): Minimum length of the overlap
527
- mismatches (int): Maximum number of mismatches (only substitutions, no deletion or insertion)
576
+ Parameters
577
+ ----------
578
+ seqx : Dseqrecord | Primer
579
+ The primer
580
+ seqy : Dseqrecord | Primer
581
+ The template
582
+ limit : int
583
+ Minimum length of the overlap
584
+ mismatches : int
585
+ Maximum number of mismatches (only substitutions, no deletion or insertion)
528
586
 
529
- Returns:
530
- list[SequenceOverlap]: A list of overlaps between the primer and the template
587
+ Returns
588
+ -------
589
+ list[SequenceOverlap]
590
+ A list of overlaps between the primer and the template
531
591
 
532
592
  >>> from pydna.dseqrecord import Dseqrecord
533
593
  >>> from pydna.primer import Primer
@@ -537,7 +597,7 @@ def primer_template_overlap(
537
597
  >>> primer_template_overlap(primer, template, limit=8, mismatches=0)
538
598
  [(0, 2, 8)]
539
599
 
540
- This actually represents the binding of the primer `GCTGCTAA` (reverse complement)
600
+ This actually represents the binding of the primer ``GCTGCTAA`` (reverse complement)
541
601
  >>> primer_template_overlap(template, primer, limit=8, mismatches=0)
542
602
  [(2, 0, 8)]
543
603
  >>> primer_template_overlap(primer, template.reverse_complement(), limit=8, mismatches=0)
@@ -546,11 +606,11 @@ def primer_template_overlap(
546
606
  []
547
607
  """
548
608
 
549
- if isinstance(seqx, _Primer) and isinstance(seqy, _Dseqrecord):
609
+ if isinstance(seqx, Primer) and isinstance(seqy, Dseqrecord):
550
610
  primer = seqx
551
611
  template = seqy
552
612
  reverse_primer = False
553
- elif isinstance(seqx, _Dseqrecord) and isinstance(seqy, _Primer):
613
+ elif isinstance(seqx, Dseqrecord) and isinstance(seqy, Primer):
554
614
  primer = seqy
555
615
  template = seqx
556
616
  reverse_primer = True
@@ -604,45 +664,8 @@ def primer_template_overlap(
604
664
  return list(sorted(out))
605
665
 
606
666
 
607
- def fill_left(seq: _Dseq) -> _Dseq:
608
- """Fill the left overhang of a sequence with the complementary sequence."""
609
- new_watson = seq.watson
610
- new_crick = seq.crick
611
-
612
- # Watson 5' overhang
613
- if seq.ovhg < 0:
614
- new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
615
- # Crick 5' overhang
616
- elif seq.ovhg > 0:
617
- new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
618
-
619
- return _Dseq(new_watson, new_crick, 0)
620
-
621
-
622
- def fill_right(seq: _Dseq) -> _Dseq:
623
- """Fill the right overhang of a sequence with the complementary sequence."""
624
- new_watson = seq.watson
625
- new_crick = seq.crick
626
-
627
- # Watson 3' overhang
628
- watson_ovhg = seq.watson_ovhg()
629
- if watson_ovhg < 0:
630
- new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
631
-
632
- # Crick 3' overhang
633
- elif watson_ovhg > 0:
634
- new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
635
-
636
- return _Dseq(new_watson, new_crick, seq.ovhg)
637
-
638
-
639
- def fill_dseq(seq: _Dseq) -> _Dseq:
640
- """Fill the overhangs of a sequence with the complementary sequence."""
641
- return fill_left(fill_right(seq))
642
-
643
-
644
667
  def reverse_complement_assembly(
645
- assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
668
+ assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
646
669
  ) -> EdgeRepresentationAssembly:
647
670
  """Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
648
671
  new_assembly = list()
@@ -656,7 +679,7 @@ def reverse_complement_assembly(
656
679
  def filter_linear_subassemblies(
657
680
  linear_assemblies: list[EdgeRepresentationAssembly],
658
681
  circular_assemblies: list[EdgeRepresentationAssembly],
659
- fragments: list[_Dseqrecord],
682
+ fragments: list[Dseqrecord],
660
683
  ) -> list[EdgeRepresentationAssembly]:
661
684
  """Remove linear assemblies which are sub-assemblies of circular assemblies"""
662
685
  all_circular_assemblies = circular_assemblies + [
@@ -702,7 +725,7 @@ def assembly2str(assembly: EdgeRepresentationAssembly) -> str:
702
725
  ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
703
726
 
704
727
  The reason for this is that by default, a feature '[8:14]' when present in a tuple
705
- is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
728
+ is printed to the console as ``SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)`` (very long).
706
729
  """
707
730
  return str(tuple(f"{u}{lu}:{v}{lv}" for u, v, lu, lv in assembly))
708
731
 
@@ -715,7 +738,7 @@ def assembly2str_tuple(assembly: EdgeRepresentationAssembly) -> str:
715
738
 
716
739
 
717
740
  def assembly_has_mismatches(
718
- fragments: list[_Dseqrecord], assembly: EdgeRepresentationAssembly
741
+ fragments: list[Dseqrecord], assembly: EdgeRepresentationAssembly
719
742
  ) -> bool:
720
743
  """Check if an assembly has mismatches. This should never happen and if so it returns an error."""
721
744
  for u, v, loc_u, loc_v in assembly:
@@ -731,7 +754,7 @@ def assembly_has_mismatches(
731
754
 
732
755
 
733
756
  def assembly_is_circular(
734
- assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
757
+ assembly: EdgeRepresentationAssembly, fragments: list[Dseqrecord]
735
758
  ) -> bool:
736
759
  """
737
760
  Based on the topology of the locations of an assembly, determine if it is circular.
@@ -740,22 +763,22 @@ def assembly_is_circular(
740
763
  if assembly[0][0] != assembly[-1][1]:
741
764
  return False
742
765
  elif (
743
- isinstance(fragments[abs(assembly[0][0]) - 1], _Dseqrecord)
766
+ isinstance(fragments[abs(assembly[0][0]) - 1], Dseqrecord)
744
767
  and fragments[abs(assembly[0][0]) - 1].circular
745
768
  ):
746
769
  return True
747
770
  else:
748
771
  return (
749
- _location_boundaries(assembly[0][2])[0]
750
- > _location_boundaries(assembly[-1][3])[0]
772
+ location_boundaries(assembly[0][2])[0]
773
+ > location_boundaries(assembly[-1][3])[0]
751
774
  )
752
775
 
753
776
 
754
777
  def assemble(
755
- fragments: list[_Dseqrecord],
778
+ fragments: list[Dseqrecord],
756
779
  assembly: EdgeRepresentationAssembly,
757
780
  is_insertion: bool = False,
758
- ) -> _Dseqrecord:
781
+ ) -> Dseqrecord:
759
782
  """Generate a Dseqrecord from an assembly and a list of fragments."""
760
783
 
761
784
  if is_insertion:
@@ -772,14 +795,15 @@ def assemble(
772
795
  u, v, loc_u, loc_v = asm_edge
773
796
  f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
774
797
  f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
775
- seq_u = str(loc_u.extract(f_u).seq).upper()
776
- seq_v = str(loc_v.extract(f_v).seq).upper()
777
- if seq_u != seq_v:
798
+ seq_u = str(loc_u.extract(f_u).seq)
799
+ seq_v = str(loc_v.extract(f_v).seq.rc())
800
+ # Test if seq_u and seq_v anneal
801
+ if not anneal_strands(seq_u, seq_v):
778
802
  raise ValueError("Mismatch in assembly")
779
803
 
780
804
  # We transform into Dseqrecords (for primers)
781
805
  dseqr_fragments = [
782
- f if isinstance(f, _Dseqrecord) else _Dseqrecord(f) for f in fragments
806
+ f if isinstance(f, Dseqrecord) else Dseqrecord(f) for f in fragments
783
807
  ]
784
808
  subfragments = get_assembly_subfragments(
785
809
  dseqr_fragments, subfragment_representation
@@ -787,49 +811,33 @@ def assemble(
787
811
 
788
812
  # Length of the overlaps between consecutive assembly fragments
789
813
  fragment_overlaps = [len(e[-1]) for e in assembly]
814
+ out_dseqrecord = subfragments.pop(0)
790
815
 
791
- out_dseqrecord = _Dseqrecord(subfragments[0])
816
+ for fragment, overlap in zip(subfragments, fragment_overlaps):
817
+ out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
818
+ out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
819
+ fragment.seq = fragment.seq.cast_to_ds_left()
820
+ fragment.seq = fragment.seq.exo1_front(overlap)
821
+ out_dseqrecord += fragment
792
822
 
793
- for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
794
- # Shift the features of the right fragment to the left by `overlap`
795
- new_features = [
796
- f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
797
- ]
798
- # Join the left sequence including the overlap with the right sequence without the overlap
799
- # we use fill_right / fill_left so that it works for ligation of sticky ends
800
- out_dseqrecord = _Dseqrecord(
801
- fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
802
- features=out_dseqrecord.features + new_features,
803
- )
804
-
805
- # For circular assemblies, close the loop and wrap origin-spanning features
823
+ # For circular assemblies, process the fragment and loop
806
824
  if is_circular:
825
+ out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_left()
826
+ out_dseqrecord.seq = out_dseqrecord.seq.cast_to_ds_right()
807
827
  overlap = fragment_overlaps[-1]
828
+ out_dseqrecord.seq = out_dseqrecord.seq.exo1_front(overlap)
829
+ out_dseqrecord.seq = out_dseqrecord.seq.exo1_end(overlap)
830
+ out_dseqrecord = out_dseqrecord.looped()
808
831
 
809
- # Special case for blunt circularisation
810
- if overlap == 0:
811
- return out_dseqrecord.looped()
812
-
813
- # Remove trailing overlap
814
- out_dseqrecord = _Dseqrecord(
815
- fill_dseq(out_dseqrecord.seq)[:-overlap],
816
- features=out_dseqrecord.features,
817
- circular=True,
818
- )
819
- for feature in out_dseqrecord.features:
820
- start, end = _location_boundaries(feature.location)
821
- if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
822
- # Wrap around the origin
823
- feature.location = _shift_location(
824
- feature.location, 0, len(out_dseqrecord)
825
- )
826
-
832
+ out_dseqrecord.source = AssemblySource.from_subfragment_representation(
833
+ subfragment_representation, fragments, is_circular
834
+ )
827
835
  return out_dseqrecord
828
836
 
829
837
 
830
838
  def annotate_primer_binding_sites(
831
- input_dseqr: _Dseqrecord, fragments: list[_Dseqrecord]
832
- ) -> _Dseqrecord:
839
+ input_dseqr: Dseqrecord, fragments: list[Dseqrecord]
840
+ ) -> Dseqrecord:
833
841
  """Annotate the primer binding sites in a Dseqrecord."""
834
842
  fwd, _, rvs = fragments
835
843
  start_rvs = len(input_dseqr) - len(rvs)
@@ -909,37 +917,36 @@ def subfragment_representation2edge_representation(
909
917
 
910
918
 
911
919
  def get_assembly_subfragments(
912
- fragments: list[_Dseqrecord],
920
+ fragments: list[Dseqrecord],
913
921
  subfragment_representation: SubFragmentRepresentationAssembly,
914
- ) -> list[_Dseqrecord]:
922
+ ) -> list[Dseqrecord]:
915
923
  """From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
916
924
 
917
925
  Subfragments are the slices of the fragments that are joined together
918
926
 
919
- For example:
920
- ```
921
- --A--
922
- TACGTAAT
923
- --B--
924
- TCGTAACGA
925
-
926
- Gives: TACGTAA / CGTAACGA
927
- ```
928
- To reproduce:
929
- ```
930
- a = Dseqrecord('TACGTAAT')
931
- b = Dseqrecord('TCGTAACGA')
932
- f = Assembly([a, b], limit=5)
933
- a0 = f.get_linear_assemblies()[0]
934
- print(assembly2str(a0))
935
- a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
936
- for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
937
- print(f.seq)
938
-
939
- # prints TACGTAA and CGTAACGA
940
- ```
941
-
942
- Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
927
+ For example::
928
+
929
+ --A--
930
+ TACGTAAT
931
+ --B--
932
+ TCGTAACGA
933
+
934
+ Gives: TACGTAA / CGTAACGA
935
+
936
+ To reproduce::
937
+
938
+ a = Dseqrecord('TACGTAAT')
939
+ b = Dseqrecord('TCGTAACGA')
940
+ f = Assembly([a, b], limit=5)
941
+ a0 = f.get_linear_assemblies()[0]
942
+ print(assembly2str(a0))
943
+ a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
944
+ for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
945
+ print(f.seq)
946
+
947
+ # prints TACGTAA and CGTAACGA
948
+
949
+ Subfragments: ``cccccgtatcgtgt``, ``atcgtgtactgtcatattc``
943
950
  """
944
951
  subfragments = list()
945
952
  for node, start_location, end_location in subfragment_representation:
@@ -953,19 +960,26 @@ def get_assembly_subfragments(
953
960
 
954
961
 
955
962
  def extract_subfragment(
956
- seq: _Dseqrecord, start_location: Location, end_location: Location
957
- ) -> _Dseqrecord:
963
+ seq: Dseqrecord, start_location: Location | None, end_location: Location | None
964
+ ) -> Dseqrecord:
958
965
  """Extract a subfragment from a sequence for an assembly, given the start and end locations of the subfragment."""
959
- start = 0 if start_location is None else _location_boundaries(start_location)[0]
960
- end = None if end_location is None else _location_boundaries(end_location)[1]
966
+
967
+ if seq.circular and (start_location is None or end_location is None):
968
+ raise ValueError(
969
+ "Start and end locations cannot be None for circular sequences"
970
+ )
971
+ # This could be used to have consistent behaviour for circular sequences, where the start is arbitrary. However,
972
+ # they should never get None, so this is not used.
973
+ # if start_location is None:
974
+ # start_location = end_location
975
+ # elif end_location is None:
976
+ # end_location = start_location
977
+
978
+ start = 0 if start_location is None else location_boundaries(start_location)[0]
979
+ end = None if end_location is None else location_boundaries(end_location)[1]
961
980
 
962
981
  # Special case, some of it could be handled by better Dseqrecord slicing in the future
963
- if (
964
- seq.circular
965
- and start_location is not None
966
- and end_location is not None
967
- and _locations_overlap(start_location, end_location, len(seq))
968
- ):
982
+ if seq.circular and locations_overlap(start_location, end_location, len(seq)):
969
983
  # The overhang is different for origin-spanning features, for instance
970
984
  # for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
971
985
  # is -4, not 9
@@ -975,7 +989,7 @@ def extract_subfragment(
975
989
  ovhg = 0
976
990
  dummy_cut = ((start, ovhg), None)
977
991
  open_seq = seq.apply_cut(dummy_cut, dummy_cut)
978
- return _Dseqrecord(fill_dseq(open_seq.seq), features=open_seq.features)
992
+ return Dseqrecord(open_seq.seq.cast_to_ds(), features=open_seq.features)
979
993
 
980
994
  return seq[start:end]
981
995
 
@@ -1028,33 +1042,38 @@ class Assembly:
1028
1042
 
1029
1043
  The assembly contains a directed graph, where nodes represent fragments and
1030
1044
  edges represent overlaps between fragments. :
1045
+
1031
1046
  - The node keys are integers, representing the index of the fragment in the
1032
- input list of fragments. The sign of the node key represents the orientation
1033
- of the fragment, positive for forward orientation, negative for reverse orientation.
1047
+ input list of fragments. The sign of the node key represents the orientation
1048
+ of the fragment, positive for forward orientation, negative for reverse orientation.
1034
1049
  - The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
1035
1050
  - u and v are the nodes connected by the edge.
1036
1051
  - key is a string that represents the location of the overlap. In the format:
1037
- 'u[start:end](strand):v[start:end](strand)'.
1052
+ 'u[start:end](strand):v[start:end](strand)'.
1038
1053
  - Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
1039
- representing the location of the overlap in the u and v fragment, respectively.
1054
+ representing the location of the overlap in the u and v fragment, respectively.
1040
1055
  - You can think of an edge as a representation of the join of two fragments.
1041
1056
 
1042
1057
  If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
1043
1058
  there will be 4 edges representing that overlap in the graph, for all possible
1044
1059
  orientations of the fragments (see add_edges_from_match for details):
1045
- - `(1, 2, '1[8:14]:2[1:7]')`
1046
- - `(2, 1, '2[1:7]:1[8:14]')`
1047
- - `(-1, -2, '-1[0:6]:-2[10:16]')`
1048
- - `(-2, -1, '-2[10:16]:-1[0:6]')`
1060
+
1061
+ - ``(1, 2, '1[8:14]:2[1:7]')``
1062
+ - ``(2, 1, '2[1:7]:1[8:14]')``
1063
+ - ``(-1, -2, '-1[0:6]:-2[10:16]')``
1064
+ - ``(-2, -1, '-2[10:16]:-1[0:6]')``
1049
1065
 
1050
1066
  An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
1051
1067
  as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
1052
1068
  and second fragment. Assemblies are then represented as:
1069
+
1053
1070
  - Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
1054
1071
  - Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
1072
+
1055
1073
  Note that the first and last fragment are the same in a circular assembly.
1056
1074
 
1057
1075
  The following constrains are applied to remove duplicate assemblies:
1076
+
1058
1077
  - Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
1059
1078
  use_fragment_order is ignored.
1060
1079
  - Linear assemblies:
@@ -1065,7 +1084,7 @@ class Assembly:
1065
1084
  frags : list
1066
1085
  A list of Dseqrecord objects.
1067
1086
  limit : int, optional
1068
- The shortest shared homology to be considered, this is passed as the third argument to the `algorithm` function.
1087
+ The shortest shared homology to be considered, this is passed as the third argument to the ``algorithm`` function.
1069
1088
  For certain algorithms, this might be ignored.
1070
1089
  algorithm : function, optional
1071
1090
  The algorithm used to determine the shared sequences. It's a function that takes two Dseqrecord objects as inputs,
@@ -1113,14 +1132,15 @@ class Assembly:
1113
1132
 
1114
1133
  def __init__(
1115
1134
  self,
1116
- frags: list[_Dseqrecord],
1135
+ frags: list[Dseqrecord],
1117
1136
  limit: int = 25,
1118
1137
  algorithm: AssemblyAlgorithmType = common_sub_strings,
1119
1138
  use_fragment_order: bool = True,
1120
1139
  use_all_fragments: bool = False,
1121
1140
  ):
1141
+
1122
1142
  # TODO: allow for the same fragment to be included more than once?
1123
- self.G = _nx.MultiDiGraph()
1143
+ self.G = nx.MultiDiGraph()
1124
1144
  # Add positive and negative nodes for forward and reverse fragments
1125
1145
  self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1126
1146
  self.G.add_nodes_from(
@@ -1128,12 +1148,12 @@ class Assembly:
1128
1148
  )
1129
1149
 
1130
1150
  # Iterate over all possible combinations of fragments
1131
- fragment_pairs = _itertools.combinations(
1151
+ fragment_pairs = itertools.combinations(
1132
1152
  filter(lambda x: x > 0, self.G.nodes), 2
1133
1153
  )
1134
1154
  for i, j in fragment_pairs:
1135
1155
  # All the relative orientations of the fragments in the pair
1136
- for u, v in _itertools.product([i, -i], [j, -j]):
1156
+ for u, v in itertools.product([i, -i], [j, -j]):
1137
1157
  u_seq = self.G.nodes[u]["seq"]
1138
1158
  v_seq = self.G.nodes[v]["seq"]
1139
1159
  matches = algorithm(u_seq, v_seq, limit)
@@ -1151,7 +1171,7 @@ class Assembly:
1151
1171
  @classmethod
1152
1172
  def assembly_is_valid(
1153
1173
  cls,
1154
- fragments: list[_Dseqrecord | _Primer],
1174
+ fragments: list[Dseqrecord | Primer],
1155
1175
  assembly: EdgeRepresentationAssembly,
1156
1176
  is_circular: bool,
1157
1177
  use_all_fragments: bool,
@@ -1167,6 +1187,23 @@ class Assembly:
1167
1187
  if len(assembly) == 0:
1168
1188
  return False
1169
1189
 
1190
+ # Topology check -> Circular sequences cannot be first or last in a linear assembly.
1191
+ # For example, let's imagine aACGTc (linear) and gACGTc (circular).
1192
+ # It should not be possible to join them into a linear assembly. It's similar if we
1193
+ # think of a restriction-ligation assembly, example: aGAATTCc (linear) and gGAATTCc
1194
+ # (circular).
1195
+ # A linear product can be generated where the circular molecule is cut open, and one end
1196
+ # it joins the linear molecule and on the other it's free, but for now it's not a
1197
+ # relevant product and it's excluded.
1198
+ first_fragment = fragments[abs(assembly[0][0]) - 1]
1199
+ last_fragment = fragments[abs(assembly[-1][1]) - 1]
1200
+ if not is_circular and (
1201
+ isinstance(first_fragment, Dseqrecord)
1202
+ and first_fragment.circular
1203
+ or (isinstance(last_fragment, Dseqrecord) and last_fragment.circular)
1204
+ ):
1205
+ return False
1206
+
1170
1207
  if use_all_fragments and len(fragments) != len(
1171
1208
  set(flatten(map(abs, e[:2]) for e in assembly))
1172
1209
  ):
@@ -1204,8 +1241,8 @@ class Assembly:
1204
1241
  # Incompatible as described in figure above
1205
1242
  fragment = fragments[abs(v1) - 1]
1206
1243
  if (
1207
- isinstance(fragment, _Primer) or not fragment.circular
1208
- ) and _location_boundaries(start_location)[1] >= _location_boundaries(
1244
+ isinstance(fragment, Primer) or not fragment.circular
1245
+ ) and location_boundaries(start_location)[1] >= location_boundaries(
1209
1246
  end_location
1210
1247
  )[
1211
1248
  1
@@ -1229,14 +1266,15 @@ class Assembly:
1229
1266
  match: SequenceOverlap,
1230
1267
  u: int,
1231
1268
  v: int,
1232
- first: _Dseqrecord,
1233
- secnd: _Dseqrecord,
1269
+ first: Dseqrecord,
1270
+ secnd: Dseqrecord,
1234
1271
  ):
1235
- """Add edges to the graph from a match returned by the `algorithm` function (see pydna.common_substrings). For
1272
+ """Add edges to the graph from a match returned by the ``algorithm`` function (see pydna.common_substrings). For
1236
1273
  format of edges (see documentation of the Assembly class).
1237
1274
 
1238
- Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
1275
+ Matches are directional, because not all ``algorithm`` functions return the same match for (u,v) and (v,u). For example,
1239
1276
  homologous recombination does but sticky end ligation does not. The function returns two edges:
1277
+
1240
1278
  - Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
1241
1279
  - Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
1242
1280
 
@@ -1248,10 +1286,10 @@ class Assembly:
1248
1286
  else:
1249
1287
  # We use shift_location with 0 to wrap origin-spanning features
1250
1288
  locs = [
1251
- _shift_location(
1289
+ shift_location(
1252
1290
  SimpleLocation(x_start, x_start + length), 0, len(first)
1253
1291
  ),
1254
- _shift_location(
1292
+ shift_location(
1255
1293
  SimpleLocation(y_start, y_start + length), 0, len(secnd)
1256
1294
  ),
1257
1295
  ]
@@ -1286,7 +1324,7 @@ class Assembly:
1286
1324
  """
1287
1325
 
1288
1326
  # Copy the graph since we will add the begin and end mock nodes
1289
- G = _nx.MultiDiGraph(self.G)
1327
+ G = nx.MultiDiGraph(self.G)
1290
1328
  G.add_nodes_from(["begin", "end"])
1291
1329
 
1292
1330
  if self.use_fragment_order:
@@ -1324,7 +1362,7 @@ class Assembly:
1324
1362
  def node_path2assembly_list(
1325
1363
  self, cycle: list[int], circular: bool
1326
1364
  ) -> list[EdgeRepresentationAssembly]:
1327
- """Convert a node path in the format [1, 2, 3] (as returned by _nx.cycles.simple_cycles) to a list of all
1365
+ """Convert a node path in the format [1, 2, 3] (as returned by networkx.cycles.simple_cycles) to a list of all
1328
1366
  possible assemblies.
1329
1367
 
1330
1368
  There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
@@ -1338,11 +1376,11 @@ class Assembly:
1338
1376
  combine.append([(u, v, key) for key in self.G[u][v]])
1339
1377
  return [
1340
1378
  tuple(map(self.format_assembly_edge, x))
1341
- for x in _itertools.product(*combine)
1379
+ for x in itertools.product(*combine)
1342
1380
  ]
1343
1381
 
1344
1382
  def get_unique_linear_paths(
1345
- self, G_with_begin_end: _nx.MultiDiGraph, max_paths=10000
1383
+ self, G_with_begin_end: nx.MultiDiGraph, max_paths=10000
1346
1384
  ) -> list[list[int]]:
1347
1385
  """Get unique linear paths from the graph, removing those that contain the same node twice."""
1348
1386
  # We remove the begin and end nodes, and get all paths without edges
@@ -1353,8 +1391,8 @@ class Assembly:
1353
1391
  node_paths = [
1354
1392
  x[1:-1]
1355
1393
  for x in limit_iterator(
1356
- _nx.all_simple_paths(
1357
- _nx.DiGraph(G_with_begin_end),
1394
+ nx.all_simple_paths(
1395
+ nx.DiGraph(G_with_begin_end),
1358
1396
  "begin",
1359
1397
  "end",
1360
1398
  cutoff=(len(self.fragments) + 1),
@@ -1403,7 +1441,7 @@ class Assembly:
1403
1441
  sorted_cycles = map(
1404
1442
  circular_permutation_min_abs,
1405
1443
  limit_iterator(
1406
- _nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
1444
+ nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
1407
1445
  10000,
1408
1446
  ),
1409
1447
  )
@@ -1446,17 +1484,18 @@ class Assembly:
1446
1484
  Here we check if one of the joins between fragments represents the edges of an insertion assembly
1447
1485
  The fragment must be linear, and the join must be as indicated below
1448
1486
 
1449
- ```
1450
- -------- ------- Fragment 1
1451
- || ||
1452
- xxxxxxxx || Fragment 2
1453
- || ||
1454
- oooooooooo Fragment 3
1455
- ```
1487
+ ::
1488
+
1489
+ -------- ------- Fragment 1
1490
+ || ||
1491
+ xxxxxxxx || Fragment 2
1492
+ || ||
1493
+ oooooooooo Fragment 3
1494
+
1456
1495
  The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
1457
1496
 
1458
1497
  These could be returned in any order by simple_cycles, so we sort the edges so that the first
1459
- and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
1498
+ and last ``u`` and ``v`` match the fragment that gets the insertion (1 in the example above).
1460
1499
  """
1461
1500
  edge_pair_index = list()
1462
1501
 
@@ -1467,8 +1506,8 @@ class Assembly:
1467
1506
  fragment = self.fragments[abs(v1) - 1]
1468
1507
  # Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
1469
1508
  # the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
1470
- left_of_insertion = _location_boundaries(start_location)[0]
1471
- right_of_insertion = _location_boundaries(end_location)[0]
1509
+ left_of_insertion = location_boundaries(start_location)[0]
1510
+ right_of_insertion = location_boundaries(end_location)[0]
1472
1511
  if not fragment.circular and (
1473
1512
  right_of_insertion >= left_of_insertion
1474
1513
  # The below condition is for single-site integration.
@@ -1480,7 +1519,7 @@ class Assembly:
1480
1519
  #
1481
1520
  # The locations of homology on the genome are [0:10] and [2:12], so not identical
1482
1521
  # but they overlap.
1483
- or _locations_overlap(start_location, end_location, len(fragment))
1522
+ or locations_overlap(start_location, end_location, len(fragment))
1484
1523
  ):
1485
1524
  edge_pair_index.append(i)
1486
1525
 
@@ -1511,13 +1550,13 @@ class Assembly:
1511
1550
  fragment1 = self.fragments[abs(f1) - 1]
1512
1551
  fragment2 = self.fragments[abs(f2) - 1]
1513
1552
 
1514
- if not _locations_overlap(
1553
+ if not locations_overlap(
1515
1554
  loc_f1_1, loc_f1_2, len(fragment1)
1516
- ) or not _locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
1555
+ ) or not locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
1517
1556
  return same_assembly
1518
1557
 
1519
1558
  # Sort to make compatible with insertion assembly
1520
- if _location_boundaries(loc_f1_1)[0] > _location_boundaries(loc_f1_2)[0]:
1559
+ if location_boundaries(loc_f1_1)[0] > location_boundaries(loc_f1_2)[0]:
1521
1560
  new_assembly = same_assembly[::-1]
1522
1561
  else:
1523
1562
  new_assembly = same_assembly[:]
@@ -1530,10 +1569,10 @@ class Assembly:
1530
1569
  fragment2 = self.fragments[abs(f2) - 1]
1531
1570
 
1532
1571
  # Extract boundaries
1533
- f2_1_start, _ = _location_boundaries(loc_f2_1)
1534
- f2_2_start, f2_2_end = _location_boundaries(loc_f2_2)
1535
- f1_1_start, _ = _location_boundaries(loc_f1_1)
1536
- f1_2_start, f1_2_end = _location_boundaries(loc_f1_2)
1572
+ f2_1_start, _ = location_boundaries(loc_f2_1)
1573
+ f2_2_start, f2_2_end = location_boundaries(loc_f2_2)
1574
+ f1_1_start, _ = location_boundaries(loc_f1_1)
1575
+ f1_2_start, f1_2_end = location_boundaries(loc_f1_2)
1537
1576
 
1538
1577
  overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(
1539
1578
  fragment2[f2_1_start:f2_2_end]
@@ -1573,7 +1612,7 @@ class Assembly:
1573
1612
  "only_adjacent_edges not implemented for insertion assemblies"
1574
1613
  )
1575
1614
 
1576
- cycles = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1615
+ cycles = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
1577
1616
 
1578
1617
  # We apply constrains already here because sometimes the combinatorial explosion is too large
1579
1618
  if self.use_all_fragments:
@@ -1592,7 +1631,7 @@ class Assembly:
1592
1631
  )
1593
1632
 
1594
1633
  # We find cycles first
1595
- iterator = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1634
+ iterator = limit_iterator(nx.cycles.simple_cycles(self.G), 10000)
1596
1635
  assemblies = sum(
1597
1636
  map(lambda x: self.node_path2assembly_list(x, True), iterator), []
1598
1637
  )
@@ -1616,29 +1655,27 @@ class Assembly:
1616
1655
 
1617
1656
  def assemble_linear(
1618
1657
  self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1619
- ) -> list[_Dseqrecord]:
1658
+ ) -> list[Dseqrecord]:
1620
1659
  """Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
1621
1660
  assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
1622
1661
  return [assemble(self.fragments, a) for a in assemblies]
1623
1662
 
1624
1663
  def assemble_circular(
1625
1664
  self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1626
- ) -> list[_Dseqrecord]:
1665
+ ) -> list[Dseqrecord]:
1627
1666
  """Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
1628
1667
  assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
1629
1668
  return [assemble(self.fragments, a) for a in assemblies]
1630
1669
 
1631
- def assemble_insertion(
1632
- self, only_adjacent_edges: bool = False
1633
- ) -> list[_Dseqrecord]:
1670
+ def assemble_insertion(self, only_adjacent_edges: bool = False) -> list[Dseqrecord]:
1634
1671
  """Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
1635
1672
  assemblies = self.get_insertion_assemblies(only_adjacent_edges)
1636
1673
  return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
1637
1674
 
1638
1675
  def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
1639
1676
  """Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
1640
- `left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1641
- and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
1677
+ ``left``, ``right``, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1678
+ and right side. The values in ``left`` and ``right`` are often the same, except in restriction-ligation with partial overlap enabled,
1642
1679
  where we can end up with a situation like this:
1643
1680
 
1644
1681
  GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
@@ -1651,13 +1688,14 @@ class Assembly:
1651
1688
  aGGTCTCCxxCCAATT
1652
1689
  tCCAGAGGTTGGxxAA
1653
1690
 
1654
- Would return
1655
- {
1656
- 1: {'left': [7:9], 'right': [9:11]},
1657
- 2: {'left': [8:10], 'right': [10:12]},
1658
- -1: {'left': [2:4], 'right': [4:6]},
1659
- -2: {'left': [2:4], 'right': [4:6]}
1660
- }
1691
+ Would return::
1692
+
1693
+ {
1694
+ 1: {'left': [7:9], 'right': [9:11]},
1695
+ 2: {'left': [8:10], 'right': [10:12]},
1696
+ -1: {'left': [2:4], 'right': [4:6]},
1697
+ -2: {'left': [2:4], 'right': [4:6]}
1698
+ }
1661
1699
 
1662
1700
  """
1663
1701
 
@@ -1671,10 +1709,10 @@ class Assembly:
1671
1709
  if edge_location not in this_dict[key]:
1672
1710
  this_dict[key].append(edge_location)
1673
1711
  this_dict["left"] = sorted(
1674
- this_dict["left"], key=lambda x: _location_boundaries(x)[0]
1712
+ this_dict["left"], key=lambda x: location_boundaries(x)[0]
1675
1713
  )
1676
1714
  this_dict["right"] = sorted(
1677
- this_dict["right"], key=lambda x: _location_boundaries(x)[0]
1715
+ this_dict["right"], key=lambda x: location_boundaries(x)[0]
1678
1716
  )
1679
1717
  locations_on_fragments[node] = this_dict
1680
1718
 
@@ -1686,10 +1724,10 @@ class Assembly:
1686
1724
  and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
1687
1725
  and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
1688
1726
 
1689
- ```
1690
- x y z
1691
- -------|-------|-------|---------
1692
- ```
1727
+ ::
1728
+
1729
+ x y z
1730
+ -------|-------|-------|---------
1693
1731
 
1694
1732
  We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
1695
1733
  The latter would indicate that the fragment was partially digested.
@@ -1721,7 +1759,7 @@ class Assembly:
1721
1759
 
1722
1760
  pairs = list()
1723
1761
  for pair in zip(left, right):
1724
- pairs += list(_itertools.product(*pair))
1762
+ pairs += list(itertools.product(*pair))
1725
1763
  allowed_location_pairs[node] = pairs
1726
1764
 
1727
1765
  fragment_assembly = edge_representation2subfragment_representation(
@@ -1734,7 +1772,7 @@ class Assembly:
1734
1772
 
1735
1773
  def __repr__(self):
1736
1774
  # https://pyformat.info
1737
- return _pretty_str(
1775
+ return ps(
1738
1776
  "Assembly\n"
1739
1777
  "fragments..: {sequences}\n"
1740
1778
  "limit(bp)..: {limit}\n"
@@ -1750,12 +1788,12 @@ class Assembly:
1750
1788
 
1751
1789
  class PCRAssembly(Assembly):
1752
1790
  """
1753
- An assembly that represents a PCR, where `fragments` is a list of primer, template, primer (in that order).
1754
- It always uses the `primer_template_overlap` algorithm and accepts the `mismatches` argument to indicate
1791
+ An assembly that represents a PCR, where ``fragments`` is a list of primer, template, primer (in that order).
1792
+ It always uses the ``primer_template_overlap`` algorithm and accepts the ``mismatches`` argument to indicate
1755
1793
  the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
1756
1794
  """
1757
1795
 
1758
- def __init__(self, frags: list[_Dseqrecord | _Primer], limit=25, mismatches=0):
1796
+ def __init__(self, frags: list[Dseqrecord | Primer], limit=25, mismatches=0):
1759
1797
 
1760
1798
  value_error = ValueError(
1761
1799
  "PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer"
@@ -1765,15 +1803,15 @@ class PCRAssembly(Assembly):
1765
1803
 
1766
1804
  # Validate the inputs: should be a series of primer, template, primer
1767
1805
  wrong_fragment_class = (
1768
- not isinstance(frags[0], _Primer),
1769
- isinstance(frags[1], _Primer),
1770
- not isinstance(frags[2], _Primer),
1806
+ not isinstance(frags[0], Primer),
1807
+ isinstance(frags[1], Primer),
1808
+ not isinstance(frags[2], Primer),
1771
1809
  )
1772
1810
  if any(wrong_fragment_class):
1773
1811
  raise value_error
1774
1812
 
1775
1813
  # TODO: allow for the same fragment to be included more than once?
1776
- self.G = _nx.MultiDiGraph()
1814
+ self.G = nx.MultiDiGraph()
1777
1815
  # Add positive and negative nodes for forward and reverse fragments
1778
1816
  self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1779
1817
  self.G.add_nodes_from(
@@ -1786,8 +1824,8 @@ class PCRAssembly(Assembly):
1786
1824
  # primer, template, primer
1787
1825
  p1, t, p2 = (i + 1, i + 2, i + 3)
1788
1826
  primer_ids += [p1, p2]
1789
- pairs += list(_itertools.product([p1, p2], [t, -t]))
1790
- pairs += list(_itertools.product([t, -t], [-p1, -p2]))
1827
+ pairs += list(itertools.product([p1, p2], [t, -t]))
1828
+ pairs += list(itertools.product([t, -t], [-p1, -p2]))
1791
1829
 
1792
1830
  for u, v in pairs:
1793
1831
  u_seq = self.G.nodes[u]["seq"]
@@ -1826,20 +1864,33 @@ class PCRAssembly(Assembly):
1826
1864
  "get_insertion_assemblies not implemented for PCR assemblies"
1827
1865
  )
1828
1866
 
1867
+ def assemble_linear(
1868
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1869
+ ) -> list[Dseqrecord]:
1870
+ """
1871
+ Overrides the parent method to ensure that the 5' of the crick strand of the product matches the
1872
+ sequence of the reverse primer. This is important when using primers with dUTP (for USER cloning).
1873
+ """
1874
+ results = super().assemble_linear(only_adjacent_edges, max_assemblies)
1875
+ for result in results:
1876
+ rp = self.fragments[2]
1877
+ result.seq = result.seq[: -len(rp)] + Dseq(str(rp.seq.rc()))
1878
+ return results
1879
+
1829
1880
 
1830
1881
  class SingleFragmentAssembly(Assembly):
1831
1882
  """
1832
1883
  An assembly that represents the circularisation or splicing of a single fragment.
1833
1884
  """
1834
1885
 
1835
- def __init__(self, frags: [_Dseqrecord], limit=25, algorithm=common_sub_strings):
1886
+ def __init__(self, frags: [Dseqrecord], limit=25, algorithm=common_sub_strings):
1836
1887
 
1837
1888
  if len(frags) != 1:
1838
1889
  raise ValueError(
1839
1890
  "SingleFragmentAssembly assembly must be initialised with a single fragment"
1840
1891
  )
1841
1892
  # TODO: allow for the same fragment to be included more than once?
1842
- self.G = _nx.MultiDiGraph()
1893
+ self.G = nx.MultiDiGraph()
1843
1894
  frag = frags[0]
1844
1895
  # Add positive and negative nodes for forward and reverse fragments
1845
1896
  self.G.add_node(1, seq=frag)
@@ -1890,8 +1941,8 @@ class SingleFragmentAssembly(Assembly):
1890
1941
  if x[0][2] == x[0][3]:
1891
1942
  return False
1892
1943
  # We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
1893
- left_start, _ = _location_boundaries(x[0][2])
1894
- _, right_end = _location_boundaries(x[0][3])
1944
+ left_start, _ = location_boundaries(x[0][2])
1945
+ _, right_end = location_boundaries(x[0][3])
1895
1946
  if left_start == 0 and right_end == len(self.fragments[0]):
1896
1947
  return False
1897
1948
  return True
@@ -1914,18 +1965,19 @@ class SingleFragmentAssembly(Assembly):
1914
1965
 
1915
1966
 
1916
1967
  def common_function_assembly_products(
1917
- frags: list[_Dseqrecord],
1968
+ frags: list[Dseqrecord],
1918
1969
  limit: int | None,
1919
1970
  algorithm: Callable,
1920
1971
  circular_only: bool,
1921
1972
  filter_results_function: Callable | None = None,
1922
- ) -> list[_Dseqrecord]:
1973
+ only_adjacent_edges: bool = False,
1974
+ ) -> list[Dseqrecord]:
1923
1975
  """Common function to avoid code duplication. Could be simplified further
1924
1976
  once SingleFragmentAssembly and Assembly are merged.
1925
1977
 
1926
1978
  Parameters
1927
1979
  ----------
1928
- frags : list[_Dseqrecord]
1980
+ frags : list[Dseqrecord]
1929
1981
  List of DNA fragments to assemble
1930
1982
  limit : int or None
1931
1983
  Minimum overlap length required, or None if not applicable
@@ -1933,10 +1985,14 @@ def common_function_assembly_products(
1933
1985
  Function that determines valid overlaps between fragments
1934
1986
  circular_only : bool
1935
1987
  If True, only return circular assemblies
1988
+ filter_results_function : Callable or None
1989
+ Function that filters the results
1990
+ only_adjacent_edges : bool
1991
+ If True, only return assemblies that use only adjacent edges
1936
1992
 
1937
1993
  Returns
1938
1994
  -------
1939
- list[_Dseqrecord]
1995
+ list[Dseqrecord]
1940
1996
  List of assembled DNA molecules
1941
1997
  """
1942
1998
  if len(frags) == 1:
@@ -1945,10 +2001,10 @@ def common_function_assembly_products(
1945
2001
  asm = Assembly(
1946
2002
  frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
1947
2003
  )
1948
- output_assemblies = asm.get_circular_assemblies()
2004
+ output_assemblies = asm.get_circular_assemblies(only_adjacent_edges)
1949
2005
  if not circular_only and len(frags) > 1:
1950
2006
  output_assemblies += filter_linear_subassemblies(
1951
- asm.get_linear_assemblies(), output_assemblies, frags
2007
+ asm.get_linear_assemblies(only_adjacent_edges), output_assemblies, frags
1952
2008
  )
1953
2009
  if not circular_only and len(frags) == 1:
1954
2010
  output_assemblies += asm.get_insertion_assemblies()
@@ -1959,14 +2015,29 @@ def common_function_assembly_products(
1959
2015
  return [assemble(frags, a) for a in output_assemblies]
1960
2016
 
1961
2017
 
2018
+ def _recast_sources(
2019
+ products: list[Dseqrecord], source_cls, **extra_fields
2020
+ ) -> list[Dseqrecord]:
2021
+ """Recast the `source` of each product to `source_cls` with optional extras.
2022
+
2023
+ This avoids repeating the same for-loop across many assembly functions.
2024
+ """
2025
+ for prod in products:
2026
+ prod.source = source_cls(
2027
+ **prod.source.to_unserialized_dict(),
2028
+ **extra_fields,
2029
+ )
2030
+ return products
2031
+
2032
+
1962
2033
  def gibson_assembly(
1963
- frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1964
- ) -> list[_Dseqrecord]:
2034
+ frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
2035
+ ) -> list[Dseqrecord]:
1965
2036
  """Returns the products for Gibson assembly.
1966
2037
 
1967
2038
  Parameters
1968
2039
  ----------
1969
- frags : list[_Dseqrecord]
2040
+ frags : list[Dseqrecord]
1970
2041
  List of DNA fragments to assemble
1971
2042
  limit : int, optional
1972
2043
  Minimum overlap length required, by default 25
@@ -1975,23 +2046,25 @@ def gibson_assembly(
1975
2046
 
1976
2047
  Returns
1977
2048
  -------
1978
- list[_Dseqrecord]
2049
+ list[Dseqrecord]
1979
2050
  List of assembled DNA molecules
1980
2051
  """
1981
- return common_function_assembly_products(
2052
+
2053
+ products = common_function_assembly_products(
1982
2054
  frags, limit, gibson_overlap, circular_only
1983
2055
  )
2056
+ return _recast_sources(products, GibsonAssemblySource)
1984
2057
 
1985
2058
 
1986
2059
  def in_fusion_assembly(
1987
- frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1988
- ) -> list[_Dseqrecord]:
2060
+ frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
2061
+ ) -> list[Dseqrecord]:
1989
2062
  """Returns the products for in-fusion assembly. This is the same as Gibson
1990
2063
  assembly, but with a different name.
1991
2064
 
1992
2065
  Parameters
1993
2066
  ----------
1994
- frags : list[_Dseqrecord]
2067
+ frags : list[Dseqrecord]
1995
2068
  List of DNA fragments to assemble
1996
2069
  limit : int, optional
1997
2070
  Minimum overlap length required, by default 25
@@ -2000,21 +2073,23 @@ def in_fusion_assembly(
2000
2073
 
2001
2074
  Returns
2002
2075
  -------
2003
- list[_Dseqrecord]
2076
+ list[Dseqrecord]
2004
2077
  List of assembled DNA molecules
2005
2078
  """
2006
- return gibson_assembly(frags, limit)
2079
+
2080
+ products = gibson_assembly(frags, limit)
2081
+ return _recast_sources(products, InFusionSource)
2007
2082
 
2008
2083
 
2009
2084
  def fusion_pcr_assembly(
2010
- frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2011
- ) -> list[_Dseqrecord]:
2085
+ frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
2086
+ ) -> list[Dseqrecord]:
2012
2087
  """Returns the products for fusion PCR assembly. This is the same as Gibson
2013
2088
  assembly, but with a different name.
2014
2089
 
2015
2090
  Parameters
2016
2091
  ----------
2017
- frags : list[_Dseqrecord]
2092
+ frags : list[Dseqrecord]
2018
2093
  List of DNA fragments to assemble
2019
2094
  limit : int, optional
2020
2095
  Minimum overlap length required, by default 25
@@ -2023,20 +2098,21 @@ def fusion_pcr_assembly(
2023
2098
 
2024
2099
  Returns
2025
2100
  -------
2026
- list[_Dseqrecord]
2101
+ list[Dseqrecord]
2027
2102
  List of assembled DNA molecules
2028
2103
  """
2029
- return gibson_assembly(frags, limit)
2104
+ products = gibson_assembly(frags, limit)
2105
+ return _recast_sources(products, OverlapExtensionPCRLigationSource)
2030
2106
 
2031
2107
 
2032
2108
  def in_vivo_assembly(
2033
- frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2034
- ) -> list[_Dseqrecord]:
2109
+ frags: list[Dseqrecord], limit: int = 25, circular_only: bool = False
2110
+ ) -> list[Dseqrecord]:
2035
2111
  """Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
2036
2112
 
2037
2113
  Parameters
2038
2114
  ----------
2039
- frags : list[_Dseqrecord]
2115
+ frags : list[Dseqrecord]
2040
2116
  List of DNA fragments to assemble
2041
2117
  limit : int, optional
2042
2118
  Minimum overlap length required, by default 25
@@ -2045,30 +2121,32 @@ def in_vivo_assembly(
2045
2121
 
2046
2122
  Returns
2047
2123
  -------
2048
- list[_Dseqrecord]
2124
+ list[Dseqrecord]
2049
2125
  List of assembled DNA molecules
2050
2126
  """
2051
- return common_function_assembly_products(
2127
+ products = common_function_assembly_products(
2052
2128
  frags, limit, common_sub_strings, circular_only
2053
2129
  )
2130
+ return _recast_sources(products, InVivoAssemblySource)
2054
2131
 
2055
2132
 
2056
2133
  def restriction_ligation_assembly(
2057
- frags: list[_Dseqrecord],
2058
- enzymes: list["_AbstractCut"],
2134
+ frags: list[Dseqrecord],
2135
+ enzymes: list["AbstractCut"],
2059
2136
  allow_blunt: bool = True,
2060
2137
  circular_only: bool = False,
2061
- ) -> list[_Dseqrecord]:
2138
+ ) -> list[Dseqrecord]:
2062
2139
  """Returns the products for restriction ligation assembly:
2063
- * Finds cutsites in the fragments
2064
- * Finds all products that could be assembled by ligating the fragments based on those cutsites
2065
- * Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2140
+
2141
+ - Finds cutsites in the fragments
2142
+ - Finds all products that could be assembled by ligating the fragments based on those cutsites
2143
+ - Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2066
2144
 
2067
2145
  Parameters
2068
2146
  ----------
2069
- frags : list[_Dseqrecord]
2147
+ frags : list[Dseqrecord]
2070
2148
  List of DNA fragments to assemble
2071
- enzymes : list[_AbstractCut]
2149
+ enzymes : list[AbstractCut]
2072
2150
  List of restriction enzymes to use
2073
2151
  allow_blunt : bool, optional
2074
2152
  If True, allow blunt end ligations, by default True
@@ -2077,15 +2155,15 @@ def restriction_ligation_assembly(
2077
2155
 
2078
2156
  Returns
2079
2157
  -------
2080
- list[_Dseqrecord]
2158
+ list[Dseqrecord]
2081
2159
  List of assembled DNA molecules
2082
2160
 
2083
2161
  Examples
2084
2162
  --------
2085
2163
  In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
2086
- Note how 2 circular products are returned, one contains the insert (`acgt`)
2087
- and the desired part of the backbone (`cccccc`), the other contains the
2088
- reversed insert (`tgga`) and the cut-out part of the backbone (`aaa`).
2164
+ Note how 2 circular products are returned, one contains the insert (``acgt``)
2165
+ and the desired part of the backbone (``cccccc``), the other contains the
2166
+ reversed insert (``tgga``) and the cut-out part of the backbone (``aaa``).
2089
2167
 
2090
2168
  >>> from pydna.assembly2 import restriction_ligation_assembly
2091
2169
  >>> from pydna.dseqrecord import Dseqrecord
@@ -2119,28 +2197,33 @@ def restriction_ligation_assembly(
2119
2197
  TTAAGtttC
2120
2198
  """
2121
2199
 
2122
- def algo(x, y, _l):
2200
+ def algorithm_fn(x, y, _l):
2123
2201
  # By default, we allow blunt ends
2124
2202
  return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
2125
2203
 
2126
- return common_function_assembly_products(frags, None, algo, circular_only)
2204
+ products = common_function_assembly_products(
2205
+ frags, None, algorithm_fn, circular_only, only_adjacent_edges=True
2206
+ )
2207
+ return _recast_sources(
2208
+ products, RestrictionAndLigationSource, restriction_enzymes=enzymes
2209
+ )
2127
2210
 
2128
2211
 
2129
2212
  def golden_gate_assembly(
2130
- frags: list[_Dseqrecord],
2131
- enzymes: list["_AbstractCut"],
2213
+ frags: list[Dseqrecord],
2214
+ enzymes: list["AbstractCut"],
2132
2215
  allow_blunt: bool = True,
2133
2216
  circular_only: bool = False,
2134
- ) -> list[_Dseqrecord]:
2217
+ ) -> list[Dseqrecord]:
2135
2218
  """Returns the products for Golden Gate assembly. This is the same as
2136
2219
  restriction ligation assembly, but with a different name. Check the documentation
2137
- for `restriction_ligation_assembly` for more details.
2220
+ for ``restriction_ligation_assembly`` for more details.
2138
2221
 
2139
2222
  Parameters
2140
2223
  ----------
2141
- frags : list[_Dseqrecord]
2224
+ frags : list[Dseqrecord]
2142
2225
  List of DNA fragments to assemble
2143
- enzymes : list[_AbstractCut]
2226
+ enzymes : list[AbstractCut]
2144
2227
  List of restriction enzymes to use
2145
2228
  allow_blunt : bool, optional
2146
2229
  If True, allow blunt end ligations, by default True
@@ -2149,30 +2232,30 @@ def golden_gate_assembly(
2149
2232
 
2150
2233
  Returns
2151
2234
  -------
2152
- list[_Dseqrecord]
2235
+ list[Dseqrecord]
2153
2236
  List of assembled DNA molecules
2154
2237
 
2155
2238
  Examples
2156
2239
  --------
2157
- See the example for `restriction_ligation_assembly`.
2240
+ See the example for ``restriction_ligation_assembly``.
2158
2241
  """
2159
2242
  return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
2160
2243
 
2161
2244
 
2162
2245
  def ligation_assembly(
2163
- frags: list[_Dseqrecord],
2246
+ frags: list[Dseqrecord],
2164
2247
  allow_blunt: bool = False,
2165
2248
  allow_partial_overlap: bool = False,
2166
2249
  circular_only: bool = False,
2167
- ) -> list[_Dseqrecord]:
2250
+ ) -> list[Dseqrecord]:
2168
2251
  """Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
2169
2252
  will be ligated.
2170
2253
 
2171
- For most cases, you probably should use `restriction_ligation_assembly` instead.
2254
+ For most cases, you probably should use ``restriction_ligation_assembly`` instead.
2172
2255
 
2173
2256
  Parameters
2174
2257
  ----------
2175
- frags : list[_Dseqrecord]
2258
+ frags : list[Dseqrecord]
2176
2259
  List of DNA fragments to assemble
2177
2260
  allow_blunt : bool, optional
2178
2261
  If True, allow blunt end ligations, by default False
@@ -2183,7 +2266,7 @@ def ligation_assembly(
2183
2266
 
2184
2267
  Returns
2185
2268
  -------
2186
- list[_Dseqrecord]
2269
+ list[Dseqrecord]
2187
2270
  List of assembled DNA molecules
2188
2271
 
2189
2272
 
@@ -2215,11 +2298,14 @@ def ligation_assembly(
2215
2298
  return sticky_end_sub_strings(x, y, allow_partial_overlap)
2216
2299
 
2217
2300
  if allow_blunt:
2218
- algo = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2301
+ algorithm_fn = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2219
2302
  else:
2220
- algo = sticky_end_algorithm
2303
+ algorithm_fn = sticky_end_algorithm
2221
2304
 
2222
- return common_function_assembly_products(frags, None, algo, circular_only)
2305
+ products = common_function_assembly_products(
2306
+ frags, None, algorithm_fn, circular_only
2307
+ )
2308
+ return _recast_sources(products, LigationSource)
2223
2309
 
2224
2310
 
2225
2311
  def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
@@ -2235,20 +2321,20 @@ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
2235
2321
 
2236
2322
 
2237
2323
  def gateway_assembly(
2238
- frags: list[_Dseqrecord],
2239
- reaction_type: str,
2324
+ frags: list[Dseqrecord],
2325
+ reaction_type: Literal["BP", "LR"],
2240
2326
  greedy: bool = False,
2241
2327
  circular_only: bool = False,
2242
2328
  multi_site_only: bool = False,
2243
- ) -> list[_Dseqrecord]:
2329
+ ) -> list[Dseqrecord]:
2244
2330
  """Returns the products for Gateway assembly / Gateway cloning.
2245
2331
 
2246
2332
  Parameters
2247
2333
  ----------
2248
- frags : list[_Dseqrecord]
2334
+ frags : list[Dseqrecord]
2249
2335
  List of DNA fragments to assemble
2250
- reaction_type : str
2251
- Type of Gateway reaction, either 'BP' or 'LR'
2336
+ reaction_type : Literal['BP', 'LR']
2337
+ Type of Gateway reaction
2252
2338
  greedy : bool, optional
2253
2339
  If True, use greedy gateway consensus sites, by default False
2254
2340
  circular_only : bool, optional
@@ -2261,7 +2347,7 @@ def gateway_assembly(
2261
2347
 
2262
2348
  Returns
2263
2349
  -------
2264
- list[_Dseqrecord]
2350
+ list[Dseqrecord]
2265
2351
  List of assembled DNA molecules
2266
2352
 
2267
2353
 
@@ -2288,9 +2374,9 @@ def gateway_assembly(
2288
2374
  >>> len(products_LR)
2289
2375
  2
2290
2376
 
2291
- Now let's understand the `multi_site_only` parameter. Let's consider a case where we are swapping fragments
2377
+ Now let's understand the ``multi_site_only`` parameter. Let's consider a case where we are swapping fragments
2292
2378
  between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
2293
- swapping between the two att sites. That's what we get if we set `multi_site_only` to True.
2379
+ swapping between the two att sites. That's what we get if we set ``multi_site_only`` to True.
2294
2380
 
2295
2381
  >>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
2296
2382
  >>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
@@ -2300,7 +2386,7 @@ def gateway_assembly(
2300
2386
  >>> len(products)
2301
2387
  2
2302
2388
 
2303
- However, if we set `multi_site_only` to False, we get 4 products, which also include the intermediate products
2389
+ However, if we set ``multi_site_only`` to False, we get 4 products, which also include the intermediate products
2304
2390
  where the two plasmids are combined into a single one through recombination of a single att site. This is an
2305
2391
  intermediate of the reaction, and typically we don't want it:
2306
2392
 
@@ -2316,13 +2402,19 @@ def gateway_assembly(
2316
2402
  f"Invalid reaction type: {reaction_type}, can only be BP or LR"
2317
2403
  )
2318
2404
 
2319
- def algo(x, y, _l):
2405
+ def algorithm_fn(x, y, _l):
2320
2406
  return gateway_overlap(x, y, reaction_type, greedy)
2321
2407
 
2322
2408
  filter_results_function = None if not multi_site_only else assembly_is_multi_site
2323
2409
 
2324
2410
  products = common_function_assembly_products(
2325
- frags, None, algo, circular_only, filter_results_function
2411
+ frags, None, algorithm_fn, circular_only, filter_results_function
2412
+ )
2413
+ products = _recast_sources(
2414
+ products,
2415
+ GatewaySource,
2416
+ reaction_type=reaction_type,
2417
+ greedy=greedy,
2326
2418
  )
2327
2419
 
2328
2420
  if len(products) == 0:
@@ -2342,13 +2434,13 @@ def gateway_assembly(
2342
2434
 
2343
2435
 
2344
2436
  def common_function_integration_products(
2345
- frags: list[_Dseqrecord], limit: int | None, algorithm: Callable
2346
- ) -> list[_Dseqrecord]:
2437
+ frags: list[Dseqrecord], limit: int | None, algorithm: Callable
2438
+ ) -> list[Dseqrecord]:
2347
2439
  """Common function to avoid code duplication for integration products.
2348
2440
 
2349
2441
  Parameters
2350
2442
  ----------
2351
- frags : list[_Dseqrecord]
2443
+ frags : list[Dseqrecord]
2352
2444
  List of DNA fragments to integrate
2353
2445
  limit : int or None
2354
2446
  Minimum overlap length required, or None if not applicable
@@ -2357,7 +2449,7 @@ def common_function_integration_products(
2357
2449
 
2358
2450
  Returns
2359
2451
  -------
2360
- list[_Dseqrecord]
2452
+ list[Dseqrecord]
2361
2453
  List of integrated DNA molecules
2362
2454
  """
2363
2455
  if len(frags) == 1:
@@ -2378,27 +2470,27 @@ def common_function_integration_products(
2378
2470
 
2379
2471
 
2380
2472
  def common_handle_insertion_fragments(
2381
- genome: _Dseqrecord, inserts: list[_Dseqrecord]
2382
- ) -> list[_Dseqrecord]:
2473
+ genome: Dseqrecord, inserts: list[Dseqrecord]
2474
+ ) -> list[Dseqrecord]:
2383
2475
  """Common function to handle / validate insertion fragments.
2384
2476
 
2385
2477
  Parameters
2386
2478
  ----------
2387
- genome : _Dseqrecord
2479
+ genome : Dseqrecord
2388
2480
  Target genome sequence
2389
- inserts : list[_Dseqrecord] or _Dseqrecord
2481
+ inserts : list[Dseqrecord] or Dseqrecord
2390
2482
  DNA fragment(s) to insert
2391
2483
 
2392
2484
  Returns
2393
2485
  -------
2394
- list[_Dseqrecord]
2486
+ list[Dseqrecord]
2395
2487
  List containing genome and insert fragments
2396
2488
  """
2397
- if not isinstance(genome, _Dseqrecord):
2489
+ if not isinstance(genome, Dseqrecord):
2398
2490
  raise ValueError("Genome must be a Dseqrecord object")
2399
2491
 
2400
2492
  if not isinstance(inserts, list) or not all(
2401
- isinstance(f, _Dseqrecord) for f in inserts
2493
+ isinstance(f, Dseqrecord) for f in inserts
2402
2494
  ):
2403
2495
  raise ValueError("Inserts must be a list of Dseqrecord objects")
2404
2496
 
@@ -2409,13 +2501,13 @@ def common_handle_insertion_fragments(
2409
2501
 
2410
2502
 
2411
2503
  def common_function_excision_products(
2412
- genome: _Dseqrecord, limit: int | None, algorithm: Callable
2413
- ) -> list[_Dseqrecord]:
2504
+ genome: Dseqrecord, limit: int | None, algorithm: Callable
2505
+ ) -> list[Dseqrecord]:
2414
2506
  """Common function to avoid code duplication for excision products.
2415
2507
 
2416
2508
  Parameters
2417
2509
  ----------
2418
- genome : _Dseqrecord
2510
+ genome : Dseqrecord
2419
2511
  Target genome sequence
2420
2512
  limit : int or None
2421
2513
  Minimum overlap length required, or None if not applicable
@@ -2424,7 +2516,7 @@ def common_function_excision_products(
2424
2516
 
2425
2517
  Returns
2426
2518
  -------
2427
- list[_Dseqrecord]
2519
+ list[Dseqrecord]
2428
2520
  List of excised DNA molecules
2429
2521
  """
2430
2522
  asm = SingleFragmentAssembly([genome], limit, algorithm)
@@ -2432,25 +2524,25 @@ def common_function_excision_products(
2432
2524
 
2433
2525
 
2434
2526
  def homologous_recombination_integration(
2435
- genome: _Dseqrecord,
2436
- inserts: list[_Dseqrecord],
2527
+ genome: Dseqrecord,
2528
+ inserts: list[Dseqrecord],
2437
2529
  limit: int = 40,
2438
- ) -> list[_Dseqrecord]:
2530
+ ) -> list[Dseqrecord]:
2439
2531
  """Returns the products resulting from the integration of an insert (or inserts joined
2440
2532
  through in vivo recombination) into the genome through homologous recombination.
2441
2533
 
2442
2534
  Parameters
2443
2535
  ----------
2444
- genome : _Dseqrecord
2536
+ genome : Dseqrecord
2445
2537
  Target genome sequence
2446
- inserts : list[_Dseqrecord]
2538
+ inserts : list[Dseqrecord]
2447
2539
  DNA fragment(s) to insert
2448
2540
  limit : int, optional
2449
2541
  Minimum homology length required, by default 40
2450
2542
 
2451
2543
  Returns
2452
2544
  -------
2453
- list[_Dseqrecord]
2545
+ list[Dseqrecord]
2454
2546
  List of integrated DNA molecules
2455
2547
 
2456
2548
 
@@ -2479,25 +2571,28 @@ def homologous_recombination_integration(
2479
2571
  """
2480
2572
  fragments = common_handle_insertion_fragments(genome, inserts)
2481
2573
 
2482
- return common_function_integration_products(fragments, limit, common_sub_strings)
2574
+ products = common_function_integration_products(
2575
+ fragments, limit, common_sub_strings
2576
+ )
2577
+ return _recast_sources(products, HomologousRecombinationSource)
2483
2578
 
2484
2579
 
2485
2580
  def homologous_recombination_excision(
2486
- genome: _Dseqrecord, limit: int = 40
2487
- ) -> list[_Dseqrecord]:
2581
+ genome: Dseqrecord, limit: int = 40
2582
+ ) -> list[Dseqrecord]:
2488
2583
  """Returns the products resulting from the excision of a fragment from the genome through
2489
2584
  homologous recombination.
2490
2585
 
2491
2586
  Parameters
2492
2587
  ----------
2493
- genome : _Dseqrecord
2588
+ genome : Dseqrecord
2494
2589
  Target genome sequence
2495
2590
  limit : int, optional
2496
2591
  Minimum homology length required, by default 40
2497
2592
 
2498
2593
  Returns
2499
2594
  -------
2500
- list[_Dseqrecord]
2595
+ list[Dseqrecord]
2501
2596
  List containing excised plasmid and remaining genome sequence
2502
2597
 
2503
2598
  Examples
@@ -2515,27 +2610,28 @@ def homologous_recombination_excision(
2515
2610
  >>> products
2516
2611
  [Dseqrecord(o25), Dseqrecord(-32)]
2517
2612
  """
2518
- return common_function_excision_products(genome, limit, common_sub_strings)
2613
+ products = common_function_excision_products(genome, limit, common_sub_strings)
2614
+ return _recast_sources(products, HomologousRecombinationSource)
2519
2615
 
2520
2616
 
2521
2617
  def cre_lox_integration(
2522
- genome: _Dseqrecord, inserts: list[_Dseqrecord]
2523
- ) -> list[_Dseqrecord]:
2618
+ genome: Dseqrecord, inserts: list[Dseqrecord]
2619
+ ) -> list[Dseqrecord]:
2524
2620
  """Returns the products resulting from the integration of an insert (or inserts joined
2525
2621
  through cre-lox recombination among them) into the genome through cre-lox integration.
2526
2622
 
2527
- Also works with lox66 and lox71 (see `pydna.cre_lox` for more details).
2623
+ Also works with lox66 and lox71 (see ``pydna.cre_lox`` for more details).
2528
2624
 
2529
2625
  Parameters
2530
2626
  ----------
2531
- genome : _Dseqrecord
2627
+ genome : Dseqrecord
2532
2628
  Target genome sequence
2533
- inserts : list[_Dseqrecord] or _Dseqrecord
2629
+ inserts : list[Dseqrecord] or Dseqrecord
2534
2630
  DNA fragment(s) to insert
2535
2631
 
2536
2632
  Returns
2537
2633
  -------
2538
- list[_Dseqrecord]
2634
+ list[Dseqrecord]
2539
2635
  List of integrated DNA molecules
2540
2636
 
2541
2637
  Examples
@@ -2574,20 +2670,21 @@ def cre_lox_integration(
2574
2670
 
2575
2671
  """
2576
2672
  fragments = common_handle_insertion_fragments(genome, inserts)
2577
- return common_function_integration_products(fragments, None, cre_loxP_overlap)
2673
+ products = common_function_integration_products(fragments, None, cre_loxP_overlap)
2674
+ return _recast_sources(products, CreLoxRecombinationSource)
2578
2675
 
2579
2676
 
2580
- def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
2677
+ def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]:
2581
2678
  """Returns the products for CRE-lox excision.
2582
2679
 
2583
2680
  Parameters
2584
2681
  ----------
2585
- genome : _Dseqrecord
2682
+ genome : Dseqrecord
2586
2683
  Target genome sequence
2587
2684
 
2588
2685
  Returns
2589
2686
  -------
2590
- list[_Dseqrecord]
2687
+ list[Dseqrecord]
2591
2688
  List containing excised plasmid and remaining genome sequence
2592
2689
 
2593
2690
  Examples
@@ -2624,4 +2721,152 @@ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
2624
2721
  >>> res2
2625
2722
  [Dseqrecord(o39), Dseqrecord(-45)]
2626
2723
  """
2627
- return common_function_excision_products(genome, None, cre_loxP_overlap)
2724
+ products = common_function_excision_products(genome, None, cre_loxP_overlap)
2725
+ return _recast_sources(products, CreLoxRecombinationSource)
2726
+
2727
+
2728
+ def crispr_integration(
2729
+ genome: Dseqrecord,
2730
+ inserts: list[Dseqrecord],
2731
+ guides: list[Primer],
2732
+ limit: int = 40,
2733
+ ) -> list[Dseqrecord]:
2734
+ """
2735
+ Returns the products for CRISPR integration.
2736
+
2737
+ Parameters
2738
+ ----------
2739
+ genome : Dseqrecord
2740
+ Target genome sequence
2741
+ inserts : list[Dseqrecord]
2742
+ DNA fragment(s) to insert
2743
+ guides : list[Primer]
2744
+ List of guide RNAs as Primer objects. This may change in the future.
2745
+ limit : int, optional
2746
+ Minimum overlap length required, by default 40
2747
+
2748
+ Returns
2749
+ -------
2750
+ list[Dseqrecord]
2751
+ List of integrated DNA molecules
2752
+
2753
+ Examples
2754
+ --------
2755
+
2756
+ >>> from pydna.dseqrecord import Dseqrecord
2757
+ >>> from pydna.assembly2 import crispr_integration
2758
+ >>> from pydna.primer import Primer
2759
+ >>> genome = Dseqrecord("aaccggttcaatgcaaacagtaatgatggatgacattcaaagcac", name="genome")
2760
+ >>> insert = Dseqrecord("aaccggttAAAAAAAAAttcaaagcac", name="insert")
2761
+ >>> guide = Primer("ttcaatgcaaacagtaatga", name="guide")
2762
+ >>> product, *_ = crispr_integration(genome, [insert], [guide], 8)
2763
+ >>> product
2764
+ Dseqrecord(-27)
2765
+
2766
+ """
2767
+ if len(guides) == 0:
2768
+ raise ValueError("At least one guide RNA is required for CRISPR integration")
2769
+
2770
+ # Get all the possible products from the homologous recombination integration
2771
+ products = homologous_recombination_integration(genome, inserts, limit)
2772
+
2773
+ # Verify that the guides cut in the region that will be repaired
2774
+
2775
+ # First we collect the positions where the guides cut
2776
+ guide_cuts = []
2777
+ for guide in guides:
2778
+ enzyme = cas9(str(guide.seq))
2779
+ possible_cuts = genome.seq.get_cutsites(enzyme)
2780
+ if len(possible_cuts) == 0:
2781
+ raise ValueError(
2782
+ f"Could not find Cas9 cutsite in the target sequence using the guide: {guide.name}"
2783
+ )
2784
+ # Keep only the position of the cut
2785
+ possible_cuts = [cut[0] for (cut, _) in possible_cuts]
2786
+ guide_cuts.append(possible_cuts)
2787
+
2788
+ # Then, we check it the possible homologous recombination products contain the cuts
2789
+ # from the guides inside the repair region.
2790
+ # We also add the used guides to each product. This is very important!
2791
+ valid_products = []
2792
+ for i, product in enumerate(products):
2793
+ # The second element of product.source.input is conventionally the insert/repair fragment
2794
+ # The other two (first and third) are the two bits of the genome
2795
+ repair_start = location_boundaries(product.source.input[0].right_location)[0]
2796
+ # Here we do +1 because the position of the cut marks the boundary (e.g. 0:10, 10:20 if a cut is at pos 10)
2797
+ repair_end = location_boundaries(product.source.input[2].left_location)[1] + 1
2798
+ repair_location = create_location(repair_start, repair_end, len(genome))
2799
+ some_cuts_inside_repair = []
2800
+ all_cuts_inside_repair = []
2801
+ for cut_group in guide_cuts:
2802
+ cuts_in_repair = [cut for cut in cut_group if cut in repair_location]
2803
+ some_cuts_inside_repair.append(len(cuts_in_repair) != 0)
2804
+ all_cuts_inside_repair.append(len(cuts_in_repair) == len(cut_group))
2805
+
2806
+ if all(some_cuts_inside_repair):
2807
+ used_guides = [g for i, g in enumerate(guides) if all_cuts_inside_repair[i]]
2808
+ # Add the used guides to the product <----- VERY IMPORTANT!
2809
+ product.source.input.extend([SourceInput(sequence=g) for g in used_guides])
2810
+ valid_products.append(product)
2811
+
2812
+ if not all(all_cuts_inside_repair):
2813
+ raise ValueError(
2814
+ "Some guides cut outside the repair region, please check the guides"
2815
+ )
2816
+
2817
+ if len(valid_products) != len(products):
2818
+ warnings.warn(
2819
+ "Some recombination products were discarded because they had off-target cuts",
2820
+ category=UserWarning,
2821
+ stacklevel=2,
2822
+ )
2823
+
2824
+ return _recast_sources(valid_products, CRISPRSource)
2825
+
2826
+
2827
+ def pcr_assembly(
2828
+ template: Dseqrecord,
2829
+ fwd_primer: Primer,
2830
+ rvs_primer: Primer,
2831
+ add_primer_features: bool = False,
2832
+ limit: int = 14,
2833
+ mismatches: int = 0,
2834
+ ) -> list[Dseqrecord]:
2835
+ """Returns the products for PCR assembly.
2836
+
2837
+ Parameters
2838
+ ----------
2839
+ template : Dseqrecord
2840
+ Template sequence
2841
+ fwd_primer : Primer
2842
+ Forward primer
2843
+ rvs_primer : Primer
2844
+ Reverse primer
2845
+ add_primer_features : bool, optional
2846
+ If True, add primer features to the product, by default False
2847
+ limit : int, optional
2848
+ Minimum overlap length required, by default 14
2849
+ mismatches : int, optional
2850
+ Maximum number of mismatches, by default 0
2851
+
2852
+ Returns
2853
+ -------
2854
+ list[Dseqrecord]
2855
+ List of assembled DNA molecules
2856
+ """
2857
+
2858
+ minimal_annealing = limit + mismatches
2859
+ fragments = [fwd_primer, template, rvs_primer]
2860
+ asm = PCRAssembly(
2861
+ fragments,
2862
+ limit=minimal_annealing,
2863
+ mismatches=mismatches,
2864
+ )
2865
+ products = asm.assemble_linear()
2866
+ # If both primers are the same, remove duplicates
2867
+ if str(fwd_primer.seq).upper() == str(rvs_primer.seq).upper():
2868
+ products = [p for p in products if not p.source.input[1].reverse_complemented]
2869
+ if add_primer_features:
2870
+ products = [annotate_primer_binding_sites(prod, fragments) for prod in products]
2871
+
2872
+ return _recast_sources(products, PCRSource, add_primer_features=add_primer_features)