pydna 5.5.1__py3-none-any.whl → 5.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/assembly2.py ADDED
@@ -0,0 +1,2627 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Improved implementation of the assembly module. To see a list of issues with the previous implementation,
4
+ see [issues tagged with fixed-with-new-assembly-model](https://github.com/pydna-group/pydna/issues?q=is%3Aissue%20state%3Aopen%20label%3Afixed-with-new-assembly-model)
5
+ """
6
+
7
+ import networkx as _nx
8
+ import itertools as _itertools
9
+ from Bio.SeqFeature import SimpleLocation, Location
10
+ from Bio.Seq import reverse_complement
11
+ from Bio.Restriction.Restriction import RestrictionBatch
12
+ import regex
13
+ import copy
14
+
15
+ from pydna.utils import (
16
+ shift_location as _shift_location,
17
+ flatten,
18
+ location_boundaries as _location_boundaries,
19
+ locations_overlap as _locations_overlap,
20
+ sum_is_sticky,
21
+ limit_iterator,
22
+ create_location,
23
+ )
24
+ from pydna._pretty import pretty_str as _pretty_str
25
+ from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
26
+ from pydna.dseqrecord import Dseqrecord as _Dseqrecord
27
+ from pydna.dseq import Dseq as _Dseq
28
+ from pydna.primer import Primer as _Primer
29
+ from pydna.seqrecord import SeqRecord as _SeqRecord
30
+ from pydna.types import (
31
+ CutSiteType,
32
+ # TODO: allow user to enforce multi-site
33
+ EdgeRepresentationAssembly,
34
+ SubFragmentRepresentationAssembly,
35
+ AssemblyAlgorithmType,
36
+ SequenceOverlap,
37
+ AssemblyEdgeType,
38
+ )
39
+ from pydna.gateway import gateway_overlap, find_gateway_sites
40
+ from pydna.cre_lox import cre_loxP_overlap
41
+
42
+ from typing import TYPE_CHECKING, Callable
43
+
44
+ if TYPE_CHECKING:
45
+ from Bio.Restriction import AbstractCut as _AbstractCut
46
+
47
+
48
+ def gather_overlapping_locations(
49
+ locs: list[Location], fragment_length: int
50
+ ) -> list[tuple[Location, ...]]:
51
+ """
52
+ Turn a list of locations into a list of tuples of those locations, where each tuple contains
53
+ locations that overlap. For example, if locs = [loc1, loc2, loc3], and loc1 and loc2 overlap,
54
+ the output will be [(loc1, loc2), (loc3,)].
55
+ """
56
+ # Make a graph with all the locations as nodes
57
+ G = _nx.Graph()
58
+ for i, loc in enumerate(locs):
59
+ G.add_node(i, location=loc)
60
+
61
+ # Add edges between nodes that overlap
62
+ for i in range(len(locs)):
63
+ for j in range(i + 1, len(locs)):
64
+ if _locations_overlap(locs[i], locs[j], fragment_length):
65
+ G.add_edge(i, j)
66
+
67
+ # Get groups of overlapping locations
68
+ groups = list()
69
+ for loc_set in _nx.connected_components(G):
70
+ groups.append(tuple(locs[i] for i in loc_set))
71
+
72
+ # Sort by location of the first element in each group (does not matter which since they are overlapping)
73
+ groups.sort(key=lambda x: _location_boundaries(x[0])[0])
74
+
75
+ return groups
76
+
77
+
78
+ def ends_from_cutsite(
79
+ cutsite: CutSiteType, seq: _Dseq
80
+ ) -> tuple[tuple[str, str], tuple[str, str]]:
81
+ """Get the sticky or blunt ends created by a restriction enzyme cut.
82
+
83
+ Args:
84
+ cutsite (CutSiteType): A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
85
+ seq (_Dseq): The DNA sequence being cut
86
+
87
+ Raises:
88
+ ValueError: If cutsite is None
89
+
90
+ Returns:
91
+ tuple[tuple[str, str], tuple[str, str]]: A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
92
+ and the sequence of the overhang. The first tuple is for the left end, second for the right end.
93
+
94
+ >>> from Bio.Restriction import NotI
95
+ >>> x = _Dseq("ctcgGCGGCCGCcagcggccg")
96
+ >>> x.get_cutsites(NotI)
97
+ [((6, -4), NotI)]
98
+ >>> ends_from_cutsite(x.get_cutsites(NotI)[0], x)
99
+ (("5'", 'ggcc'), ("5'", 'ggcc'))
100
+ """
101
+
102
+ if cutsite is None:
103
+ raise ValueError("None is not supported")
104
+
105
+ cut_watson, cut_crick, ovhg = seq.get_cut_parameters(cutsite, is_left=None)
106
+ if ovhg < 0:
107
+ # TODO check the edge in circular
108
+ return (
109
+ ("5'", str(seq[cut_watson:cut_crick].reverse_complement()).lower()),
110
+ ("5'", str(seq[cut_watson:cut_crick]).lower()),
111
+ )
112
+ elif ovhg > 0:
113
+ return (
114
+ ("3'", str(seq[cut_crick:cut_watson]).lower()),
115
+ ("3'", str(seq[cut_crick:cut_watson].reverse_complement()).lower()),
116
+ )
117
+
118
+ return ("blunt", ""), ("blunt", "")
119
+
120
+
121
+ def restriction_ligation_overlap(
122
+ seqx: _Dseqrecord,
123
+ seqy: _Dseqrecord,
124
+ enzymes=RestrictionBatch,
125
+ partial=False,
126
+ allow_blunt=False,
127
+ ) -> list[SequenceOverlap]:
128
+ """Assembly algorithm to find overlaps that would result from restriction and ligation.
129
+
130
+ Like in sticky and gibson, the order matters (see example below of partial overlap)
131
+
132
+ Args:
133
+ seqx (_Dseqrecord): The first sequence
134
+ seqy (_Dseqrecord): The second sequence
135
+ enzymes (RestrictionBatch): The enzymes to use
136
+ partial (bool): Whether to allow partial overlaps
137
+ allow_blunt (bool): Whether to allow blunt ends
138
+ Returns:
139
+ list[SequenceOverlap]: A list of overlaps between the two sequences
140
+
141
+ >>> from pydna.dseqrecord import Dseqrecord
142
+ >>> from pydna.assembly2 import restriction_ligation_overlap
143
+ >>> from Bio.Restriction import EcoRI, RgaI, DrdI, EcoRV
144
+ >>> x = Dseqrecord("ccGAATTCaa")
145
+ >>> y = Dseqrecord("aaaaGAATTCgg")
146
+ >>> restriction_ligation_overlap(x, y, [EcoRI])
147
+ [(3, 5, 4)]
148
+ >>> restriction_ligation_overlap(y, x, [EcoRI])
149
+ [(5, 3, 4)]
150
+
151
+ Partial overlap, note how it is not symmetric
152
+
153
+ >>> x = Dseqrecord("GACTAAAGGGTC")
154
+ >>> y = Dseqrecord("AAGCGATCGCAAGCGATCGCAA")
155
+ >>> restriction_ligation_overlap(x, y, [RgaI, DrdI], partial=True)
156
+ [(6, 5, 1), (6, 15, 1)]
157
+ >>> restriction_ligation_overlap(y, x, [RgaI, DrdI], partial=True)
158
+ []
159
+
160
+ Blunt overlap, returns length of the overlap 0
161
+
162
+ >>> x = Dseqrecord("aaGATATCcc")
163
+ >>> y = Dseqrecord("ttttGATATCaa")
164
+ >>> restriction_ligation_overlap(x, y, [EcoRV], allow_blunt=True)
165
+ [(5, 7, 0)]
166
+ >>> restriction_ligation_overlap(y, x, [EcoRV], allow_blunt=True)
167
+ [(7, 5, 0)]
168
+
169
+ """
170
+ cuts_x = seqx.seq.get_cutsites(*enzymes)
171
+ cuts_y = seqy.seq.get_cutsites(*enzymes)
172
+ # If blunt ends are allowed, something similar to this could be done to allow
173
+ # joining with linear sequence ends, but for now it messes up with the only_adjacent_edges
174
+ # case
175
+ # if allow_blunt:
176
+ # if not seqx.circular:
177
+ # cuts_x.append(((len(seqx), 0), None))
178
+ # if not seqy.circular:
179
+ # cuts_y.append(((0, 0), None))
180
+ matches = list()
181
+ for cut_x, cut_y in _itertools.product(cuts_x, cuts_y):
182
+ # A blunt end
183
+ if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
184
+ matches.append((cut_x[0][0], cut_y[0][0], 0))
185
+ continue
186
+
187
+ # Otherwise, test overhangs
188
+ overlap = sum_is_sticky(
189
+ ends_from_cutsite(cut_x, seqx.seq)[0],
190
+ ends_from_cutsite(cut_y, seqy.seq)[1],
191
+ partial,
192
+ )
193
+ if not overlap:
194
+ continue
195
+ x_watson, x_crick, x_ovhg = seqx.seq.get_cut_parameters(cut_x, is_left=False)
196
+ y_watson, y_crick, y_ovhg = seqy.seq.get_cut_parameters(cut_y, is_left=True)
197
+ # Positions where the overlap would start for full overlap
198
+ left_x = x_watson if x_ovhg < 0 else x_crick
199
+ left_y = y_watson if y_ovhg < 0 else y_crick
200
+
201
+ # Correct por partial overlaps
202
+ left_x += abs(x_ovhg) - overlap
203
+
204
+ matches.append((left_x, left_y, overlap))
205
+ return matches
206
+
207
+
208
+ def combine_algorithms(*algorithms: AssemblyAlgorithmType) -> AssemblyAlgorithmType:
209
+ """
210
+ Combine assembly algorithms, if any of them returns a match, the match is returned.
211
+
212
+ This can be used for example in a ligation where you want to allow both sticky and blunt end ligation.
213
+ """
214
+
215
+ def combined(seqx, seqy, limit):
216
+ matches = list()
217
+ for algorithm in algorithms:
218
+ matches += algorithm(seqx, seqy, limit)
219
+ return matches
220
+
221
+ return combined
222
+
223
+
224
+ def blunt_overlap(
225
+ seqx: _Dseqrecord, seqy: _Dseqrecord, limit=None
226
+ ) -> list[SequenceOverlap]:
227
+ """
228
+ Assembly algorithm to find blunt overlaps. Used for blunt ligation.
229
+
230
+ It basically returns [(len(seqx), 0, 0)] if the right end of seqx is blunt and the
231
+ left end of seqy is blunt (compatible with blunt ligation). Otherwise, it returns an empty list.
232
+
233
+ Args:
234
+ seqx (_Dseqrecord): The first sequence
235
+ seqy (_Dseqrecord): The second sequence
236
+ limit (int): There for compatibility, but it is ignored
237
+
238
+ Returns:
239
+ list[SequenceOverlap]: A list of overlaps between the two sequences
240
+
241
+ >>> from pydna.assembly2 import blunt_overlap
242
+ >>> from pydna.dseqrecord import Dseqrecord
243
+ >>> x = Dseqrecord("AAAAAA")
244
+ >>> y = Dseqrecord("TTTTTT")
245
+ >>> blunt_overlap(x, y)
246
+ [(6, 0, 0)]
247
+ """
248
+ if (
249
+ seqx.seq.three_prime_end()[0] == "blunt"
250
+ and seqy.seq.five_prime_end()[0] == "blunt"
251
+ ):
252
+ return [(len(seqx), 0, 0)]
253
+ return []
254
+
255
+
256
+ def common_sub_strings(
257
+ seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25
258
+ ) -> list[SequenceOverlap]:
259
+ """
260
+ Assembly algorithm to find common substrings of length == limit. see the docs of
261
+ the function common_sub_strings_str for more details. It is case insensitive.
262
+
263
+ >>> from pydna.dseqrecord import Dseqrecord
264
+ >>> x = Dseqrecord("TAAAAAAT")
265
+ >>> y = Dseqrecord("CCaAaAaACC")
266
+ >>> common_sub_strings(x, y, limit=5)
267
+ [(1, 2, 6), (1, 3, 5), (2, 2, 5)]
268
+ """
269
+ query_seqx = str(seqx.seq).upper()
270
+ query_seqy = str(seqy.seq).upper()
271
+ if seqx.circular:
272
+ query_seqx = query_seqx * 2
273
+ if seqy.circular:
274
+ query_seqy = query_seqy * 2
275
+ results = common_sub_strings_str(query_seqx, query_seqy, limit)
276
+
277
+ if not seqx.circular and not seqy.circular:
278
+ return results
279
+
280
+ # Remove matches that start on the second copy of the sequence
281
+ if seqx.circular:
282
+ results = [r for r in results if r[0] < len(seqx)]
283
+ if seqy.circular:
284
+ results = [r for r in results if r[1] < len(seqy)]
285
+
286
+ # Trim lengths that span more than the sequence
287
+ if seqx.circular or seqy.circular:
288
+ max_match_length = min(len(seqx), len(seqy))
289
+ results = [(r[0], r[1], min(r[2], max_match_length)) for r in results]
290
+
291
+ # Edge case where the sequences are identical
292
+ if len(seqx.seq) == len(seqy.seq):
293
+ full_match = next((r for r in results if r[2] == len(seqx.seq)), None)
294
+ if full_match is not None:
295
+ return [full_match]
296
+
297
+ # Remove duplicate matches, see example below
298
+ # Let's imagine the following two sequences, where either seqy or both are circular
299
+ # seqx: 01234
300
+ # seqy: 123450, circular
301
+ #
302
+ # common_sub_strings would return [(0, 5, 5), (1, 0, 4)]
303
+ # Actually, (1, 0, 4) is a subset of (0, 5, 5), the part
304
+ # that does not span the origin. To remove matches like this,
305
+ # We find matches where the origin is spanned in one of the sequences
306
+ # only, and then remove the subset of that match that does not span the origin.
307
+ shifted_matches = set()
308
+ for x, y, length in results:
309
+ x_span_origin = seqx.circular and x + length > len(seqx)
310
+ y_span_origin = seqy.circular and y + length > len(seqy)
311
+ if x_span_origin and not y_span_origin:
312
+ shift = len(seqx) - x
313
+ shifted_matches.add((0, y + shift, length - shift))
314
+ elif not x_span_origin and y_span_origin:
315
+ shift = len(seqy) - y
316
+ shifted_matches.add((x + shift, 0, length - shift))
317
+ return [r for r in results if r not in shifted_matches]
318
+
319
+
320
+ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
321
+ """
322
+ Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
323
+ The order matters, we want alignments like:
324
+
325
+ ```
326
+ seqx: oooo------xxxx
327
+ seqy: xxxx------oooo
328
+ Product: oooo------xxxx------oooo
329
+
330
+ Not like:
331
+
332
+ seqx: oooo------xxxx
333
+ seqy: xxxx------oooo
334
+ Product (unwanted): oooo
335
+ ```
336
+
337
+ Args:
338
+ seqx (_Dseqrecord): The first sequence
339
+ seqy (_Dseqrecord): The second sequence
340
+ limit (int): Minimum length of the overlap
341
+
342
+ Returns:
343
+ list[SequenceOverlap]: A list of overlaps between the two sequences
344
+
345
+ >>> from pydna.dseqrecord import Dseqrecord
346
+ >>> from pydna.assembly2 import gibson_overlap
347
+ >>> x = Dseqrecord("ttactaAAAAAA")
348
+ >>> y = Dseqrecord("AAAAAAcgcacg")
349
+ >>> gibson_overlap(x, y, limit=5)
350
+ [(6, 0, 6), (7, 0, 5)]
351
+ >>> gibson_overlap(y, x, limit=5)
352
+ []
353
+ """
354
+
355
+ # Because Gibson enzymes remove 5' overhangs, we remove them from the sequence
356
+ # when looking for homology, then we shift the location of the second fragment accordingly.
357
+ # This is only relevant for linear fragments, so we don't need to worry about
358
+ # shifting locations for circular fragments.
359
+ trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
360
+ trim_x_right = seqx.seq.watson_ovhg() if seqx.seq.watson_ovhg() < 0 else None
361
+ trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
362
+ trim_y_right = seqy.seq.watson_ovhg() if seqy.seq.watson_ovhg() < 0 else None
363
+
364
+ stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
365
+ stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
366
+ # We have to convert to list because we need to modify the matches
367
+ matches = [
368
+ list(m)
369
+ for m in common_sub_strings_str(stringx, stringy, limit)
370
+ if (m[1] == 0 and m[0] + m[2] == len(stringx))
371
+ ]
372
+ for match in matches:
373
+ match[0] += trim_x_left
374
+ match[1] += trim_y_left
375
+
376
+ # convert to tuples again
377
+ return [tuple(m) for m in matches]
378
+
379
+
380
+ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit: bool = False):
381
+ """
382
+ Assembly algorithm for ligation of sticky ends.
383
+
384
+ For now, if limit 0 / False (default) only full overlaps are considered.
385
+ Otherwise, partial overlaps are also returned.
386
+
387
+ Args:
388
+ seqx (_Dseqrecord): The first sequence
389
+ seqy (_Dseqrecord): The second sequence
390
+ limit (bool): Whether to allow partial overlaps
391
+
392
+ Returns:
393
+ list[SequenceOverlap]: A list of overlaps between the two sequences
394
+
395
+
396
+ Ligation of fully overlapping sticky ends, note how the order matters
397
+
398
+ >>> from pydna.dseq import Dseq
399
+ >>> from pydna.dseqrecord import Dseqrecord
400
+ >>> from pydna.assembly2 import sticky_end_sub_strings
401
+ >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 3))
402
+ >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
403
+ >>> sticky_end_sub_strings(x, y, limit=False)
404
+ [(3, 0, 3)]
405
+ >>> sticky_end_sub_strings(y, x, limit=False)
406
+ []
407
+
408
+ Ligation of partially overlapping sticky ends, specified with limit=True
409
+
410
+ >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 2))
411
+ >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
412
+ >>> sticky_end_sub_strings(x, y, limit=False)
413
+ []
414
+ >>> sticky_end_sub_strings(x, y, limit=True)
415
+ [(4, 0, 2)]
416
+
417
+ """
418
+ overlap = sum_is_sticky(
419
+ seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit
420
+ )
421
+ if overlap:
422
+ return [(len(seqx) - overlap, 0, overlap)]
423
+ return []
424
+
425
+
426
+ def zip_match_leftwards(
427
+ seqx: _SeqRecord, seqy: _SeqRecord, match: SequenceOverlap
428
+ ) -> SequenceOverlap:
429
+ """
430
+ Starting from the rightmost edge of the match, return a new match encompassing the max
431
+ number of bases. This can be used to return a longer match if a primer aligns for longer
432
+ than the limit or a shorter match if there are mismatches. This is convenient to maintain
433
+ as many features as possible. It is used in PCR assembly.
434
+
435
+ >>> seq = _Dseqrecord('AAAAACGTCCCGT')
436
+ >>> primer = _Dseqrecord('ACGTCCCGT')
437
+ >>> match = (13, 9, 0) # an empty match at the end of each
438
+ >>> zip_match_leftwards(seq, primer, match)
439
+ (4, 0, 9)
440
+
441
+ Works in circular molecules if the match spans the origin:
442
+ >>> seq = _Dseqrecord('TCCCGTAAAAACG', circular=True)
443
+ >>> primer = _Dseqrecord('ACGTCCCGT')
444
+ >>> match = (6, 9, 0)
445
+ >>> zip_match_leftwards(seq, primer, match)
446
+ (10, 0, 9)
447
+
448
+ """
449
+
450
+ query_x = seqrecord2_uppercase_DNA_string(seqx)
451
+ query_y = seqrecord2_uppercase_DNA_string(seqy)
452
+
453
+ # In circular sequences, the match may go beyond the left-most edge of the sequence if it spans
454
+ # the origin:
455
+ # Primer: ACGTCCCGT
456
+ # |||||||||
457
+ # Circular seq: ACGTCCCGT -> Equivalent to Dseqrecord('CCCGTACGT', circular=True)
458
+ # ^
459
+ # Origin
460
+ # We would start from the last T and move leftwards, but we would stop at the origin
461
+ # For those cases we shift by length, then go back
462
+
463
+ end_on_x = match[0] + match[2]
464
+ if isinstance(seqx, _Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
465
+ end_on_x += len(seqx)
466
+
467
+ end_on_y = match[1] + match[2]
468
+ if isinstance(seqy, _Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
469
+ end_on_y += len(seqy)
470
+
471
+ count = 0
472
+ for x, y in zip(reversed(query_x[:end_on_x]), reversed(query_y[:end_on_y])):
473
+ if x != y:
474
+ break
475
+ count += 1
476
+
477
+ # Shift back by length if needed
478
+ start_on_x = (end_on_x - count) % len(seqx)
479
+ start_on_y = (end_on_y - count) % len(seqy)
480
+
481
+ return (start_on_x, start_on_y, count)
482
+
483
+
484
+ def zip_match_rightwards(
485
+ seqx: _Dseqrecord, seqy: _Dseqrecord, match: SequenceOverlap
486
+ ) -> SequenceOverlap:
487
+ """Same as zip_match_leftwards, but towards the right."""
488
+
489
+ query_x = seqrecord2_uppercase_DNA_string(seqx)
490
+ query_y = seqrecord2_uppercase_DNA_string(seqy)
491
+
492
+ start_on_x, start_on_y, _ = match
493
+ count = 0
494
+ for x, y in zip(query_x[start_on_x:], query_y[start_on_y:]):
495
+ if x != y:
496
+ break
497
+ count += 1
498
+ return (start_on_x, start_on_y, count)
499
+
500
+
501
+ def seqrecord2_uppercase_DNA_string(seqr: _SeqRecord) -> str:
502
+ """
503
+ Transform a Dseqrecord to a sequence string where U is replaced by T, everything is upper case and
504
+ circular sequences are repeated twice. This is used for PCR, to support primers with U's (e.g. for USER cloning).
505
+ """
506
+ out = str(seqr.seq).upper().replace("U", "T")
507
+ if isinstance(seqr, _Dseqrecord) and seqr.circular:
508
+ return out * 2
509
+ return out
510
+
511
+
512
+ def primer_template_overlap(
513
+ seqx: _Dseqrecord | _Primer, seqy: _Dseqrecord | _Primer, limit=25, mismatches=0
514
+ ) -> list[SequenceOverlap]:
515
+ """
516
+ Assembly algorithm to find overlaps between a primer and a template. It accepts mismatches.
517
+ When there are mismatches, it only returns the common part between the primer and the template.
518
+
519
+ If seqx is a primer and seqy is a template, it represents the binding of a forward primer.
520
+ If seqx is a template and seqy is a primer, it represents the binding of a reverse primer,
521
+ where the primer has been passed as its reverse complement (see examples).
522
+
523
+ Args:
524
+ seqx (_Dseqrecord | _Primer): The primer
525
+ seqy (_Dseqrecord | _Primer): The template
526
+ limit (int): Minimum length of the overlap
527
+ mismatches (int): Maximum number of mismatches (only substitutions, no deletion or insertion)
528
+
529
+ Returns:
530
+ list[SequenceOverlap]: A list of overlaps between the primer and the template
531
+
532
+ >>> from pydna.dseqrecord import Dseqrecord
533
+ >>> from pydna.primer import Primer
534
+ >>> from pydna.assembly2 import primer_template_overlap
535
+ >>> template = Dseqrecord("AATTAGCAGCGATCGAGT", circular=True)
536
+ >>> primer = Primer("TTAGCAGC")
537
+ >>> primer_template_overlap(primer, template, limit=8, mismatches=0)
538
+ [(0, 2, 8)]
539
+
540
+ This actually represents the binding of the primer `GCTGCTAA` (reverse complement)
541
+ >>> primer_template_overlap(template, primer, limit=8, mismatches=0)
542
+ [(2, 0, 8)]
543
+ >>> primer_template_overlap(primer, template.reverse_complement(), limit=8, mismatches=0)
544
+ []
545
+ >>> primer_template_overlap(primer.reverse_complement(), template, limit=8, mismatches=0)
546
+ []
547
+ """
548
+
549
+ if isinstance(seqx, _Primer) and isinstance(seqy, _Dseqrecord):
550
+ primer = seqx
551
+ template = seqy
552
+ reverse_primer = False
553
+ elif isinstance(seqx, _Dseqrecord) and isinstance(seqy, _Primer):
554
+ primer = seqy
555
+ template = seqx
556
+ reverse_primer = True
557
+ else:
558
+ raise ValueError(
559
+ "One of the sequences must be a primer and the other a Dseqrecord"
560
+ )
561
+
562
+ if len(primer) < limit:
563
+ return []
564
+
565
+ subject = seqrecord2_uppercase_DNA_string(template)
566
+ query = (
567
+ seqrecord2_uppercase_DNA_string(primer[:limit])
568
+ if reverse_primer
569
+ else seqrecord2_uppercase_DNA_string(primer[-limit:])
570
+ )
571
+
572
+ re_matches = list(
573
+ regex.finditer(
574
+ "(" + query + "){s<=" + str(mismatches) + "}", subject, overlapped=True
575
+ )
576
+ )
577
+ re_matches += list(
578
+ regex.finditer(
579
+ "(?r)(" + query + "){s<=" + str(mismatches) + "}", subject, overlapped=True
580
+ )
581
+ )
582
+
583
+ out = set()
584
+ for re_match in re_matches:
585
+
586
+ start, end = re_match.span()
587
+
588
+ # For circular sequences the same match is returned twice unless it falls
589
+ # on the origin, we eliminate duplicates here
590
+ if start >= len(template):
591
+ continue
592
+
593
+ # This extends match beyond the limit if the primer aligns more than that
594
+ # and reduces the match if the primer has mismatches
595
+ if reverse_primer:
596
+ # Match in the same format as other assembly algorithms
597
+ starting_match = (start, 0, end - start)
598
+ out.add(zip_match_rightwards(template, primer, starting_match))
599
+ else:
600
+ # Match in the same format as other assembly algorithms
601
+ starting_match = (len(primer) - limit, start, end - start)
602
+ out.add(zip_match_leftwards(primer, template, starting_match))
603
+
604
+ return list(sorted(out))
605
+
606
+
607
+ def fill_left(seq: _Dseq) -> _Dseq:
608
+ """Fill the left overhang of a sequence with the complementary sequence."""
609
+ new_watson = seq.watson
610
+ new_crick = seq.crick
611
+
612
+ # Watson 5' overhang
613
+ if seq.ovhg < 0:
614
+ new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
615
+ # Crick 5' overhang
616
+ elif seq.ovhg > 0:
617
+ new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
618
+
619
+ return _Dseq(new_watson, new_crick, 0)
620
+
621
+
622
+ def fill_right(seq: _Dseq) -> _Dseq:
623
+ """Fill the right overhang of a sequence with the complementary sequence."""
624
+ new_watson = seq.watson
625
+ new_crick = seq.crick
626
+
627
+ # Watson 3' overhang
628
+ watson_ovhg = seq.watson_ovhg()
629
+ if watson_ovhg < 0:
630
+ new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
631
+
632
+ # Crick 3' overhang
633
+ elif watson_ovhg > 0:
634
+ new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
635
+
636
+ return _Dseq(new_watson, new_crick, seq.ovhg)
637
+
638
+
639
+ def fill_dseq(seq: _Dseq) -> _Dseq:
640
+ """Fill the overhangs of a sequence with the complementary sequence."""
641
+ return fill_left(fill_right(seq))
642
+
643
+
644
+ def reverse_complement_assembly(
645
+ assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
646
+ ) -> EdgeRepresentationAssembly:
647
+ """Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
648
+ new_assembly = list()
649
+ for u, v, locu, locv in assembly:
650
+ f_u = fragments[abs(u) - 1]
651
+ f_v = fragments[abs(v) - 1]
652
+ new_assembly.append((-v, -u, locv._flip(len(f_v)), locu._flip(len(f_u))))
653
+ return new_assembly[::-1]
654
+
655
+
656
+ def filter_linear_subassemblies(
657
+ linear_assemblies: list[EdgeRepresentationAssembly],
658
+ circular_assemblies: list[EdgeRepresentationAssembly],
659
+ fragments: list[_Dseqrecord],
660
+ ) -> list[EdgeRepresentationAssembly]:
661
+ """Remove linear assemblies which are sub-assemblies of circular assemblies"""
662
+ all_circular_assemblies = circular_assemblies + [
663
+ reverse_complement_assembly(c, fragments) for c in circular_assemblies
664
+ ]
665
+ filtered_assemblies = [
666
+ assem
667
+ for assem in linear_assemblies
668
+ if not any(is_sublist(assem, c, True) for c in all_circular_assemblies)
669
+ ]
670
+ # I don't think the line below is necessary, but just in case
671
+ # filtered_assemblies = [l for l in filtered_assemblies if not any(is_sublist(reverse_complement_assembly(l, fragments), c, True) for c in all_circular_assemblies)]
672
+ return filtered_assemblies
673
+
674
+
675
+ def remove_subassemblies(
676
+ assemblies: list[EdgeRepresentationAssembly],
677
+ ) -> list[EdgeRepresentationAssembly]:
678
+ """Filter out subassemblies, i.e. assemblies that are contained within another assembly.
679
+
680
+ For example:
681
+ [(1, 2, '1[8:14]:2[1:7]'), (2, 3, '2[10:17]:3[1:8]')]
682
+ [(1, 2, '1[8:14]:2[1:7]')]
683
+ The second one is a subassembly of the first one.
684
+ """
685
+
686
+ # Sort by length, longest first
687
+ assemblies = sorted(assemblies, key=len, reverse=True)
688
+
689
+ filtered_assemblies = list()
690
+ for assembly in assemblies:
691
+ # Check if this assembly is a subassembly of any of the assemblies we have already found
692
+ if not any(is_sublist(assembly, a) for a in filtered_assemblies):
693
+ filtered_assemblies.append(assembly)
694
+
695
+ return filtered_assemblies
696
+
697
+
698
+ def assembly2str(assembly: EdgeRepresentationAssembly) -> str:
699
+ """Convert an assembly to a string representation, for example:
700
+ ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
701
+ becomes:
702
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
703
+
704
+ The reason for this is that by default, a feature '[8:14]' when present in a tuple
705
+ is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
706
+ """
707
+ return str(tuple(f"{u}{lu}:{v}{lv}" for u, v, lu, lv in assembly))
708
+
709
+
710
+ def assembly2str_tuple(assembly: EdgeRepresentationAssembly) -> str:
711
+ """Convert an assembly to a string representation, like
712
+ ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
713
+ """
714
+ return str(tuple((u, v, str(lu), str(lv)) for u, v, lu, lv in assembly))
715
+
716
+
717
+ def assembly_has_mismatches(
718
+ fragments: list[_Dseqrecord], assembly: EdgeRepresentationAssembly
719
+ ) -> bool:
720
+ """Check if an assembly has mismatches. This should never happen and if so it returns an error."""
721
+ for u, v, loc_u, loc_v in assembly:
722
+ seq_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
723
+ seq_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
724
+ # TODO: Check issue where extraction failed, and whether it would give problems here
725
+ if (
726
+ str(loc_u.extract(seq_u).seq).upper()
727
+ != str(loc_v.extract(seq_v).seq).upper()
728
+ ):
729
+ return True
730
+ return False
731
+
732
+
733
+ def assembly_is_circular(
734
+ assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
735
+ ) -> bool:
736
+ """
737
+ Based on the topology of the locations of an assembly, determine if it is circular.
738
+ This does not work for insertion assemblies, that's why assemble takes the optional argument is_insertion.
739
+ """
740
+ if assembly[0][0] != assembly[-1][1]:
741
+ return False
742
+ elif (
743
+ isinstance(fragments[abs(assembly[0][0]) - 1], _Dseqrecord)
744
+ and fragments[abs(assembly[0][0]) - 1].circular
745
+ ):
746
+ return True
747
+ else:
748
+ return (
749
+ _location_boundaries(assembly[0][2])[0]
750
+ > _location_boundaries(assembly[-1][3])[0]
751
+ )
752
+
753
+
754
+ def assemble(
755
+ fragments: list[_Dseqrecord],
756
+ assembly: EdgeRepresentationAssembly,
757
+ is_insertion: bool = False,
758
+ ) -> _Dseqrecord:
759
+ """Generate a Dseqrecord from an assembly and a list of fragments."""
760
+
761
+ if is_insertion:
762
+ is_circular = False
763
+ else:
764
+ is_circular = assembly_is_circular(assembly, fragments)
765
+
766
+ subfragment_representation = edge_representation2subfragment_representation(
767
+ assembly, is_circular
768
+ )
769
+
770
+ # Sanity check
771
+ for asm_edge in assembly:
772
+ u, v, loc_u, loc_v = asm_edge
773
+ f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
774
+ f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
775
+ seq_u = str(loc_u.extract(f_u).seq).upper()
776
+ seq_v = str(loc_v.extract(f_v).seq).upper()
777
+ if seq_u != seq_v:
778
+ raise ValueError("Mismatch in assembly")
779
+
780
+ # We transform into Dseqrecords (for primers)
781
+ dseqr_fragments = [
782
+ f if isinstance(f, _Dseqrecord) else _Dseqrecord(f) for f in fragments
783
+ ]
784
+ subfragments = get_assembly_subfragments(
785
+ dseqr_fragments, subfragment_representation
786
+ )
787
+
788
+ # Length of the overlaps between consecutive assembly fragments
789
+ fragment_overlaps = [len(e[-1]) for e in assembly]
790
+
791
+ out_dseqrecord = _Dseqrecord(subfragments[0])
792
+
793
+ for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
794
+ # Shift the features of the right fragment to the left by `overlap`
795
+ new_features = [
796
+ f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
797
+ ]
798
+ # Join the left sequence including the overlap with the right sequence without the overlap
799
+ # we use fill_right / fill_left so that it works for ligation of sticky ends
800
+ out_dseqrecord = _Dseqrecord(
801
+ fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
802
+ features=out_dseqrecord.features + new_features,
803
+ )
804
+
805
+ # For circular assemblies, close the loop and wrap origin-spanning features
806
+ if is_circular:
807
+ overlap = fragment_overlaps[-1]
808
+
809
+ # Special case for blunt circularisation
810
+ if overlap == 0:
811
+ return out_dseqrecord.looped()
812
+
813
+ # Remove trailing overlap
814
+ out_dseqrecord = _Dseqrecord(
815
+ fill_dseq(out_dseqrecord.seq)[:-overlap],
816
+ features=out_dseqrecord.features,
817
+ circular=True,
818
+ )
819
+ for feature in out_dseqrecord.features:
820
+ start, end = _location_boundaries(feature.location)
821
+ if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
822
+ # Wrap around the origin
823
+ feature.location = _shift_location(
824
+ feature.location, 0, len(out_dseqrecord)
825
+ )
826
+
827
+ return out_dseqrecord
828
+
829
+
830
+ def annotate_primer_binding_sites(
831
+ input_dseqr: _Dseqrecord, fragments: list[_Dseqrecord]
832
+ ) -> _Dseqrecord:
833
+ """Annotate the primer binding sites in a Dseqrecord."""
834
+ fwd, _, rvs = fragments
835
+ start_rvs = len(input_dseqr) - len(rvs)
836
+
837
+ output_dseqr = copy.deepcopy(input_dseqr)
838
+ output_dseqr.add_feature(
839
+ x=0,
840
+ y=len(fwd),
841
+ type_="primer_bind",
842
+ strand=1,
843
+ label=[fwd.name],
844
+ note=["sequence: " + str(fwd.seq)],
845
+ )
846
+ output_dseqr.add_feature(
847
+ x=start_rvs,
848
+ y=len(output_dseqr),
849
+ type_="primer_bind",
850
+ strand=-1,
851
+ label=[rvs.name],
852
+ note=["sequence: " + str(rvs.seq)],
853
+ )
854
+ return output_dseqr
855
+
856
+
857
+ def edge_representation2subfragment_representation(
858
+ assembly: EdgeRepresentationAssembly, is_circular: bool
859
+ ) -> SubFragmentRepresentationAssembly:
860
+ """
861
+ Turn this kind of edge representation fragment 1, fragment 2, right edge on 1, left edge on 2
862
+ a = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b', 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
863
+ Into this: fragment 1, left edge on 1, right edge on 1
864
+ b = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
865
+ """
866
+
867
+ if is_circular:
868
+ temp = list(assembly[-1:]) + list(assembly)
869
+ else:
870
+ temp = (
871
+ [(None, assembly[0][0], None, None)]
872
+ + list(assembly)
873
+ + [(assembly[-1][1], None, None, None)]
874
+ )
875
+ edge_pairs = zip(temp, temp[1:])
876
+ subfragment_representation = list()
877
+ for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
878
+ subfragment_representation.append((v1, start_location, end_location))
879
+
880
+ return tuple(subfragment_representation)
881
+
882
+
883
+ def subfragment_representation2edge_representation(
884
+ assembly: SubFragmentRepresentationAssembly, is_circular: bool
885
+ ) -> EdgeRepresentationAssembly:
886
+ """
887
+ Turn this kind of subfragment representation fragment 1, left edge on 1, right edge on 1
888
+ a = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
889
+ Into this: fragment 1, fragment 2, right edge on 1, left edge on 2
890
+ b = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b' 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
891
+ """
892
+
893
+ edge_representation = []
894
+
895
+ # Iterate through the assembly pairwise to create the edge representation
896
+ for i in range(len(assembly) - 1):
897
+ frag1, left1, right1 = assembly[i]
898
+ frag2, left2, right2 = assembly[i + 1]
899
+ # Create the edge between the current and next fragment
900
+ edge_representation.append((frag1, frag2, right1, left2))
901
+
902
+ if is_circular:
903
+ # Add the edge from the last fragment back to the first
904
+ frag_last, left_last, right_last = assembly[-1]
905
+ frag_first, left_first, right_first = assembly[0]
906
+ edge_representation.append((frag_last, frag_first, right_last, left_first))
907
+
908
+ return tuple(edge_representation)
909
+
910
+
911
+ def get_assembly_subfragments(
912
+ fragments: list[_Dseqrecord],
913
+ subfragment_representation: SubFragmentRepresentationAssembly,
914
+ ) -> list[_Dseqrecord]:
915
+ """From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
916
+
917
+ Subfragments are the slices of the fragments that are joined together
918
+
919
+ For example:
920
+ ```
921
+ --A--
922
+ TACGTAAT
923
+ --B--
924
+ TCGTAACGA
925
+
926
+ Gives: TACGTAA / CGTAACGA
927
+ ```
928
+ To reproduce:
929
+ ```
930
+ a = Dseqrecord('TACGTAAT')
931
+ b = Dseqrecord('TCGTAACGA')
932
+ f = Assembly([a, b], limit=5)
933
+ a0 = f.get_linear_assemblies()[0]
934
+ print(assembly2str(a0))
935
+ a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
936
+ for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
937
+ print(f.seq)
938
+
939
+ # prints TACGTAA and CGTAACGA
940
+ ```
941
+
942
+ Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
943
+ """
944
+ subfragments = list()
945
+ for node, start_location, end_location in subfragment_representation:
946
+ seq = (
947
+ fragments[node - 1]
948
+ if node > 0
949
+ else fragments[-node - 1].reverse_complement()
950
+ )
951
+ subfragments.append(extract_subfragment(seq, start_location, end_location))
952
+ return subfragments
953
+
954
+
955
+ def extract_subfragment(
956
+ seq: _Dseqrecord, start_location: Location, end_location: Location
957
+ ) -> _Dseqrecord:
958
+ """Extract a subfragment from a sequence for an assembly, given the start and end locations of the subfragment."""
959
+ start = 0 if start_location is None else _location_boundaries(start_location)[0]
960
+ end = None if end_location is None else _location_boundaries(end_location)[1]
961
+
962
+ # Special case, some of it could be handled by better Dseqrecord slicing in the future
963
+ if (
964
+ seq.circular
965
+ and start_location is not None
966
+ and end_location is not None
967
+ and _locations_overlap(start_location, end_location, len(seq))
968
+ ):
969
+ # The overhang is different for origin-spanning features, for instance
970
+ # for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
971
+ # is -4, not 9
972
+ ovhg = start - end if end > start else start - end - len(seq)
973
+ # edge case
974
+ if abs(ovhg) == len(seq):
975
+ ovhg = 0
976
+ dummy_cut = ((start, ovhg), None)
977
+ open_seq = seq.apply_cut(dummy_cut, dummy_cut)
978
+ return _Dseqrecord(fill_dseq(open_seq.seq), features=open_seq.features)
979
+
980
+ return seq[start:end]
981
+
982
+
983
+ def is_sublist(sublist: list, my_list: list, my_list_is_cyclic: bool = False) -> bool:
984
+ """Returns True if argument sublist is a sublist of argument my_list (can be treated as cyclic), False otherwise.
985
+
986
+ Examples
987
+ --------
988
+ >>> is_sublist([1, 2], [1, 2, 3], False)
989
+ True
990
+ >>> is_sublist([1, 2], [1, 3, 2], False)
991
+ False
992
+
993
+ # See the case here for cyclic lists
994
+ >>> is_sublist([3, 1], [1, 2, 3], False)
995
+ False
996
+ >>> is_sublist([3, 1], [1, 2, 3], True)
997
+ True
998
+ """
999
+ n = len(sublist)
1000
+ if my_list_is_cyclic:
1001
+ my_list = my_list + my_list
1002
+ for i in range(len(my_list) - n + 1):
1003
+ # Just in case tuples were passed
1004
+ if list(my_list[i : i + n]) == list(sublist):
1005
+ return True
1006
+ return False
1007
+
1008
+
1009
+ def circular_permutation_min_abs(lst: list) -> list:
1010
+ """Returns the circular permutation of lst with the smallest absolute value first.
1011
+
1012
+ Examples
1013
+ --------
1014
+ >>> circular_permutation_min_abs([1, 2, 3])
1015
+ [1, 2, 3]
1016
+ >>> circular_permutation_min_abs([3, 1, 2])
1017
+ [1, 2, 3]
1018
+ """
1019
+ min_abs_index = min(range(len(lst)), key=lambda i: abs(lst[i]))
1020
+ return lst[min_abs_index:] + lst[:min_abs_index]
1021
+
1022
+
1023
+ class Assembly:
1024
+ """Assembly of a list of DNA fragments into linear or circular constructs.
1025
+ Accepts a list of Dseqrecords (source fragments) to
1026
+ initiate an Assembly object. Several methods are available for analysis
1027
+ of overlapping sequences, graph construction and assembly.
1028
+
1029
+ The assembly contains a directed graph, where nodes represent fragments and
1030
+ edges represent overlaps between fragments. :
1031
+ - The node keys are integers, representing the index of the fragment in the
1032
+ input list of fragments. The sign of the node key represents the orientation
1033
+ of the fragment, positive for forward orientation, negative for reverse orientation.
1034
+ - The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
1035
+ - u and v are the nodes connected by the edge.
1036
+ - key is a string that represents the location of the overlap. In the format:
1037
+ 'u[start:end](strand):v[start:end](strand)'.
1038
+ - Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
1039
+ representing the location of the overlap in the u and v fragment, respectively.
1040
+ - You can think of an edge as a representation of the join of two fragments.
1041
+
1042
+ If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
1043
+ there will be 4 edges representing that overlap in the graph, for all possible
1044
+ orientations of the fragments (see add_edges_from_match for details):
1045
+ - `(1, 2, '1[8:14]:2[1:7]')`
1046
+ - `(2, 1, '2[1:7]:1[8:14]')`
1047
+ - `(-1, -2, '-1[0:6]:-2[10:16]')`
1048
+ - `(-2, -1, '-2[10:16]:-1[0:6]')`
1049
+
1050
+ An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
1051
+ as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
1052
+ and second fragment. Assemblies are then represented as:
1053
+ - Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
1054
+ - Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
1055
+ Note that the first and last fragment are the same in a circular assembly.
1056
+
1057
+ The following constrains are applied to remove duplicate assemblies:
1058
+ - Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
1059
+ use_fragment_order is ignored.
1060
+ - Linear assemblies:
1061
+ - Using uid (see add_edges_from_match) to identify unique edges.
1062
+
1063
+ Parameters
1064
+ ----------
1065
+ frags : list
1066
+ A list of Dseqrecord objects.
1067
+ limit : int, optional
1068
+ The shortest shared homology to be considered, this is passed as the third argument to the `algorithm` function.
1069
+ For certain algorithms, this might be ignored.
1070
+ algorithm : function, optional
1071
+ The algorithm used to determine the shared sequences. It's a function that takes two Dseqrecord objects as inputs,
1072
+ and will get passed the third argument (limit), that may or may not be used. It must return a list of overlaps
1073
+ (see common_sub_strings for an example).
1074
+ use_fragment_order : bool, optional
1075
+ It's set to True by default to reproduce legacy pydna behaviour: only assemblies that start with the first fragment and end with the last are considered.
1076
+ You should set it to False.
1077
+ use_all_fragments : bool, optional
1078
+ Constrain the assembly to use all fragments.
1079
+
1080
+
1081
+ Examples
1082
+ --------
1083
+
1084
+ from assembly2 import Assembly, assembly2str
1085
+ from pydna.dseqrecord import Dseqrecord
1086
+
1087
+ example_fragments = (
1088
+ Dseqrecord('AacgatCAtgctcc', name='a'),
1089
+ Dseqrecord('TtgctccTAAattctgc', name='b'),
1090
+ Dseqrecord('CattctgcGAGGacgatG', name='c'),
1091
+ )
1092
+
1093
+ asm = Assembly(example_fragments, limit=5, use_fragment_order=False)
1094
+ print('Linear ===============')
1095
+ for assembly in asm.get_linear_assemblies():
1096
+ print(' ', assembly2str(assembly))
1097
+ print('Circular =============')
1098
+ for assembly in asm.get_circular_assemblies():
1099
+ print(' ', assembly2str(assembly))
1100
+
1101
+ # Prints
1102
+ Linear ===============
1103
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
1104
+ ('2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
1105
+ ('3[12:17]:1[1:6]', '1[8:14]:2[1:7]')
1106
+ ('1[1:6]:3[12:17]',)
1107
+ ('2[1:7]:1[8:14]',)
1108
+ ('3[1:8]:2[10:17]',)
1109
+ Circular =============
1110
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
1111
+
1112
+ """
1113
+
1114
+ def __init__(
1115
+ self,
1116
+ frags: list[_Dseqrecord],
1117
+ limit: int = 25,
1118
+ algorithm: AssemblyAlgorithmType = common_sub_strings,
1119
+ use_fragment_order: bool = True,
1120
+ use_all_fragments: bool = False,
1121
+ ):
1122
+ # TODO: allow for the same fragment to be included more than once?
1123
+ self.G = _nx.MultiDiGraph()
1124
+ # Add positive and negative nodes for forward and reverse fragments
1125
+ self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1126
+ self.G.add_nodes_from(
1127
+ (-(i + 1), {"seq": f.reverse_complement()}) for (i, f) in enumerate(frags)
1128
+ )
1129
+
1130
+ # Iterate over all possible combinations of fragments
1131
+ fragment_pairs = _itertools.combinations(
1132
+ filter(lambda x: x > 0, self.G.nodes), 2
1133
+ )
1134
+ for i, j in fragment_pairs:
1135
+ # All the relative orientations of the fragments in the pair
1136
+ for u, v in _itertools.product([i, -i], [j, -j]):
1137
+ u_seq = self.G.nodes[u]["seq"]
1138
+ v_seq = self.G.nodes[v]["seq"]
1139
+ matches = algorithm(u_seq, v_seq, limit)
1140
+ for match in matches:
1141
+ self.add_edges_from_match(match, u, v, u_seq, v_seq)
1142
+
1143
+ self.fragments = frags
1144
+ self.limit = limit
1145
+ self.algorithm = algorithm
1146
+ self.use_fragment_order = use_fragment_order
1147
+ self.use_all_fragments = use_all_fragments
1148
+
1149
+ return
1150
+
1151
+ @classmethod
1152
+ def assembly_is_valid(
1153
+ cls,
1154
+ fragments: list[_Dseqrecord | _Primer],
1155
+ assembly: EdgeRepresentationAssembly,
1156
+ is_circular: bool,
1157
+ use_all_fragments: bool,
1158
+ is_insertion: bool = False,
1159
+ ) -> bool:
1160
+ """
1161
+ Returns True if the assembly is valid, False otherwise. See function comments for conditions tested.
1162
+ """
1163
+ if is_circular is None:
1164
+ return False
1165
+
1166
+ # Linear assemblies may get begin-1-end, begin-2-end, these are removed here.
1167
+ if len(assembly) == 0:
1168
+ return False
1169
+
1170
+ if use_all_fragments and len(fragments) != len(
1171
+ set(flatten(map(abs, e[:2]) for e in assembly))
1172
+ ):
1173
+ return False
1174
+
1175
+ # Here we check whether subsequent pairs of fragments are compatible, for instance:
1176
+ # Compatible (overlap of 1 and 2 occurs before overlap of 2 and 3):
1177
+ # (1,2,[2:9],[0:7]), (2,3,[12:19],[0:7])
1178
+ # -- A --
1179
+ # 1 gtatcgtgt -- B --
1180
+ # 2 atcgtgtactgtcatattc
1181
+ # 3 catattcaa
1182
+ # Incompatible (overlap of 1 and 2 occurs after overlap of 2 and 3):
1183
+ # (1,2,[2:9],[13:20]), (2,3,[0:7],[0:7])
1184
+ # -- A --
1185
+ # 1 -- B -- gtatcgtgt
1186
+ # 2 catattcccccccatcgtgtactgt
1187
+ # 3 catattcaa
1188
+ # Redundant: overlap of 1 and 2 ends at the same spot as overlap of 2 and 3
1189
+ # (1,2,[2:9],[1:8]), (2,3,[0:8],[0:8])
1190
+ # -- A --
1191
+ # gtatcgtgt
1192
+ # catcgtgtactgtcatattc
1193
+ # catcgtgtactgtcatattc
1194
+ # -- B ---
1195
+ if is_circular:
1196
+ # In a circular assembly, first and last fragment must be the same
1197
+ if assembly[0][0] != assembly[-1][1]:
1198
+ return False
1199
+ edge_pairs = zip(assembly, assembly[1:] + assembly[:1])
1200
+ else:
1201
+ edge_pairs = zip(assembly, assembly[1:])
1202
+
1203
+ for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
1204
+ # Incompatible as described in figure above
1205
+ fragment = fragments[abs(v1) - 1]
1206
+ if (
1207
+ isinstance(fragment, _Primer) or not fragment.circular
1208
+ ) and _location_boundaries(start_location)[1] >= _location_boundaries(
1209
+ end_location
1210
+ )[
1211
+ 1
1212
+ ]:
1213
+ return False
1214
+
1215
+ # Fragments are used only once
1216
+ nodes_used = [
1217
+ f[0]
1218
+ for f in edge_representation2subfragment_representation(
1219
+ assembly, is_circular or is_insertion
1220
+ )
1221
+ ]
1222
+ if len(nodes_used) != len(set(map(abs, nodes_used))):
1223
+ return False
1224
+
1225
+ return True
1226
+
1227
+ def add_edges_from_match(
1228
+ self,
1229
+ match: SequenceOverlap,
1230
+ u: int,
1231
+ v: int,
1232
+ first: _Dseqrecord,
1233
+ secnd: _Dseqrecord,
1234
+ ):
1235
+ """Add edges to the graph from a match returned by the `algorithm` function (see pydna.common_substrings). For
1236
+ format of edges (see documentation of the Assembly class).
1237
+
1238
+ Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
1239
+ homologous recombination does but sticky end ligation does not. The function returns two edges:
1240
+ - Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
1241
+ - Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
1242
+
1243
+ """
1244
+ x_start, y_start, length = match
1245
+ if length == 0:
1246
+ # Edge case, blunt ligation
1247
+ locs = [SimpleLocation(x_start, x_start), SimpleLocation(y_start, y_start)]
1248
+ else:
1249
+ # We use shift_location with 0 to wrap origin-spanning features
1250
+ locs = [
1251
+ _shift_location(
1252
+ SimpleLocation(x_start, x_start + length), 0, len(first)
1253
+ ),
1254
+ _shift_location(
1255
+ SimpleLocation(y_start, y_start + length), 0, len(secnd)
1256
+ ),
1257
+ ]
1258
+
1259
+ # Flip the locations to get the reverse complement
1260
+ rc_locs = [locs[0]._flip(len(first)), locs[1]._flip(len(secnd))]
1261
+
1262
+ # Unique id that identifies the edge in either orientation
1263
+ uid = f"{u}{locs[0]}:{v}{locs[1]}"
1264
+
1265
+ combinations = (
1266
+ (u, v, locs),
1267
+ (-v, -u, rc_locs[::-1]),
1268
+ )
1269
+
1270
+ for u, v, l in combinations:
1271
+ self.G.add_edge(u, v, f"{u}{l[0]}:{v}{l[1]}", locations=l, uid=uid)
1272
+
1273
+ def format_assembly_edge(
1274
+ self, graph_edge: tuple[int, int, str]
1275
+ ) -> AssemblyEdgeType:
1276
+ """Go from the (u, v, key) to the (u, v, locu, locv) format."""
1277
+ u, v, key = graph_edge
1278
+ locu, locv = self.G.get_edge_data(u, v, key)["locations"]
1279
+ return u, v, locu, locv
1280
+
1281
+ def get_linear_assemblies(
1282
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1283
+ ) -> list[EdgeRepresentationAssembly]:
1284
+ """Get linear assemblies, applying the constrains described in __init__, ensuring that paths represent
1285
+ real assemblies (see assembly_is_valid). Subassemblies are removed (see remove_subassemblies).
1286
+ """
1287
+
1288
+ # Copy the graph since we will add the begin and end mock nodes
1289
+ G = _nx.MultiDiGraph(self.G)
1290
+ G.add_nodes_from(["begin", "end"])
1291
+
1292
+ if self.use_fragment_order:
1293
+ # Path must start with the first fragment and end with the last
1294
+ G.add_edge("begin", 1)
1295
+ G.add_edge("begin", -1)
1296
+ G.add_edge(len(self.fragments), "end")
1297
+ G.add_edge(-len(self.fragments), "end")
1298
+ else:
1299
+ for node in filter(lambda x: type(x) is int, G.nodes):
1300
+ G.add_edge("begin", node)
1301
+ G.add_edge(node, "end")
1302
+
1303
+ unique_linear_paths = self.get_unique_linear_paths(G)
1304
+ possible_assemblies = self.get_possible_assembly_number(unique_linear_paths)
1305
+ if possible_assemblies > max_assemblies:
1306
+ raise ValueError(
1307
+ f"Too many assemblies ({possible_assemblies} pre-validation) to assemble"
1308
+ )
1309
+
1310
+ assemblies = sum(
1311
+ map(lambda x: self.node_path2assembly_list(x, False), unique_linear_paths),
1312
+ [],
1313
+ )
1314
+
1315
+ out = [
1316
+ a
1317
+ for a in assemblies
1318
+ if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments)
1319
+ ]
1320
+ if only_adjacent_edges:
1321
+ out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, False)]
1322
+ return remove_subassemblies(out)
1323
+
1324
+ def node_path2assembly_list(
1325
+ self, cycle: list[int], circular: bool
1326
+ ) -> list[EdgeRepresentationAssembly]:
1327
+ """Convert a node path in the format [1, 2, 3] (as returned by _nx.cycles.simple_cycles) to a list of all
1328
+ possible assemblies.
1329
+
1330
+ There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
1331
+ for example two overlaps between 1 and 2, and single overlap between 2 and 3 should return 3 assemblies.
1332
+ """
1333
+ combine = list()
1334
+ pairing = (
1335
+ zip(cycle, cycle[1:] + cycle[:1]) if circular else zip(cycle, cycle[1:])
1336
+ )
1337
+ for u, v in pairing:
1338
+ combine.append([(u, v, key) for key in self.G[u][v]])
1339
+ return [
1340
+ tuple(map(self.format_assembly_edge, x))
1341
+ for x in _itertools.product(*combine)
1342
+ ]
1343
+
1344
+ def get_unique_linear_paths(
1345
+ self, G_with_begin_end: _nx.MultiDiGraph, max_paths=10000
1346
+ ) -> list[list[int]]:
1347
+ """Get unique linear paths from the graph, removing those that contain the same node twice."""
1348
+ # We remove the begin and end nodes, and get all paths without edges
1349
+ # e.g. we will get [1, 2, 3] only once, even if multiple edges connect
1350
+ # 1 and 2 or 2 and 3, by converting to DiGraph.
1351
+
1352
+ # Cutoff has a different meaning of what one would expect, see https://github.com/networkx/networkx/issues/2762
1353
+ node_paths = [
1354
+ x[1:-1]
1355
+ for x in limit_iterator(
1356
+ _nx.all_simple_paths(
1357
+ _nx.DiGraph(G_with_begin_end),
1358
+ "begin",
1359
+ "end",
1360
+ cutoff=(len(self.fragments) + 1),
1361
+ ),
1362
+ max_paths,
1363
+ )
1364
+ ]
1365
+
1366
+ # Remove those that contain the same node twice
1367
+ node_paths = [x for x in node_paths if len(x) == len(set(map(abs, x)))]
1368
+
1369
+ if self.use_all_fragments:
1370
+ node_paths = [x for x in node_paths if len(x) == len(self.fragments)]
1371
+
1372
+ # For each path, we check if there are reverse complement duplicates
1373
+ # See: https://github.com/manulera/OpenCloning_backend/issues/160
1374
+ unique_node_paths = list()
1375
+ for p in node_paths:
1376
+ if [-x for x in p[::-1]] not in unique_node_paths:
1377
+ unique_node_paths.append(p)
1378
+
1379
+ return unique_node_paths
1380
+
1381
+ def get_possible_assembly_number(self, paths: list[list[int]]) -> int:
1382
+ """
1383
+ Get the number of possible assemblies from a list of node paths. Basically, for each path
1384
+ passed as a list of integers / nodes, we calculate the number of paths possible connecting
1385
+ the nodes in that order, given the graph (all the edges connecting them).
1386
+ """
1387
+ possibilities = 0
1388
+ for path in paths:
1389
+ this_path = 1
1390
+ for u, v in zip(path, path[1:]):
1391
+ if v in self.G[u]:
1392
+ this_path *= len(self.G[u][v])
1393
+ possibilities += this_path
1394
+ return possibilities
1395
+
1396
+ def get_circular_assemblies(
1397
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1398
+ ) -> list[EdgeRepresentationAssembly]:
1399
+ """Get circular assemblies, applying the constrains described in __init__, ensuring that paths represent
1400
+ real assemblies (see assembly_is_valid)."""
1401
+ # The constrain of circular sequence is that the first node is the fragment with the smallest index in its initial orientation,
1402
+ # this is ensured by the circular_permutation_min_abs function + the filter below
1403
+ sorted_cycles = map(
1404
+ circular_permutation_min_abs,
1405
+ limit_iterator(
1406
+ _nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
1407
+ 10000,
1408
+ ),
1409
+ )
1410
+ sorted_cycles = filter(lambda x: x[0] > 0, sorted_cycles)
1411
+ # cycles.simple_cycles returns lists [1,2,3] not assemblies, see self.cycle2circular_assemblies
1412
+
1413
+ # We apply constrains already here because sometimes the combinatorial explosion is too large
1414
+ if self.use_all_fragments:
1415
+ sorted_cycles = [c for c in sorted_cycles if len(c) == len(self.fragments)]
1416
+
1417
+ # Remove cycles with duplicates
1418
+ sorted_cycles = [c for c in sorted_cycles if len(c) == len(set(map(abs, c)))]
1419
+ possible_assembly_number = self.get_possible_assembly_number(
1420
+ [c + c[:1] for c in sorted_cycles]
1421
+ )
1422
+ if possible_assembly_number > max_assemblies:
1423
+ raise ValueError(
1424
+ f"Too many assemblies ({possible_assembly_number} pre-validation) to assemble"
1425
+ )
1426
+
1427
+ assemblies = sum(
1428
+ map(lambda x: self.node_path2assembly_list(x, True), sorted_cycles), []
1429
+ )
1430
+
1431
+ out = [
1432
+ a
1433
+ for a in assemblies
1434
+ if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)
1435
+ ]
1436
+ if only_adjacent_edges:
1437
+ out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, True)]
1438
+ return out
1439
+
1440
+ def format_insertion_assembly(
1441
+ self, assembly: EdgeRepresentationAssembly
1442
+ ) -> EdgeRepresentationAssembly | None:
1443
+ """Sorts the fragment representing a cycle so that they represent an insertion assembly if possible,
1444
+ else returns None.
1445
+
1446
+ Here we check if one of the joins between fragments represents the edges of an insertion assembly
1447
+ The fragment must be linear, and the join must be as indicated below
1448
+
1449
+ ```
1450
+ -------- ------- Fragment 1
1451
+ || ||
1452
+ xxxxxxxx || Fragment 2
1453
+ || ||
1454
+ oooooooooo Fragment 3
1455
+ ```
1456
+ The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
1457
+
1458
+ These could be returned in any order by simple_cycles, so we sort the edges so that the first
1459
+ and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
1460
+ """
1461
+ edge_pair_index = list()
1462
+
1463
+ # Pair edges with one another
1464
+ for i, ((_u1, v1, _, end_location), (_u2, _v2, start_location, _)) in enumerate(
1465
+ zip(assembly, assembly[1:] + assembly[:1])
1466
+ ):
1467
+ fragment = self.fragments[abs(v1) - 1]
1468
+ # Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
1469
+ # the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
1470
+ left_of_insertion = _location_boundaries(start_location)[0]
1471
+ right_of_insertion = _location_boundaries(end_location)[0]
1472
+ if not fragment.circular and (
1473
+ right_of_insertion >= left_of_insertion
1474
+ # The below condition is for single-site integration.
1475
+ # The reason to use locations_overlap instead of equality is because the location might extend
1476
+ # left of right. For example, let's take ACCGGTTT as homology arm for an integration:
1477
+ #
1478
+ # insert aaACCGGTTTccACCGGTTTtt
1479
+ # genome aaACCGGTTTtt
1480
+ #
1481
+ # The locations of homology on the genome are [0:10] and [2:12], so not identical
1482
+ # but they overlap.
1483
+ or _locations_overlap(start_location, end_location, len(fragment))
1484
+ ):
1485
+ edge_pair_index.append(i)
1486
+
1487
+ if len(edge_pair_index) != 1:
1488
+ return None
1489
+
1490
+ shift_by = (edge_pair_index[0] + 1) % len(assembly)
1491
+ return assembly[shift_by:] + assembly[:shift_by]
1492
+
1493
+ def format_insertion_assembly_edge_case(
1494
+ self, assembly: EdgeRepresentationAssembly
1495
+ ) -> EdgeRepresentationAssembly:
1496
+ """
1497
+ Edge case from https://github.com/manulera/OpenCloning_backend/issues/329
1498
+ """
1499
+ same_assembly = assembly[:]
1500
+
1501
+ if len(assembly) != 2:
1502
+ return same_assembly
1503
+ ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = assembly
1504
+
1505
+ if f1 != _f1 or _f2 != f2:
1506
+ return same_assembly
1507
+
1508
+ if loc_f2_1 == loc_f2_2 or loc_f1_2 == loc_f1_1:
1509
+ return same_assembly
1510
+
1511
+ fragment1 = self.fragments[abs(f1) - 1]
1512
+ fragment2 = self.fragments[abs(f2) - 1]
1513
+
1514
+ if not _locations_overlap(
1515
+ loc_f1_1, loc_f1_2, len(fragment1)
1516
+ ) or not _locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
1517
+ return same_assembly
1518
+
1519
+ # Sort to make compatible with insertion assembly
1520
+ if _location_boundaries(loc_f1_1)[0] > _location_boundaries(loc_f1_2)[0]:
1521
+ new_assembly = same_assembly[::-1]
1522
+ else:
1523
+ new_assembly = same_assembly[:]
1524
+
1525
+ ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = new_assembly
1526
+
1527
+ fragment1 = self.fragments[abs(f1) - 1]
1528
+ if fragment1.circular:
1529
+ return same_assembly
1530
+ fragment2 = self.fragments[abs(f2) - 1]
1531
+
1532
+ # Extract boundaries
1533
+ f2_1_start, _ = _location_boundaries(loc_f2_1)
1534
+ f2_2_start, f2_2_end = _location_boundaries(loc_f2_2)
1535
+ f1_1_start, _ = _location_boundaries(loc_f1_1)
1536
+ f1_2_start, f1_2_end = _location_boundaries(loc_f1_2)
1537
+
1538
+ overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(
1539
+ fragment2[f2_1_start:f2_2_end]
1540
+ )
1541
+
1542
+ if overlap_diff == 0:
1543
+ assert False, "Overlap is 0"
1544
+
1545
+ if overlap_diff > 0:
1546
+ new_loc_f1_1 = create_location(
1547
+ f1_1_start, f1_2_start - overlap_diff, len(fragment1)
1548
+ )
1549
+ new_loc_f2_1 = create_location(f2_1_start, f2_2_start, len(fragment2))
1550
+ else:
1551
+ new_loc_f2_1 = create_location(
1552
+ f2_1_start, f2_2_start + overlap_diff, len(fragment2)
1553
+ )
1554
+ new_loc_f1_1 = create_location(f1_1_start, f1_2_start, len(fragment1))
1555
+
1556
+ new_assembly = [
1557
+ (f1, f2, new_loc_f1_1, new_loc_f2_1),
1558
+ new_assembly[1],
1559
+ ]
1560
+
1561
+ return new_assembly
1562
+
1563
+ def get_insertion_assemblies(
1564
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1565
+ ) -> list[EdgeRepresentationAssembly]:
1566
+ """Assemblies that represent the insertion of a fragment or series of fragment inside a linear construct. For instance,
1567
+ digesting CCCCGAATTCCCCGAATTC with EcoRI and inserting the fragment with two overhangs into the EcoRI site of AAAGAATTCAAA.
1568
+ This is not so much meant for the use-case of linear fragments that represent actual linear fragments, but for linear
1569
+ fragments that represent a genome region. This can then be used to simulate homologous recombination.
1570
+ """
1571
+ if only_adjacent_edges:
1572
+ raise NotImplementedError(
1573
+ "only_adjacent_edges not implemented for insertion assemblies"
1574
+ )
1575
+
1576
+ cycles = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1577
+
1578
+ # We apply constrains already here because sometimes the combinatorial explosion is too large
1579
+ if self.use_all_fragments:
1580
+ cycles = [c for c in cycles if len(c) == len(self.fragments)]
1581
+
1582
+ # Remove cycles with duplicates
1583
+ cycles = [c for c in cycles if len(c) == len(set(map(abs, c)))]
1584
+
1585
+ possible_assembly_number = self.get_possible_assembly_number(
1586
+ [c + c[:1] for c in cycles]
1587
+ )
1588
+
1589
+ if possible_assembly_number > max_assemblies:
1590
+ raise ValueError(
1591
+ f"Too many assemblies ({possible_assembly_number} pre-validation) to assemble"
1592
+ )
1593
+
1594
+ # We find cycles first
1595
+ iterator = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1596
+ assemblies = sum(
1597
+ map(lambda x: self.node_path2assembly_list(x, True), iterator), []
1598
+ )
1599
+ # We format the edge case
1600
+ assemblies = [self.format_insertion_assembly_edge_case(a) for a in assemblies]
1601
+ # We select those that contain exactly only one suitable edge
1602
+ assemblies = [
1603
+ b
1604
+ for a in assemblies
1605
+ if (b := self.format_insertion_assembly(a)) is not None
1606
+ ]
1607
+ # First fragment should be in the + orientation
1608
+ assemblies = list(filter(lambda x: x[0][0] > 0, assemblies))
1609
+ return [
1610
+ a
1611
+ for a in assemblies
1612
+ if self.assembly_is_valid(
1613
+ self.fragments, a, False, self.use_all_fragments, is_insertion=True
1614
+ )
1615
+ ]
1616
+
1617
+ def assemble_linear(
1618
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1619
+ ) -> list[_Dseqrecord]:
1620
+ """Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
1621
+ assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
1622
+ return [assemble(self.fragments, a) for a in assemblies]
1623
+
1624
+ def assemble_circular(
1625
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1626
+ ) -> list[_Dseqrecord]:
1627
+ """Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
1628
+ assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
1629
+ return [assemble(self.fragments, a) for a in assemblies]
1630
+
1631
+ def assemble_insertion(
1632
+ self, only_adjacent_edges: bool = False
1633
+ ) -> list[_Dseqrecord]:
1634
+ """Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
1635
+ assemblies = self.get_insertion_assemblies(only_adjacent_edges)
1636
+ return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
1637
+
1638
+ def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
1639
+ """Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
1640
+ `left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1641
+ and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
1642
+ where we can end up with a situation like this:
1643
+
1644
+ GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
1645
+
1646
+ # Partial overlap in assembly 1[9:11]:2[8:10]
1647
+ GGTCTCCxxAACCAA
1648
+ CCAGAGGGGTTxxTT
1649
+
1650
+ # Partial overlap in 2[10:12]:1[7:9]
1651
+ aGGTCTCCxxCCAATT
1652
+ tCCAGAGGTTGGxxAA
1653
+
1654
+ Would return
1655
+ {
1656
+ 1: {'left': [7:9], 'right': [9:11]},
1657
+ 2: {'left': [8:10], 'right': [10:12]},
1658
+ -1: {'left': [2:4], 'right': [4:6]},
1659
+ -2: {'left': [2:4], 'right': [4:6]}
1660
+ }
1661
+
1662
+ """
1663
+
1664
+ locations_on_fragments = dict()
1665
+ for node in self.G.nodes:
1666
+ this_dict = {"left": list(), "right": list()}
1667
+ for edge in self.G.edges(data=True):
1668
+ for i, key in enumerate(["right", "left"]):
1669
+ if edge[i] == node:
1670
+ edge_location = edge[2]["locations"][i]
1671
+ if edge_location not in this_dict[key]:
1672
+ this_dict[key].append(edge_location)
1673
+ this_dict["left"] = sorted(
1674
+ this_dict["left"], key=lambda x: _location_boundaries(x)[0]
1675
+ )
1676
+ this_dict["right"] = sorted(
1677
+ this_dict["right"], key=lambda x: _location_boundaries(x)[0]
1678
+ )
1679
+ locations_on_fragments[node] = this_dict
1680
+
1681
+ return locations_on_fragments
1682
+
1683
+ def assembly_uses_only_adjacent_edges(self, assembly, is_circular: bool) -> bool:
1684
+ """
1685
+ Check whether only adjacent edges within each fragment are used in the assembly. This is useful to check if a cut and ligate assembly is valid,
1686
+ and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
1687
+ and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
1688
+
1689
+ ```
1690
+ x y z
1691
+ -------|-------|-------|---------
1692
+ ```
1693
+
1694
+ We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
1695
+ The latter would indicate that the fragment was partially digested.
1696
+ """
1697
+
1698
+ locations_on_fragments = self.get_locations_on_fragments()
1699
+ for node in locations_on_fragments:
1700
+ fragment_len = len(self.fragments[abs(node) - 1])
1701
+ for side in ["left", "right"]:
1702
+ locations_on_fragments[node][side] = gather_overlapping_locations(
1703
+ locations_on_fragments[node][side], fragment_len
1704
+ )
1705
+
1706
+ allowed_location_pairs = dict()
1707
+ for node in locations_on_fragments:
1708
+ if not is_circular:
1709
+ # We add the existing ends of the fragment
1710
+ left = [(None,)] + locations_on_fragments[node]["left"]
1711
+ right = locations_on_fragments[node]["right"] + [(None,)]
1712
+
1713
+ else:
1714
+ # For circular assemblies, we add the first location at the end
1715
+ # to allow for the last edge to be used
1716
+ left = locations_on_fragments[node]["left"]
1717
+ right = (
1718
+ locations_on_fragments[node]["right"][1:]
1719
+ + locations_on_fragments[node]["right"][:1]
1720
+ )
1721
+
1722
+ pairs = list()
1723
+ for pair in zip(left, right):
1724
+ pairs += list(_itertools.product(*pair))
1725
+ allowed_location_pairs[node] = pairs
1726
+
1727
+ fragment_assembly = edge_representation2subfragment_representation(
1728
+ assembly, is_circular
1729
+ )
1730
+ for node, start_location, end_location in fragment_assembly:
1731
+ if (start_location, end_location) not in allowed_location_pairs[node]:
1732
+ return False
1733
+ return True
1734
+
1735
+ def __repr__(self):
1736
+ # https://pyformat.info
1737
+ return _pretty_str(
1738
+ "Assembly\n"
1739
+ "fragments..: {sequences}\n"
1740
+ "limit(bp)..: {limit}\n"
1741
+ "G.nodes....: {nodes}\n"
1742
+ "algorithm..: {al}".format(
1743
+ sequences=" ".join("{}bp".format(len(x)) for x in self.fragments),
1744
+ limit=self.limit,
1745
+ nodes=self.G.order(),
1746
+ al=self.algorithm.__name__,
1747
+ )
1748
+ )
1749
+
1750
+
1751
+ class PCRAssembly(Assembly):
1752
+ """
1753
+ An assembly that represents a PCR, where `fragments` is a list of primer, template, primer (in that order).
1754
+ It always uses the `primer_template_overlap` algorithm and accepts the `mismatches` argument to indicate
1755
+ the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
1756
+ """
1757
+
1758
+ def __init__(self, frags: list[_Dseqrecord | _Primer], limit=25, mismatches=0):
1759
+
1760
+ value_error = ValueError(
1761
+ "PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer"
1762
+ )
1763
+ if len(frags) != 3:
1764
+ raise value_error
1765
+
1766
+ # Validate the inputs: should be a series of primer, template, primer
1767
+ wrong_fragment_class = (
1768
+ not isinstance(frags[0], _Primer),
1769
+ isinstance(frags[1], _Primer),
1770
+ not isinstance(frags[2], _Primer),
1771
+ )
1772
+ if any(wrong_fragment_class):
1773
+ raise value_error
1774
+
1775
+ # TODO: allow for the same fragment to be included more than once?
1776
+ self.G = _nx.MultiDiGraph()
1777
+ # Add positive and negative nodes for forward and reverse fragments
1778
+ self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1779
+ self.G.add_nodes_from(
1780
+ (-(i + 1), {"seq": f.reverse_complement()}) for (i, f) in enumerate(frags)
1781
+ )
1782
+
1783
+ pairs = list()
1784
+ primer_ids = list()
1785
+ for i in range(0, len(frags), 3):
1786
+ # primer, template, primer
1787
+ p1, t, p2 = (i + 1, i + 2, i + 3)
1788
+ primer_ids += [p1, p2]
1789
+ pairs += list(_itertools.product([p1, p2], [t, -t]))
1790
+ pairs += list(_itertools.product([t, -t], [-p1, -p2]))
1791
+
1792
+ for u, v in pairs:
1793
+ u_seq = self.G.nodes[u]["seq"]
1794
+ v_seq = self.G.nodes[v]["seq"]
1795
+ matches = primer_template_overlap(u_seq, v_seq, limit, mismatches)
1796
+ for match in matches:
1797
+ self.add_edges_from_match(match, u, v, u_seq, v_seq)
1798
+
1799
+ # These two are constrained
1800
+ self.use_fragment_order = False
1801
+ self.use_all_fragments = True
1802
+
1803
+ self.fragments = frags
1804
+ self.limit = limit
1805
+ self.algorithm = primer_template_overlap
1806
+
1807
+ return
1808
+
1809
+ def get_linear_assemblies(
1810
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1811
+ ) -> list[EdgeRepresentationAssembly]:
1812
+ if only_adjacent_edges:
1813
+ raise NotImplementedError(
1814
+ "only_adjacent_edges not implemented for PCR assemblies"
1815
+ )
1816
+
1817
+ return super().get_linear_assemblies(max_assemblies=max_assemblies)
1818
+
1819
+ def get_circular_assemblies(self, only_adjacent_edges: bool = False):
1820
+ raise NotImplementedError(
1821
+ "get_circular_assemblies not implemented for PCR assemblies"
1822
+ )
1823
+
1824
+ def get_insertion_assemblies(self, only_adjacent_edges: bool = False):
1825
+ raise NotImplementedError(
1826
+ "get_insertion_assemblies not implemented for PCR assemblies"
1827
+ )
1828
+
1829
+
1830
+ class SingleFragmentAssembly(Assembly):
1831
+ """
1832
+ An assembly that represents the circularisation or splicing of a single fragment.
1833
+ """
1834
+
1835
+ def __init__(self, frags: [_Dseqrecord], limit=25, algorithm=common_sub_strings):
1836
+
1837
+ if len(frags) != 1:
1838
+ raise ValueError(
1839
+ "SingleFragmentAssembly assembly must be initialised with a single fragment"
1840
+ )
1841
+ # TODO: allow for the same fragment to be included more than once?
1842
+ self.G = _nx.MultiDiGraph()
1843
+ frag = frags[0]
1844
+ # Add positive and negative nodes for forward and reverse fragments
1845
+ self.G.add_node(1, seq=frag)
1846
+
1847
+ matches = algorithm(frag, frag, limit)
1848
+ for match in matches:
1849
+ self.add_edges_from_match(match, 1, 1, frag, frag)
1850
+
1851
+ # To avoid duplicated outputs
1852
+ self.G.remove_edges_from([(-1, -1)])
1853
+
1854
+ # These two are constrained
1855
+ self.use_fragment_order = True
1856
+ self.use_all_fragments = True
1857
+
1858
+ self.fragments = frags
1859
+ self.limit = limit
1860
+ self.algorithm = algorithm
1861
+
1862
+ return
1863
+
1864
+ def get_circular_assemblies(
1865
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1866
+ ) -> list[EdgeRepresentationAssembly]:
1867
+ # We don't want the same location twice
1868
+ assemblies = filter(
1869
+ lambda x: x[0][2] != x[0][3],
1870
+ super().get_circular_assemblies(only_adjacent_edges, max_assemblies),
1871
+ )
1872
+ return [
1873
+ a
1874
+ for a in assemblies
1875
+ if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)
1876
+ ]
1877
+
1878
+ def get_insertion_assemblies(
1879
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1880
+ ) -> list[EdgeRepresentationAssembly]:
1881
+ """This could be renamed splicing assembly, but the essence is similar"""
1882
+
1883
+ if only_adjacent_edges:
1884
+ raise NotImplementedError(
1885
+ "only_adjacent_edges not implemented for insertion assemblies"
1886
+ )
1887
+
1888
+ def splicing_assembly_filter(x):
1889
+ # We don't want the same location twice
1890
+ if x[0][2] == x[0][3]:
1891
+ return False
1892
+ # We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
1893
+ left_start, _ = _location_boundaries(x[0][2])
1894
+ _, right_end = _location_boundaries(x[0][3])
1895
+ if left_start == 0 and right_end == len(self.fragments[0]):
1896
+ return False
1897
+ return True
1898
+
1899
+ # We don't want the same location twice
1900
+ assemblies = filter(
1901
+ splicing_assembly_filter,
1902
+ super().get_insertion_assemblies(max_assemblies=max_assemblies),
1903
+ )
1904
+ return [
1905
+ a
1906
+ for a in assemblies
1907
+ if self.assembly_is_valid(
1908
+ self.fragments, a, False, self.use_all_fragments, is_insertion=True
1909
+ )
1910
+ ]
1911
+
1912
+ def get_linear_assemblies(self):
1913
+ raise NotImplementedError("Linear assembly does not make sense")
1914
+
1915
+
1916
+ def common_function_assembly_products(
1917
+ frags: list[_Dseqrecord],
1918
+ limit: int | None,
1919
+ algorithm: Callable,
1920
+ circular_only: bool,
1921
+ filter_results_function: Callable | None = None,
1922
+ ) -> list[_Dseqrecord]:
1923
+ """Common function to avoid code duplication. Could be simplified further
1924
+ once SingleFragmentAssembly and Assembly are merged.
1925
+
1926
+ Parameters
1927
+ ----------
1928
+ frags : list[_Dseqrecord]
1929
+ List of DNA fragments to assemble
1930
+ limit : int or None
1931
+ Minimum overlap length required, or None if not applicable
1932
+ algorithm : Callable
1933
+ Function that determines valid overlaps between fragments
1934
+ circular_only : bool
1935
+ If True, only return circular assemblies
1936
+
1937
+ Returns
1938
+ -------
1939
+ list[_Dseqrecord]
1940
+ List of assembled DNA molecules
1941
+ """
1942
+ if len(frags) == 1:
1943
+ asm = SingleFragmentAssembly(frags, limit, algorithm)
1944
+ else:
1945
+ asm = Assembly(
1946
+ frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
1947
+ )
1948
+ output_assemblies = asm.get_circular_assemblies()
1949
+ if not circular_only and len(frags) > 1:
1950
+ output_assemblies += filter_linear_subassemblies(
1951
+ asm.get_linear_assemblies(), output_assemblies, frags
1952
+ )
1953
+ if not circular_only and len(frags) == 1:
1954
+ output_assemblies += asm.get_insertion_assemblies()
1955
+
1956
+ if filter_results_function:
1957
+ output_assemblies = [a for a in output_assemblies if filter_results_function(a)]
1958
+
1959
+ return [assemble(frags, a) for a in output_assemblies]
1960
+
1961
+
1962
+ def gibson_assembly(
1963
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1964
+ ) -> list[_Dseqrecord]:
1965
+ """Returns the products for Gibson assembly.
1966
+
1967
+ Parameters
1968
+ ----------
1969
+ frags : list[_Dseqrecord]
1970
+ List of DNA fragments to assemble
1971
+ limit : int, optional
1972
+ Minimum overlap length required, by default 25
1973
+ circular_only : bool, optional
1974
+ If True, only return circular assemblies, by default False
1975
+
1976
+ Returns
1977
+ -------
1978
+ list[_Dseqrecord]
1979
+ List of assembled DNA molecules
1980
+ """
1981
+ return common_function_assembly_products(
1982
+ frags, limit, gibson_overlap, circular_only
1983
+ )
1984
+
1985
+
1986
+ def in_fusion_assembly(
1987
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
1988
+ ) -> list[_Dseqrecord]:
1989
+ """Returns the products for in-fusion assembly. This is the same as Gibson
1990
+ assembly, but with a different name.
1991
+
1992
+ Parameters
1993
+ ----------
1994
+ frags : list[_Dseqrecord]
1995
+ List of DNA fragments to assemble
1996
+ limit : int, optional
1997
+ Minimum overlap length required, by default 25
1998
+ circular_only : bool, optional
1999
+ If True, only return circular assemblies, by default False
2000
+
2001
+ Returns
2002
+ -------
2003
+ list[_Dseqrecord]
2004
+ List of assembled DNA molecules
2005
+ """
2006
+ return gibson_assembly(frags, limit)
2007
+
2008
+
2009
+ def fusion_pcr_assembly(
2010
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2011
+ ) -> list[_Dseqrecord]:
2012
+ """Returns the products for fusion PCR assembly. This is the same as Gibson
2013
+ assembly, but with a different name.
2014
+
2015
+ Parameters
2016
+ ----------
2017
+ frags : list[_Dseqrecord]
2018
+ List of DNA fragments to assemble
2019
+ limit : int, optional
2020
+ Minimum overlap length required, by default 25
2021
+ circular_only : bool, optional
2022
+ If True, only return circular assemblies, by default False
2023
+
2024
+ Returns
2025
+ -------
2026
+ list[_Dseqrecord]
2027
+ List of assembled DNA molecules
2028
+ """
2029
+ return gibson_assembly(frags, limit)
2030
+
2031
+
2032
+ def in_vivo_assembly(
2033
+ frags: list[_Dseqrecord], limit: int = 25, circular_only: bool = False
2034
+ ) -> list[_Dseqrecord]:
2035
+ """Returns the products for in vivo assembly (IVA), which relies on homologous recombination between the fragments.
2036
+
2037
+ Parameters
2038
+ ----------
2039
+ frags : list[_Dseqrecord]
2040
+ List of DNA fragments to assemble
2041
+ limit : int, optional
2042
+ Minimum overlap length required, by default 25
2043
+ circular_only : bool, optional
2044
+ If True, only return circular assemblies, by default False
2045
+
2046
+ Returns
2047
+ -------
2048
+ list[_Dseqrecord]
2049
+ List of assembled DNA molecules
2050
+ """
2051
+ return common_function_assembly_products(
2052
+ frags, limit, common_sub_strings, circular_only
2053
+ )
2054
+
2055
+
2056
+ def restriction_ligation_assembly(
2057
+ frags: list[_Dseqrecord],
2058
+ enzymes: list["_AbstractCut"],
2059
+ allow_blunt: bool = True,
2060
+ circular_only: bool = False,
2061
+ ) -> list[_Dseqrecord]:
2062
+ """Returns the products for restriction ligation assembly:
2063
+ * Finds cutsites in the fragments
2064
+ * Finds all products that could be assembled by ligating the fragments based on those cutsites
2065
+ * Will NOT return products that combine an existing end with an end generated by the same enzyme (see example below)
2066
+
2067
+ Parameters
2068
+ ----------
2069
+ frags : list[_Dseqrecord]
2070
+ List of DNA fragments to assemble
2071
+ enzymes : list[_AbstractCut]
2072
+ List of restriction enzymes to use
2073
+ allow_blunt : bool, optional
2074
+ If True, allow blunt end ligations, by default True
2075
+ circular_only : bool, optional
2076
+ If True, only return circular assemblies, by default False
2077
+
2078
+ Returns
2079
+ -------
2080
+ list[_Dseqrecord]
2081
+ List of assembled DNA molecules
2082
+
2083
+ Examples
2084
+ --------
2085
+ In the example below, we plan to assemble a plasmid from a backbone and an insert, using the EcoRI and SalI enzymes.
2086
+ Note how 2 circular products are returned, one contains the insert (`acgt`)
2087
+ and the desired part of the backbone (`cccccc`), the other contains the
2088
+ reversed insert (`tgga`) and the cut-out part of the backbone (`aaa`).
2089
+
2090
+ >>> from pydna.assembly2 import restriction_ligation_assembly
2091
+ >>> from pydna.dseqrecord import Dseqrecord
2092
+ >>> from Bio.Restriction import EcoRI, SalI
2093
+ >>> backbone = Dseqrecord("cccGAATTCaaaGTCGACccc", circular=True)
2094
+ >>> insert = Dseqrecord("ggGAATTCaggtGTCGACgg")
2095
+ >>> products = restriction_ligation_assembly([backbone, insert], [EcoRI, SalI], circular_only=True)
2096
+ >>> products[0].seq
2097
+ Dseq(o22)
2098
+ TCGACccccccGAATTCaggtG
2099
+ AGCTGggggggCTTAAGtccaC
2100
+ >>> products[1].seq
2101
+ Dseq(o19)
2102
+ AATTCaaaGTCGACacctG
2103
+ TTAAGtttCAGCTGtggaC
2104
+
2105
+ Note that passing a pre-cut fragment will not work.
2106
+
2107
+ >>> restriction_products = insert.cut([EcoRI, SalI])
2108
+ >>> cut_insert = restriction_products[1]
2109
+ >>> restriction_ligation_assembly([backbone, cut_insert], [EcoRI, SalI], circular_only=True)
2110
+ []
2111
+
2112
+ It also works with a single fragment, for circularization:
2113
+
2114
+ >>> seq = Dseqrecord("GAATTCaaaGAATTC")
2115
+ >>> products =restriction_ligation_assembly([seq], [EcoRI])
2116
+ >>> products[0].seq
2117
+ Dseq(o9)
2118
+ AATTCaaaG
2119
+ TTAAGtttC
2120
+ """
2121
+
2122
+ def algo(x, y, _l):
2123
+ # By default, we allow blunt ends
2124
+ return restriction_ligation_overlap(x, y, enzymes, False, allow_blunt)
2125
+
2126
+ return common_function_assembly_products(frags, None, algo, circular_only)
2127
+
2128
+
2129
+ def golden_gate_assembly(
2130
+ frags: list[_Dseqrecord],
2131
+ enzymes: list["_AbstractCut"],
2132
+ allow_blunt: bool = True,
2133
+ circular_only: bool = False,
2134
+ ) -> list[_Dseqrecord]:
2135
+ """Returns the products for Golden Gate assembly. This is the same as
2136
+ restriction ligation assembly, but with a different name. Check the documentation
2137
+ for `restriction_ligation_assembly` for more details.
2138
+
2139
+ Parameters
2140
+ ----------
2141
+ frags : list[_Dseqrecord]
2142
+ List of DNA fragments to assemble
2143
+ enzymes : list[_AbstractCut]
2144
+ List of restriction enzymes to use
2145
+ allow_blunt : bool, optional
2146
+ If True, allow blunt end ligations, by default True
2147
+ circular_only : bool, optional
2148
+ If True, only return circular assemblies, by default False
2149
+
2150
+ Returns
2151
+ -------
2152
+ list[_Dseqrecord]
2153
+ List of assembled DNA molecules
2154
+
2155
+ Examples
2156
+ --------
2157
+ See the example for `restriction_ligation_assembly`.
2158
+ """
2159
+ return restriction_ligation_assembly(frags, enzymes, allow_blunt, circular_only)
2160
+
2161
+
2162
+ def ligation_assembly(
2163
+ frags: list[_Dseqrecord],
2164
+ allow_blunt: bool = False,
2165
+ allow_partial_overlap: bool = False,
2166
+ circular_only: bool = False,
2167
+ ) -> list[_Dseqrecord]:
2168
+ """Returns the products for ligation assembly, as inputs pass the fragments (digested if needed) that
2169
+ will be ligated.
2170
+
2171
+ For most cases, you probably should use `restriction_ligation_assembly` instead.
2172
+
2173
+ Parameters
2174
+ ----------
2175
+ frags : list[_Dseqrecord]
2176
+ List of DNA fragments to assemble
2177
+ allow_blunt : bool, optional
2178
+ If True, allow blunt end ligations, by default False
2179
+ allow_partial_overlap : bool, optional
2180
+ If True, allow partial overlaps between sticky ends, by default False
2181
+ circular_only : bool, optional
2182
+ If True, only return circular assemblies, by default False
2183
+
2184
+ Returns
2185
+ -------
2186
+ list[_Dseqrecord]
2187
+ List of assembled DNA molecules
2188
+
2189
+
2190
+ Examples
2191
+ --------
2192
+ In the example below, we plan to assemble a plasmid from a backbone and an insert,
2193
+ using the EcoRI enzyme. The insert and insertion site in the backbone are flanked by
2194
+ EcoRI sites, so there are two possible products depending on the orientation of the insert.
2195
+
2196
+ >>> from pydna.assembly2 import ligation_assembly
2197
+ >>> from pydna.dseqrecord import Dseqrecord
2198
+ >>> from Bio.Restriction import EcoRI
2199
+ >>> backbone = Dseqrecord("cccGAATTCaaaGAATTCccc", circular=True)
2200
+ >>> backbone_cut = backbone.cut(EcoRI)[1]
2201
+ >>> insert = Dseqrecord("ggGAATTCaggtGAATTCgg")
2202
+ >>> insert_cut = insert.cut(EcoRI)[1]
2203
+ >>> products = ligation_assembly([backbone_cut, insert_cut])
2204
+ >>> products[0].seq
2205
+ Dseq(o22)
2206
+ AATTCccccccGAATTCaggtG
2207
+ TTAAGggggggCTTAAGtccaC
2208
+ >>> products[1].seq
2209
+ Dseq(o22)
2210
+ AATTCccccccGAATTCacctG
2211
+ TTAAGggggggCTTAAGtggaC
2212
+ """
2213
+
2214
+ def sticky_end_algorithm(x, y, _l):
2215
+ return sticky_end_sub_strings(x, y, allow_partial_overlap)
2216
+
2217
+ if allow_blunt:
2218
+ algo = combine_algorithms(sticky_end_algorithm, blunt_overlap)
2219
+ else:
2220
+ algo = sticky_end_algorithm
2221
+
2222
+ return common_function_assembly_products(frags, None, algo, circular_only)
2223
+
2224
+
2225
+ def assembly_is_multi_site(asm: list[EdgeRepresentationAssembly]) -> bool:
2226
+ """Returns True if the assembly is a multi-site assembly, False otherwise."""
2227
+
2228
+ if len(asm) < 2:
2229
+ return False
2230
+
2231
+ is_cycle = asm[0][1] == asm[-1][0]
2232
+ asm2 = edge_representation2subfragment_representation(asm, is_cycle)
2233
+
2234
+ return all(f[1] != f[2] for f in asm2)
2235
+
2236
+
2237
+ def gateway_assembly(
2238
+ frags: list[_Dseqrecord],
2239
+ reaction_type: str,
2240
+ greedy: bool = False,
2241
+ circular_only: bool = False,
2242
+ multi_site_only: bool = False,
2243
+ ) -> list[_Dseqrecord]:
2244
+ """Returns the products for Gateway assembly / Gateway cloning.
2245
+
2246
+ Parameters
2247
+ ----------
2248
+ frags : list[_Dseqrecord]
2249
+ List of DNA fragments to assemble
2250
+ reaction_type : str
2251
+ Type of Gateway reaction, either 'BP' or 'LR'
2252
+ greedy : bool, optional
2253
+ If True, use greedy gateway consensus sites, by default False
2254
+ circular_only : bool, optional
2255
+ If True, only return circular assemblies, by default False
2256
+ multi_site_only : bool, optional
2257
+ If True, only return products that where 2 sites recombined. Even if input sequences
2258
+ contain multiple att sites (typically 2), a product could be generated where only one
2259
+ site recombines. That's typically not what you want, so you can set this to True to
2260
+ only return products where both att sites recombined.
2261
+
2262
+ Returns
2263
+ -------
2264
+ list[_Dseqrecord]
2265
+ List of assembled DNA molecules
2266
+
2267
+
2268
+ Examples
2269
+ --------
2270
+
2271
+ Below an example with dummy Gateway sequences, composed with minimal sequences and the consensus
2272
+ att sites.
2273
+
2274
+ >>> from pydna.assembly2 import gateway_assembly
2275
+ >>> from pydna.dseqrecord import Dseqrecord
2276
+ >>> attB1 = "ACAACTTTGTACAAAAAAGCAGAAG"
2277
+ >>> attP1 = "AAAATAATGATTTTATTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCTGAACGAGAAGCGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATCCAGTCACTATGAATCAACTACTTAGATGGTATTAGTGACCTGTA"
2278
+ >>> attR1 = "ACAACTTTGTACAAAAAAGCTGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATGCAGTCACTATG"
2279
+ >>> attL1 = "CAAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATAAGCAATGCTTTCTTATAATGCCAACTTTGTACAAAAAAGCAGGCT"
2280
+ >>> seq1 = Dseqrecord("aaa" + attB1 + "ccc")
2281
+ >>> seq2 = Dseqrecord("aaa" + attP1 + "ccc")
2282
+ >>> seq3 = Dseqrecord("aaa" + attR1 + "ccc")
2283
+ >>> seq4 = Dseqrecord("aaa" + attL1 + "ccc")
2284
+ >>> products_BP = gateway_assembly([seq1, seq2], "BP")
2285
+ >>> products_LR = gateway_assembly([seq3, seq4], "LR")
2286
+ >>> len(products_BP)
2287
+ 2
2288
+ >>> len(products_LR)
2289
+ 2
2290
+
2291
+ Now let's understand the `multi_site_only` parameter. Let's consider a case where we are swapping fragments
2292
+ between two plasmids using an LR reaction. Experimentally, we expect to obtain two plasmids, resulting from the
2293
+ swapping between the two att sites. That's what we get if we set `multi_site_only` to True.
2294
+
2295
+ >>> attL2 = 'aaataatgattttattttgactgatagtgacctgttcgttgcaacaaattgataagcaatgctttcttataatgccaactttgtacaagaaagctg'
2296
+ >>> attR2 = 'accactttgtacaagaaagctgaacgagaaacgtaaaatgatataaatatcaatatattaaattagattttgcataaaaaacagactacataatactgtaaaacacaacatatccagtcactatg'
2297
+ >>> insert = Dseqrecord("cccccc" + attL1 + "ccc" + attL2 + "cccccc", circular=True)
2298
+ >>> backbone = Dseqrecord("ttttt" + attR1 + "aaa" + attR2, circular=True)
2299
+ >>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=True)
2300
+ >>> len(products)
2301
+ 2
2302
+
2303
+ However, if we set `multi_site_only` to False, we get 4 products, which also include the intermediate products
2304
+ where the two plasmids are combined into a single one through recombination of a single att site. This is an
2305
+ intermediate of the reaction, and typically we don't want it:
2306
+
2307
+ >>> products = gateway_assembly([insert, backbone], "LR", multi_site_only=False)
2308
+ >>> print([len(p) for p in products])
2309
+ [469, 237, 232, 469]
2310
+
2311
+
2312
+ """
2313
+
2314
+ if reaction_type not in ["BP", "LR"]:
2315
+ raise ValueError(
2316
+ f"Invalid reaction type: {reaction_type}, can only be BP or LR"
2317
+ )
2318
+
2319
+ def algo(x, y, _l):
2320
+ return gateway_overlap(x, y, reaction_type, greedy)
2321
+
2322
+ filter_results_function = None if not multi_site_only else assembly_is_multi_site
2323
+
2324
+ products = common_function_assembly_products(
2325
+ frags, None, algo, circular_only, filter_results_function
2326
+ )
2327
+
2328
+ if len(products) == 0:
2329
+ # Build a list of all the sites in the fragments
2330
+ sites_in_fragments = list()
2331
+ for frag in frags:
2332
+ sites_in_fragments.append(list(find_gateway_sites(frag, greedy).keys()))
2333
+ formatted_strings = [
2334
+ f'fragment {i + 1}: {", ".join(sites)}'
2335
+ for i, sites in enumerate(sites_in_fragments)
2336
+ ]
2337
+ raise ValueError(
2338
+ f"Inputs are not compatible for {reaction_type} reaction.\n\n"
2339
+ + "\n".join(formatted_strings),
2340
+ )
2341
+ return products
2342
+
2343
+
2344
+ def common_function_integration_products(
2345
+ frags: list[_Dseqrecord], limit: int | None, algorithm: Callable
2346
+ ) -> list[_Dseqrecord]:
2347
+ """Common function to avoid code duplication for integration products.
2348
+
2349
+ Parameters
2350
+ ----------
2351
+ frags : list[_Dseqrecord]
2352
+ List of DNA fragments to integrate
2353
+ limit : int or None
2354
+ Minimum overlap length required, or None if not applicable
2355
+ algorithm : Callable
2356
+ Function that determines valid overlaps between fragments
2357
+
2358
+ Returns
2359
+ -------
2360
+ list[_Dseqrecord]
2361
+ List of integrated DNA molecules
2362
+ """
2363
+ if len(frags) == 1:
2364
+ asm = SingleFragmentAssembly(frags, limit, algorithm)
2365
+ else:
2366
+ asm = Assembly(
2367
+ frags, limit, algorithm, use_fragment_order=False, use_all_fragments=True
2368
+ )
2369
+
2370
+ if frags[0].circular:
2371
+ raise ValueError(
2372
+ "Genome must be linear for integration assembly, use in vivo assembly instead"
2373
+ )
2374
+
2375
+ # We only want insertions in the genome (first fragment)
2376
+ output_assemblies = [a for a in asm.get_insertion_assemblies() if a[0][0] == 1]
2377
+ return [assemble(frags, a, True) for a in output_assemblies]
2378
+
2379
+
2380
+ def common_handle_insertion_fragments(
2381
+ genome: _Dseqrecord, inserts: list[_Dseqrecord]
2382
+ ) -> list[_Dseqrecord]:
2383
+ """Common function to handle / validate insertion fragments.
2384
+
2385
+ Parameters
2386
+ ----------
2387
+ genome : _Dseqrecord
2388
+ Target genome sequence
2389
+ inserts : list[_Dseqrecord] or _Dseqrecord
2390
+ DNA fragment(s) to insert
2391
+
2392
+ Returns
2393
+ -------
2394
+ list[_Dseqrecord]
2395
+ List containing genome and insert fragments
2396
+ """
2397
+ if not isinstance(genome, _Dseqrecord):
2398
+ raise ValueError("Genome must be a Dseqrecord object")
2399
+
2400
+ if not isinstance(inserts, list) or not all(
2401
+ isinstance(f, _Dseqrecord) for f in inserts
2402
+ ):
2403
+ raise ValueError("Inserts must be a list of Dseqrecord objects")
2404
+
2405
+ if len(inserts) == 0:
2406
+ raise ValueError("Inserts must be a non-empty list of Dseqrecord objects")
2407
+
2408
+ return [genome] + inserts
2409
+
2410
+
2411
+ def common_function_excision_products(
2412
+ genome: _Dseqrecord, limit: int | None, algorithm: Callable
2413
+ ) -> list[_Dseqrecord]:
2414
+ """Common function to avoid code duplication for excision products.
2415
+
2416
+ Parameters
2417
+ ----------
2418
+ genome : _Dseqrecord
2419
+ Target genome sequence
2420
+ limit : int or None
2421
+ Minimum overlap length required, or None if not applicable
2422
+ algorithm : Callable
2423
+ Function that determines valid overlaps between fragments
2424
+
2425
+ Returns
2426
+ -------
2427
+ list[_Dseqrecord]
2428
+ List of excised DNA molecules
2429
+ """
2430
+ asm = SingleFragmentAssembly([genome], limit, algorithm)
2431
+ return asm.assemble_circular() + asm.assemble_insertion()
2432
+
2433
+
2434
+ def homologous_recombination_integration(
2435
+ genome: _Dseqrecord,
2436
+ inserts: list[_Dseqrecord],
2437
+ limit: int = 40,
2438
+ ) -> list[_Dseqrecord]:
2439
+ """Returns the products resulting from the integration of an insert (or inserts joined
2440
+ through in vivo recombination) into the genome through homologous recombination.
2441
+
2442
+ Parameters
2443
+ ----------
2444
+ genome : _Dseqrecord
2445
+ Target genome sequence
2446
+ inserts : list[_Dseqrecord]
2447
+ DNA fragment(s) to insert
2448
+ limit : int, optional
2449
+ Minimum homology length required, by default 40
2450
+
2451
+ Returns
2452
+ -------
2453
+ list[_Dseqrecord]
2454
+ List of integrated DNA molecules
2455
+
2456
+
2457
+ Examples
2458
+ --------
2459
+
2460
+ Below an example with a single insert.
2461
+
2462
+ >>> from pydna.assembly2 import homologous_recombination_integration
2463
+ >>> from pydna.dseqrecord import Dseqrecord
2464
+ >>> homology = "AAGTCCGTTCGTTTTACCTG"
2465
+ >>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
2466
+ >>> insert = Dseqrecord(f"{homology}gggg{homology}")
2467
+ >>> products = homologous_recombination_integration(genome, [insert], 20)
2468
+ >>> str(products[0].seq)
2469
+ 'aaaaaaAAGTCCGTTCGTTTTACCTGggggAAGTCCGTTCGTTTTACCTGaaaaaa'
2470
+
2471
+ Below an example with two inserts joined through homology.
2472
+
2473
+ >>> homology2 = "ATTACAGCATGGGAAGAAAGA"
2474
+ >>> insert_1 = Dseqrecord(f"{homology}gggg{homology2}")
2475
+ >>> insert_2 = Dseqrecord(f"{homology2}cccc{homology}")
2476
+ >>> products = homologous_recombination_integration(genome, [insert_1, insert_2], 20)
2477
+ >>> str(products[0].seq)
2478
+ 'aaaaaaAAGTCCGTTCGTTTTACCTGggggATTACAGCATGGGAAGAAAGAccccAAGTCCGTTCGTTTTACCTGaaaaaa'
2479
+ """
2480
+ fragments = common_handle_insertion_fragments(genome, inserts)
2481
+
2482
+ return common_function_integration_products(fragments, limit, common_sub_strings)
2483
+
2484
+
2485
+ def homologous_recombination_excision(
2486
+ genome: _Dseqrecord, limit: int = 40
2487
+ ) -> list[_Dseqrecord]:
2488
+ """Returns the products resulting from the excision of a fragment from the genome through
2489
+ homologous recombination.
2490
+
2491
+ Parameters
2492
+ ----------
2493
+ genome : _Dseqrecord
2494
+ Target genome sequence
2495
+ limit : int, optional
2496
+ Minimum homology length required, by default 40
2497
+
2498
+ Returns
2499
+ -------
2500
+ list[_Dseqrecord]
2501
+ List containing excised plasmid and remaining genome sequence
2502
+
2503
+ Examples
2504
+ --------
2505
+
2506
+ Example of a homologous recombination event, where a plasmid is excised from the
2507
+ genome (circular sequence of 25 bp), and that part is removed from the genome,
2508
+ leaving a shorter linear sequence (32 bp).
2509
+
2510
+ >>> from pydna.assembly2 import homologous_recombination_excision
2511
+ >>> from pydna.dseqrecord import Dseqrecord
2512
+ >>> homology = "AAGTCCGTTCGTTTTACCTG"
2513
+ >>> genome = Dseqrecord(f"aaaaaa{homology}ccccc{homology}aaaaaa")
2514
+ >>> products = homologous_recombination_excision(genome, 20)
2515
+ >>> products
2516
+ [Dseqrecord(o25), Dseqrecord(-32)]
2517
+ """
2518
+ return common_function_excision_products(genome, limit, common_sub_strings)
2519
+
2520
+
2521
+ def cre_lox_integration(
2522
+ genome: _Dseqrecord, inserts: list[_Dseqrecord]
2523
+ ) -> list[_Dseqrecord]:
2524
+ """Returns the products resulting from the integration of an insert (or inserts joined
2525
+ through cre-lox recombination among them) into the genome through cre-lox integration.
2526
+
2527
+ Also works with lox66 and lox71 (see `pydna.cre_lox` for more details).
2528
+
2529
+ Parameters
2530
+ ----------
2531
+ genome : _Dseqrecord
2532
+ Target genome sequence
2533
+ inserts : list[_Dseqrecord] or _Dseqrecord
2534
+ DNA fragment(s) to insert
2535
+
2536
+ Returns
2537
+ -------
2538
+ list[_Dseqrecord]
2539
+ List of integrated DNA molecules
2540
+
2541
+ Examples
2542
+ --------
2543
+
2544
+ Below an example of reversible integration and excision.
2545
+
2546
+ >>> from pydna.dseqrecord import Dseqrecord
2547
+ >>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
2548
+ >>> from pydna.cre_lox import LOXP_SEQUENCE
2549
+ >>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
2550
+ >>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
2551
+ >>> [a, b]
2552
+ [Dseqrecord(-45), Dseqrecord(o39)]
2553
+ >>> res = cre_lox_integration(a, [b])
2554
+ >>> res
2555
+ [Dseqrecord(-84)]
2556
+ >>> res2 = cre_lox_excision(res[0])
2557
+ >>> res2
2558
+ [Dseqrecord(o39), Dseqrecord(-45)]
2559
+
2560
+ Below an example with lox66 and lox71 (irreversible integration).
2561
+ Here, the result of excision is still returned because there is a low
2562
+ probability of it happening, but it's considered a rare event.
2563
+
2564
+ >>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
2565
+ >>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
2566
+ >>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
2567
+ >>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
2568
+ >>> res = cre_lox_integration(a, [b])
2569
+ >>> res
2570
+ [Dseqrecord(-84)]
2571
+ >>> res2 = cre_lox_excision(res[0])
2572
+ >>> res2
2573
+ [Dseqrecord(o39), Dseqrecord(-45)]
2574
+
2575
+ """
2576
+ fragments = common_handle_insertion_fragments(genome, inserts)
2577
+ return common_function_integration_products(fragments, None, cre_loxP_overlap)
2578
+
2579
+
2580
+ def cre_lox_excision(genome: _Dseqrecord) -> list[_Dseqrecord]:
2581
+ """Returns the products for CRE-lox excision.
2582
+
2583
+ Parameters
2584
+ ----------
2585
+ genome : _Dseqrecord
2586
+ Target genome sequence
2587
+
2588
+ Returns
2589
+ -------
2590
+ list[_Dseqrecord]
2591
+ List containing excised plasmid and remaining genome sequence
2592
+
2593
+ Examples
2594
+ --------
2595
+
2596
+ Below an example of reversible integration and excision.
2597
+
2598
+ >>> from pydna.dseqrecord import Dseqrecord
2599
+ >>> from pydna.assembly2 import cre_lox_integration, cre_lox_excision
2600
+ >>> from pydna.cre_lox import LOXP_SEQUENCE
2601
+ >>> a = Dseqrecord(f"cccccc{LOXP_SEQUENCE}aaaaa")
2602
+ >>> b = Dseqrecord(f"{LOXP_SEQUENCE}bbbbb", circular=True)
2603
+ >>> [a, b]
2604
+ [Dseqrecord(-45), Dseqrecord(o39)]
2605
+ >>> res = cre_lox_integration(a, [b])
2606
+ >>> res
2607
+ [Dseqrecord(-84)]
2608
+ >>> res2 = cre_lox_excision(res[0])
2609
+ >>> res2
2610
+ [Dseqrecord(o39), Dseqrecord(-45)]
2611
+
2612
+ Below an example with lox66 and lox71 (irreversible integration).
2613
+ Here, the result of excision is still returned because there is a low
2614
+ probability of it happening, but it's considered a rare event.
2615
+
2616
+ >>> lox66 = 'ATAACTTCGTATAGCATACATTATACGAACGGTA'
2617
+ >>> lox71 = 'TACCGTTCGTATAGCATACATTATACGAAGTTAT'
2618
+ >>> a = Dseqrecord(f"cccccc{lox66}aaaaa")
2619
+ >>> b = Dseqrecord(f"{lox71}bbbbb", circular=True)
2620
+ >>> res = cre_lox_integration(a, [b])
2621
+ >>> res
2622
+ [Dseqrecord(-84)]
2623
+ >>> res2 = cre_lox_excision(res[0])
2624
+ >>> res2
2625
+ [Dseqrecord(o39), Dseqrecord(-45)]
2626
+ """
2627
+ return common_function_excision_products(genome, None, cre_loxP_overlap)