pydna 5.5.1__py3-none-any.whl → 5.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/assembly2.py ADDED
@@ -0,0 +1,1902 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Slightly different assembly implementation"""
3
+
4
+ import networkx as _nx
5
+ import itertools as _itertools
6
+ from Bio.SeqFeature import SimpleLocation, Location
7
+ from Bio.Seq import reverse_complement
8
+ from Bio.Restriction.Restriction import RestrictionBatch
9
+ import regex
10
+ import copy
11
+
12
+ from pydna.utils import (
13
+ shift_location as _shift_location,
14
+ flatten,
15
+ location_boundaries as _location_boundaries,
16
+ locations_overlap as _locations_overlap,
17
+ sum_is_sticky,
18
+ limit_iterator,
19
+ create_location,
20
+ )
21
+ from pydna._pretty import pretty_str as _pretty_str
22
+ from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
23
+ from pydna.dseqrecord import Dseqrecord as _Dseqrecord
24
+ from pydna.dseq import Dseq as _Dseq
25
+ from pydna.primer import Primer as _Primer
26
+ from pydna.seqrecord import SeqRecord as _SeqRecord
27
+ from pydna.types import (
28
+ CutSiteType,
29
+ EdgeRepresentationAssembly,
30
+ SubFragmentRepresentationAssembly,
31
+ AssemblyAlgorithmType,
32
+ SequenceOverlap,
33
+ AssemblyEdgeType,
34
+ )
35
+
36
+
37
+ def gather_overlapping_locations(
38
+ locs: list[Location], fragment_length: int
39
+ ) -> list[tuple[Location, ...]]:
40
+ """
41
+ Turn a list of locations into a list of tuples of those locations, where each tuple contains
42
+ locations that overlap. For example, if locs = [loc1, loc2, loc3], and loc1 and loc2 overlap,
43
+ the output will be [(loc1, loc2), (loc3,)].
44
+ """
45
+ # Make a graph with all the locations as nodes
46
+ G = _nx.Graph()
47
+ for i, loc in enumerate(locs):
48
+ G.add_node(i, location=loc)
49
+
50
+ # Add edges between nodes that overlap
51
+ for i in range(len(locs)):
52
+ for j in range(i + 1, len(locs)):
53
+ if _locations_overlap(locs[i], locs[j], fragment_length):
54
+ G.add_edge(i, j)
55
+
56
+ # Get groups of overlapping locations
57
+ groups = list()
58
+ for loc_set in _nx.connected_components(G):
59
+ groups.append(tuple(locs[i] for i in loc_set))
60
+
61
+ # Sort by location of the first element in each group (does not matter which since they are overlapping)
62
+ groups.sort(key=lambda x: _location_boundaries(x[0])[0])
63
+
64
+ return groups
65
+
66
+
67
+ def ends_from_cutsite(
68
+ cutsite: CutSiteType, seq: _Dseq
69
+ ) -> tuple[tuple[str, str], tuple[str, str]]:
70
+ """Get the sticky or blunt ends created by a restriction enzyme cut.
71
+
72
+ Args:
73
+ cutsite (CutSiteType): A tuple ((cut_watson, ovhg), enzyme) describing where the cut occurs
74
+ seq (_Dseq): The DNA sequence being cut
75
+
76
+ Raises:
77
+ ValueError: If cutsite is None
78
+
79
+ Returns:
80
+ tuple[tuple[str, str], tuple[str, str]]: A tuple of two tuples, each containing the type of end ('5\'', '3\'', or 'blunt')
81
+ and the sequence of the overhang. The first tuple is for the left end, second for the right end.
82
+
83
+ >>> from Bio.Restriction import NotI
84
+ >>> x = _Dseq("ctcgGCGGCCGCcagcggccg")
85
+ >>> x.get_cutsites(NotI)
86
+ [((6, -4), NotI)]
87
+ >>> ends_from_cutsite(x.get_cutsites(NotI)[0], x)
88
+ (("5'", 'ggcc'), ("5'", 'ggcc'))
89
+ """
90
+
91
+ if cutsite is None:
92
+ raise ValueError("None is not supported")
93
+
94
+ cut_watson, cut_crick, ovhg = seq.get_cut_parameters(cutsite, is_left=None)
95
+ if ovhg < 0:
96
+ # TODO check the edge in circular
97
+ return (
98
+ ("5'", str(seq[cut_watson:cut_crick].reverse_complement()).lower()),
99
+ ("5'", str(seq[cut_watson:cut_crick]).lower()),
100
+ )
101
+ elif ovhg > 0:
102
+ return (
103
+ ("3'", str(seq[cut_crick:cut_watson]).lower()),
104
+ ("3'", str(seq[cut_crick:cut_watson].reverse_complement()).lower()),
105
+ )
106
+
107
+ return ("blunt", ""), ("blunt", "")
108
+
109
+
110
+ def restriction_ligation_overlap(
111
+ seqx: _Dseqrecord,
112
+ seqy: _Dseqrecord,
113
+ enzymes=RestrictionBatch,
114
+ partial=False,
115
+ allow_blunt=False,
116
+ ) -> list[SequenceOverlap]:
117
+ """Assembly algorithm to find overlaps that would result from restriction and ligation.
118
+
119
+ Like in sticky and gibson, the order matters (see example below of partial overlap)
120
+
121
+ Args:
122
+ seqx (_Dseqrecord): The first sequence
123
+ seqy (_Dseqrecord): The second sequence
124
+ enzymes (RestrictionBatch): The enzymes to use
125
+ partial (bool): Whether to allow partial overlaps
126
+ allow_blunt (bool): Whether to allow blunt ends
127
+ Returns:
128
+ list[SequenceOverlap]: A list of overlaps between the two sequences
129
+
130
+ >>> from pydna.dseqrecord import Dseqrecord
131
+ >>> from pydna.assembly2 import restriction_ligation_overlap
132
+ >>> from Bio.Restriction import EcoRI, RgaI, DrdI, EcoRV
133
+ >>> x = Dseqrecord("ccGAATTCaa")
134
+ >>> y = Dseqrecord("aaaaGAATTCgg")
135
+ >>> restriction_ligation_overlap(x, y, [EcoRI])
136
+ [(3, 5, 4)]
137
+ >>> restriction_ligation_overlap(y, x, [EcoRI])
138
+ [(5, 3, 4)]
139
+
140
+ Partial overlap, note how it is not symmetric
141
+
142
+ >>> x = Dseqrecord("GACTAAAGGGTC")
143
+ >>> y = Dseqrecord("AAGCGATCGCAAGCGATCGCAA")
144
+ >>> restriction_ligation_overlap(x, y, [RgaI, DrdI], partial=True)
145
+ [(6, 5, 1), (6, 15, 1)]
146
+ >>> restriction_ligation_overlap(y, x, [RgaI, DrdI], partial=True)
147
+ []
148
+
149
+ Blunt overlap, returns length of the overlap 0
150
+
151
+ >>> x = Dseqrecord("aaGATATCcc")
152
+ >>> y = Dseqrecord("ttttGATATCaa")
153
+ >>> restriction_ligation_overlap(x, y, [EcoRV], allow_blunt=True)
154
+ [(5, 7, 0)]
155
+ >>> restriction_ligation_overlap(y, x, [EcoRV], allow_blunt=True)
156
+ [(7, 5, 0)]
157
+
158
+ """
159
+ cuts_x = seqx.seq.get_cutsites(*enzymes)
160
+ cuts_y = seqy.seq.get_cutsites(*enzymes)
161
+ # If blunt ends are allowed, something similar to this could be done to allow
162
+ # joining with linear sequence ends, but for now it messes up with the only_adjacent_edges
163
+ # case
164
+ # if allow_blunt:
165
+ # if not seqx.circular:
166
+ # cuts_x.append(((len(seqx), 0), None))
167
+ # if not seqy.circular:
168
+ # cuts_y.append(((0, 0), None))
169
+ matches = list()
170
+ for cut_x, cut_y in _itertools.product(cuts_x, cuts_y):
171
+ # A blunt end
172
+ if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
173
+ matches.append((cut_x[0][0], cut_y[0][0], 0))
174
+ continue
175
+
176
+ # Otherwise, test overhangs
177
+ overlap = sum_is_sticky(
178
+ ends_from_cutsite(cut_x, seqx.seq)[0],
179
+ ends_from_cutsite(cut_y, seqy.seq)[1],
180
+ partial,
181
+ )
182
+ if not overlap:
183
+ continue
184
+ x_watson, x_crick, x_ovhg = seqx.seq.get_cut_parameters(cut_x, is_left=False)
185
+ y_watson, y_crick, y_ovhg = seqy.seq.get_cut_parameters(cut_y, is_left=True)
186
+ # Positions where the overlap would start for full overlap
187
+ left_x = x_watson if x_ovhg < 0 else x_crick
188
+ left_y = y_watson if y_ovhg < 0 else y_crick
189
+
190
+ # Correct por partial overlaps
191
+ left_x += abs(x_ovhg) - overlap
192
+
193
+ matches.append((left_x, left_y, overlap))
194
+ return matches
195
+
196
+
197
+ def combine_algorithms(*algorithms: AssemblyAlgorithmType) -> AssemblyAlgorithmType:
198
+ """
199
+ Combine assembly algorithms, if any of them returns a match, the match is returned.
200
+
201
+ This can be used for example in a ligation where you want to allow both sticky and blunt end ligation.
202
+ """
203
+
204
+ def combined(seqx, seqy, limit):
205
+ matches = list()
206
+ for algorithm in algorithms:
207
+ matches += algorithm(seqx, seqy, limit)
208
+ return matches
209
+
210
+ return combined
211
+
212
+
213
+ def blunt_overlap(
214
+ seqx: _Dseqrecord, seqy: _Dseqrecord, limit=None
215
+ ) -> list[SequenceOverlap]:
216
+ """
217
+ Assembly algorithm to find blunt overlaps. Used for blunt ligation.
218
+
219
+ It basically returns [(len(seqx), 0, 0)] if the right end of seqx is blunt and the
220
+ left end of seqy is blunt (compatible with blunt ligation). Otherwise, it returns an empty list.
221
+
222
+ Args:
223
+ seqx (_Dseqrecord): The first sequence
224
+ seqy (_Dseqrecord): The second sequence
225
+ limit (int): There for compatibility, but it is ignored
226
+
227
+ Returns:
228
+ list[SequenceOverlap]: A list of overlaps between the two sequences
229
+
230
+ >>> from pydna.assembly2 import blunt_overlap
231
+ >>> from pydna.dseqrecord import Dseqrecord
232
+ >>> x = Dseqrecord("AAAAAA")
233
+ >>> y = Dseqrecord("TTTTTT")
234
+ >>> blunt_overlap(x, y)
235
+ [(6, 0, 0)]
236
+ """
237
+ if (
238
+ seqx.seq.three_prime_end()[0] == "blunt"
239
+ and seqy.seq.five_prime_end()[0] == "blunt"
240
+ ):
241
+ return [(len(seqx), 0, 0)]
242
+ return []
243
+
244
+
245
+ def common_sub_strings(
246
+ seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25
247
+ ) -> list[SequenceOverlap]:
248
+ """
249
+ Assembly algorithm to find common substrings of length == limit. see the docs of
250
+ the function common_sub_strings_str for more details. It is case insensitive.
251
+
252
+ >>> from pydna.dseqrecord import Dseqrecord
253
+ >>> x = Dseqrecord("TAAAAAAT")
254
+ >>> y = Dseqrecord("CCaAaAaACC")
255
+ >>> common_sub_strings(x, y, limit=5)
256
+ [(1, 2, 6), (1, 3, 5), (2, 2, 5)]
257
+ """
258
+ query_seqx = str(seqx.seq).upper()
259
+ query_seqy = str(seqy.seq).upper()
260
+ if seqx.circular:
261
+ query_seqx = query_seqx * 2
262
+ if seqy.circular:
263
+ query_seqy = query_seqy * 2
264
+ results = common_sub_strings_str(query_seqx, query_seqy, limit)
265
+
266
+ if not seqx.circular and not seqy.circular:
267
+ return results
268
+
269
+ # Remove matches that start on the second copy of the sequence
270
+ if seqx.circular:
271
+ results = [r for r in results if r[0] < len(seqx)]
272
+ if seqy.circular:
273
+ results = [r for r in results if r[1] < len(seqy)]
274
+
275
+ # Trim lengths that span more than the sequence
276
+ if seqx.circular or seqy.circular:
277
+ max_match_length = min(len(seqx), len(seqy))
278
+ results = [(r[0], r[1], min(r[2], max_match_length)) for r in results]
279
+
280
+ # Edge case where the sequences are identical
281
+ if len(seqx.seq) == len(seqy.seq):
282
+ full_match = next((r for r in results if r[2] == len(seqx.seq)), None)
283
+ if full_match is not None:
284
+ return [full_match]
285
+
286
+ # Remove duplicate matches, see example below
287
+ # Let's imagine the following two sequences, where either seqy or both are circular
288
+ # seqx: 01234
289
+ # seqy: 123450, circular
290
+ #
291
+ # common_sub_strings would return [(0, 5, 5), (1, 0, 4)]
292
+ # Actually, (1, 0, 4) is a subset of (0, 5, 5), the part
293
+ # that does not span the origin. To remove matches like this,
294
+ # We find matches where the origin is spanned in one of the sequences
295
+ # only, and then remove the subset of that match that does not span the origin.
296
+ shifted_matches = set()
297
+ for x, y, length in results:
298
+ x_span_origin = seqx.circular and x + length > len(seqx)
299
+ y_span_origin = seqy.circular and y + length > len(seqy)
300
+ if x_span_origin and not y_span_origin:
301
+ shift = len(seqx) - x
302
+ shifted_matches.add((0, y + shift, length - shift))
303
+ elif not x_span_origin and y_span_origin:
304
+ shift = len(seqy) - y
305
+ shifted_matches.add((x + shift, 0, length - shift))
306
+ return [r for r in results if r not in shifted_matches]
307
+
308
+
309
+ def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
310
+ """
311
+ Assembly algorithm to find terminal overlaps (e.g. for Gibson assembly).
312
+ The order matters, we want alignments like:
313
+
314
+ ```
315
+ seqx: oooo------xxxx
316
+ seqy: xxxx------oooo
317
+ Product: oooo------xxxx------oooo
318
+
319
+ Not like:
320
+
321
+ seqx: oooo------xxxx
322
+ seqy: xxxx------oooo
323
+ Product (unwanted): oooo
324
+ ```
325
+
326
+ Args:
327
+ seqx (_Dseqrecord): The first sequence
328
+ seqy (_Dseqrecord): The second sequence
329
+ limit (int): Minimum length of the overlap
330
+
331
+ Returns:
332
+ list[SequenceOverlap]: A list of overlaps between the two sequences
333
+
334
+ >>> from pydna.dseqrecord import Dseqrecord
335
+ >>> from pydna.assembly2 import gibson_overlap
336
+ >>> x = Dseqrecord("ttactaAAAAAA")
337
+ >>> y = Dseqrecord("AAAAAAcgcacg")
338
+ >>> gibson_overlap(x, y, limit=5)
339
+ [(6, 0, 6), (7, 0, 5)]
340
+ >>> gibson_overlap(y, x, limit=5)
341
+ []
342
+ """
343
+
344
+ # Because Gibson enzymes remove 5' overhangs, we remove them from the sequence
345
+ # when looking for homology, then we shift the location of the second fragment accordingly.
346
+ # This is only relevant for linear fragments, so we don't need to worry about
347
+ # shifting locations for circular fragments.
348
+ trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
349
+ trim_x_right = seqx.seq.watson_ovhg() if seqx.seq.watson_ovhg() < 0 else None
350
+ trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
351
+ trim_y_right = seqy.seq.watson_ovhg() if seqy.seq.watson_ovhg() < 0 else None
352
+
353
+ stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
354
+ stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
355
+ # We have to convert to list because we need to modify the matches
356
+ matches = [
357
+ list(m)
358
+ for m in common_sub_strings_str(stringx, stringy, limit)
359
+ if (m[1] == 0 and m[0] + m[2] == len(stringx))
360
+ ]
361
+ for match in matches:
362
+ match[0] += trim_x_left
363
+ match[1] += trim_y_left
364
+
365
+ # convert to tuples again
366
+ return [tuple(m) for m in matches]
367
+
368
+
369
+ def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
370
+ """
371
+ Assembly algorithm for ligation of sticky ends.
372
+
373
+ For now, if limit 0 / False (default) only full overlaps are considered.
374
+ Otherwise, partial overlaps are also returned.
375
+
376
+ Args:
377
+ seqx (_Dseqrecord): The first sequence
378
+ seqy (_Dseqrecord): The second sequence
379
+ limit (int): Minimum length of the overlap
380
+
381
+ Returns:
382
+ list[SequenceOverlap]: A list of overlaps between the two sequences
383
+
384
+
385
+ Ligation of fully overlapping sticky ends, note how the order matters
386
+
387
+ >>> from pydna.dseq import Dseq
388
+ >>> from pydna.dseqrecord import Dseqrecord
389
+ >>> from pydna.assembly2 import sticky_end_sub_strings
390
+ >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 3))
391
+ >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
392
+ >>> sticky_end_sub_strings(x, y, limit=0)
393
+ [(3, 0, 3)]
394
+ >>> sticky_end_sub_strings(y, x, limit=0)
395
+ []
396
+
397
+ Ligation of partially overlapping sticky ends, specified with limit=True
398
+
399
+ >>> x = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 0, 2))
400
+ >>> y = Dseqrecord(Dseq.from_full_sequence_and_overhangs("AAAAAA", 3, 0))
401
+ >>> sticky_end_sub_strings(x, y, limit=0)
402
+ []
403
+ >>> sticky_end_sub_strings(x, y, limit=True)
404
+ [(4, 0, 2)]
405
+
406
+ """
407
+ overlap = sum_is_sticky(
408
+ seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit
409
+ )
410
+ if overlap:
411
+ return [(len(seqx) - overlap, 0, overlap)]
412
+ return []
413
+
414
+
415
+ def zip_match_leftwards(
416
+ seqx: _SeqRecord, seqy: _SeqRecord, match: SequenceOverlap
417
+ ) -> SequenceOverlap:
418
+ """
419
+ Starting from the rightmost edge of the match, return a new match encompassing the max
420
+ number of bases. This can be used to return a longer match if a primer aligns for longer
421
+ than the limit or a shorter match if there are mismatches. This is convenient to maintain
422
+ as many features as possible. It is used in PCR assembly.
423
+
424
+ >>> seq = _Dseqrecord('AAAAACGTCCCGT')
425
+ >>> primer = _Dseqrecord('ACGTCCCGT')
426
+ >>> match = (13, 9, 0) # an empty match at the end of each
427
+ >>> zip_match_leftwards(seq, primer, match)
428
+ (4, 0, 9)
429
+
430
+ Works in circular molecules if the match spans the origin:
431
+ >>> seq = _Dseqrecord('TCCCGTAAAAACG', circular=True)
432
+ >>> primer = _Dseqrecord('ACGTCCCGT')
433
+ >>> match = (6, 9, 0)
434
+ >>> zip_match_leftwards(seq, primer, match)
435
+ (10, 0, 9)
436
+
437
+ """
438
+
439
+ query_x = seqrecord2_uppercase_DNA_string(seqx)
440
+ query_y = seqrecord2_uppercase_DNA_string(seqy)
441
+
442
+ # In circular sequences, the match may go beyond the left-most edge of the sequence if it spans
443
+ # the origin:
444
+ # Primer: ACGTCCCGT
445
+ # |||||||||
446
+ # Circular seq: ACGTCCCGT -> Equivalent to Dseqrecord('CCCGTACGT', circular=True)
447
+ # ^
448
+ # Origin
449
+ # We would start from the last T and move leftwards, but we would stop at the origin
450
+ # For those cases we shift by length, then go back
451
+
452
+ end_on_x = match[0] + match[2]
453
+ if isinstance(seqx, _Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
454
+ end_on_x += len(seqx)
455
+
456
+ end_on_y = match[1] + match[2]
457
+ if isinstance(seqy, _Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
458
+ end_on_y += len(seqy)
459
+
460
+ count = 0
461
+ for x, y in zip(reversed(query_x[:end_on_x]), reversed(query_y[:end_on_y])):
462
+ if x != y:
463
+ break
464
+ count += 1
465
+
466
+ # Shift back by length if needed
467
+ start_on_x = (end_on_x - count) % len(seqx)
468
+ start_on_y = (end_on_y - count) % len(seqy)
469
+
470
+ return (start_on_x, start_on_y, count)
471
+
472
+
473
+ def zip_match_rightwards(
474
+ seqx: _Dseqrecord, seqy: _Dseqrecord, match: SequenceOverlap
475
+ ) -> SequenceOverlap:
476
+ """Same as zip_match_leftwards, but towards the right."""
477
+
478
+ query_x = seqrecord2_uppercase_DNA_string(seqx)
479
+ query_y = seqrecord2_uppercase_DNA_string(seqy)
480
+
481
+ start_on_x, start_on_y, _ = match
482
+ count = 0
483
+ for x, y in zip(query_x[start_on_x:], query_y[start_on_y:]):
484
+ if x != y:
485
+ break
486
+ count += 1
487
+ return (start_on_x, start_on_y, count)
488
+
489
+
490
+ def seqrecord2_uppercase_DNA_string(seqr: _SeqRecord) -> str:
491
+ """
492
+ Transform a Dseqrecord to a sequence string where U is replaced by T, everything is upper case and
493
+ circular sequences are repeated twice. This is used for PCR, to support primers with U's (e.g. for USER cloning).
494
+ """
495
+ out = str(seqr.seq).upper().replace("U", "T")
496
+ if isinstance(seqr, _Dseqrecord) and seqr.circular:
497
+ return out * 2
498
+ return out
499
+
500
+
501
+ def primer_template_overlap(
502
+ seqx: _Dseqrecord | _Primer, seqy: _Dseqrecord | _Primer, limit=25, mismatches=0
503
+ ) -> list[SequenceOverlap]:
504
+ """
505
+ Assembly algorithm to find overlaps between a primer and a template. It accepts mismatches.
506
+ When there are mismatches, it only returns the common part between the primer and the template.
507
+
508
+ If seqx is a primer and seqy is a template, it represents the binding of a forward primer.
509
+ If seqx is a template and seqy is a primer, it represents the binding of a reverse primer,
510
+ where the primer has been passed as its reverse complement (see examples).
511
+
512
+ Args:
513
+ seqx (_Dseqrecord | _Primer): The primer
514
+ seqy (_Dseqrecord | _Primer): The template
515
+ limit (int): Minimum length of the overlap
516
+ mismatches (int): Maximum number of mismatches (only substitutions, no deletion or insertion)
517
+
518
+ Returns:
519
+ list[SequenceOverlap]: A list of overlaps between the primer and the template
520
+
521
+ >>> from pydna.dseqrecord import Dseqrecord
522
+ >>> from pydna.primer import Primer
523
+ >>> from pydna.assembly2 import primer_template_overlap
524
+ >>> template = Dseqrecord("AATTAGCAGCGATCGAGT", circular=True)
525
+ >>> primer = Primer("TTAGCAGC")
526
+ >>> primer_template_overlap(primer, template, limit=8, mismatches=0)
527
+ [(0, 2, 8)]
528
+
529
+ This actually represents the binding of the primer `GCTGCTAA` (reverse complement)
530
+ >>> primer_template_overlap(template, primer, limit=8, mismatches=0)
531
+ [(2, 0, 8)]
532
+ >>> primer_template_overlap(primer, template.reverse_complement(), limit=8, mismatches=0)
533
+ []
534
+ >>> primer_template_overlap(primer.reverse_complement(), template, limit=8, mismatches=0)
535
+ []
536
+ """
537
+
538
+ if isinstance(seqx, _Primer) and isinstance(seqy, _Dseqrecord):
539
+ primer = seqx
540
+ template = seqy
541
+ reverse_primer = False
542
+ elif isinstance(seqx, _Dseqrecord) and isinstance(seqy, _Primer):
543
+ primer = seqy
544
+ template = seqx
545
+ reverse_primer = True
546
+ else:
547
+ raise ValueError(
548
+ "One of the sequences must be a primer and the other a Dseqrecord"
549
+ )
550
+
551
+ if len(primer) < limit:
552
+ return []
553
+
554
+ subject = seqrecord2_uppercase_DNA_string(template)
555
+ query = (
556
+ seqrecord2_uppercase_DNA_string(primer[:limit])
557
+ if reverse_primer
558
+ else seqrecord2_uppercase_DNA_string(primer[-limit:])
559
+ )
560
+
561
+ re_matches = list(
562
+ regex.finditer(
563
+ "(" + query + "){s<=" + str(mismatches) + "}", subject, overlapped=True
564
+ )
565
+ )
566
+ re_matches += list(
567
+ regex.finditer(
568
+ "(?r)(" + query + "){s<=" + str(mismatches) + "}", subject, overlapped=True
569
+ )
570
+ )
571
+
572
+ out = set()
573
+ for re_match in re_matches:
574
+
575
+ start, end = re_match.span()
576
+
577
+ # For circular sequences the same match is returned twice unless it falls
578
+ # on the origin, we eliminate duplicates here
579
+ if start >= len(template):
580
+ continue
581
+
582
+ # This extends match beyond the limit if the primer aligns more than that
583
+ # and reduces the match if the primer has mismatches
584
+ if reverse_primer:
585
+ # Match in the same format as other assembly algorithms
586
+ starting_match = (start, 0, end - start)
587
+ out.add(zip_match_rightwards(template, primer, starting_match))
588
+ else:
589
+ # Match in the same format as other assembly algorithms
590
+ starting_match = (len(primer) - limit, start, end - start)
591
+ out.add(zip_match_leftwards(primer, template, starting_match))
592
+
593
+ return list(sorted(out))
594
+
595
+
596
+ def fill_left(seq: _Dseq) -> _Dseq:
597
+ """Fill the left overhang of a sequence with the complementary sequence."""
598
+ new_watson = seq.watson
599
+ new_crick = seq.crick
600
+
601
+ # Watson 5' overhang
602
+ if seq.ovhg < 0:
603
+ new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
604
+ # Crick 5' overhang
605
+ elif seq.ovhg > 0:
606
+ new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
607
+
608
+ return _Dseq(new_watson, new_crick, 0)
609
+
610
+
611
+ def fill_right(seq: _Dseq) -> _Dseq:
612
+ """Fill the right overhang of a sequence with the complementary sequence."""
613
+ new_watson = seq.watson
614
+ new_crick = seq.crick
615
+
616
+ # Watson 3' overhang
617
+ watson_ovhg = seq.watson_ovhg()
618
+ if watson_ovhg < 0:
619
+ new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
620
+
621
+ # Crick 3' overhang
622
+ elif watson_ovhg > 0:
623
+ new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
624
+
625
+ return _Dseq(new_watson, new_crick, seq.ovhg)
626
+
627
+
628
+ def fill_dseq(seq: _Dseq) -> _Dseq:
629
+ """Fill the overhangs of a sequence with the complementary sequence."""
630
+ return fill_left(fill_right(seq))
631
+
632
+
633
+ def reverse_complement_assembly(
634
+ assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
635
+ ) -> EdgeRepresentationAssembly:
636
+ """Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
637
+ new_assembly = list()
638
+ for u, v, locu, locv in assembly:
639
+ f_u = fragments[abs(u) - 1]
640
+ f_v = fragments[abs(v) - 1]
641
+ new_assembly.append((-v, -u, locv._flip(len(f_v)), locu._flip(len(f_u))))
642
+ return new_assembly[::-1]
643
+
644
+
645
+ def filter_linear_subassemblies(
646
+ linear_assemblies: list[EdgeRepresentationAssembly],
647
+ circular_assemblies: list[EdgeRepresentationAssembly],
648
+ fragments: list[_Dseqrecord],
649
+ ) -> list[EdgeRepresentationAssembly]:
650
+ """Remove linear assemblies which are sub-assemblies of circular assemblies"""
651
+ all_circular_assemblies = circular_assemblies + [
652
+ reverse_complement_assembly(c, fragments) for c in circular_assemblies
653
+ ]
654
+ filtered_assemblies = [
655
+ assem
656
+ for assem in linear_assemblies
657
+ if not any(is_sublist(assem, c, True) for c in all_circular_assemblies)
658
+ ]
659
+ # I don't think the line below is necessary, but just in case
660
+ # filtered_assemblies = [l for l in filtered_assemblies if not any(is_sublist(reverse_complement_assembly(l, fragments), c, True) for c in all_circular_assemblies)]
661
+ return filtered_assemblies
662
+
663
+
664
+ def remove_subassemblies(
665
+ assemblies: list[EdgeRepresentationAssembly],
666
+ ) -> list[EdgeRepresentationAssembly]:
667
+ """Filter out subassemblies, i.e. assemblies that are contained within another assembly.
668
+
669
+ For example:
670
+ [(1, 2, '1[8:14]:2[1:7]'), (2, 3, '2[10:17]:3[1:8]')]
671
+ [(1, 2, '1[8:14]:2[1:7]')]
672
+ The second one is a subassembly of the first one.
673
+ """
674
+
675
+ # Sort by length, longest first
676
+ assemblies = sorted(assemblies, key=len, reverse=True)
677
+
678
+ filtered_assemblies = list()
679
+ for assembly in assemblies:
680
+ # Check if this assembly is a subassembly of any of the assemblies we have already found
681
+ if not any(is_sublist(assembly, a) for a in filtered_assemblies):
682
+ filtered_assemblies.append(assembly)
683
+
684
+ return filtered_assemblies
685
+
686
+
687
+ def assembly2str(assembly: EdgeRepresentationAssembly) -> str:
688
+ """Convert an assembly to a string representation, for example:
689
+ ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
690
+ becomes:
691
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
692
+
693
+ The reason for this is that by default, a feature '[8:14]' when present in a tuple
694
+ is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
695
+ """
696
+ return str(tuple(f"{u}{lu}:{v}{lv}" for u, v, lu, lv in assembly))
697
+
698
+
699
+ def assembly2str_tuple(assembly: EdgeRepresentationAssembly) -> str:
700
+ """Convert an assembly to a string representation, like
701
+ ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
702
+ """
703
+ return str(tuple((u, v, str(lu), str(lv)) for u, v, lu, lv in assembly))
704
+
705
+
706
+ def assembly_has_mismatches(
707
+ fragments: list[_Dseqrecord], assembly: EdgeRepresentationAssembly
708
+ ) -> bool:
709
+ """Check if an assembly has mismatches. This should never happen and if so it returns an error."""
710
+ for u, v, loc_u, loc_v in assembly:
711
+ seq_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
712
+ seq_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
713
+ # TODO: Check issue where extraction failed, and whether it would give problems here
714
+ if (
715
+ str(loc_u.extract(seq_u).seq).upper()
716
+ != str(loc_v.extract(seq_v).seq).upper()
717
+ ):
718
+ return True
719
+ return False
720
+
721
+
722
+ def assembly_is_circular(
723
+ assembly: EdgeRepresentationAssembly, fragments: list[_Dseqrecord]
724
+ ) -> bool:
725
+ """
726
+ Based on the topology of the locations of an assembly, determine if it is circular.
727
+ This does not work for insertion assemblies, that's why assemble takes the optional argument is_insertion.
728
+ """
729
+ if assembly[0][0] != assembly[-1][1]:
730
+ return False
731
+ elif (
732
+ isinstance(fragments[abs(assembly[0][0]) - 1], _Dseqrecord)
733
+ and fragments[abs(assembly[0][0]) - 1].circular
734
+ ):
735
+ return True
736
+ else:
737
+ return (
738
+ _location_boundaries(assembly[0][2])[0]
739
+ > _location_boundaries(assembly[-1][3])[0]
740
+ )
741
+
742
+
743
+ def assemble(
744
+ fragments: list[_Dseqrecord],
745
+ assembly: EdgeRepresentationAssembly,
746
+ is_insertion: bool = False,
747
+ ) -> _Dseqrecord:
748
+ """Generate a Dseqrecord from an assembly and a list of fragments."""
749
+
750
+ if is_insertion:
751
+ is_circular = False
752
+ else:
753
+ is_circular = assembly_is_circular(assembly, fragments)
754
+
755
+ subfragment_representation = edge_representation2subfragment_representation(
756
+ assembly, is_circular
757
+ )
758
+
759
+ # Sanity check
760
+ for asm_edge in assembly:
761
+ u, v, loc_u, loc_v = asm_edge
762
+ f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
763
+ f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
764
+ seq_u = str(loc_u.extract(f_u).seq).upper()
765
+ seq_v = str(loc_v.extract(f_v).seq).upper()
766
+ if seq_u != seq_v:
767
+ raise ValueError("Mismatch in assembly")
768
+
769
+ # We transform into Dseqrecords (for primers)
770
+ dseqr_fragments = [
771
+ f if isinstance(f, _Dseqrecord) else _Dseqrecord(f) for f in fragments
772
+ ]
773
+ subfragments = get_assembly_subfragments(
774
+ dseqr_fragments, subfragment_representation
775
+ )
776
+
777
+ # Length of the overlaps between consecutive assembly fragments
778
+ fragment_overlaps = [len(e[-1]) for e in assembly]
779
+
780
+ out_dseqrecord = _Dseqrecord(subfragments[0])
781
+
782
+ for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
783
+ # Shift the features of the right fragment to the left by `overlap`
784
+ new_features = [
785
+ f._shift(len(out_dseqrecord) - overlap) for f in fragment.features
786
+ ]
787
+ # Join the left sequence including the overlap with the right sequence without the overlap
788
+ # we use fill_right / fill_left so that it works for ligation of sticky ends
789
+ out_dseqrecord = _Dseqrecord(
790
+ fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
791
+ features=out_dseqrecord.features + new_features,
792
+ )
793
+
794
+ # For circular assemblies, close the loop and wrap origin-spanning features
795
+ if is_circular:
796
+ overlap = fragment_overlaps[-1]
797
+
798
+ # Special case for blunt circularisation
799
+ if overlap == 0:
800
+ return out_dseqrecord.looped()
801
+
802
+ # Remove trailing overlap
803
+ out_dseqrecord = _Dseqrecord(
804
+ fill_dseq(out_dseqrecord.seq)[:-overlap],
805
+ features=out_dseqrecord.features,
806
+ circular=True,
807
+ )
808
+ for feature in out_dseqrecord.features:
809
+ start, end = _location_boundaries(feature.location)
810
+ if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
811
+ # Wrap around the origin
812
+ feature.location = _shift_location(
813
+ feature.location, 0, len(out_dseqrecord)
814
+ )
815
+
816
+ return out_dseqrecord
817
+
818
+
819
+ def annotate_primer_binding_sites(
820
+ input_dseqr: _Dseqrecord, fragments: list[_Dseqrecord]
821
+ ) -> _Dseqrecord:
822
+ """Annotate the primer binding sites in a Dseqrecord."""
823
+ fwd, _, rvs = fragments
824
+ start_rvs = len(input_dseqr) - len(rvs)
825
+
826
+ output_dseqr = copy.deepcopy(input_dseqr)
827
+ output_dseqr.add_feature(
828
+ x=0,
829
+ y=len(fwd),
830
+ type_="primer_bind",
831
+ strand=1,
832
+ label=[fwd.name],
833
+ note=["sequence: " + str(fwd.seq)],
834
+ )
835
+ output_dseqr.add_feature(
836
+ x=start_rvs,
837
+ y=len(output_dseqr),
838
+ type_="primer_bind",
839
+ strand=-1,
840
+ label=[rvs.name],
841
+ note=["sequence: " + str(rvs.seq)],
842
+ )
843
+ return output_dseqr
844
+
845
+
846
+ def edge_representation2subfragment_representation(
847
+ assembly: EdgeRepresentationAssembly, is_circular: bool
848
+ ) -> SubFragmentRepresentationAssembly:
849
+ """
850
+ Turn this kind of edge representation fragment 1, fragment 2, right edge on 1, left edge on 2
851
+ a = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b', 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
852
+ Into this: fragment 1, left edge on 1, right edge on 1
853
+ b = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
854
+ """
855
+
856
+ if is_circular:
857
+ temp = list(assembly[-1:]) + list(assembly)
858
+ else:
859
+ temp = (
860
+ [(None, assembly[0][0], None, None)]
861
+ + list(assembly)
862
+ + [(assembly[-1][1], None, None, None)]
863
+ )
864
+ edge_pairs = zip(temp, temp[1:])
865
+ subfragment_representation = list()
866
+ for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
867
+ subfragment_representation.append((v1, start_location, end_location))
868
+
869
+ return tuple(subfragment_representation)
870
+
871
+
872
+ def subfragment_representation2edge_representation(
873
+ assembly: SubFragmentRepresentationAssembly, is_circular: bool
874
+ ) -> EdgeRepresentationAssembly:
875
+ """
876
+ Turn this kind of subfragment representation fragment 1, left edge on 1, right edge on 1
877
+ a = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
878
+ Into this: fragment 1, fragment 2, right edge on 1, left edge on 2
879
+ b = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b' 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
880
+ """
881
+
882
+ edge_representation = []
883
+
884
+ # Iterate through the assembly pairwise to create the edge representation
885
+ for i in range(len(assembly) - 1):
886
+ frag1, left1, right1 = assembly[i]
887
+ frag2, left2, right2 = assembly[i + 1]
888
+ # Create the edge between the current and next fragment
889
+ edge_representation.append((frag1, frag2, right1, left2))
890
+
891
+ if is_circular:
892
+ # Add the edge from the last fragment back to the first
893
+ frag_last, left_last, right_last = assembly[-1]
894
+ frag_first, left_first, right_first = assembly[0]
895
+ edge_representation.append((frag_last, frag_first, right_last, left_first))
896
+
897
+ return tuple(edge_representation)
898
+
899
+
900
+ def get_assembly_subfragments(
901
+ fragments: list[_Dseqrecord],
902
+ subfragment_representation: SubFragmentRepresentationAssembly,
903
+ ) -> list[_Dseqrecord]:
904
+ """From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
905
+
906
+ Subfragments are the slices of the fragments that are joined together
907
+
908
+ For example:
909
+ ```
910
+ --A--
911
+ TACGTAAT
912
+ --B--
913
+ TCGTAACGA
914
+
915
+ Gives: TACGTAA / CGTAACGA
916
+ ```
917
+ To reproduce:
918
+ ```
919
+ a = Dseqrecord('TACGTAAT')
920
+ b = Dseqrecord('TCGTAACGA')
921
+ f = Assembly([a, b], limit=5)
922
+ a0 = f.get_linear_assemblies()[0]
923
+ print(assembly2str(a0))
924
+ a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
925
+ for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
926
+ print(f.seq)
927
+
928
+ # prints TACGTAA and CGTAACGA
929
+ ```
930
+
931
+ Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
932
+ """
933
+ subfragments = list()
934
+ for node, start_location, end_location in subfragment_representation:
935
+ seq = (
936
+ fragments[node - 1]
937
+ if node > 0
938
+ else fragments[-node - 1].reverse_complement()
939
+ )
940
+ subfragments.append(extract_subfragment(seq, start_location, end_location))
941
+ return subfragments
942
+
943
+
944
+ def extract_subfragment(
945
+ seq: _Dseqrecord, start_location: Location, end_location: Location
946
+ ) -> _Dseqrecord:
947
+ """Extract a subfragment from a sequence for an assembly, given the start and end locations of the subfragment."""
948
+ start = 0 if start_location is None else _location_boundaries(start_location)[0]
949
+ end = None if end_location is None else _location_boundaries(end_location)[1]
950
+
951
+ # Special case, some of it could be handled by better Dseqrecord slicing in the future
952
+ if (
953
+ seq.circular
954
+ and start_location is not None
955
+ and end_location is not None
956
+ and _locations_overlap(start_location, end_location, len(seq))
957
+ ):
958
+ # The overhang is different for origin-spanning features, for instance
959
+ # for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
960
+ # is -4, not 9
961
+ ovhg = start - end if end > start else start - end - len(seq)
962
+ # edge case
963
+ if abs(ovhg) == len(seq):
964
+ ovhg = 0
965
+ dummy_cut = ((start, ovhg), None)
966
+ open_seq = seq.apply_cut(dummy_cut, dummy_cut)
967
+ return _Dseqrecord(fill_dseq(open_seq.seq), features=open_seq.features)
968
+
969
+ return seq[start:end]
970
+
971
+
972
+ def is_sublist(sublist: list, my_list: list, my_list_is_cyclic: bool = False) -> bool:
973
+ """Returns True if argument sublist is a sublist of argument my_list (can be treated as cyclic), False otherwise.
974
+
975
+ Examples
976
+ --------
977
+ >>> is_sublist([1, 2], [1, 2, 3], False)
978
+ True
979
+ >>> is_sublist([1, 2], [1, 3, 2], False)
980
+ False
981
+
982
+ # See the case here for cyclic lists
983
+ >>> is_sublist([3, 1], [1, 2, 3], False)
984
+ False
985
+ >>> is_sublist([3, 1], [1, 2, 3], True)
986
+ True
987
+ """
988
+ n = len(sublist)
989
+ if my_list_is_cyclic:
990
+ my_list = my_list + my_list
991
+ for i in range(len(my_list) - n + 1):
992
+ # Just in case tuples were passed
993
+ if list(my_list[i : i + n]) == list(sublist):
994
+ return True
995
+ return False
996
+
997
+
998
+ def circular_permutation_min_abs(lst: list) -> list:
999
+ """Returns the circular permutation of lst with the smallest absolute value first.
1000
+
1001
+ Examples
1002
+ --------
1003
+ >>> circular_permutation_min_abs([1, 2, 3])
1004
+ [1, 2, 3]
1005
+ >>> circular_permutation_min_abs([3, 1, 2])
1006
+ [1, 2, 3]
1007
+ """
1008
+ min_abs_index = min(range(len(lst)), key=lambda i: abs(lst[i]))
1009
+ return lst[min_abs_index:] + lst[:min_abs_index]
1010
+
1011
+
1012
+ class Assembly:
1013
+ """Assembly of a list of DNA fragments into linear or circular constructs.
1014
+ Accepts a list of Dseqrecords (source fragments) to
1015
+ initiate an Assembly object. Several methods are available for analysis
1016
+ of overlapping sequences, graph construction and assembly.
1017
+
1018
+ The assembly contains a directed graph, where nodes represent fragments and
1019
+ edges represent overlaps between fragments. :
1020
+ - The node keys are integers, representing the index of the fragment in the
1021
+ input list of fragments. The sign of the node key represents the orientation
1022
+ of the fragment, positive for forward orientation, negative for reverse orientation.
1023
+ - The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
1024
+ - u and v are the nodes connected by the edge.
1025
+ - key is a string that represents the location of the overlap. In the format:
1026
+ 'u[start:end](strand):v[start:end](strand)'.
1027
+ - Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
1028
+ representing the location of the overlap in the u and v fragment, respectively.
1029
+ - You can think of an edge as a representation of the join of two fragments.
1030
+
1031
+ If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
1032
+ there will be 4 edges representing that overlap in the graph, for all possible
1033
+ orientations of the fragments (see add_edges_from_match for details):
1034
+ - `(1, 2, '1[8:14]:2[1:7]')`
1035
+ - `(2, 1, '2[1:7]:1[8:14]')`
1036
+ - `(-1, -2, '-1[0:6]:-2[10:16]')`
1037
+ - `(-2, -1, '-2[10:16]:-1[0:6]')`
1038
+
1039
+ An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
1040
+ as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
1041
+ and second fragment. Assemblies are then represented as:
1042
+ - Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
1043
+ - Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
1044
+ Note that the first and last fragment are the same in a circular assembly.
1045
+
1046
+ The following constrains are applied to remove duplicate assemblies:
1047
+ - Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
1048
+ use_fragment_order is ignored.
1049
+ - Linear assemblies:
1050
+ - Using uid (see add_edges_from_match) to identify unique edges.
1051
+
1052
+ Parameters
1053
+ ----------
1054
+ frags : list
1055
+ A list of Dseqrecord objects.
1056
+ limit : int, optional
1057
+ The shortest shared homology to be considered, this is passed as the third argument to the `algorithm` function.
1058
+ For certain algorithms, this might be ignored.
1059
+ algorithm : function, optional
1060
+ The algorithm used to determine the shared sequences. It's a function that takes two Dseqrecord objects as inputs,
1061
+ and will get passed the third argument (limit), that may or may not be used. It must return a list of overlaps
1062
+ (see common_sub_strings for an example).
1063
+ use_fragment_order : bool, optional
1064
+ It's set to True by default to reproduce legacy pydna behaviour: only assemblies that start with the first fragment and end with the last are considered.
1065
+ You should set it to False.
1066
+ use_all_fragments : bool, optional
1067
+ Constrain the assembly to use all fragments.
1068
+
1069
+
1070
+ Examples
1071
+ --------
1072
+
1073
+ from assembly2 import Assembly, assembly2str
1074
+ from pydna.dseqrecord import Dseqrecord
1075
+
1076
+ example_fragments = (
1077
+ Dseqrecord('AacgatCAtgctcc', name='a'),
1078
+ Dseqrecord('TtgctccTAAattctgc', name='b'),
1079
+ Dseqrecord('CattctgcGAGGacgatG', name='c'),
1080
+ )
1081
+
1082
+ asm = Assembly(example_fragments, limit=5, use_fragment_order=False)
1083
+ print('Linear ===============')
1084
+ for assembly in asm.get_linear_assemblies():
1085
+ print(' ', assembly2str(assembly))
1086
+ print('Circular =============')
1087
+ for assembly in asm.get_circular_assemblies():
1088
+ print(' ', assembly2str(assembly))
1089
+
1090
+ # Prints
1091
+ Linear ===============
1092
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
1093
+ ('2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
1094
+ ('3[12:17]:1[1:6]', '1[8:14]:2[1:7]')
1095
+ ('1[1:6]:3[12:17]',)
1096
+ ('2[1:7]:1[8:14]',)
1097
+ ('3[1:8]:2[10:17]',)
1098
+ Circular =============
1099
+ ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
1100
+
1101
+ """
1102
+
1103
+ def __init__(
1104
+ self,
1105
+ frags: list[_Dseqrecord],
1106
+ limit: int = 25,
1107
+ algorithm: AssemblyAlgorithmType = common_sub_strings,
1108
+ use_fragment_order: bool = True,
1109
+ use_all_fragments: bool = False,
1110
+ ):
1111
+ # TODO: allow for the same fragment to be included more than once?
1112
+ self.G = _nx.MultiDiGraph()
1113
+ # Add positive and negative nodes for forward and reverse fragments
1114
+ self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1115
+ self.G.add_nodes_from(
1116
+ (-(i + 1), {"seq": f.reverse_complement()}) for (i, f) in enumerate(frags)
1117
+ )
1118
+
1119
+ # Iterate over all possible combinations of fragments
1120
+ fragment_pairs = _itertools.combinations(
1121
+ filter(lambda x: x > 0, self.G.nodes), 2
1122
+ )
1123
+ for i, j in fragment_pairs:
1124
+ # All the relative orientations of the fragments in the pair
1125
+ for u, v in _itertools.product([i, -i], [j, -j]):
1126
+ u_seq = self.G.nodes[u]["seq"]
1127
+ v_seq = self.G.nodes[v]["seq"]
1128
+ matches = algorithm(u_seq, v_seq, limit)
1129
+ for match in matches:
1130
+ self.add_edges_from_match(match, u, v, u_seq, v_seq)
1131
+
1132
+ self.fragments = frags
1133
+ self.limit = limit
1134
+ self.algorithm = algorithm
1135
+ self.use_fragment_order = use_fragment_order
1136
+ self.use_all_fragments = use_all_fragments
1137
+
1138
+ return
1139
+
1140
+ @classmethod
1141
+ def assembly_is_valid(
1142
+ cls,
1143
+ fragments: list[_Dseqrecord | _Primer],
1144
+ assembly: EdgeRepresentationAssembly,
1145
+ is_circular: bool,
1146
+ use_all_fragments: bool,
1147
+ is_insertion: bool = False,
1148
+ ) -> bool:
1149
+ """
1150
+ Returns True if the assembly is valid, False otherwise. See function comments for conditions tested.
1151
+ """
1152
+ if is_circular is None:
1153
+ return False
1154
+
1155
+ # Linear assemblies may get begin-1-end, begin-2-end, these are removed here.
1156
+ if len(assembly) == 0:
1157
+ return False
1158
+
1159
+ if use_all_fragments and len(fragments) != len(
1160
+ set(flatten(map(abs, e[:2]) for e in assembly))
1161
+ ):
1162
+ return False
1163
+
1164
+ # Here we check whether subsequent pairs of fragments are compatible, for instance:
1165
+ # Compatible (overlap of 1 and 2 occurs before overlap of 2 and 3):
1166
+ # (1,2,[2:9],[0:7]), (2,3,[12:19],[0:7])
1167
+ # -- A --
1168
+ # 1 gtatcgtgt -- B --
1169
+ # 2 atcgtgtactgtcatattc
1170
+ # 3 catattcaa
1171
+ # Incompatible (overlap of 1 and 2 occurs after overlap of 2 and 3):
1172
+ # (1,2,[2:9],[13:20]), (2,3,[0:7],[0:7])
1173
+ # -- A --
1174
+ # 1 -- B -- gtatcgtgt
1175
+ # 2 catattcccccccatcgtgtactgt
1176
+ # 3 catattcaa
1177
+ # Redundant: overlap of 1 and 2 ends at the same spot as overlap of 2 and 3
1178
+ # (1,2,[2:9],[1:8]), (2,3,[0:8],[0:8])
1179
+ # -- A --
1180
+ # gtatcgtgt
1181
+ # catcgtgtactgtcatattc
1182
+ # catcgtgtactgtcatattc
1183
+ # -- B ---
1184
+ if is_circular:
1185
+ # In a circular assembly, first and last fragment must be the same
1186
+ if assembly[0][0] != assembly[-1][1]:
1187
+ return False
1188
+ edge_pairs = zip(assembly, assembly[1:] + assembly[:1])
1189
+ else:
1190
+ edge_pairs = zip(assembly, assembly[1:])
1191
+
1192
+ for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
1193
+ # Incompatible as described in figure above
1194
+ fragment = fragments[abs(v1) - 1]
1195
+ if (
1196
+ isinstance(fragment, _Primer) or not fragment.circular
1197
+ ) and _location_boundaries(start_location)[1] >= _location_boundaries(
1198
+ end_location
1199
+ )[
1200
+ 1
1201
+ ]:
1202
+ return False
1203
+
1204
+ # Fragments are used only once
1205
+ nodes_used = [
1206
+ f[0]
1207
+ for f in edge_representation2subfragment_representation(
1208
+ assembly, is_circular or is_insertion
1209
+ )
1210
+ ]
1211
+ if len(nodes_used) != len(set(map(abs, nodes_used))):
1212
+ return False
1213
+
1214
+ return True
1215
+
1216
+ def add_edges_from_match(
1217
+ self,
1218
+ match: SequenceOverlap,
1219
+ u: int,
1220
+ v: int,
1221
+ first: _Dseqrecord,
1222
+ secnd: _Dseqrecord,
1223
+ ):
1224
+ """Add edges to the graph from a match returned by the `algorithm` function (see pydna.common_substrings). For
1225
+ format of edges (see documentation of the Assembly class).
1226
+
1227
+ Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
1228
+ homologous recombination does but sticky end ligation does not. The function returns two edges:
1229
+ - Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
1230
+ - Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
1231
+
1232
+ """
1233
+ x_start, y_start, length = match
1234
+ if length == 0:
1235
+ # Edge case, blunt ligation
1236
+ locs = [SimpleLocation(x_start, x_start), SimpleLocation(y_start, y_start)]
1237
+ else:
1238
+ # We use shift_location with 0 to wrap origin-spanning features
1239
+ locs = [
1240
+ _shift_location(
1241
+ SimpleLocation(x_start, x_start + length), 0, len(first)
1242
+ ),
1243
+ _shift_location(
1244
+ SimpleLocation(y_start, y_start + length), 0, len(secnd)
1245
+ ),
1246
+ ]
1247
+
1248
+ # Flip the locations to get the reverse complement
1249
+ rc_locs = [locs[0]._flip(len(first)), locs[1]._flip(len(secnd))]
1250
+
1251
+ # Unique id that identifies the edge in either orientation
1252
+ uid = f"{u}{locs[0]}:{v}{locs[1]}"
1253
+
1254
+ combinations = (
1255
+ (u, v, locs),
1256
+ (-v, -u, rc_locs[::-1]),
1257
+ )
1258
+
1259
+ for u, v, l in combinations:
1260
+ self.G.add_edge(u, v, f"{u}{l[0]}:{v}{l[1]}", locations=l, uid=uid)
1261
+
1262
+ def format_assembly_edge(
1263
+ self, graph_edge: tuple[int, int, str]
1264
+ ) -> AssemblyEdgeType:
1265
+ """Go from the (u, v, key) to the (u, v, locu, locv) format."""
1266
+ u, v, key = graph_edge
1267
+ locu, locv = self.G.get_edge_data(u, v, key)["locations"]
1268
+ return u, v, locu, locv
1269
+
1270
+ def get_linear_assemblies(
1271
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1272
+ ) -> list[EdgeRepresentationAssembly]:
1273
+ """Get linear assemblies, applying the constrains described in __init__, ensuring that paths represent
1274
+ real assemblies (see assembly_is_valid). Subassemblies are removed (see remove_subassemblies).
1275
+ """
1276
+
1277
+ # Copy the graph since we will add the begin and end mock nodes
1278
+ G = _nx.MultiDiGraph(self.G)
1279
+ G.add_nodes_from(["begin", "end"])
1280
+
1281
+ if self.use_fragment_order:
1282
+ # Path must start with the first fragment and end with the last
1283
+ G.add_edge("begin", 1)
1284
+ G.add_edge("begin", -1)
1285
+ G.add_edge(len(self.fragments), "end")
1286
+ G.add_edge(-len(self.fragments), "end")
1287
+ else:
1288
+ for node in filter(lambda x: type(x) is int, G.nodes):
1289
+ G.add_edge("begin", node)
1290
+ G.add_edge(node, "end")
1291
+
1292
+ unique_linear_paths = self.get_unique_linear_paths(G)
1293
+ possible_assemblies = self.get_possible_assembly_number(unique_linear_paths)
1294
+ if possible_assemblies > max_assemblies:
1295
+ raise ValueError(
1296
+ f"Too many assemblies ({possible_assemblies} pre-validation) to assemble"
1297
+ )
1298
+
1299
+ assemblies = sum(
1300
+ map(lambda x: self.node_path2assembly_list(x, False), unique_linear_paths),
1301
+ [],
1302
+ )
1303
+
1304
+ out = [
1305
+ a
1306
+ for a in assemblies
1307
+ if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments)
1308
+ ]
1309
+ if only_adjacent_edges:
1310
+ out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, False)]
1311
+ return remove_subassemblies(out)
1312
+
1313
+ def node_path2assembly_list(
1314
+ self, cycle: list[int], circular: bool
1315
+ ) -> list[EdgeRepresentationAssembly]:
1316
+ """Convert a node path in the format [1, 2, 3] (as returned by _nx.cycles.simple_cycles) to a list of all
1317
+ possible assemblies.
1318
+
1319
+ There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
1320
+ for example two overlaps between 1 and 2, and single overlap between 2 and 3 should return 3 assemblies.
1321
+ """
1322
+ combine = list()
1323
+ pairing = (
1324
+ zip(cycle, cycle[1:] + cycle[:1]) if circular else zip(cycle, cycle[1:])
1325
+ )
1326
+ for u, v in pairing:
1327
+ combine.append([(u, v, key) for key in self.G[u][v]])
1328
+ return [
1329
+ tuple(map(self.format_assembly_edge, x))
1330
+ for x in _itertools.product(*combine)
1331
+ ]
1332
+
1333
+ def get_unique_linear_paths(
1334
+ self, G_with_begin_end: _nx.MultiDiGraph, max_paths=10000
1335
+ ) -> list[list[int]]:
1336
+ """Get unique linear paths from the graph, removing those that contain the same node twice."""
1337
+ # We remove the begin and end nodes, and get all paths without edges
1338
+ # e.g. we will get [1, 2, 3] only once, even if multiple edges connect
1339
+ # 1 and 2 or 2 and 3, by converting to DiGraph.
1340
+
1341
+ # Cutoff has a different meaning of what one would expect, see https://github.com/networkx/networkx/issues/2762
1342
+ node_paths = [
1343
+ x[1:-1]
1344
+ for x in limit_iterator(
1345
+ _nx.all_simple_paths(
1346
+ _nx.DiGraph(G_with_begin_end),
1347
+ "begin",
1348
+ "end",
1349
+ cutoff=(len(self.fragments) + 1),
1350
+ ),
1351
+ max_paths,
1352
+ )
1353
+ ]
1354
+
1355
+ # Remove those that contain the same node twice
1356
+ node_paths = [x for x in node_paths if len(x) == len(set(map(abs, x)))]
1357
+
1358
+ if self.use_all_fragments:
1359
+ node_paths = [x for x in node_paths if len(x) == len(self.fragments)]
1360
+
1361
+ # For each path, we check if there are reverse complement duplicates
1362
+ # See: https://github.com/manulera/OpenCloning_backend/issues/160
1363
+ unique_node_paths = list()
1364
+ for p in node_paths:
1365
+ if [-x for x in p[::-1]] not in unique_node_paths:
1366
+ unique_node_paths.append(p)
1367
+
1368
+ return unique_node_paths
1369
+
1370
+ def get_possible_assembly_number(self, paths: list[list[int]]) -> int:
1371
+ """
1372
+ Get the number of possible assemblies from a list of node paths. Basically, for each path
1373
+ passed as a list of integers / nodes, we calculate the number of paths possible connecting
1374
+ the nodes in that order, given the graph (all the edges connecting them).
1375
+ """
1376
+ possibilities = 0
1377
+ for path in paths:
1378
+ this_path = 1
1379
+ for u, v in zip(path, path[1:]):
1380
+ if v in self.G[u]:
1381
+ this_path *= len(self.G[u][v])
1382
+ possibilities += this_path
1383
+ return possibilities
1384
+
1385
+ def get_circular_assemblies(
1386
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1387
+ ) -> list[EdgeRepresentationAssembly]:
1388
+ """Get circular assemblies, applying the constrains described in __init__, ensuring that paths represent
1389
+ real assemblies (see assembly_is_valid)."""
1390
+ # The constrain of circular sequence is that the first node is the fragment with the smallest index in its initial orientation,
1391
+ # this is ensured by the circular_permutation_min_abs function + the filter below
1392
+ sorted_cycles = map(
1393
+ circular_permutation_min_abs,
1394
+ limit_iterator(
1395
+ _nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)),
1396
+ 10000,
1397
+ ),
1398
+ )
1399
+ sorted_cycles = filter(lambda x: x[0] > 0, sorted_cycles)
1400
+ # cycles.simple_cycles returns lists [1,2,3] not assemblies, see self.cycle2circular_assemblies
1401
+
1402
+ # We apply constrains already here because sometimes the combinatorial explosion is too large
1403
+ if self.use_all_fragments:
1404
+ sorted_cycles = [c for c in sorted_cycles if len(c) == len(self.fragments)]
1405
+
1406
+ # Remove cycles with duplicates
1407
+ sorted_cycles = [c for c in sorted_cycles if len(c) == len(set(map(abs, c)))]
1408
+ possible_assembly_number = self.get_possible_assembly_number(
1409
+ [c + c[:1] for c in sorted_cycles]
1410
+ )
1411
+ if possible_assembly_number > max_assemblies:
1412
+ raise ValueError(
1413
+ f"Too many assemblies ({possible_assembly_number} pre-validation) to assemble"
1414
+ )
1415
+
1416
+ assemblies = sum(
1417
+ map(lambda x: self.node_path2assembly_list(x, True), sorted_cycles), []
1418
+ )
1419
+
1420
+ out = [
1421
+ a
1422
+ for a in assemblies
1423
+ if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)
1424
+ ]
1425
+ if only_adjacent_edges:
1426
+ out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, True)]
1427
+ return out
1428
+
1429
+ def format_insertion_assembly(
1430
+ self, assembly: EdgeRepresentationAssembly
1431
+ ) -> EdgeRepresentationAssembly | None:
1432
+ """Sorts the fragment representing a cycle so that they represent an insertion assembly if possible,
1433
+ else returns None.
1434
+
1435
+ Here we check if one of the joins between fragments represents the edges of an insertion assembly
1436
+ The fragment must be linear, and the join must be as indicated below
1437
+
1438
+ ```
1439
+ -------- ------- Fragment 1
1440
+ || ||
1441
+ xxxxxxxx || Fragment 2
1442
+ || ||
1443
+ oooooooooo Fragment 3
1444
+ ```
1445
+ The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
1446
+
1447
+ These could be returned in any order by simple_cycles, so we sort the edges so that the first
1448
+ and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
1449
+ """
1450
+ edge_pair_index = list()
1451
+
1452
+ # Pair edges with one another
1453
+ for i, ((_u1, v1, _, end_location), (_u2, _v2, start_location, _)) in enumerate(
1454
+ zip(assembly, assembly[1:] + assembly[:1])
1455
+ ):
1456
+ fragment = self.fragments[abs(v1) - 1]
1457
+ # Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
1458
+ # the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
1459
+ left_of_insertion = _location_boundaries(start_location)[0]
1460
+ right_of_insertion = _location_boundaries(end_location)[0]
1461
+ if not fragment.circular and (
1462
+ right_of_insertion >= left_of_insertion
1463
+ # The below condition is for single-site integration.
1464
+ # The reason to use locations_overlap instead of equality is because the location might extend
1465
+ # left of right. For example, let's take ACCGGTTT as homology arm for an integration:
1466
+ #
1467
+ # insert aaACCGGTTTccACCGGTTTtt
1468
+ # genome aaACCGGTTTtt
1469
+ #
1470
+ # The locations of homology on the genome are [0:10] and [2:12], so not identical
1471
+ # but they overlap.
1472
+ or _locations_overlap(start_location, end_location, len(fragment))
1473
+ ):
1474
+ edge_pair_index.append(i)
1475
+
1476
+ if len(edge_pair_index) != 1:
1477
+ return None
1478
+
1479
+ shift_by = (edge_pair_index[0] + 1) % len(assembly)
1480
+ return assembly[shift_by:] + assembly[:shift_by]
1481
+
1482
+ def format_insertion_assembly_edge_case(
1483
+ self, assembly: EdgeRepresentationAssembly
1484
+ ) -> EdgeRepresentationAssembly:
1485
+ """
1486
+ Edge case from https://github.com/manulera/OpenCloning_backend/issues/329
1487
+ """
1488
+ same_assembly = assembly[:]
1489
+
1490
+ if len(assembly) != 2:
1491
+ return same_assembly
1492
+ ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = assembly
1493
+
1494
+ if f1 != _f1 or _f2 != f2:
1495
+ return same_assembly
1496
+
1497
+ if loc_f2_1 == loc_f2_2 or loc_f1_2 == loc_f1_1:
1498
+ return same_assembly
1499
+
1500
+ fragment1 = self.fragments[abs(f1) - 1]
1501
+ fragment2 = self.fragments[abs(f2) - 1]
1502
+
1503
+ if not _locations_overlap(
1504
+ loc_f1_1, loc_f1_2, len(fragment1)
1505
+ ) or not _locations_overlap(loc_f2_2, loc_f2_1, len(fragment2)):
1506
+ return same_assembly
1507
+
1508
+ # Sort to make compatible with insertion assembly
1509
+ if _location_boundaries(loc_f1_1)[0] > _location_boundaries(loc_f1_2)[0]:
1510
+ new_assembly = same_assembly[::-1]
1511
+ else:
1512
+ new_assembly = same_assembly[:]
1513
+
1514
+ ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = new_assembly
1515
+
1516
+ fragment1 = self.fragments[abs(f1) - 1]
1517
+ if fragment1.circular:
1518
+ return same_assembly
1519
+ fragment2 = self.fragments[abs(f2) - 1]
1520
+
1521
+ # Extract boundaries
1522
+ f2_1_start, _ = _location_boundaries(loc_f2_1)
1523
+ f2_2_start, f2_2_end = _location_boundaries(loc_f2_2)
1524
+ f1_1_start, _ = _location_boundaries(loc_f1_1)
1525
+ f1_2_start, f1_2_end = _location_boundaries(loc_f1_2)
1526
+
1527
+ overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(
1528
+ fragment2[f2_1_start:f2_2_end]
1529
+ )
1530
+
1531
+ if overlap_diff == 0:
1532
+ assert False, "Overlap is 0"
1533
+
1534
+ if overlap_diff > 0:
1535
+ new_loc_f1_1 = create_location(
1536
+ f1_1_start, f1_2_start - overlap_diff, len(fragment1)
1537
+ )
1538
+ new_loc_f2_1 = create_location(f2_1_start, f2_2_start, len(fragment2))
1539
+ else:
1540
+ new_loc_f2_1 = create_location(
1541
+ f2_1_start, f2_2_start + overlap_diff, len(fragment2)
1542
+ )
1543
+ new_loc_f1_1 = create_location(f1_1_start, f1_2_start, len(fragment1))
1544
+
1545
+ new_assembly = [
1546
+ (f1, f2, new_loc_f1_1, new_loc_f2_1),
1547
+ new_assembly[1],
1548
+ ]
1549
+
1550
+ return new_assembly
1551
+
1552
+ def get_insertion_assemblies(
1553
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1554
+ ) -> list[EdgeRepresentationAssembly]:
1555
+ """Assemblies that represent the insertion of a fragment or series of fragment inside a linear construct. For instance,
1556
+ digesting CCCCGAATTCCCCGAATTC with EcoRI and inserting the fragment with two overhangs into the EcoRI site of AAAGAATTCAAA.
1557
+ This is not so much meant for the use-case of linear fragments that represent actual linear fragments, but for linear
1558
+ fragments that represent a genome region. This can then be used to simulate homologous recombination.
1559
+ """
1560
+ if only_adjacent_edges:
1561
+ raise NotImplementedError(
1562
+ "only_adjacent_edges not implemented for insertion assemblies"
1563
+ )
1564
+
1565
+ cycles = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1566
+
1567
+ # We apply constrains already here because sometimes the combinatorial explosion is too large
1568
+ if self.use_all_fragments:
1569
+ cycles = [c for c in cycles if len(c) == len(self.fragments)]
1570
+
1571
+ # Remove cycles with duplicates
1572
+ cycles = [c for c in cycles if len(c) == len(set(map(abs, c)))]
1573
+
1574
+ possible_assembly_number = self.get_possible_assembly_number(
1575
+ [c + c[:1] for c in cycles]
1576
+ )
1577
+
1578
+ if possible_assembly_number > max_assemblies:
1579
+ raise ValueError(
1580
+ f"Too many assemblies ({possible_assembly_number} pre-validation) to assemble"
1581
+ )
1582
+
1583
+ # We find cycles first
1584
+ iterator = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1585
+ assemblies = sum(
1586
+ map(lambda x: self.node_path2assembly_list(x, True), iterator), []
1587
+ )
1588
+ # We format the edge case
1589
+ assemblies = [self.format_insertion_assembly_edge_case(a) for a in assemblies]
1590
+ # We select those that contain exactly only one suitable edge
1591
+ assemblies = [
1592
+ b
1593
+ for a in assemblies
1594
+ if (b := self.format_insertion_assembly(a)) is not None
1595
+ ]
1596
+ # First fragment should be in the + orientation
1597
+ assemblies = list(filter(lambda x: x[0][0] > 0, assemblies))
1598
+ return [
1599
+ a
1600
+ for a in assemblies
1601
+ if self.assembly_is_valid(
1602
+ self.fragments, a, False, self.use_all_fragments, is_insertion=True
1603
+ )
1604
+ ]
1605
+
1606
+ def assemble_linear(
1607
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1608
+ ) -> list[_Dseqrecord]:
1609
+ """Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
1610
+ assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
1611
+ return [assemble(self.fragments, a) for a in assemblies]
1612
+
1613
+ def assemble_circular(
1614
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1615
+ ) -> list[_Dseqrecord]:
1616
+ """Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
1617
+ assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
1618
+ return [assemble(self.fragments, a) for a in assemblies]
1619
+
1620
+ def assemble_insertion(
1621
+ self, only_adjacent_edges: bool = False
1622
+ ) -> list[_Dseqrecord]:
1623
+ """Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
1624
+ assemblies = self.get_insertion_assemblies(only_adjacent_edges)
1625
+ return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
1626
+
1627
+ def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
1628
+ """Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
1629
+ `left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1630
+ and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
1631
+ where we can end up with a situation like this:
1632
+
1633
+ GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
1634
+
1635
+ # Partial overlap in assembly 1[9:11]:2[8:10]
1636
+ GGTCTCCxxAACCAA
1637
+ CCAGAGGGGTTxxTT
1638
+
1639
+ # Partial overlap in 2[10:12]:1[7:9]
1640
+ aGGTCTCCxxCCAATT
1641
+ tCCAGAGGTTGGxxAA
1642
+
1643
+ Would return
1644
+ {
1645
+ 1: {'left': [7:9], 'right': [9:11]},
1646
+ 2: {'left': [8:10], 'right': [10:12]},
1647
+ -1: {'left': [2:4], 'right': [4:6]},
1648
+ -2: {'left': [2:4], 'right': [4:6]}
1649
+ }
1650
+
1651
+ """
1652
+
1653
+ locations_on_fragments = dict()
1654
+ for node in self.G.nodes:
1655
+ this_dict = {"left": list(), "right": list()}
1656
+ for edge in self.G.edges(data=True):
1657
+ for i, key in enumerate(["right", "left"]):
1658
+ if edge[i] == node:
1659
+ edge_location = edge[2]["locations"][i]
1660
+ if edge_location not in this_dict[key]:
1661
+ this_dict[key].append(edge_location)
1662
+ this_dict["left"] = sorted(
1663
+ this_dict["left"], key=lambda x: _location_boundaries(x)[0]
1664
+ )
1665
+ this_dict["right"] = sorted(
1666
+ this_dict["right"], key=lambda x: _location_boundaries(x)[0]
1667
+ )
1668
+ locations_on_fragments[node] = this_dict
1669
+
1670
+ return locations_on_fragments
1671
+
1672
+ def assembly_uses_only_adjacent_edges(self, assembly, is_circular: bool) -> bool:
1673
+ """
1674
+ Check whether only adjacent edges within each fragment are used in the assembly. This is useful to check if a cut and ligate assembly is valid,
1675
+ and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
1676
+ and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
1677
+
1678
+ ```
1679
+ x y z
1680
+ -------|-------|-------|---------
1681
+ ```
1682
+
1683
+ We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
1684
+ The latter would indicate that the fragment was partially digested.
1685
+ """
1686
+
1687
+ locations_on_fragments = self.get_locations_on_fragments()
1688
+ for node in locations_on_fragments:
1689
+ fragment_len = len(self.fragments[abs(node) - 1])
1690
+ for side in ["left", "right"]:
1691
+ locations_on_fragments[node][side] = gather_overlapping_locations(
1692
+ locations_on_fragments[node][side], fragment_len
1693
+ )
1694
+
1695
+ allowed_location_pairs = dict()
1696
+ for node in locations_on_fragments:
1697
+ if not is_circular:
1698
+ # We add the existing ends of the fragment
1699
+ left = [(None,)] + locations_on_fragments[node]["left"]
1700
+ right = locations_on_fragments[node]["right"] + [(None,)]
1701
+
1702
+ else:
1703
+ # For circular assemblies, we add the first location at the end
1704
+ # to allow for the last edge to be used
1705
+ left = locations_on_fragments[node]["left"]
1706
+ right = (
1707
+ locations_on_fragments[node]["right"][1:]
1708
+ + locations_on_fragments[node]["right"][:1]
1709
+ )
1710
+
1711
+ pairs = list()
1712
+ for pair in zip(left, right):
1713
+ pairs += list(_itertools.product(*pair))
1714
+ allowed_location_pairs[node] = pairs
1715
+
1716
+ fragment_assembly = edge_representation2subfragment_representation(
1717
+ assembly, is_circular
1718
+ )
1719
+ for node, start_location, end_location in fragment_assembly:
1720
+ if (start_location, end_location) not in allowed_location_pairs[node]:
1721
+ return False
1722
+ return True
1723
+
1724
+ def __repr__(self):
1725
+ # https://pyformat.info
1726
+ return _pretty_str(
1727
+ "Assembly\n"
1728
+ "fragments..: {sequences}\n"
1729
+ "limit(bp)..: {limit}\n"
1730
+ "G.nodes....: {nodes}\n"
1731
+ "algorithm..: {al}".format(
1732
+ sequences=" ".join("{}bp".format(len(x)) for x in self.fragments),
1733
+ limit=self.limit,
1734
+ nodes=self.G.order(),
1735
+ al=self.algorithm.__name__,
1736
+ )
1737
+ )
1738
+
1739
+
1740
+ class PCRAssembly(Assembly):
1741
+ """
1742
+ An assembly that represents a PCR, where `fragments` is a list of primer, template, primer (in that order).
1743
+ It always uses the `primer_template_overlap` algorithm and accepts the `mismatches` argument to indicate
1744
+ the number of mismatches allowed in the overlap. Only supports substitution mismatches, not indels.
1745
+ """
1746
+
1747
+ def __init__(self, frags: list[_Dseqrecord | _Primer], limit=25, mismatches=0):
1748
+
1749
+ value_error = ValueError(
1750
+ "PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer"
1751
+ )
1752
+ if len(frags) != 3:
1753
+ raise value_error
1754
+
1755
+ # Validate the inputs: should be a series of primer, template, primer
1756
+ wrong_fragment_class = (
1757
+ not isinstance(frags[0], _Primer),
1758
+ isinstance(frags[1], _Primer),
1759
+ not isinstance(frags[2], _Primer),
1760
+ )
1761
+ if any(wrong_fragment_class):
1762
+ raise value_error
1763
+
1764
+ # TODO: allow for the same fragment to be included more than once?
1765
+ self.G = _nx.MultiDiGraph()
1766
+ # Add positive and negative nodes for forward and reverse fragments
1767
+ self.G.add_nodes_from((i + 1, {"seq": f}) for (i, f) in enumerate(frags))
1768
+ self.G.add_nodes_from(
1769
+ (-(i + 1), {"seq": f.reverse_complement()}) for (i, f) in enumerate(frags)
1770
+ )
1771
+
1772
+ pairs = list()
1773
+ primer_ids = list()
1774
+ for i in range(0, len(frags), 3):
1775
+ # primer, template, primer
1776
+ p1, t, p2 = (i + 1, i + 2, i + 3)
1777
+ primer_ids += [p1, p2]
1778
+ pairs += list(_itertools.product([p1, p2], [t, -t]))
1779
+ pairs += list(_itertools.product([t, -t], [-p1, -p2]))
1780
+
1781
+ for u, v in pairs:
1782
+ u_seq = self.G.nodes[u]["seq"]
1783
+ v_seq = self.G.nodes[v]["seq"]
1784
+ matches = primer_template_overlap(u_seq, v_seq, limit, mismatches)
1785
+ for match in matches:
1786
+ self.add_edges_from_match(match, u, v, u_seq, v_seq)
1787
+
1788
+ # These two are constrained
1789
+ self.use_fragment_order = False
1790
+ self.use_all_fragments = True
1791
+
1792
+ self.fragments = frags
1793
+ self.limit = limit
1794
+ self.algorithm = primer_template_overlap
1795
+
1796
+ return
1797
+
1798
+ def get_linear_assemblies(
1799
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1800
+ ) -> list[EdgeRepresentationAssembly]:
1801
+ if only_adjacent_edges:
1802
+ raise NotImplementedError(
1803
+ "only_adjacent_edges not implemented for PCR assemblies"
1804
+ )
1805
+
1806
+ return super().get_linear_assemblies(max_assemblies=max_assemblies)
1807
+
1808
+ def get_circular_assemblies(self, only_adjacent_edges: bool = False):
1809
+ raise NotImplementedError(
1810
+ "get_circular_assemblies not implemented for PCR assemblies"
1811
+ )
1812
+
1813
+ def get_insertion_assemblies(self, only_adjacent_edges: bool = False):
1814
+ raise NotImplementedError(
1815
+ "get_insertion_assemblies not implemented for PCR assemblies"
1816
+ )
1817
+
1818
+
1819
+ class SingleFragmentAssembly(Assembly):
1820
+ """
1821
+ An assembly that represents the circularisation or splicing of a single fragment.
1822
+ """
1823
+
1824
+ def __init__(self, frags: [_Dseqrecord], limit=25, algorithm=common_sub_strings):
1825
+
1826
+ if len(frags) != 1:
1827
+ raise ValueError(
1828
+ "SingleFragmentAssembly assembly must be initialised with a single fragment"
1829
+ )
1830
+ # TODO: allow for the same fragment to be included more than once?
1831
+ self.G = _nx.MultiDiGraph()
1832
+ frag = frags[0]
1833
+ # Add positive and negative nodes for forward and reverse fragments
1834
+ self.G.add_node(1, seq=frag)
1835
+
1836
+ matches = algorithm(frag, frag, limit)
1837
+ for match in matches:
1838
+ self.add_edges_from_match(match, 1, 1, frag, frag)
1839
+
1840
+ # To avoid duplicated outputs
1841
+ self.G.remove_edges_from([(-1, -1)])
1842
+
1843
+ # These two are constrained
1844
+ self.use_fragment_order = True
1845
+ self.use_all_fragments = True
1846
+
1847
+ self.fragments = frags
1848
+ self.limit = limit
1849
+ self.algorithm = algorithm
1850
+
1851
+ return
1852
+
1853
+ def get_circular_assemblies(
1854
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1855
+ ) -> list[EdgeRepresentationAssembly]:
1856
+ # We don't want the same location twice
1857
+ assemblies = filter(
1858
+ lambda x: x[0][2] != x[0][3],
1859
+ super().get_circular_assemblies(only_adjacent_edges, max_assemblies),
1860
+ )
1861
+ return [
1862
+ a
1863
+ for a in assemblies
1864
+ if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)
1865
+ ]
1866
+
1867
+ def get_insertion_assemblies(
1868
+ self, only_adjacent_edges: bool = False, max_assemblies: int = 50
1869
+ ) -> list[EdgeRepresentationAssembly]:
1870
+ """This could be renamed splicing assembly, but the essence is similar"""
1871
+
1872
+ if only_adjacent_edges:
1873
+ raise NotImplementedError(
1874
+ "only_adjacent_edges not implemented for insertion assemblies"
1875
+ )
1876
+
1877
+ def splicing_assembly_filter(x):
1878
+ # We don't want the same location twice
1879
+ if x[0][2] == x[0][3]:
1880
+ return False
1881
+ # We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
1882
+ left_start, _ = _location_boundaries(x[0][2])
1883
+ _, right_end = _location_boundaries(x[0][3])
1884
+ if left_start == 0 and right_end == len(self.fragments[0]):
1885
+ return False
1886
+ return True
1887
+
1888
+ # We don't want the same location twice
1889
+ assemblies = filter(
1890
+ splicing_assembly_filter,
1891
+ super().get_insertion_assemblies(max_assemblies=max_assemblies),
1892
+ )
1893
+ return [
1894
+ a
1895
+ for a in assemblies
1896
+ if self.assembly_is_valid(
1897
+ self.fragments, a, False, self.use_all_fragments, is_insertion=True
1898
+ )
1899
+ ]
1900
+
1901
+ def get_linear_assemblies(self):
1902
+ raise NotImplementedError("Linear assembly does not make sense")