opencloning 0.3.8__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
opencloning/assembly2.py DELETED
@@ -1,1467 +0,0 @@
1
- """Slightly different assembly implementation"""
2
-
3
- from pydna.utils import (
4
- shift_location as _shift_location,
5
- flatten,
6
- location_boundaries as _location_boundaries,
7
- locations_overlap as _locations_overlap,
8
- )
9
- from pydna._pretty import pretty_str as _pretty_str
10
- from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
11
- from pydna.dseqrecord import Dseqrecord as _Dseqrecord
12
- from pydna.dseq import Dseq as _Dseq
13
- from pydna.primer import Primer as _Primer
14
- from pydna.seqrecord import SeqRecord as _SeqRecord
15
- import networkx as _nx
16
- import itertools as _itertools
17
- from Bio.SeqFeature import SimpleLocation, Location
18
- from .dna_utils import sum_is_sticky, create_location
19
- from Bio.Seq import reverse_complement
20
- from Bio.Restriction.Restriction import RestrictionBatch, AbstractCut
21
- import regex
22
- import copy
23
-
24
- # Currently unused, commented out because it's not tested
25
- # def primers_clash(assembly, fragments):
26
- # edge_pairs = zip(assembly, assembly[1:])
27
- # for (_u1, _v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
28
- # # Only for primer joins
29
- # if not isinstance(fragments[abs(_v1) - 1], _Dseqrecord):
30
- # continue
31
- # if _locations_overlap(start_location, end_location, len(fragments[abs(_v1) - 1])):
32
- # return True
33
- # return False
34
-
35
-
36
- def limit_iterator(iterator, limit):
37
- for i, x in enumerate(iterator):
38
- if i >= limit:
39
- raise ValueError(f'Too many possible paths (more than {limit})')
40
- yield x
41
-
42
-
43
- def gather_overlapping_locations(locs: list[Location], fragment_length: int):
44
- """
45
- Turn a list of locations into a list of tuples of those locations, where each tuple contains
46
- locations that overlap. For example, if locs = [loc1, loc2, loc3], and loc1 and loc2 overlap,
47
- the output will be [(loc1, loc2), (loc3,)].
48
- """
49
- # Make a graph with all the locations as nodes
50
- G = _nx.Graph()
51
- for i, loc in enumerate(locs):
52
- G.add_node(i, location=loc)
53
-
54
- # Add edges between nodes that overlap
55
- for i in range(len(locs)):
56
- for j in range(i + 1, len(locs)):
57
- if _locations_overlap(locs[i], locs[j], fragment_length):
58
- G.add_edge(i, j)
59
-
60
- # Get groups of overlapping locations
61
- groups = list()
62
- for loc_set in _nx.connected_components(G):
63
- groups.append(tuple(locs[i] for i in loc_set))
64
-
65
- # Sort by location of the first element in each group (does not matter which since they are overlapping)
66
- groups.sort(key=lambda x: _location_boundaries(x[0])[0])
67
-
68
- return groups
69
-
70
-
71
- # def assembly_checksum(G: _nx.MultiDiGraph, edge_list):
72
- # """Calculate a checksum for an assembly, from a list of edges in the form (u, v, key)."""
73
- # checksum_list = list()
74
- # for edge in edge_list:
75
- # u, v, key = edge
76
- # checksum_list.append(G.get_edge_data(u, v, key)['uid'])
77
-
78
- # return min('-'.join(checksum_list), '-'.join(checksum_list[::-1]))
79
-
80
-
81
- def ends_from_cutsite(cutsite: tuple[tuple[int, int], AbstractCut], seq: _Dseq):
82
- if cutsite is None:
83
- raise ValueError('None is not supported')
84
-
85
- cut_watson, cut_crick, ovhg = seq.get_cut_parameters(cutsite, is_left=None)
86
- if ovhg < 0:
87
- # TODO check the edge in circular
88
- return (
89
- ("5'", str(seq[cut_watson:cut_crick].reverse_complement()).lower()),
90
- ("5'", str(seq[cut_watson:cut_crick]).lower()),
91
- )
92
- elif ovhg > 0:
93
- return (
94
- ("3'", str(seq[cut_crick:cut_watson]).lower()),
95
- ("3'", str(seq[cut_crick:cut_watson].reverse_complement()).lower()),
96
- )
97
-
98
- return ('blunt', ''), ('blunt', '')
99
-
100
-
101
- def restriction_ligation_overlap(
102
- seqx: _Dseqrecord, seqy: _Dseqrecord, enzymes=RestrictionBatch, partial=False, allow_blunt=False
103
- ):
104
- """Find overlaps. Like in stiky and gibson, the order matters"""
105
- cuts_x = seqx.seq.get_cutsites(*enzymes)
106
- cuts_y = seqy.seq.get_cutsites(*enzymes)
107
- # If blunt ends are allowed, something similar to this could be done to allow
108
- # joining with linear sequence ends, but for now it messes up with the only_adjacent_edges
109
- # case
110
- # if allow_blunt:
111
- # if not seqx.circular:
112
- # cuts_x.append(((len(seqx), 0), None))
113
- # if not seqy.circular:
114
- # cuts_y.append(((0, 0), None))
115
- matches = list()
116
- for cut_x, cut_y in _itertools.product(cuts_x, cuts_y):
117
- # A blunt end
118
- if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
119
- matches.append((cut_x[0][0], cut_y[0][0], 0))
120
- continue
121
-
122
- # Otherwise, test overhangs
123
- overlap = sum_is_sticky(ends_from_cutsite(cut_x, seqx.seq)[0], ends_from_cutsite(cut_y, seqy.seq)[1], partial)
124
- if not overlap:
125
- continue
126
- x_watson, x_crick, x_ovhg = seqx.seq.get_cut_parameters(cut_x, is_left=False)
127
- y_watson, y_crick, y_ovhg = seqy.seq.get_cut_parameters(cut_y, is_left=True)
128
- # Positions where the overlap would start for full overlap
129
- left_x = x_watson if x_ovhg < 0 else x_crick
130
- left_y = y_watson if y_ovhg < 0 else y_crick
131
-
132
- # Correct por partial overlaps
133
- left_x += abs(x_ovhg) - overlap
134
-
135
- matches.append((left_x, left_y, overlap))
136
- return matches
137
-
138
-
139
- def combine_algorithms(*algorithms):
140
- """Combine algorithms, if any of them returns a match, the match is returned."""
141
-
142
- def combined(seqx, seqy, limit):
143
- matches = list()
144
- for algorithm in algorithms:
145
- matches += algorithm(seqx, seqy, limit)
146
- return matches
147
-
148
- return combined
149
-
150
-
151
- def blunt_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=None):
152
- """Find blunt overlaps"""
153
- if seqx.seq.three_prime_end()[0] == 'blunt' and seqy.seq.five_prime_end()[0] == 'blunt':
154
- return [(len(seqx), 0, 0)]
155
- return []
156
-
157
-
158
- def common_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
159
- query_seqx = str(seqx.seq).upper()
160
- query_seqy = str(seqy.seq).upper()
161
- if seqx.circular:
162
- query_seqx = query_seqx * 2
163
- if seqy.circular:
164
- query_seqy = query_seqy * 2
165
- results = common_sub_strings_str(query_seqx, query_seqy, limit)
166
-
167
- if not seqx.circular and not seqy.circular:
168
- return results
169
-
170
- # Remove matches that start on the second copy of the sequence
171
- if seqx.circular:
172
- results = [r for r in results if r[0] < len(seqx)]
173
- if seqy.circular:
174
- results = [r for r in results if r[1] < len(seqy)]
175
-
176
- # Trim lengths that span more than the sequence
177
- if seqx.circular or seqy.circular:
178
- max_match_length = min(len(seqx), len(seqy))
179
- results = [(r[0], r[1], min(r[2], max_match_length)) for r in results]
180
-
181
- # Edge case where the sequences are identical
182
- if len(seqx.seq) == len(seqy.seq):
183
- full_match = next((r for r in results if r[2] == len(seqx.seq)), None)
184
- if full_match is not None:
185
- return [full_match]
186
-
187
- # Remove duplicate matches, see example below
188
- # Let's imagine the following two sequences, where either seqy or both are circular
189
- # seqx: 01234
190
- # seqy: 123450, circular
191
- #
192
- # common_sub_strings would return [(0, 5, 5), (1, 0, 4)]
193
- # Actually, (1, 0, 4) is a subset of (0, 5, 5), the part
194
- # that does not span the origin. To remove matches like this,
195
- # We find matches where the origin is spanned in one of the sequences
196
- # only, and then remove the subset of that match that does not span the origin.
197
- shifted_matches = set()
198
- for x, y, length in results:
199
- x_span_origin = seqx.circular and x + length > len(seqx)
200
- y_span_origin = seqy.circular and y + length > len(seqy)
201
- if x_span_origin and not y_span_origin:
202
- shift = len(seqx) - x
203
- shifted_matches.add((0, y + shift, length - shift))
204
- elif not x_span_origin and y_span_origin:
205
- shift = len(seqy) - y
206
- shifted_matches.add((x + shift, 0, length - shift))
207
- return [r for r in results if r not in shifted_matches]
208
-
209
-
210
- def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
211
- """
212
- The order matters, we want alignments like:
213
-
214
- oooo------xxxx
215
- xxxx------oooo
216
- Product: oooo------xxxx------oooo
217
-
218
- Not like:
219
-
220
- oooo------xxxx
221
- xxxx------oooo
222
- Product (unwanted): oooo
223
- """
224
-
225
- # Because Gibson enzymes remove 5' overhangs, we remove them from the sequence
226
- # when looking for homology, then we shift the location of the second fragment accordingly.
227
- # This is only relevant for linear fragments, so we don't need to worry about
228
- # shifting locations for circular fragments.
229
- trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
230
- trim_x_right = seqx.seq.watson_ovhg() if seqx.seq.watson_ovhg() < 0 else None
231
- trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
232
- trim_y_right = seqy.seq.watson_ovhg() if seqy.seq.watson_ovhg() < 0 else None
233
-
234
- stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
235
- stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
236
- # We have to convert to list because we need to modify the matches
237
- matches = [
238
- list(m) for m in common_sub_strings_str(stringx, stringy, limit) if (m[1] == 0 and m[0] + m[2] == len(stringx))
239
- ]
240
- for match in matches:
241
- match[0] += trim_x_left
242
- match[1] += trim_y_left
243
-
244
- # convert to tuples again
245
- return [tuple(m) for m in matches]
246
-
247
-
248
- def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
249
- """For now, if limit 0 / False only full overlaps are considered."""
250
- overlap = sum_is_sticky(seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit)
251
- if overlap:
252
- return [(len(seqx) - overlap, 0, overlap)]
253
- return []
254
-
255
-
256
- def zip_match_leftwards(seqx: _SeqRecord, seqy: _SeqRecord, match: tuple[int, int, int]):
257
- """Starting from the rightmost edge of the match, return a new match encompassing the max
258
- number of bases. This can be used to return a longer match if a primer aligns for longer
259
- than the limit or a shorter match if there are mismatches. This is convenient to maintain
260
- as many features as possible.
261
-
262
- >>> seq = _Dseqrecord('AAAAACGTCCCGT')
263
- >>> primer = _Dseqrecord('ACGTCCCGT')
264
- >>> match = (13, 9, 0) # an empty match at the end of each
265
- >>> zip_match_leftwards(seq, primer, match)
266
- (4, 0, 9)
267
-
268
- Works in circular molecules if the match spans the origin:
269
- >>> seq = _Dseqrecord('TCCCGTAAAAACG', circular=True)
270
- >>> primer = _Dseqrecord('ACGTCCCGT')
271
- >>> match = (6, 9, 0)
272
- >>> zip_match_leftwards(seq, primer, match)
273
- >>> (10, 0, 9)
274
-
275
- """
276
-
277
- query_x = seqrecord2str_for_alignment(seqx)
278
- query_y = seqrecord2str_for_alignment(seqy)
279
-
280
- # In circular sequences, the match may go beyond the left-most edge of the sequence if it spans
281
- # the origin:
282
- # Primer: ACGTCCCGT
283
- # |||||||||
284
- # Circular seq: ACGTCCCGT -> Equivalent to Dseqrecord('CCCGTACGT', circular=True)
285
- # ^
286
- # Origin
287
- # We would start from the last T and move leftwards, but we would stop at the origin
288
- # For those cases we shift by length, then go back
289
-
290
- end_on_x = match[0] + match[2]
291
- if isinstance(seqx, _Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
292
- end_on_x += len(seqx)
293
-
294
- end_on_y = match[1] + match[2]
295
- if isinstance(seqy, _Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
296
- end_on_y += len(seqy)
297
-
298
- count = 0
299
- for x, y in zip(reversed(query_x[:end_on_x]), reversed(query_y[:end_on_y])):
300
- if x != y:
301
- break
302
- count += 1
303
-
304
- # Shift back by length if needed
305
- start_on_x = (end_on_x - count) % len(seqx)
306
- start_on_y = (end_on_y - count) % len(seqy)
307
-
308
- return (start_on_x, start_on_y, count)
309
-
310
-
311
- def zip_match_rightwards(seqx: _Dseqrecord, seqy: _Dseqrecord, match: tuple[int, int, int]):
312
- """Same as zip_match_leftwards, towards the right."""
313
-
314
- query_x = seqrecord2str_for_alignment(seqx)
315
- query_y = seqrecord2str_for_alignment(seqy)
316
-
317
- start_on_x, start_on_y, _ = match
318
- count = 0
319
- for x, y in zip(query_x[start_on_x:], query_y[start_on_y:]):
320
- if x != y:
321
- break
322
- count += 1
323
- return (start_on_x, start_on_y, count)
324
-
325
-
326
- def seqrecord2str_for_alignment(seqr: _SeqRecord):
327
- """Transform a Dseqrecord to a string representation where U is replaced by T, everything is upper case and
328
- circular sequences are repeated twice."""
329
- out = str(seqr.seq).upper().replace('U', 'T')
330
- if isinstance(seqr, _Dseqrecord) and seqr.circular:
331
- return out * 2
332
- return out
333
-
334
-
335
- def alignment_sub_strings(seqx: _Dseqrecord | _Primer, seqy: _Dseqrecord | _Primer, limit=25, mismatches=0):
336
- """"""
337
-
338
- if isinstance(seqx, _Primer) and isinstance(seqy, _Dseqrecord):
339
- primer = seqx
340
- template = seqy
341
- reverse_primer = False
342
- elif isinstance(seqx, _Dseqrecord) and isinstance(seqy, _Primer):
343
- primer = seqy
344
- template = seqx
345
- reverse_primer = True
346
- else:
347
- raise ValueError('One of the sequences must be a primer and the other a Dseqrecord')
348
-
349
- if len(primer) < limit:
350
- return []
351
-
352
- subject = seqrecord2str_for_alignment(template)
353
- query = (
354
- seqrecord2str_for_alignment(primer[:limit]) if reverse_primer else seqrecord2str_for_alignment(primer[-limit:])
355
- )
356
-
357
- re_matches = list(regex.finditer('(' + query + '){s<=' + str(mismatches) + '}', subject, overlapped=True))
358
- re_matches += list(regex.finditer('(?r)(' + query + '){s<=' + str(mismatches) + '}', subject, overlapped=True))
359
-
360
- out = set()
361
- for re_match in re_matches:
362
-
363
- start, end = re_match.span()
364
-
365
- # For circular sequences the same match is returned twice unless it falls
366
- # on the origin, we eliminate duplicates here
367
- if start >= len(template):
368
- continue
369
-
370
- # This extends match beyond the limit if the primer aligns more than that
371
- # and reduces the match if the primer has mismatches
372
- if reverse_primer:
373
- # Match in the same format as other assembly algorithms
374
- starting_match = (start, 0, end - start)
375
- out.add(zip_match_rightwards(template, primer, starting_match))
376
- else:
377
- # Match in the same format as other assembly algorithms
378
- starting_match = (len(primer) - limit, start, end - start)
379
- out.add(zip_match_leftwards(primer, template, starting_match))
380
-
381
- return list(sorted(out))
382
-
383
-
384
- def fill_left(seq: _Dseq):
385
- """Fill the left overhang of a sequence with the complementary sequence."""
386
- new_watson = seq.watson
387
- new_crick = seq.crick
388
-
389
- # Watson 5' overhang
390
- if seq.ovhg < 0:
391
- new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
392
- # Crick 5' overhang
393
- elif seq.ovhg > 0:
394
- new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
395
-
396
- return _Dseq(new_watson, new_crick, 0)
397
-
398
-
399
- def fill_right(seq: _Dseq):
400
- """Fill the right overhang of a sequence with the complementary sequence."""
401
- new_watson = seq.watson
402
- new_crick = seq.crick
403
-
404
- # Watson 3' overhang
405
- watson_ovhg = seq.watson_ovhg()
406
- if watson_ovhg < 0:
407
- new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
408
-
409
- # Crick 3' overhang
410
- elif watson_ovhg > 0:
411
- new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
412
-
413
- return _Dseq(new_watson, new_crick, seq.ovhg)
414
-
415
-
416
- def fill_dseq(seq: _Dseq):
417
- return fill_left(fill_right(seq))
418
-
419
-
420
- def reverse_complement_assembly(
421
- assembly: list[tuple[int, int, Location, Location]], fragments: list[_Dseqrecord]
422
- ) -> list[tuple[int, int, Location, Location]]:
423
- """Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
424
- new_assembly = list()
425
- for u, v, locu, locv in assembly:
426
- f_u = fragments[abs(u) - 1]
427
- f_v = fragments[abs(v) - 1]
428
- new_assembly.append((-v, -u, locv._flip(len(f_v)), locu._flip(len(f_u))))
429
- return new_assembly[::-1]
430
-
431
-
432
- def filter_linear_subassemblies(linear_assemblies, circular_assemblies, fragments):
433
- """Remove linear assemblies which are sub-assemblies of circular assemblies"""
434
- all_circular_assemblies = circular_assemblies + [
435
- reverse_complement_assembly(c, fragments) for c in circular_assemblies
436
- ]
437
- filtered_assemblies = [
438
- assem for assem in linear_assemblies if not any(is_sublist(assem, c, True) for c in all_circular_assemblies)
439
- ]
440
- # I don't think the line below is necessary, but just in case
441
- # filtered_assemblies = [l for l in filtered_assemblies if not any(is_sublist(reverse_complement_assembly(l, fragments), c, True) for c in all_circular_assemblies)]
442
- return filtered_assemblies
443
-
444
-
445
- def remove_subassemblies(assemblies):
446
- """Filter out subassemblies, i.e. assemblies that are contained within another assembly.
447
-
448
- For example:
449
- [(1, 2, '1[8:14]:2[1:7]'), (2, 3, '2[10:17]:3[1:8]')]
450
- [(1, 2, '1[8:14]:2[1:7]')]
451
- The second one is a subassembly of the first one.
452
- """
453
-
454
- # Sort by length, longest first
455
- assemblies = sorted(assemblies, key=len, reverse=True)
456
-
457
- filtered_assemblies = list()
458
- for assembly in assemblies:
459
- # Check if this assembly is a subassembly of any of the assemblies we have already found
460
- if not any(is_sublist(assembly, a) for a in filtered_assemblies):
461
- filtered_assemblies.append(assembly)
462
-
463
- return filtered_assemblies
464
-
465
-
466
- def assembly2str(assembly):
467
- """Convert an assembly to a string representation, for example:
468
- ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
469
- becomes:
470
- ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
471
-
472
- The reason for this is that by default, a feature '[8:14]' when present in a tuple
473
- is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
474
- """
475
- return str(tuple(f'{u}{lu}:{v}{lv}' for u, v, lu, lv in assembly))
476
-
477
-
478
- def assembly2str_tuple(assembly):
479
- """Convert an assembly to a string representation, like
480
- ((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
481
- """
482
- return str(tuple((u, v, str(lu), str(lv)) for u, v, lu, lv in assembly))
483
-
484
-
485
- def assembly_has_mismatches(fragments, assembly):
486
- for u, v, loc_u, loc_v in assembly:
487
- seq_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
488
- seq_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
489
- # TODO: Check issue where extraction failed, and whether it would give problems here
490
- if str(loc_u.extract(seq_u).seq).upper() != str(loc_v.extract(seq_v).seq).upper():
491
- return True
492
- return False
493
-
494
-
495
- def assembly_is_circular(assembly, fragments):
496
- """
497
- Note: This does not work for insertion assemblies, that's why assemble takes the optional argument is_insertion.
498
- """
499
- if assembly[0][0] != assembly[-1][1]:
500
- return False
501
- elif isinstance(fragments[abs(assembly[0][0]) - 1], _Dseqrecord) and fragments[abs(assembly[0][0]) - 1].circular:
502
- return True
503
- else:
504
- return _location_boundaries(assembly[0][2])[0] > _location_boundaries(assembly[-1][3])[0]
505
-
506
-
507
- def assemble(fragments, assembly, is_insertion=False):
508
- """Execute an assembly, from the representation returned by get_linear_assemblies or get_circular_assemblies."""
509
-
510
- if is_insertion:
511
- is_circular = False
512
- else:
513
- is_circular = assembly_is_circular(assembly, fragments)
514
-
515
- subfragment_representation = edge_representation2subfragment_representation(assembly, is_circular)
516
-
517
- # Sanity check
518
- for asm_edge in assembly:
519
- u, v, loc_u, loc_v = asm_edge
520
- f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
521
- f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
522
- seq_u = str(loc_u.extract(f_u).seq).upper()
523
- seq_v = str(loc_v.extract(f_v).seq).upper()
524
- if seq_u != seq_v:
525
- raise ValueError('Mismatch in assembly')
526
-
527
- # We transform into Dseqrecords (for primers)
528
- dseqr_fragments = [f if isinstance(f, _Dseqrecord) else _Dseqrecord(f) for f in fragments]
529
- subfragments = get_assembly_subfragments(dseqr_fragments, subfragment_representation)
530
-
531
- # Length of the overlaps between consecutive assembly fragments
532
- fragment_overlaps = [len(e[-1]) for e in assembly]
533
-
534
- out_dseqrecord = _Dseqrecord(subfragments[0])
535
-
536
- for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
537
- # Shift the features of the right fragment to the left by `overlap`
538
- new_features = [f._shift(len(out_dseqrecord) - overlap) for f in fragment.features]
539
- # Join the left sequence including the overlap with the right sequence without the overlap
540
- # we use fill_right / fill_left so that it works for ligation of sticky ends
541
- out_dseqrecord = _Dseqrecord(
542
- fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
543
- features=out_dseqrecord.features + new_features,
544
- )
545
-
546
- # For circular assemblies, close the loop and wrap origin-spanning features
547
- if is_circular:
548
- overlap = fragment_overlaps[-1]
549
-
550
- # Special case for blunt circularisation
551
- if overlap == 0:
552
- return out_dseqrecord.looped()
553
-
554
- # Remove trailing overlap
555
- out_dseqrecord = _Dseqrecord(
556
- fill_dseq(out_dseqrecord.seq)[:-overlap], features=out_dseqrecord.features, circular=True
557
- )
558
- for feature in out_dseqrecord.features:
559
- start, end = _location_boundaries(feature.location)
560
- if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
561
- # Wrap around the origin
562
- feature.location = _shift_location(feature.location, 0, len(out_dseqrecord))
563
-
564
- return out_dseqrecord
565
-
566
-
567
- def annotate_primer_binding_sites(
568
- input_dseqr: _Dseqrecord, fragments: list[_Dseqrecord], assembly: list[tuple[int, int, Location, Location]]
569
- ) -> _Dseqrecord:
570
- """Annotate the primer binding sites in a Dseqrecord."""
571
- fwd, _, rvs = fragments
572
- start_rvs = len(input_dseqr) - len(rvs)
573
-
574
- output_dseqr = copy.deepcopy(input_dseqr)
575
- output_dseqr.add_feature(
576
- x=0, y=len(fwd), type_='primer_bind', strand=1, label=[fwd.name], note=['sequence: ' + str(fwd.seq)]
577
- )
578
- output_dseqr.add_feature(
579
- x=start_rvs,
580
- y=len(output_dseqr),
581
- type_='primer_bind',
582
- strand=-1,
583
- label=[rvs.name],
584
- note=['sequence: ' + str(rvs.seq)],
585
- )
586
- return output_dseqr
587
-
588
-
589
- def edge_representation2subfragment_representation(assembly, is_circular):
590
- """
591
- Turn this kind of edge representation fragment 1, fragment 2, right edge on 1, left edge on 2
592
- a = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b', 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
593
- Into this: fragment 1, left edge on 1, right edge on 1
594
- b = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
595
- """
596
-
597
- if is_circular:
598
- temp = list(assembly[-1:]) + list(assembly)
599
- else:
600
- temp = [(None, assembly[0][0], None, None)] + list(assembly) + [(assembly[-1][1], None, None, None)]
601
- edge_pairs = zip(temp, temp[1:])
602
- subfragment_representation = list()
603
- for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
604
- subfragment_representation.append((v1, start_location, end_location))
605
-
606
- return tuple(subfragment_representation)
607
-
608
-
609
- def subfragment_representation2edge_representation(assembly, is_circular):
610
- """
611
- Turn this kind of subfragment representation fragment 1, left edge on 1, right edge on 1
612
- a = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
613
- Into this: fragment 1, fragment 2, right edge on 1, left edge on 2
614
- b = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b' 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
615
- """
616
-
617
- edge_representation = []
618
-
619
- # Iterate through the assembly pairwise to create the edge representation
620
- for i in range(len(assembly) - 1):
621
- frag1, left1, right1 = assembly[i]
622
- frag2, left2, right2 = assembly[i + 1]
623
- # Create the edge between the current and next fragment
624
- edge_representation.append((frag1, frag2, right1, left2))
625
-
626
- if is_circular:
627
- # Add the edge from the last fragment back to the first
628
- frag_last, left_last, right_last = assembly[-1]
629
- frag_first, left_first, right_first = assembly[0]
630
- edge_representation.append((frag_last, frag_first, right_last, left_first))
631
-
632
- return tuple(edge_representation)
633
-
634
-
635
- def get_assembly_subfragments(fragments: list[_Dseqrecord], subfragment_representation):
636
- """From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
637
-
638
- Subfragments are the slices of the fragments that are joined together
639
-
640
- For example:
641
- ```
642
- --A--
643
- TACGTAAT
644
- --B--
645
- TCGTAACGA
646
-
647
- Gives: TACGTAA / CGTAACGA
648
- ```
649
- To reproduce:
650
- ```
651
- a = Dseqrecord('TACGTAAT')
652
- b = Dseqrecord('TCGTAACGA')
653
- f = Assembly([a, b], limit=5)
654
- a0 = f.get_linear_assemblies()[0]
655
- print(assembly2str(a0))
656
- a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
657
- for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
658
- print(f.seq)
659
-
660
- # prints TACGTAA and CGTAACGA
661
- ```
662
-
663
- Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
664
- """
665
- subfragments = list()
666
- for node, start_location, end_location in subfragment_representation:
667
- seq = fragments[node - 1] if node > 0 else fragments[-node - 1].reverse_complement()
668
- subfragments.append(extract_subfragment(seq, start_location, end_location))
669
- return subfragments
670
-
671
-
672
- def extract_subfragment(seq: _Dseqrecord, start_location: Location, end_location: Location):
673
- """Extract a subfragment from a sequence, given the start and end locations of the subfragment."""
674
- start = 0 if start_location is None else _location_boundaries(start_location)[0]
675
- end = None if end_location is None else _location_boundaries(end_location)[1]
676
-
677
- # Special case, some of it could be handled by better Dseqrecord slicing in the future
678
- if (
679
- seq.circular
680
- and start_location is not None
681
- and end_location is not None
682
- and _locations_overlap(start_location, end_location, len(seq))
683
- ):
684
- # The overhang is different for origin-spanning features, for instance
685
- # for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
686
- # is -4, not 9
687
- ovhg = start - end if end > start else start - end - len(seq)
688
- # edge case
689
- if abs(ovhg) == len(seq):
690
- ovhg = 0
691
- dummy_cut = ((start, ovhg), None)
692
- open_seq = seq.apply_cut(dummy_cut, dummy_cut)
693
- return _Dseqrecord(fill_dseq(open_seq.seq), features=open_seq.features)
694
-
695
- return seq[start:end]
696
-
697
-
698
- def is_sublist(sublist, my_list, my_list_is_cyclic=False):
699
- """Returns True if sublist is a sublist of my_list (can be treated as cyclic), False otherwise.
700
-
701
- Examples
702
- --------
703
- >>> is_sublist([1, 2], [1, 2, 3], False)
704
- True
705
- >>> is_sublist([1, 2], [1, 3, 2], False)
706
- False
707
-
708
- # See the case here for cyclic lists
709
- >>> is_sublist([3, 1], [1, 2, 3], False)
710
- False
711
- >>> is_sublist([3, 1], [1, 2, 3], True)
712
- True
713
- """
714
- n = len(sublist)
715
- if my_list_is_cyclic:
716
- my_list = my_list + my_list
717
- for i in range(len(my_list) - n + 1):
718
- # Just in case tuples were passed
719
- if list(my_list[i : i + n]) == list(sublist):
720
- return True
721
- return False
722
-
723
-
724
- def circular_permutation_min_abs(lst):
725
- """Returns the circular permutation of lst with the smallest absolute value first.
726
-
727
- Examples
728
- --------
729
- >>> circular_permutation_min_abs([1, 2, 3])
730
- [1, 2, 3]
731
- >>> circular_permutation_min_abs([3, 1, 2])
732
- [1, 2, 3]
733
- """
734
- min_abs_index = min(range(len(lst)), key=lambda i: abs(lst[i]))
735
- return lst[min_abs_index:] + lst[:min_abs_index]
736
-
737
-
738
- class Assembly:
739
- """Assembly of a list of linear DNA fragments into linear or circular
740
- constructs. The Assembly is meant to replace the Assembly method as it
741
- is easier to use. Accepts a list of Dseqrecords (source fragments) to
742
- initiate an Assembly object. Several methods are available for analysis
743
- of overlapping sequences, graph construction and assembly.
744
-
745
- The assembly contains a directed graph, where nodes represent fragments and
746
- edges represent overlaps between fragments. :
747
- - The node keys are integers, representing the index of the fragment in the
748
- input list of fragments. The sign of the node key represents the orientation
749
- of the fragment, positive for forward orientation, negative for reverse orientation.
750
- - The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
751
- - u and v are the nodes connected by the edge.
752
- - key is a string that represents the location of the overlap. In the format:
753
- 'u[start:end](strand):v[start:end](strand)'.
754
- - Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
755
- representing the location of the overlap in the u and v fragment, respectively.
756
- - You can think of an edge as a representation of the join of two fragments.
757
-
758
- If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
759
- there will be 4 edges representing that overlap in the graph, for all possible
760
- orientations of the fragments (see add_edges_from_match for details):
761
- - `(1, 2, '1[8:14]:2[1:7]')`
762
- - `(2, 1, '2[1:7]:1[8:14]')`
763
- - `(-1, -2, '-1[0:6]:-2[10:16]')`
764
- - `(-2, -1, '-2[10:16]:-1[0:6]')`
765
-
766
- An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
767
- as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
768
- and second fragment. Assemblies are then represented as:
769
- - Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
770
- - Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
771
- Note that the first and last fragment are the same in a circular assembly.
772
-
773
- The following constrains are applied to remove duplicate assemblies:
774
- - Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
775
- use_fragment_order is ignored.
776
- - Linear assemblies:
777
- - Using uid (see add_edges_from_match) to identify unique edges.
778
-
779
- Parameters
780
- ----------
781
-
782
- fragments : list
783
- a list of Dseqrecord objects.
784
- limit : int, optional
785
- The shortest shared homology to be considered
786
- algorithm : function, optional
787
- The algorithm used to determine the shared sequences.
788
- use_fragment_order : bool, optional
789
- Legacy pydna behaviour: only assemblies that start with the first fragment and end with the last are considered.
790
- use_all_fragments : bool, optional
791
- Constrain the assembly to use all fragments.
792
-
793
- Examples
794
- --------
795
-
796
- from assembly2 import Assembly, assembly2str
797
- from pydna.dseqrecord import Dseqrecord
798
-
799
- example_fragments = (
800
- Dseqrecord('AacgatCAtgctcc', name='a'),
801
- Dseqrecord('TtgctccTAAattctgc', name='b'),
802
- Dseqrecord('CattctgcGAGGacgatG', name='c'),
803
- )
804
-
805
- asm = Assembly(example_fragments, limit=5, use_fragment_order=False)
806
- print('Linear ===============')
807
- for assembly in asm.get_linear_assemblies():
808
- print(' ', assembly2str(assembly))
809
- print('Circular =============')
810
- for assembly in asm.get_circular_assemblies():
811
- print(' ', assembly2str(assembly))
812
-
813
- # Prints
814
- Linear ===============
815
- ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
816
- ('2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
817
- ('3[12:17]:1[1:6]', '1[8:14]:2[1:7]')
818
- ('1[1:6]:3[12:17]',)
819
- ('2[1:7]:1[8:14]',)
820
- ('3[1:8]:2[10:17]',)
821
- Circular =============
822
- ('1[8:14]:2[1:7]', '2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
823
-
824
- """
825
-
826
- def __init__(
827
- self,
828
- frags: list[_Dseqrecord],
829
- limit=25,
830
- algorithm=common_sub_strings,
831
- use_fragment_order=True,
832
- use_all_fragments=False,
833
- ):
834
- # TODO: allow for the same fragment to be included more than once?
835
- self.G = _nx.MultiDiGraph()
836
- # Add positive and negative nodes for forward and reverse fragments
837
- self.G.add_nodes_from((i + 1, {'seq': f}) for (i, f) in enumerate(frags))
838
- self.G.add_nodes_from((-(i + 1), {'seq': f.reverse_complement()}) for (i, f) in enumerate(frags))
839
-
840
- # Iterate over all possible combinations of fragments
841
- fragment_pairs = _itertools.combinations(filter(lambda x: x > 0, self.G.nodes), 2)
842
- for i, j in fragment_pairs:
843
- # All the relative orientations of the fragments in the pair
844
- for u, v in _itertools.product([i, -i], [j, -j]):
845
- u_seq = self.G.nodes[u]['seq']
846
- v_seq = self.G.nodes[v]['seq']
847
- matches = algorithm(u_seq, v_seq, limit)
848
- for match in matches:
849
- self.add_edges_from_match(match, u, v, u_seq, v_seq)
850
-
851
- self.fragments = frags
852
- self.limit = limit
853
- self.algorithm = algorithm
854
- self.use_fragment_order = use_fragment_order
855
- self.use_all_fragments = use_all_fragments
856
-
857
- return
858
-
859
- @classmethod
860
- def assembly_is_valid(
861
- cls, fragments: list[_Dseqrecord | _Primer], assembly, is_circular, use_all_fragments, is_insertion=False
862
- ):
863
- """Function used to filter paths returned from the graph, see conditions tested below."""
864
- if is_circular is None:
865
- return False
866
-
867
- # Linear assemblies may get begin-1-end, begin-2-end, these are removed here.
868
- if len(assembly) == 0:
869
- return False
870
-
871
- if use_all_fragments and len(fragments) != len(set(flatten(map(abs, e[:2]) for e in assembly))):
872
- return False
873
-
874
- # Here we check whether subsequent pairs of fragments are compatible, for instance:
875
- # Compatible (overlap of 1 and 2 occurs before overlap of 2 and 3):
876
- # (1,2,[2:9],[0:7]), (2,3,[12:19],[0:7])
877
- # -- A --
878
- # 1 gtatcgtgt -- B --
879
- # 2 atcgtgtactgtcatattc
880
- # 3 catattcaa
881
- # Incompatible (overlap of 1 and 2 occurs after overlap of 2 and 3):
882
- # (1,2,[2:9],[13:20]), (2,3,[0:7],[0:7])
883
- # -- A --
884
- # 1 -- B -- gtatcgtgt
885
- # 2 catattcccccccatcgtgtactgt
886
- # 3 catattcaa
887
- # Redundant: overlap of 1 and 2 ends at the same spot as overlap of 2 and 3
888
- # (1,2,[2:9],[1:8]), (2,3,[0:8],[0:8])
889
- # -- A --
890
- # gtatcgtgt
891
- # catcgtgtactgtcatattc
892
- # catcgtgtactgtcatattc
893
- # -- B ---
894
- if is_circular:
895
- # In a circular assembly, first and last fragment must be the same
896
- if assembly[0][0] != assembly[-1][1]:
897
- return False
898
- edge_pairs = zip(assembly, assembly[1:] + assembly[:1])
899
- else:
900
- edge_pairs = zip(assembly, assembly[1:])
901
-
902
- for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
903
- # Incompatible as described in figure above
904
- fragment = fragments[abs(v1) - 1]
905
- if (isinstance(fragment, _Primer) or not fragment.circular) and _location_boundaries(start_location)[
906
- 1
907
- ] >= _location_boundaries(end_location)[1]:
908
- return False
909
-
910
- # Fragments are used only once
911
- nodes_used = [
912
- f[0] for f in edge_representation2subfragment_representation(assembly, is_circular or is_insertion)
913
- ]
914
- if len(nodes_used) != len(set(map(abs, nodes_used))):
915
- return False
916
-
917
- return True
918
-
919
- def add_edges_from_match(self, match, u: int, v: int, first: _Dseqrecord, secnd: _Dseqrecord):
920
- """Add edges to the graph from a match returned by an `algorithm` function (see pydna.common_substrings). For
921
- format of edges (see documentation of the Assembly class).
922
-
923
- Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
924
- homologous recombination does but sticky end ligation does not. The function returns two edges:
925
- - Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
926
- - Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
927
-
928
- """
929
- x_start, y_start, length = match
930
- if length == 0:
931
- # Edge case, blunt ligation
932
- locs = [SimpleLocation(x_start, x_start), SimpleLocation(y_start, y_start)]
933
- else:
934
- # We use shift_location with 0 to wrap origin-spanning features
935
- locs = [
936
- _shift_location(SimpleLocation(x_start, x_start + length), 0, len(first)),
937
- _shift_location(SimpleLocation(y_start, y_start + length), 0, len(secnd)),
938
- ]
939
-
940
- rc_locs = [locs[0]._flip(len(first)), locs[1]._flip(len(secnd))]
941
-
942
- # Unique id that identifies the edge in either orientation
943
- uid = f'{u}{locs[0]}:{v}{locs[1]}'
944
-
945
- combinations = (
946
- (u, v, locs),
947
- (-v, -u, rc_locs[::-1]),
948
- )
949
-
950
- for u, v, l in combinations:
951
- self.G.add_edge(u, v, f'{u}{l[0]}:{v}{l[1]}', locations=l, uid=uid)
952
-
953
- def format_assembly_edge(self, assembly_edge):
954
- """Go from the (u, v, key) to the (u, v, locu, locv) format."""
955
- u, v, key = assembly_edge
956
- locu, locv = self.G.get_edge_data(u, v, key)['locations']
957
- return u, v, locu, locv
958
-
959
- def get_linear_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
960
- """Get linear assemblies, applying the constrains described in __init__, ensuring that paths represent
961
- real assemblies (see assembly_is_valid). Subassemblies are removed (see remove_subassemblies)."""
962
-
963
- # Copy the graph since we will add the begin and end mock nodes
964
- G = _nx.MultiDiGraph(self.G)
965
- G.add_nodes_from(['begin', 'end'])
966
-
967
- if self.use_fragment_order:
968
- # Path must start with the first fragment and end with the last
969
- G.add_edge('begin', 1)
970
- G.add_edge('begin', -1)
971
- G.add_edge(len(self.fragments), 'end')
972
- G.add_edge(-len(self.fragments), 'end')
973
- else:
974
- for node in filter(lambda x: type(x) is int, G.nodes):
975
- G.add_edge('begin', node)
976
- G.add_edge(node, 'end')
977
-
978
- unique_linear_paths = self.get_unique_linear_paths(G, max_assemblies)
979
- possible_assemblies = self.get_possible_assembly_number(unique_linear_paths)
980
- if possible_assemblies > max_assemblies:
981
- raise ValueError(f'Too many assemblies ({possible_assemblies} pre-validation) to assemble')
982
-
983
- assemblies = sum(map(lambda x: self.node_path2assembly_list(x, False), unique_linear_paths), [])
984
-
985
- out = [a for a in assemblies if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments)]
986
- if only_adjacent_edges:
987
- out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, False)]
988
- return remove_subassemblies(out)
989
-
990
- def node_path2assembly_list(self, cycle, circular: bool):
991
- """Convert a node path in the format [1, 2, 3] (as returned by _nx.cycles.simple_cycles) to a list of all
992
- possible assemblies.
993
-
994
- There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
995
- for example two overlaps between 1 and 2, and single overlap between 2 and 3 should return 3 assemblies.
996
- """
997
- combine = list()
998
- pairing = zip(cycle, cycle[1:] + cycle[:1]) if circular else zip(cycle, cycle[1:])
999
- for u, v in pairing:
1000
- combine.append([(u, v, key) for key in self.G[u][v]])
1001
- return [tuple(map(self.format_assembly_edge, x)) for x in _itertools.product(*combine)]
1002
-
1003
- def get_unique_linear_paths(self, G_with_begin_end: _nx.MultiDiGraph, max_paths):
1004
- # We remove the begin and end nodes, and get all paths without edges
1005
- # e.g. we will get [1, 2, 3] only once, even if multiple edges connect
1006
- # 1 and 2 or 2 and 3, by converting to DiGraph.
1007
-
1008
- # Cutoff has a different meaning of what one would expect, see https://github.com/networkx/networkx/issues/2762
1009
- node_paths = [
1010
- x[1:-1]
1011
- for x in limit_iterator(
1012
- _nx.all_simple_paths(_nx.DiGraph(G_with_begin_end), 'begin', 'end', cutoff=(len(self.fragments) + 1)),
1013
- 10000,
1014
- )
1015
- ]
1016
-
1017
- # Remove those that contain the same node twice
1018
- node_paths = [x for x in node_paths if len(x) == len(set(map(abs, x)))]
1019
-
1020
- if self.use_all_fragments:
1021
- node_paths = [x for x in node_paths if len(x) == len(self.fragments)]
1022
-
1023
- # For each path, we check if there are reverse complement duplicates
1024
- # See: https://github.com/manulera/OpenCloning_backend/issues/160
1025
- unique_node_paths = list()
1026
- for p in node_paths:
1027
- if [-x for x in p[::-1]] not in unique_node_paths:
1028
- unique_node_paths.append(p)
1029
-
1030
- return unique_node_paths
1031
-
1032
- def get_possible_assembly_number(self, paths):
1033
- possibilities = 0
1034
- for path in paths:
1035
- this_path = 1
1036
- for u, v in zip(path, path[1:]):
1037
- if v in self.G[u]:
1038
- this_path *= len(self.G[u][v])
1039
- possibilities += this_path
1040
- return possibilities
1041
-
1042
- def get_circular_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1043
- """Get circular assemblies, applying the constrains described in __init__, ensuring that paths represent
1044
- real assemblies (see assembly_is_valid)."""
1045
- # The constrain of circular sequence is that the first node is the fragment with the smallest index in its initial orientation,
1046
- # this is ensured by the circular_permutation_min_abs function + the filter below
1047
- sorted_cycles = map(
1048
- circular_permutation_min_abs,
1049
- limit_iterator(_nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)), 10000),
1050
- )
1051
- sorted_cycles = filter(lambda x: x[0] > 0, sorted_cycles)
1052
- # cycles.simple_cycles returns lists [1,2,3] not assemblies, see self.cycle2circular_assemblies
1053
-
1054
- # We apply constrains already here because sometimes the combinatorial explosion is too large
1055
- if self.use_all_fragments:
1056
- sorted_cycles = [c for c in sorted_cycles if len(c) == len(self.fragments)]
1057
-
1058
- # Remove cycles with duplicates
1059
- sorted_cycles = [c for c in sorted_cycles if len(c) == len(set(map(abs, c)))]
1060
- possible_assembly_number = self.get_possible_assembly_number([c + c[:1] for c in sorted_cycles])
1061
- if possible_assembly_number > max_assemblies:
1062
- raise ValueError(f'Too many assemblies ({possible_assembly_number} pre-validation) to assemble')
1063
-
1064
- assemblies = sum(map(lambda x: self.node_path2assembly_list(x, True), sorted_cycles), [])
1065
-
1066
- out = [a for a in assemblies if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)]
1067
- if only_adjacent_edges:
1068
- out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, True)]
1069
- return out
1070
-
1071
- def format_insertion_assembly(self, assembly):
1072
- """Sorts the fragment representing a cycle so that they represent an insertion assembly if possible,
1073
- else returns None.
1074
-
1075
- Here we check if one of the joins between fragments represents the edges of an insertion assembly
1076
- The fragment must be linear, and the join must be as indicated below
1077
-
1078
- ```
1079
- -------- ------- Fragment 1
1080
- || ||
1081
- xxxxxxxx || Fragment 2
1082
- || ||
1083
- oooooooooo Fragment 3
1084
- ```
1085
- The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
1086
-
1087
- These could be returned in any order by simple_cycles, so we sort the edges so that the first
1088
- and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
1089
- """
1090
- edge_pair_index = list()
1091
-
1092
- # Pair edges with one another
1093
- for i, ((_u1, v1, _, end_location), (_u2, _v2, start_location, _)) in enumerate(
1094
- zip(assembly, assembly[1:] + assembly[:1])
1095
- ):
1096
- fragment = self.fragments[abs(v1) - 1]
1097
- # Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
1098
- # the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
1099
- left_of_insertion = _location_boundaries(start_location)[0]
1100
- right_of_insertion = _location_boundaries(end_location)[0]
1101
- if not fragment.circular and (
1102
- right_of_insertion >= left_of_insertion
1103
- # The below condition is for single-site integration.
1104
- # The reason to use locations_overlap instead of equality is because the location might extend
1105
- # left of right. For example, let's take ACCGGTTT as homology arm for an integration:
1106
- #
1107
- # insert aaACCGGTTTccACCGGTTTtt
1108
- # genome aaACCGGTTTtt
1109
- #
1110
- # The locations of homology on the genome are [0:10] and [2:12], so not identical
1111
- # but they overlap.
1112
- or _locations_overlap(start_location, end_location, len(fragment))
1113
- ):
1114
- edge_pair_index.append(i)
1115
-
1116
- if len(edge_pair_index) != 1:
1117
- return None
1118
-
1119
- shift_by = (edge_pair_index[0] + 1) % len(assembly)
1120
- return assembly[shift_by:] + assembly[:shift_by]
1121
-
1122
- def format_insertion_assembly_edge_case(self, assembly):
1123
- """
1124
- Edge case from https://github.com/manulera/OpenCloning_backend/issues/329
1125
- """
1126
- same_assembly = assembly[:]
1127
-
1128
- if len(assembly) != 2:
1129
- return same_assembly
1130
- ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = assembly
1131
-
1132
- if f1 != _f1 or _f2 != f2:
1133
- return same_assembly
1134
-
1135
- if loc_f2_1 == loc_f2_2 or loc_f1_2 == loc_f1_1:
1136
- return same_assembly
1137
-
1138
- fragment1 = self.fragments[abs(f1) - 1]
1139
- fragment2 = self.fragments[abs(f2) - 1]
1140
-
1141
- if not _locations_overlap(loc_f1_1, loc_f1_2, len(fragment1)) or not _locations_overlap(
1142
- loc_f2_2, loc_f2_1, len(fragment2)
1143
- ):
1144
- return same_assembly
1145
-
1146
- # Sort to make compatible with insertion assembly
1147
- if _location_boundaries(loc_f1_1)[0] > _location_boundaries(loc_f1_2)[0]:
1148
- new_assembly = same_assembly[::-1]
1149
- else:
1150
- new_assembly = same_assembly[:]
1151
-
1152
- ((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = new_assembly
1153
-
1154
- fragment1 = self.fragments[abs(f1) - 1]
1155
- if fragment1.circular:
1156
- return same_assembly
1157
- fragment2 = self.fragments[abs(f2) - 1]
1158
-
1159
- # Extract boundaries
1160
- f2_1_start, _ = _location_boundaries(loc_f2_1)
1161
- f2_2_start, f2_2_end = _location_boundaries(loc_f2_2)
1162
- f1_1_start, _ = _location_boundaries(loc_f1_1)
1163
- f1_2_start, f1_2_end = _location_boundaries(loc_f1_2)
1164
-
1165
- overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(fragment2[f2_1_start:f2_2_end])
1166
-
1167
- if overlap_diff == 0:
1168
- assert False, 'Overlap is 0'
1169
-
1170
- if overlap_diff > 0:
1171
- new_loc_f1_1 = create_location(f1_1_start, f1_2_start - overlap_diff, len(fragment1))
1172
- new_loc_f2_1 = create_location(f2_1_start, f2_2_start, len(fragment2))
1173
- else:
1174
- new_loc_f2_1 = create_location(f2_1_start, f2_2_start + overlap_diff, len(fragment2))
1175
- new_loc_f1_1 = create_location(f1_1_start, f1_2_start, len(fragment1))
1176
-
1177
- new_assembly = [
1178
- (f1, f2, new_loc_f1_1, new_loc_f2_1),
1179
- new_assembly[1],
1180
- ]
1181
-
1182
- return new_assembly
1183
-
1184
- def get_insertion_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1185
- """Assemblies that represent the insertion of a fragment or series of fragment inside a linear construct. For instance,
1186
- digesting CCCCGAATTCCCCGAATTC with EcoRI and inserting the fragment with two overhangs into the EcoRI site of AAAGAATTCAAA.
1187
- This is not so much meant for the use-case of linear fragments that represent actual linear fragments, but for linear
1188
- fragments that represent a genome region. This can then be used to simulate homologous recombination.
1189
- """
1190
- if only_adjacent_edges:
1191
- raise NotImplementedError('only_adjacent_edges not implemented for insertion assemblies')
1192
-
1193
- cycles = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1194
-
1195
- # We apply constrains already here because sometimes the combinatorial explosion is too large
1196
- if self.use_all_fragments:
1197
- cycles = [c for c in cycles if len(c) == len(self.fragments)]
1198
-
1199
- # Remove cycles with duplicates
1200
- cycles = [c for c in cycles if len(c) == len(set(map(abs, c)))]
1201
-
1202
- possible_assembly_number = self.get_possible_assembly_number([c + c[:1] for c in cycles])
1203
-
1204
- if possible_assembly_number > max_assemblies:
1205
- raise ValueError(f'Too many assemblies ({possible_assembly_number} pre-validation) to assemble')
1206
-
1207
- # We find cycles first
1208
- iterator = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
1209
- assemblies = sum(map(lambda x: self.node_path2assembly_list(x, True), iterator), [])
1210
- # We format the edge case
1211
- assemblies = [self.format_insertion_assembly_edge_case(a) for a in assemblies]
1212
- # We select those that contain exactly only one suitable edge
1213
- assemblies = [b for a in assemblies if (b := self.format_insertion_assembly(a)) is not None]
1214
- # First fragment should be in the + orientation
1215
- assemblies = list(filter(lambda x: x[0][0] > 0, assemblies))
1216
- return [
1217
- a
1218
- for a in assemblies
1219
- if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments, is_insertion=True)
1220
- ]
1221
-
1222
- def assemble_linear(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1223
- """Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
1224
- assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
1225
- return [assemble(self.fragments, a) for a in assemblies]
1226
-
1227
- def assemble_circular(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1228
- """Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
1229
- assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
1230
- return [assemble(self.fragments, a) for a in assemblies]
1231
-
1232
- def assemble_insertion(self, only_adjacent_edges: bool = False):
1233
- """Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
1234
- assemblies = self.get_insertion_assemblies(only_adjacent_edges)
1235
- return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
1236
-
1237
- def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
1238
- """Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
1239
- `left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
1240
- and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
1241
- where we can end up with a situation like this:
1242
-
1243
- GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
1244
-
1245
- # Partial overlap in assembly 1[9:11]:2[8:10]
1246
- GGTCTCCxxAACCAA
1247
- CCAGAGGGGTTxxTT
1248
-
1249
- # Partial overlap in 2[10:12]:1[7:9]
1250
- aGGTCTCCxxCCAATT
1251
- tCCAGAGGTTGGxxAA
1252
-
1253
- Would return
1254
- {
1255
- 1: {'left': [7:9], 'right': [9:11]},
1256
- 2: {'left': [8:10], 'right': [10:12]},
1257
- -1: {'left': [2:4], 'right': [4:6]},
1258
- -2: {'left': [2:4], 'right': [4:6]}
1259
- }
1260
-
1261
- """
1262
-
1263
- locations_on_fragments = dict()
1264
- for node in self.G.nodes:
1265
- this_dict = {'left': list(), 'right': list()}
1266
- for edge in self.G.edges(data=True):
1267
- for i, key in enumerate(['right', 'left']):
1268
- if edge[i] == node:
1269
- edge_location = edge[2]['locations'][i]
1270
- if edge_location not in this_dict[key]:
1271
- this_dict[key].append(edge_location)
1272
- this_dict['left'] = sorted(this_dict['left'], key=lambda x: _location_boundaries(x)[0])
1273
- this_dict['right'] = sorted(this_dict['right'], key=lambda x: _location_boundaries(x)[0])
1274
- locations_on_fragments[node] = this_dict
1275
-
1276
- return locations_on_fragments
1277
-
1278
- def assembly_uses_only_adjacent_edges(self, assembly, is_circular: bool) -> bool:
1279
- """
1280
- Check whether only adjacent edges within each fragment are used in the assembly. This is useful to check if a cut and ligate assembly is valid,
1281
- and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
1282
- and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
1283
-
1284
- x y z
1285
- -------|-------|-------|---------
1286
-
1287
- We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
1288
- The latter would indicate that the fragment was partially digested.
1289
- """
1290
-
1291
- locations_on_fragments = self.get_locations_on_fragments()
1292
- for node in locations_on_fragments:
1293
- fragment_len = len(self.fragments[abs(node) - 1])
1294
- for side in ['left', 'right']:
1295
- locations_on_fragments[node][side] = gather_overlapping_locations(
1296
- locations_on_fragments[node][side], fragment_len
1297
- )
1298
-
1299
- allowed_location_pairs = dict()
1300
- for node in locations_on_fragments:
1301
- if not is_circular:
1302
- # We add the existing ends of the fragment
1303
- left = [(None,)] + locations_on_fragments[node]['left']
1304
- right = locations_on_fragments[node]['right'] + [(None,)]
1305
-
1306
- else:
1307
- # For circular assemblies, we add the first location at the end
1308
- # to allow for the last edge to be used
1309
- left = locations_on_fragments[node]['left']
1310
- right = locations_on_fragments[node]['right'][1:] + locations_on_fragments[node]['right'][:1]
1311
-
1312
- pairs = list()
1313
- for pair in zip(left, right):
1314
- pairs += list(_itertools.product(*pair))
1315
- allowed_location_pairs[node] = pairs
1316
-
1317
- fragment_assembly = edge_representation2subfragment_representation(assembly, is_circular)
1318
- for node, start_location, end_location in fragment_assembly:
1319
- if (start_location, end_location) not in allowed_location_pairs[node]:
1320
- return False
1321
- return True
1322
-
1323
- def __repr__(self):
1324
- # https://pyformat.info
1325
- return _pretty_str(
1326
- 'Assembly\n'
1327
- 'fragments..: {sequences}\n'
1328
- 'limit(bp)..: {limit}\n'
1329
- 'G.nodes....: {nodes}\n'
1330
- 'algorithm..: {al}'.format(
1331
- sequences=' '.join('{}bp'.format(len(x)) for x in self.fragments),
1332
- limit=self.limit,
1333
- nodes=self.G.order(),
1334
- al=self.algorithm.__name__,
1335
- )
1336
- )
1337
-
1338
-
1339
- class PCRAssembly(Assembly):
1340
- def __init__(self, frags: list[_Dseqrecord | _Primer], limit=25, mismatches=0):
1341
-
1342
- value_error = ValueError(
1343
- 'PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer'
1344
- )
1345
- if len(frags) != 3:
1346
- raise value_error
1347
-
1348
- # Validate the inputs: should be a series of primer, template, primer
1349
- wrong_fragment_class = (
1350
- not isinstance(frags[0], _Primer),
1351
- isinstance(frags[1], _Primer),
1352
- not isinstance(frags[2], _Primer),
1353
- )
1354
- if any(wrong_fragment_class):
1355
- raise value_error
1356
-
1357
- # TODO: allow for the same fragment to be included more than once?
1358
- self.G = _nx.MultiDiGraph()
1359
- # Add positive and negative nodes for forward and reverse fragments
1360
- self.G.add_nodes_from((i + 1, {'seq': f}) for (i, f) in enumerate(frags))
1361
- self.G.add_nodes_from((-(i + 1), {'seq': f.reverse_complement()}) for (i, f) in enumerate(frags))
1362
-
1363
- pairs = list()
1364
- primer_ids = list()
1365
- for i in range(0, len(frags), 3):
1366
- # primer, template, primer
1367
- p1, t, p2 = (i + 1, i + 2, i + 3)
1368
- primer_ids += [p1, p2]
1369
- pairs += list(_itertools.product([p1, p2], [t, -t]))
1370
- pairs += list(_itertools.product([t, -t], [-p1, -p2]))
1371
-
1372
- for u, v in pairs:
1373
- u_seq = self.G.nodes[u]['seq']
1374
- v_seq = self.G.nodes[v]['seq']
1375
- matches = alignment_sub_strings(u_seq, v_seq, limit, mismatches)
1376
- for match in matches:
1377
- self.add_edges_from_match(match, u, v, u_seq, v_seq)
1378
-
1379
- # These two are constrained
1380
- self.use_fragment_order = False
1381
- self.use_all_fragments = True
1382
-
1383
- self.fragments = frags
1384
- self.limit = limit
1385
- self.algorithm = alignment_sub_strings
1386
-
1387
- return
1388
-
1389
- def get_linear_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1390
- if only_adjacent_edges:
1391
- raise NotImplementedError('only_adjacent_edges not implemented for PCR assemblies')
1392
-
1393
- return super().get_linear_assemblies(max_assemblies=max_assemblies)
1394
-
1395
- def get_circular_assemblies(self, only_adjacent_edges: bool = False):
1396
- raise NotImplementedError('get_circular_assemblies not implemented for PCR assemblies')
1397
-
1398
- def get_insertion_assemblies(self, only_adjacent_edges: bool = False):
1399
- raise NotImplementedError('get_insertion_assemblies not implemented for PCR assemblies')
1400
-
1401
-
1402
- class SingleFragmentAssembly(Assembly):
1403
- """
1404
- An assembly that represents the circularisation or splicing of a single fragment.
1405
- """
1406
-
1407
- def __init__(self, frags: [_Dseqrecord], limit=25, algorithm=common_sub_strings):
1408
-
1409
- if len(frags) != 1:
1410
- raise ValueError('SingleFragmentAssembly assembly must be initialised with a single fragment')
1411
- # TODO: allow for the same fragment to be included more than once?
1412
- self.G = _nx.MultiDiGraph()
1413
- frag = frags[0]
1414
- # Add positive and negative nodes for forward and reverse fragments
1415
- self.G.add_node(1, seq=frag)
1416
-
1417
- matches = algorithm(frag, frag, limit)
1418
- for match in matches:
1419
- self.add_edges_from_match(match, 1, 1, frag, frag)
1420
-
1421
- # To avoid duplicated outputs
1422
- self.G.remove_edges_from([(-1, -1)])
1423
-
1424
- # These two are constrained
1425
- self.use_fragment_order = True
1426
- self.use_all_fragments = True
1427
-
1428
- self.fragments = frags
1429
- self.limit = limit
1430
- self.algorithm = algorithm
1431
-
1432
- return
1433
-
1434
- def get_circular_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1435
- # We don't want the same location twice
1436
- assemblies = filter(
1437
- lambda x: x[0][2] != x[0][3], super().get_circular_assemblies(only_adjacent_edges, max_assemblies)
1438
- )
1439
- return [a for a in assemblies if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)]
1440
-
1441
- def get_insertion_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
1442
- """This could be renamed splicing assembly, but the essence is similar"""
1443
-
1444
- if only_adjacent_edges:
1445
- raise NotImplementedError('only_adjacent_edges not implemented for insertion assemblies')
1446
-
1447
- def splicing_assembly_filter(x):
1448
- # We don't want the same location twice
1449
- if x[0][2] == x[0][3]:
1450
- return False
1451
- # We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
1452
- left_start, _ = _location_boundaries(x[0][2])
1453
- _, right_end = _location_boundaries(x[0][3])
1454
- if left_start == 0 and right_end == len(self.fragments[0]):
1455
- return False
1456
- return True
1457
-
1458
- # We don't want the same location twice
1459
- assemblies = filter(splicing_assembly_filter, super().get_insertion_assemblies(max_assemblies=max_assemblies))
1460
- return [
1461
- a
1462
- for a in assemblies
1463
- if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments, is_insertion=True)
1464
- ]
1465
-
1466
- def get_linear_assemblies(self):
1467
- raise NotImplementedError('Linear assembly does not make sense')