opencloning 0.3.8__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencloning/app_settings.py +1 -0
- opencloning/batch_cloning/EBIC/example.py +1 -3
- opencloning/batch_cloning/pombe/pombe_clone.py +29 -37
- opencloning/batch_cloning/pombe/pombe_summary.py +11 -7
- opencloning/batch_cloning/ziqiang_et_al2024/__init__.py +28 -56
- opencloning/batch_cloning/ziqiang_et_al2024/ziqiang_et_al2024.json +47 -56
- opencloning/bug_fixing/README.md +5 -2
- opencloning/bug_fixing/backend_v0_3.py +12 -15
- opencloning/dna_functions.py +5 -6
- opencloning/dna_utils.py +26 -21
- opencloning/endpoints/assembly.py +27 -23
- opencloning/endpoints/no_assembly.py +8 -5
- opencloning/endpoints/no_input.py +11 -4
- opencloning/pydantic_models.py +57 -24
- opencloning/request_examples.py +4 -4
- {opencloning-0.3.8.dist-info → opencloning-0.4.2.dist-info}/METADATA +6 -5
- {opencloning-0.3.8.dist-info → opencloning-0.4.2.dist-info}/RECORD +19 -21
- opencloning/assembly2.py +0 -1467
- opencloning/batch_cloning/pombe/pombe_all.sh +0 -9
- {opencloning-0.3.8.dist-info → opencloning-0.4.2.dist-info}/LICENSE +0 -0
- {opencloning-0.3.8.dist-info → opencloning-0.4.2.dist-info}/WHEEL +0 -0
opencloning/assembly2.py
DELETED
|
@@ -1,1467 +0,0 @@
|
|
|
1
|
-
"""Slightly different assembly implementation"""
|
|
2
|
-
|
|
3
|
-
from pydna.utils import (
|
|
4
|
-
shift_location as _shift_location,
|
|
5
|
-
flatten,
|
|
6
|
-
location_boundaries as _location_boundaries,
|
|
7
|
-
locations_overlap as _locations_overlap,
|
|
8
|
-
)
|
|
9
|
-
from pydna._pretty import pretty_str as _pretty_str
|
|
10
|
-
from pydna.common_sub_strings import common_sub_strings as common_sub_strings_str
|
|
11
|
-
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
|
|
12
|
-
from pydna.dseq import Dseq as _Dseq
|
|
13
|
-
from pydna.primer import Primer as _Primer
|
|
14
|
-
from pydna.seqrecord import SeqRecord as _SeqRecord
|
|
15
|
-
import networkx as _nx
|
|
16
|
-
import itertools as _itertools
|
|
17
|
-
from Bio.SeqFeature import SimpleLocation, Location
|
|
18
|
-
from .dna_utils import sum_is_sticky, create_location
|
|
19
|
-
from Bio.Seq import reverse_complement
|
|
20
|
-
from Bio.Restriction.Restriction import RestrictionBatch, AbstractCut
|
|
21
|
-
import regex
|
|
22
|
-
import copy
|
|
23
|
-
|
|
24
|
-
# Currently unused, commented out because it's not tested
|
|
25
|
-
# def primers_clash(assembly, fragments):
|
|
26
|
-
# edge_pairs = zip(assembly, assembly[1:])
|
|
27
|
-
# for (_u1, _v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
|
|
28
|
-
# # Only for primer joins
|
|
29
|
-
# if not isinstance(fragments[abs(_v1) - 1], _Dseqrecord):
|
|
30
|
-
# continue
|
|
31
|
-
# if _locations_overlap(start_location, end_location, len(fragments[abs(_v1) - 1])):
|
|
32
|
-
# return True
|
|
33
|
-
# return False
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def limit_iterator(iterator, limit):
|
|
37
|
-
for i, x in enumerate(iterator):
|
|
38
|
-
if i >= limit:
|
|
39
|
-
raise ValueError(f'Too many possible paths (more than {limit})')
|
|
40
|
-
yield x
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def gather_overlapping_locations(locs: list[Location], fragment_length: int):
|
|
44
|
-
"""
|
|
45
|
-
Turn a list of locations into a list of tuples of those locations, where each tuple contains
|
|
46
|
-
locations that overlap. For example, if locs = [loc1, loc2, loc3], and loc1 and loc2 overlap,
|
|
47
|
-
the output will be [(loc1, loc2), (loc3,)].
|
|
48
|
-
"""
|
|
49
|
-
# Make a graph with all the locations as nodes
|
|
50
|
-
G = _nx.Graph()
|
|
51
|
-
for i, loc in enumerate(locs):
|
|
52
|
-
G.add_node(i, location=loc)
|
|
53
|
-
|
|
54
|
-
# Add edges between nodes that overlap
|
|
55
|
-
for i in range(len(locs)):
|
|
56
|
-
for j in range(i + 1, len(locs)):
|
|
57
|
-
if _locations_overlap(locs[i], locs[j], fragment_length):
|
|
58
|
-
G.add_edge(i, j)
|
|
59
|
-
|
|
60
|
-
# Get groups of overlapping locations
|
|
61
|
-
groups = list()
|
|
62
|
-
for loc_set in _nx.connected_components(G):
|
|
63
|
-
groups.append(tuple(locs[i] for i in loc_set))
|
|
64
|
-
|
|
65
|
-
# Sort by location of the first element in each group (does not matter which since they are overlapping)
|
|
66
|
-
groups.sort(key=lambda x: _location_boundaries(x[0])[0])
|
|
67
|
-
|
|
68
|
-
return groups
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# def assembly_checksum(G: _nx.MultiDiGraph, edge_list):
|
|
72
|
-
# """Calculate a checksum for an assembly, from a list of edges in the form (u, v, key)."""
|
|
73
|
-
# checksum_list = list()
|
|
74
|
-
# for edge in edge_list:
|
|
75
|
-
# u, v, key = edge
|
|
76
|
-
# checksum_list.append(G.get_edge_data(u, v, key)['uid'])
|
|
77
|
-
|
|
78
|
-
# return min('-'.join(checksum_list), '-'.join(checksum_list[::-1]))
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def ends_from_cutsite(cutsite: tuple[tuple[int, int], AbstractCut], seq: _Dseq):
|
|
82
|
-
if cutsite is None:
|
|
83
|
-
raise ValueError('None is not supported')
|
|
84
|
-
|
|
85
|
-
cut_watson, cut_crick, ovhg = seq.get_cut_parameters(cutsite, is_left=None)
|
|
86
|
-
if ovhg < 0:
|
|
87
|
-
# TODO check the edge in circular
|
|
88
|
-
return (
|
|
89
|
-
("5'", str(seq[cut_watson:cut_crick].reverse_complement()).lower()),
|
|
90
|
-
("5'", str(seq[cut_watson:cut_crick]).lower()),
|
|
91
|
-
)
|
|
92
|
-
elif ovhg > 0:
|
|
93
|
-
return (
|
|
94
|
-
("3'", str(seq[cut_crick:cut_watson]).lower()),
|
|
95
|
-
("3'", str(seq[cut_crick:cut_watson].reverse_complement()).lower()),
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
return ('blunt', ''), ('blunt', '')
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def restriction_ligation_overlap(
|
|
102
|
-
seqx: _Dseqrecord, seqy: _Dseqrecord, enzymes=RestrictionBatch, partial=False, allow_blunt=False
|
|
103
|
-
):
|
|
104
|
-
"""Find overlaps. Like in stiky and gibson, the order matters"""
|
|
105
|
-
cuts_x = seqx.seq.get_cutsites(*enzymes)
|
|
106
|
-
cuts_y = seqy.seq.get_cutsites(*enzymes)
|
|
107
|
-
# If blunt ends are allowed, something similar to this could be done to allow
|
|
108
|
-
# joining with linear sequence ends, but for now it messes up with the only_adjacent_edges
|
|
109
|
-
# case
|
|
110
|
-
# if allow_blunt:
|
|
111
|
-
# if not seqx.circular:
|
|
112
|
-
# cuts_x.append(((len(seqx), 0), None))
|
|
113
|
-
# if not seqy.circular:
|
|
114
|
-
# cuts_y.append(((0, 0), None))
|
|
115
|
-
matches = list()
|
|
116
|
-
for cut_x, cut_y in _itertools.product(cuts_x, cuts_y):
|
|
117
|
-
# A blunt end
|
|
118
|
-
if allow_blunt and cut_x[0][1] == cut_y[0][1] == 0:
|
|
119
|
-
matches.append((cut_x[0][0], cut_y[0][0], 0))
|
|
120
|
-
continue
|
|
121
|
-
|
|
122
|
-
# Otherwise, test overhangs
|
|
123
|
-
overlap = sum_is_sticky(ends_from_cutsite(cut_x, seqx.seq)[0], ends_from_cutsite(cut_y, seqy.seq)[1], partial)
|
|
124
|
-
if not overlap:
|
|
125
|
-
continue
|
|
126
|
-
x_watson, x_crick, x_ovhg = seqx.seq.get_cut_parameters(cut_x, is_left=False)
|
|
127
|
-
y_watson, y_crick, y_ovhg = seqy.seq.get_cut_parameters(cut_y, is_left=True)
|
|
128
|
-
# Positions where the overlap would start for full overlap
|
|
129
|
-
left_x = x_watson if x_ovhg < 0 else x_crick
|
|
130
|
-
left_y = y_watson if y_ovhg < 0 else y_crick
|
|
131
|
-
|
|
132
|
-
# Correct por partial overlaps
|
|
133
|
-
left_x += abs(x_ovhg) - overlap
|
|
134
|
-
|
|
135
|
-
matches.append((left_x, left_y, overlap))
|
|
136
|
-
return matches
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def combine_algorithms(*algorithms):
|
|
140
|
-
"""Combine algorithms, if any of them returns a match, the match is returned."""
|
|
141
|
-
|
|
142
|
-
def combined(seqx, seqy, limit):
|
|
143
|
-
matches = list()
|
|
144
|
-
for algorithm in algorithms:
|
|
145
|
-
matches += algorithm(seqx, seqy, limit)
|
|
146
|
-
return matches
|
|
147
|
-
|
|
148
|
-
return combined
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def blunt_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=None):
|
|
152
|
-
"""Find blunt overlaps"""
|
|
153
|
-
if seqx.seq.three_prime_end()[0] == 'blunt' and seqy.seq.five_prime_end()[0] == 'blunt':
|
|
154
|
-
return [(len(seqx), 0, 0)]
|
|
155
|
-
return []
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def common_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
159
|
-
query_seqx = str(seqx.seq).upper()
|
|
160
|
-
query_seqy = str(seqy.seq).upper()
|
|
161
|
-
if seqx.circular:
|
|
162
|
-
query_seqx = query_seqx * 2
|
|
163
|
-
if seqy.circular:
|
|
164
|
-
query_seqy = query_seqy * 2
|
|
165
|
-
results = common_sub_strings_str(query_seqx, query_seqy, limit)
|
|
166
|
-
|
|
167
|
-
if not seqx.circular and not seqy.circular:
|
|
168
|
-
return results
|
|
169
|
-
|
|
170
|
-
# Remove matches that start on the second copy of the sequence
|
|
171
|
-
if seqx.circular:
|
|
172
|
-
results = [r for r in results if r[0] < len(seqx)]
|
|
173
|
-
if seqy.circular:
|
|
174
|
-
results = [r for r in results if r[1] < len(seqy)]
|
|
175
|
-
|
|
176
|
-
# Trim lengths that span more than the sequence
|
|
177
|
-
if seqx.circular or seqy.circular:
|
|
178
|
-
max_match_length = min(len(seqx), len(seqy))
|
|
179
|
-
results = [(r[0], r[1], min(r[2], max_match_length)) for r in results]
|
|
180
|
-
|
|
181
|
-
# Edge case where the sequences are identical
|
|
182
|
-
if len(seqx.seq) == len(seqy.seq):
|
|
183
|
-
full_match = next((r for r in results if r[2] == len(seqx.seq)), None)
|
|
184
|
-
if full_match is not None:
|
|
185
|
-
return [full_match]
|
|
186
|
-
|
|
187
|
-
# Remove duplicate matches, see example below
|
|
188
|
-
# Let's imagine the following two sequences, where either seqy or both are circular
|
|
189
|
-
# seqx: 01234
|
|
190
|
-
# seqy: 123450, circular
|
|
191
|
-
#
|
|
192
|
-
# common_sub_strings would return [(0, 5, 5), (1, 0, 4)]
|
|
193
|
-
# Actually, (1, 0, 4) is a subset of (0, 5, 5), the part
|
|
194
|
-
# that does not span the origin. To remove matches like this,
|
|
195
|
-
# We find matches where the origin is spanned in one of the sequences
|
|
196
|
-
# only, and then remove the subset of that match that does not span the origin.
|
|
197
|
-
shifted_matches = set()
|
|
198
|
-
for x, y, length in results:
|
|
199
|
-
x_span_origin = seqx.circular and x + length > len(seqx)
|
|
200
|
-
y_span_origin = seqy.circular and y + length > len(seqy)
|
|
201
|
-
if x_span_origin and not y_span_origin:
|
|
202
|
-
shift = len(seqx) - x
|
|
203
|
-
shifted_matches.add((0, y + shift, length - shift))
|
|
204
|
-
elif not x_span_origin and y_span_origin:
|
|
205
|
-
shift = len(seqy) - y
|
|
206
|
-
shifted_matches.add((x + shift, 0, length - shift))
|
|
207
|
-
return [r for r in results if r not in shifted_matches]
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def gibson_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=25):
|
|
211
|
-
"""
|
|
212
|
-
The order matters, we want alignments like:
|
|
213
|
-
|
|
214
|
-
oooo------xxxx
|
|
215
|
-
xxxx------oooo
|
|
216
|
-
Product: oooo------xxxx------oooo
|
|
217
|
-
|
|
218
|
-
Not like:
|
|
219
|
-
|
|
220
|
-
oooo------xxxx
|
|
221
|
-
xxxx------oooo
|
|
222
|
-
Product (unwanted): oooo
|
|
223
|
-
"""
|
|
224
|
-
|
|
225
|
-
# Because Gibson enzymes remove 5' overhangs, we remove them from the sequence
|
|
226
|
-
# when looking for homology, then we shift the location of the second fragment accordingly.
|
|
227
|
-
# This is only relevant for linear fragments, so we don't need to worry about
|
|
228
|
-
# shifting locations for circular fragments.
|
|
229
|
-
trim_x_left = -seqx.seq.ovhg if seqx.seq.ovhg < 0 else 0
|
|
230
|
-
trim_x_right = seqx.seq.watson_ovhg() if seqx.seq.watson_ovhg() < 0 else None
|
|
231
|
-
trim_y_left = -seqy.seq.ovhg if seqy.seq.ovhg < 0 else 0
|
|
232
|
-
trim_y_right = seqy.seq.watson_ovhg() if seqy.seq.watson_ovhg() < 0 else None
|
|
233
|
-
|
|
234
|
-
stringx = str(seqx.seq[trim_x_left:trim_x_right]).upper()
|
|
235
|
-
stringy = str(seqy.seq[trim_y_left:trim_y_right]).upper()
|
|
236
|
-
# We have to convert to list because we need to modify the matches
|
|
237
|
-
matches = [
|
|
238
|
-
list(m) for m in common_sub_strings_str(stringx, stringy, limit) if (m[1] == 0 and m[0] + m[2] == len(stringx))
|
|
239
|
-
]
|
|
240
|
-
for match in matches:
|
|
241
|
-
match[0] += trim_x_left
|
|
242
|
-
match[1] += trim_y_left
|
|
243
|
-
|
|
244
|
-
# convert to tuples again
|
|
245
|
-
return [tuple(m) for m in matches]
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def sticky_end_sub_strings(seqx: _Dseqrecord, seqy: _Dseqrecord, limit=0):
|
|
249
|
-
"""For now, if limit 0 / False only full overlaps are considered."""
|
|
250
|
-
overlap = sum_is_sticky(seqx.seq.three_prime_end(), seqy.seq.five_prime_end(), limit)
|
|
251
|
-
if overlap:
|
|
252
|
-
return [(len(seqx) - overlap, 0, overlap)]
|
|
253
|
-
return []
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def zip_match_leftwards(seqx: _SeqRecord, seqy: _SeqRecord, match: tuple[int, int, int]):
|
|
257
|
-
"""Starting from the rightmost edge of the match, return a new match encompassing the max
|
|
258
|
-
number of bases. This can be used to return a longer match if a primer aligns for longer
|
|
259
|
-
than the limit or a shorter match if there are mismatches. This is convenient to maintain
|
|
260
|
-
as many features as possible.
|
|
261
|
-
|
|
262
|
-
>>> seq = _Dseqrecord('AAAAACGTCCCGT')
|
|
263
|
-
>>> primer = _Dseqrecord('ACGTCCCGT')
|
|
264
|
-
>>> match = (13, 9, 0) # an empty match at the end of each
|
|
265
|
-
>>> zip_match_leftwards(seq, primer, match)
|
|
266
|
-
(4, 0, 9)
|
|
267
|
-
|
|
268
|
-
Works in circular molecules if the match spans the origin:
|
|
269
|
-
>>> seq = _Dseqrecord('TCCCGTAAAAACG', circular=True)
|
|
270
|
-
>>> primer = _Dseqrecord('ACGTCCCGT')
|
|
271
|
-
>>> match = (6, 9, 0)
|
|
272
|
-
>>> zip_match_leftwards(seq, primer, match)
|
|
273
|
-
>>> (10, 0, 9)
|
|
274
|
-
|
|
275
|
-
"""
|
|
276
|
-
|
|
277
|
-
query_x = seqrecord2str_for_alignment(seqx)
|
|
278
|
-
query_y = seqrecord2str_for_alignment(seqy)
|
|
279
|
-
|
|
280
|
-
# In circular sequences, the match may go beyond the left-most edge of the sequence if it spans
|
|
281
|
-
# the origin:
|
|
282
|
-
# Primer: ACGTCCCGT
|
|
283
|
-
# |||||||||
|
|
284
|
-
# Circular seq: ACGTCCCGT -> Equivalent to Dseqrecord('CCCGTACGT', circular=True)
|
|
285
|
-
# ^
|
|
286
|
-
# Origin
|
|
287
|
-
# We would start from the last T and move leftwards, but we would stop at the origin
|
|
288
|
-
# For those cases we shift by length, then go back
|
|
289
|
-
|
|
290
|
-
end_on_x = match[0] + match[2]
|
|
291
|
-
if isinstance(seqx, _Dseqrecord) and seqx.circular and end_on_x <= len(seqx):
|
|
292
|
-
end_on_x += len(seqx)
|
|
293
|
-
|
|
294
|
-
end_on_y = match[1] + match[2]
|
|
295
|
-
if isinstance(seqy, _Dseqrecord) and seqy.circular and end_on_y <= len(seqy):
|
|
296
|
-
end_on_y += len(seqy)
|
|
297
|
-
|
|
298
|
-
count = 0
|
|
299
|
-
for x, y in zip(reversed(query_x[:end_on_x]), reversed(query_y[:end_on_y])):
|
|
300
|
-
if x != y:
|
|
301
|
-
break
|
|
302
|
-
count += 1
|
|
303
|
-
|
|
304
|
-
# Shift back by length if needed
|
|
305
|
-
start_on_x = (end_on_x - count) % len(seqx)
|
|
306
|
-
start_on_y = (end_on_y - count) % len(seqy)
|
|
307
|
-
|
|
308
|
-
return (start_on_x, start_on_y, count)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
def zip_match_rightwards(seqx: _Dseqrecord, seqy: _Dseqrecord, match: tuple[int, int, int]):
|
|
312
|
-
"""Same as zip_match_leftwards, towards the right."""
|
|
313
|
-
|
|
314
|
-
query_x = seqrecord2str_for_alignment(seqx)
|
|
315
|
-
query_y = seqrecord2str_for_alignment(seqy)
|
|
316
|
-
|
|
317
|
-
start_on_x, start_on_y, _ = match
|
|
318
|
-
count = 0
|
|
319
|
-
for x, y in zip(query_x[start_on_x:], query_y[start_on_y:]):
|
|
320
|
-
if x != y:
|
|
321
|
-
break
|
|
322
|
-
count += 1
|
|
323
|
-
return (start_on_x, start_on_y, count)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
def seqrecord2str_for_alignment(seqr: _SeqRecord):
|
|
327
|
-
"""Transform a Dseqrecord to a string representation where U is replaced by T, everything is upper case and
|
|
328
|
-
circular sequences are repeated twice."""
|
|
329
|
-
out = str(seqr.seq).upper().replace('U', 'T')
|
|
330
|
-
if isinstance(seqr, _Dseqrecord) and seqr.circular:
|
|
331
|
-
return out * 2
|
|
332
|
-
return out
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def alignment_sub_strings(seqx: _Dseqrecord | _Primer, seqy: _Dseqrecord | _Primer, limit=25, mismatches=0):
|
|
336
|
-
""""""
|
|
337
|
-
|
|
338
|
-
if isinstance(seqx, _Primer) and isinstance(seqy, _Dseqrecord):
|
|
339
|
-
primer = seqx
|
|
340
|
-
template = seqy
|
|
341
|
-
reverse_primer = False
|
|
342
|
-
elif isinstance(seqx, _Dseqrecord) and isinstance(seqy, _Primer):
|
|
343
|
-
primer = seqy
|
|
344
|
-
template = seqx
|
|
345
|
-
reverse_primer = True
|
|
346
|
-
else:
|
|
347
|
-
raise ValueError('One of the sequences must be a primer and the other a Dseqrecord')
|
|
348
|
-
|
|
349
|
-
if len(primer) < limit:
|
|
350
|
-
return []
|
|
351
|
-
|
|
352
|
-
subject = seqrecord2str_for_alignment(template)
|
|
353
|
-
query = (
|
|
354
|
-
seqrecord2str_for_alignment(primer[:limit]) if reverse_primer else seqrecord2str_for_alignment(primer[-limit:])
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
re_matches = list(regex.finditer('(' + query + '){s<=' + str(mismatches) + '}', subject, overlapped=True))
|
|
358
|
-
re_matches += list(regex.finditer('(?r)(' + query + '){s<=' + str(mismatches) + '}', subject, overlapped=True))
|
|
359
|
-
|
|
360
|
-
out = set()
|
|
361
|
-
for re_match in re_matches:
|
|
362
|
-
|
|
363
|
-
start, end = re_match.span()
|
|
364
|
-
|
|
365
|
-
# For circular sequences the same match is returned twice unless it falls
|
|
366
|
-
# on the origin, we eliminate duplicates here
|
|
367
|
-
if start >= len(template):
|
|
368
|
-
continue
|
|
369
|
-
|
|
370
|
-
# This extends match beyond the limit if the primer aligns more than that
|
|
371
|
-
# and reduces the match if the primer has mismatches
|
|
372
|
-
if reverse_primer:
|
|
373
|
-
# Match in the same format as other assembly algorithms
|
|
374
|
-
starting_match = (start, 0, end - start)
|
|
375
|
-
out.add(zip_match_rightwards(template, primer, starting_match))
|
|
376
|
-
else:
|
|
377
|
-
# Match in the same format as other assembly algorithms
|
|
378
|
-
starting_match = (len(primer) - limit, start, end - start)
|
|
379
|
-
out.add(zip_match_leftwards(primer, template, starting_match))
|
|
380
|
-
|
|
381
|
-
return list(sorted(out))
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
def fill_left(seq: _Dseq):
|
|
385
|
-
"""Fill the left overhang of a sequence with the complementary sequence."""
|
|
386
|
-
new_watson = seq.watson
|
|
387
|
-
new_crick = seq.crick
|
|
388
|
-
|
|
389
|
-
# Watson 5' overhang
|
|
390
|
-
if seq.ovhg < 0:
|
|
391
|
-
new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg])
|
|
392
|
-
# Crick 5' overhang
|
|
393
|
-
elif seq.ovhg > 0:
|
|
394
|
-
new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson
|
|
395
|
-
|
|
396
|
-
return _Dseq(new_watson, new_crick, 0)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def fill_right(seq: _Dseq):
|
|
400
|
-
"""Fill the right overhang of a sequence with the complementary sequence."""
|
|
401
|
-
new_watson = seq.watson
|
|
402
|
-
new_crick = seq.crick
|
|
403
|
-
|
|
404
|
-
# Watson 3' overhang
|
|
405
|
-
watson_ovhg = seq.watson_ovhg()
|
|
406
|
-
if watson_ovhg < 0:
|
|
407
|
-
new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg])
|
|
408
|
-
|
|
409
|
-
# Crick 3' overhang
|
|
410
|
-
elif watson_ovhg > 0:
|
|
411
|
-
new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick
|
|
412
|
-
|
|
413
|
-
return _Dseq(new_watson, new_crick, seq.ovhg)
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def fill_dseq(seq: _Dseq):
|
|
417
|
-
return fill_left(fill_right(seq))
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
def reverse_complement_assembly(
|
|
421
|
-
assembly: list[tuple[int, int, Location, Location]], fragments: list[_Dseqrecord]
|
|
422
|
-
) -> list[tuple[int, int, Location, Location]]:
|
|
423
|
-
"""Complement an assembly, i.e. reverse the order of the fragments and the orientation of the overlaps."""
|
|
424
|
-
new_assembly = list()
|
|
425
|
-
for u, v, locu, locv in assembly:
|
|
426
|
-
f_u = fragments[abs(u) - 1]
|
|
427
|
-
f_v = fragments[abs(v) - 1]
|
|
428
|
-
new_assembly.append((-v, -u, locv._flip(len(f_v)), locu._flip(len(f_u))))
|
|
429
|
-
return new_assembly[::-1]
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def filter_linear_subassemblies(linear_assemblies, circular_assemblies, fragments):
|
|
433
|
-
"""Remove linear assemblies which are sub-assemblies of circular assemblies"""
|
|
434
|
-
all_circular_assemblies = circular_assemblies + [
|
|
435
|
-
reverse_complement_assembly(c, fragments) for c in circular_assemblies
|
|
436
|
-
]
|
|
437
|
-
filtered_assemblies = [
|
|
438
|
-
assem for assem in linear_assemblies if not any(is_sublist(assem, c, True) for c in all_circular_assemblies)
|
|
439
|
-
]
|
|
440
|
-
# I don't think the line below is necessary, but just in case
|
|
441
|
-
# filtered_assemblies = [l for l in filtered_assemblies if not any(is_sublist(reverse_complement_assembly(l, fragments), c, True) for c in all_circular_assemblies)]
|
|
442
|
-
return filtered_assemblies
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
def remove_subassemblies(assemblies):
|
|
446
|
-
"""Filter out subassemblies, i.e. assemblies that are contained within another assembly.
|
|
447
|
-
|
|
448
|
-
For example:
|
|
449
|
-
[(1, 2, '1[8:14]:2[1:7]'), (2, 3, '2[10:17]:3[1:8]')]
|
|
450
|
-
[(1, 2, '1[8:14]:2[1:7]')]
|
|
451
|
-
The second one is a subassembly of the first one.
|
|
452
|
-
"""
|
|
453
|
-
|
|
454
|
-
# Sort by length, longest first
|
|
455
|
-
assemblies = sorted(assemblies, key=len, reverse=True)
|
|
456
|
-
|
|
457
|
-
filtered_assemblies = list()
|
|
458
|
-
for assembly in assemblies:
|
|
459
|
-
# Check if this assembly is a subassembly of any of the assemblies we have already found
|
|
460
|
-
if not any(is_sublist(assembly, a) for a in filtered_assemblies):
|
|
461
|
-
filtered_assemblies.append(assembly)
|
|
462
|
-
|
|
463
|
-
return filtered_assemblies
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
def assembly2str(assembly):
|
|
467
|
-
"""Convert an assembly to a string representation, for example:
|
|
468
|
-
((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
|
|
469
|
-
becomes:
|
|
470
|
-
('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
|
|
471
|
-
|
|
472
|
-
The reason for this is that by default, a feature '[8:14]' when present in a tuple
|
|
473
|
-
is printed to the console as `SimpleLocation(ExactPosition(8), ExactPosition(14), strand=1)` (very long).
|
|
474
|
-
"""
|
|
475
|
-
return str(tuple(f'{u}{lu}:{v}{lv}' for u, v, lu, lv in assembly))
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
def assembly2str_tuple(assembly):
|
|
479
|
-
"""Convert an assembly to a string representation, like
|
|
480
|
-
((1, 2, [8:14], [1:7]),(2, 3, [10:17], [1:8]))
|
|
481
|
-
"""
|
|
482
|
-
return str(tuple((u, v, str(lu), str(lv)) for u, v, lu, lv in assembly))
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
def assembly_has_mismatches(fragments, assembly):
|
|
486
|
-
for u, v, loc_u, loc_v in assembly:
|
|
487
|
-
seq_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
|
|
488
|
-
seq_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
|
|
489
|
-
# TODO: Check issue where extraction failed, and whether it would give problems here
|
|
490
|
-
if str(loc_u.extract(seq_u).seq).upper() != str(loc_v.extract(seq_v).seq).upper():
|
|
491
|
-
return True
|
|
492
|
-
return False
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
def assembly_is_circular(assembly, fragments):
|
|
496
|
-
"""
|
|
497
|
-
Note: This does not work for insertion assemblies, that's why assemble takes the optional argument is_insertion.
|
|
498
|
-
"""
|
|
499
|
-
if assembly[0][0] != assembly[-1][1]:
|
|
500
|
-
return False
|
|
501
|
-
elif isinstance(fragments[abs(assembly[0][0]) - 1], _Dseqrecord) and fragments[abs(assembly[0][0]) - 1].circular:
|
|
502
|
-
return True
|
|
503
|
-
else:
|
|
504
|
-
return _location_boundaries(assembly[0][2])[0] > _location_boundaries(assembly[-1][3])[0]
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
def assemble(fragments, assembly, is_insertion=False):
|
|
508
|
-
"""Execute an assembly, from the representation returned by get_linear_assemblies or get_circular_assemblies."""
|
|
509
|
-
|
|
510
|
-
if is_insertion:
|
|
511
|
-
is_circular = False
|
|
512
|
-
else:
|
|
513
|
-
is_circular = assembly_is_circular(assembly, fragments)
|
|
514
|
-
|
|
515
|
-
subfragment_representation = edge_representation2subfragment_representation(assembly, is_circular)
|
|
516
|
-
|
|
517
|
-
# Sanity check
|
|
518
|
-
for asm_edge in assembly:
|
|
519
|
-
u, v, loc_u, loc_v = asm_edge
|
|
520
|
-
f_u = fragments[u - 1] if u > 0 else fragments[-u - 1].reverse_complement()
|
|
521
|
-
f_v = fragments[v - 1] if v > 0 else fragments[-v - 1].reverse_complement()
|
|
522
|
-
seq_u = str(loc_u.extract(f_u).seq).upper()
|
|
523
|
-
seq_v = str(loc_v.extract(f_v).seq).upper()
|
|
524
|
-
if seq_u != seq_v:
|
|
525
|
-
raise ValueError('Mismatch in assembly')
|
|
526
|
-
|
|
527
|
-
# We transform into Dseqrecords (for primers)
|
|
528
|
-
dseqr_fragments = [f if isinstance(f, _Dseqrecord) else _Dseqrecord(f) for f in fragments]
|
|
529
|
-
subfragments = get_assembly_subfragments(dseqr_fragments, subfragment_representation)
|
|
530
|
-
|
|
531
|
-
# Length of the overlaps between consecutive assembly fragments
|
|
532
|
-
fragment_overlaps = [len(e[-1]) for e in assembly]
|
|
533
|
-
|
|
534
|
-
out_dseqrecord = _Dseqrecord(subfragments[0])
|
|
535
|
-
|
|
536
|
-
for fragment, overlap in zip(subfragments[1:], fragment_overlaps):
|
|
537
|
-
# Shift the features of the right fragment to the left by `overlap`
|
|
538
|
-
new_features = [f._shift(len(out_dseqrecord) - overlap) for f in fragment.features]
|
|
539
|
-
# Join the left sequence including the overlap with the right sequence without the overlap
|
|
540
|
-
# we use fill_right / fill_left so that it works for ligation of sticky ends
|
|
541
|
-
out_dseqrecord = _Dseqrecord(
|
|
542
|
-
fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:],
|
|
543
|
-
features=out_dseqrecord.features + new_features,
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
# For circular assemblies, close the loop and wrap origin-spanning features
|
|
547
|
-
if is_circular:
|
|
548
|
-
overlap = fragment_overlaps[-1]
|
|
549
|
-
|
|
550
|
-
# Special case for blunt circularisation
|
|
551
|
-
if overlap == 0:
|
|
552
|
-
return out_dseqrecord.looped()
|
|
553
|
-
|
|
554
|
-
# Remove trailing overlap
|
|
555
|
-
out_dseqrecord = _Dseqrecord(
|
|
556
|
-
fill_dseq(out_dseqrecord.seq)[:-overlap], features=out_dseqrecord.features, circular=True
|
|
557
|
-
)
|
|
558
|
-
for feature in out_dseqrecord.features:
|
|
559
|
-
start, end = _location_boundaries(feature.location)
|
|
560
|
-
if start >= len(out_dseqrecord) or end > len(out_dseqrecord):
|
|
561
|
-
# Wrap around the origin
|
|
562
|
-
feature.location = _shift_location(feature.location, 0, len(out_dseqrecord))
|
|
563
|
-
|
|
564
|
-
return out_dseqrecord
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
def annotate_primer_binding_sites(
|
|
568
|
-
input_dseqr: _Dseqrecord, fragments: list[_Dseqrecord], assembly: list[tuple[int, int, Location, Location]]
|
|
569
|
-
) -> _Dseqrecord:
|
|
570
|
-
"""Annotate the primer binding sites in a Dseqrecord."""
|
|
571
|
-
fwd, _, rvs = fragments
|
|
572
|
-
start_rvs = len(input_dseqr) - len(rvs)
|
|
573
|
-
|
|
574
|
-
output_dseqr = copy.deepcopy(input_dseqr)
|
|
575
|
-
output_dseqr.add_feature(
|
|
576
|
-
x=0, y=len(fwd), type_='primer_bind', strand=1, label=[fwd.name], note=['sequence: ' + str(fwd.seq)]
|
|
577
|
-
)
|
|
578
|
-
output_dseqr.add_feature(
|
|
579
|
-
x=start_rvs,
|
|
580
|
-
y=len(output_dseqr),
|
|
581
|
-
type_='primer_bind',
|
|
582
|
-
strand=-1,
|
|
583
|
-
label=[rvs.name],
|
|
584
|
-
note=['sequence: ' + str(rvs.seq)],
|
|
585
|
-
)
|
|
586
|
-
return output_dseqr
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
def edge_representation2subfragment_representation(assembly, is_circular):
|
|
590
|
-
"""
|
|
591
|
-
Turn this kind of edge representation fragment 1, fragment 2, right edge on 1, left edge on 2
|
|
592
|
-
a = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b', 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
|
|
593
|
-
Into this: fragment 1, left edge on 1, right edge on 1
|
|
594
|
-
b = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
|
|
595
|
-
"""
|
|
596
|
-
|
|
597
|
-
if is_circular:
|
|
598
|
-
temp = list(assembly[-1:]) + list(assembly)
|
|
599
|
-
else:
|
|
600
|
-
temp = [(None, assembly[0][0], None, None)] + list(assembly) + [(assembly[-1][1], None, None, None)]
|
|
601
|
-
edge_pairs = zip(temp, temp[1:])
|
|
602
|
-
subfragment_representation = list()
|
|
603
|
-
for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
|
|
604
|
-
subfragment_representation.append((v1, start_location, end_location))
|
|
605
|
-
|
|
606
|
-
return tuple(subfragment_representation)
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
def subfragment_representation2edge_representation(assembly, is_circular):
|
|
610
|
-
"""
|
|
611
|
-
Turn this kind of subfragment representation fragment 1, left edge on 1, right edge on 1
|
|
612
|
-
a = [(1, 'loc1c', 'loc1a'), (2, 'loc2a', 'loc2b'), (3, 'loc3b', 'loc3c')]
|
|
613
|
-
Into this: fragment 1, fragment 2, right edge on 1, left edge on 2
|
|
614
|
-
b = [(1, 2, 'loc1a', 'loc2a'), (2, 3, 'loc2b' 'loc3b'), (3, 1, 'loc3c', 'loc1c')]
|
|
615
|
-
"""
|
|
616
|
-
|
|
617
|
-
edge_representation = []
|
|
618
|
-
|
|
619
|
-
# Iterate through the assembly pairwise to create the edge representation
|
|
620
|
-
for i in range(len(assembly) - 1):
|
|
621
|
-
frag1, left1, right1 = assembly[i]
|
|
622
|
-
frag2, left2, right2 = assembly[i + 1]
|
|
623
|
-
# Create the edge between the current and next fragment
|
|
624
|
-
edge_representation.append((frag1, frag2, right1, left2))
|
|
625
|
-
|
|
626
|
-
if is_circular:
|
|
627
|
-
# Add the edge from the last fragment back to the first
|
|
628
|
-
frag_last, left_last, right_last = assembly[-1]
|
|
629
|
-
frag_first, left_first, right_first = assembly[0]
|
|
630
|
-
edge_representation.append((frag_last, frag_first, right_last, left_first))
|
|
631
|
-
|
|
632
|
-
return tuple(edge_representation)
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
def get_assembly_subfragments(fragments: list[_Dseqrecord], subfragment_representation):
|
|
636
|
-
"""From the fragment representation returned by edge_representation2subfragment_representation, get the subfragments that are joined together.
|
|
637
|
-
|
|
638
|
-
Subfragments are the slices of the fragments that are joined together
|
|
639
|
-
|
|
640
|
-
For example:
|
|
641
|
-
```
|
|
642
|
-
--A--
|
|
643
|
-
TACGTAAT
|
|
644
|
-
--B--
|
|
645
|
-
TCGTAACGA
|
|
646
|
-
|
|
647
|
-
Gives: TACGTAA / CGTAACGA
|
|
648
|
-
```
|
|
649
|
-
To reproduce:
|
|
650
|
-
```
|
|
651
|
-
a = Dseqrecord('TACGTAAT')
|
|
652
|
-
b = Dseqrecord('TCGTAACGA')
|
|
653
|
-
f = Assembly([a, b], limit=5)
|
|
654
|
-
a0 = f.get_linear_assemblies()[0]
|
|
655
|
-
print(assembly2str(a0))
|
|
656
|
-
a0_subfragment_rep =edge_representation2subfragment_representation(a0, False)
|
|
657
|
-
for f in get_assembly_subfragments([a, b], a0_subfragment_rep):
|
|
658
|
-
print(f.seq)
|
|
659
|
-
|
|
660
|
-
# prints TACGTAA and CGTAACGA
|
|
661
|
-
```
|
|
662
|
-
|
|
663
|
-
Subfragments: `cccccgtatcgtgt`, `atcgtgtactgtcatattc`
|
|
664
|
-
"""
|
|
665
|
-
subfragments = list()
|
|
666
|
-
for node, start_location, end_location in subfragment_representation:
|
|
667
|
-
seq = fragments[node - 1] if node > 0 else fragments[-node - 1].reverse_complement()
|
|
668
|
-
subfragments.append(extract_subfragment(seq, start_location, end_location))
|
|
669
|
-
return subfragments
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
def extract_subfragment(seq: _Dseqrecord, start_location: Location, end_location: Location):
|
|
673
|
-
"""Extract a subfragment from a sequence, given the start and end locations of the subfragment."""
|
|
674
|
-
start = 0 if start_location is None else _location_boundaries(start_location)[0]
|
|
675
|
-
end = None if end_location is None else _location_boundaries(end_location)[1]
|
|
676
|
-
|
|
677
|
-
# Special case, some of it could be handled by better Dseqrecord slicing in the future
|
|
678
|
-
if (
|
|
679
|
-
seq.circular
|
|
680
|
-
and start_location is not None
|
|
681
|
-
and end_location is not None
|
|
682
|
-
and _locations_overlap(start_location, end_location, len(seq))
|
|
683
|
-
):
|
|
684
|
-
# The overhang is different for origin-spanning features, for instance
|
|
685
|
-
# for a feature join{[12:13], [0:3]} in a sequence of length 13, the overhang
|
|
686
|
-
# is -4, not 9
|
|
687
|
-
ovhg = start - end if end > start else start - end - len(seq)
|
|
688
|
-
# edge case
|
|
689
|
-
if abs(ovhg) == len(seq):
|
|
690
|
-
ovhg = 0
|
|
691
|
-
dummy_cut = ((start, ovhg), None)
|
|
692
|
-
open_seq = seq.apply_cut(dummy_cut, dummy_cut)
|
|
693
|
-
return _Dseqrecord(fill_dseq(open_seq.seq), features=open_seq.features)
|
|
694
|
-
|
|
695
|
-
return seq[start:end]
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
def is_sublist(sublist, my_list, my_list_is_cyclic=False):
|
|
699
|
-
"""Returns True if sublist is a sublist of my_list (can be treated as cyclic), False otherwise.
|
|
700
|
-
|
|
701
|
-
Examples
|
|
702
|
-
--------
|
|
703
|
-
>>> is_sublist([1, 2], [1, 2, 3], False)
|
|
704
|
-
True
|
|
705
|
-
>>> is_sublist([1, 2], [1, 3, 2], False)
|
|
706
|
-
False
|
|
707
|
-
|
|
708
|
-
# See the case here for cyclic lists
|
|
709
|
-
>>> is_sublist([3, 1], [1, 2, 3], False)
|
|
710
|
-
False
|
|
711
|
-
>>> is_sublist([3, 1], [1, 2, 3], True)
|
|
712
|
-
True
|
|
713
|
-
"""
|
|
714
|
-
n = len(sublist)
|
|
715
|
-
if my_list_is_cyclic:
|
|
716
|
-
my_list = my_list + my_list
|
|
717
|
-
for i in range(len(my_list) - n + 1):
|
|
718
|
-
# Just in case tuples were passed
|
|
719
|
-
if list(my_list[i : i + n]) == list(sublist):
|
|
720
|
-
return True
|
|
721
|
-
return False
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
def circular_permutation_min_abs(lst):
|
|
725
|
-
"""Returns the circular permutation of lst with the smallest absolute value first.
|
|
726
|
-
|
|
727
|
-
Examples
|
|
728
|
-
--------
|
|
729
|
-
>>> circular_permutation_min_abs([1, 2, 3])
|
|
730
|
-
[1, 2, 3]
|
|
731
|
-
>>> circular_permutation_min_abs([3, 1, 2])
|
|
732
|
-
[1, 2, 3]
|
|
733
|
-
"""
|
|
734
|
-
min_abs_index = min(range(len(lst)), key=lambda i: abs(lst[i]))
|
|
735
|
-
return lst[min_abs_index:] + lst[:min_abs_index]
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
class Assembly:
|
|
739
|
-
"""Assembly of a list of linear DNA fragments into linear or circular
|
|
740
|
-
constructs. The Assembly is meant to replace the Assembly method as it
|
|
741
|
-
is easier to use. Accepts a list of Dseqrecords (source fragments) to
|
|
742
|
-
initiate an Assembly object. Several methods are available for analysis
|
|
743
|
-
of overlapping sequences, graph construction and assembly.
|
|
744
|
-
|
|
745
|
-
The assembly contains a directed graph, where nodes represent fragments and
|
|
746
|
-
edges represent overlaps between fragments. :
|
|
747
|
-
- The node keys are integers, representing the index of the fragment in the
|
|
748
|
-
input list of fragments. The sign of the node key represents the orientation
|
|
749
|
-
of the fragment, positive for forward orientation, negative for reverse orientation.
|
|
750
|
-
- The edges contain the locations of the overlaps in the fragments. For an edge (u, v, key):
|
|
751
|
-
- u and v are the nodes connected by the edge.
|
|
752
|
-
- key is a string that represents the location of the overlap. In the format:
|
|
753
|
-
'u[start:end](strand):v[start:end](strand)'.
|
|
754
|
-
- Edges have a 'locations' attribute, which is a list of two FeatureLocation objects,
|
|
755
|
-
representing the location of the overlap in the u and v fragment, respectively.
|
|
756
|
-
- You can think of an edge as a representation of the join of two fragments.
|
|
757
|
-
|
|
758
|
-
If fragment 1 and 2 share a subsequence of 6bp, [8:14] in fragment 1 and [1:7] in fragment 2,
|
|
759
|
-
there will be 4 edges representing that overlap in the graph, for all possible
|
|
760
|
-
orientations of the fragments (see add_edges_from_match for details):
|
|
761
|
-
- `(1, 2, '1[8:14]:2[1:7]')`
|
|
762
|
-
- `(2, 1, '2[1:7]:1[8:14]')`
|
|
763
|
-
- `(-1, -2, '-1[0:6]:-2[10:16]')`
|
|
764
|
-
- `(-2, -1, '-2[10:16]:-1[0:6]')`
|
|
765
|
-
|
|
766
|
-
An assembly can be thought of as a tuple of graph edges, but instead of representing them with node indexes and keys, we represent them
|
|
767
|
-
as u, v, locu, locv, where u and v are the nodes connected by the edge, and locu and locv are the locations of the overlap in the first
|
|
768
|
-
and second fragment. Assemblies are then represented as:
|
|
769
|
-
- Linear: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]))
|
|
770
|
-
- Circular: ((1, 2, [8:14], [1:7]), (2, 3, [10:17], [1:8]), (3, 1, [12:17], [1:6]))
|
|
771
|
-
Note that the first and last fragment are the same in a circular assembly.
|
|
772
|
-
|
|
773
|
-
The following constrains are applied to remove duplicate assemblies:
|
|
774
|
-
- Circular assemblies: the first subfragment is not reversed, and has the smallest index in the input fragment list.
|
|
775
|
-
use_fragment_order is ignored.
|
|
776
|
-
- Linear assemblies:
|
|
777
|
-
- Using uid (see add_edges_from_match) to identify unique edges.
|
|
778
|
-
|
|
779
|
-
Parameters
|
|
780
|
-
----------
|
|
781
|
-
|
|
782
|
-
fragments : list
|
|
783
|
-
a list of Dseqrecord objects.
|
|
784
|
-
limit : int, optional
|
|
785
|
-
The shortest shared homology to be considered
|
|
786
|
-
algorithm : function, optional
|
|
787
|
-
The algorithm used to determine the shared sequences.
|
|
788
|
-
use_fragment_order : bool, optional
|
|
789
|
-
Legacy pydna behaviour: only assemblies that start with the first fragment and end with the last are considered.
|
|
790
|
-
use_all_fragments : bool, optional
|
|
791
|
-
Constrain the assembly to use all fragments.
|
|
792
|
-
|
|
793
|
-
Examples
|
|
794
|
-
--------
|
|
795
|
-
|
|
796
|
-
from assembly2 import Assembly, assembly2str
|
|
797
|
-
from pydna.dseqrecord import Dseqrecord
|
|
798
|
-
|
|
799
|
-
example_fragments = (
|
|
800
|
-
Dseqrecord('AacgatCAtgctcc', name='a'),
|
|
801
|
-
Dseqrecord('TtgctccTAAattctgc', name='b'),
|
|
802
|
-
Dseqrecord('CattctgcGAGGacgatG', name='c'),
|
|
803
|
-
)
|
|
804
|
-
|
|
805
|
-
asm = Assembly(example_fragments, limit=5, use_fragment_order=False)
|
|
806
|
-
print('Linear ===============')
|
|
807
|
-
for assembly in asm.get_linear_assemblies():
|
|
808
|
-
print(' ', assembly2str(assembly))
|
|
809
|
-
print('Circular =============')
|
|
810
|
-
for assembly in asm.get_circular_assemblies():
|
|
811
|
-
print(' ', assembly2str(assembly))
|
|
812
|
-
|
|
813
|
-
# Prints
|
|
814
|
-
Linear ===============
|
|
815
|
-
('1[8:14]:2[1:7]', '2[10:17]:3[1:8]')
|
|
816
|
-
('2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
|
|
817
|
-
('3[12:17]:1[1:6]', '1[8:14]:2[1:7]')
|
|
818
|
-
('1[1:6]:3[12:17]',)
|
|
819
|
-
('2[1:7]:1[8:14]',)
|
|
820
|
-
('3[1:8]:2[10:17]',)
|
|
821
|
-
Circular =============
|
|
822
|
-
('1[8:14]:2[1:7]', '2[10:17]:3[1:8]', '3[12:17]:1[1:6]')
|
|
823
|
-
|
|
824
|
-
"""
|
|
825
|
-
|
|
826
|
-
def __init__(
|
|
827
|
-
self,
|
|
828
|
-
frags: list[_Dseqrecord],
|
|
829
|
-
limit=25,
|
|
830
|
-
algorithm=common_sub_strings,
|
|
831
|
-
use_fragment_order=True,
|
|
832
|
-
use_all_fragments=False,
|
|
833
|
-
):
|
|
834
|
-
# TODO: allow for the same fragment to be included more than once?
|
|
835
|
-
self.G = _nx.MultiDiGraph()
|
|
836
|
-
# Add positive and negative nodes for forward and reverse fragments
|
|
837
|
-
self.G.add_nodes_from((i + 1, {'seq': f}) for (i, f) in enumerate(frags))
|
|
838
|
-
self.G.add_nodes_from((-(i + 1), {'seq': f.reverse_complement()}) for (i, f) in enumerate(frags))
|
|
839
|
-
|
|
840
|
-
# Iterate over all possible combinations of fragments
|
|
841
|
-
fragment_pairs = _itertools.combinations(filter(lambda x: x > 0, self.G.nodes), 2)
|
|
842
|
-
for i, j in fragment_pairs:
|
|
843
|
-
# All the relative orientations of the fragments in the pair
|
|
844
|
-
for u, v in _itertools.product([i, -i], [j, -j]):
|
|
845
|
-
u_seq = self.G.nodes[u]['seq']
|
|
846
|
-
v_seq = self.G.nodes[v]['seq']
|
|
847
|
-
matches = algorithm(u_seq, v_seq, limit)
|
|
848
|
-
for match in matches:
|
|
849
|
-
self.add_edges_from_match(match, u, v, u_seq, v_seq)
|
|
850
|
-
|
|
851
|
-
self.fragments = frags
|
|
852
|
-
self.limit = limit
|
|
853
|
-
self.algorithm = algorithm
|
|
854
|
-
self.use_fragment_order = use_fragment_order
|
|
855
|
-
self.use_all_fragments = use_all_fragments
|
|
856
|
-
|
|
857
|
-
return
|
|
858
|
-
|
|
859
|
-
@classmethod
|
|
860
|
-
def assembly_is_valid(
|
|
861
|
-
cls, fragments: list[_Dseqrecord | _Primer], assembly, is_circular, use_all_fragments, is_insertion=False
|
|
862
|
-
):
|
|
863
|
-
"""Function used to filter paths returned from the graph, see conditions tested below."""
|
|
864
|
-
if is_circular is None:
|
|
865
|
-
return False
|
|
866
|
-
|
|
867
|
-
# Linear assemblies may get begin-1-end, begin-2-end, these are removed here.
|
|
868
|
-
if len(assembly) == 0:
|
|
869
|
-
return False
|
|
870
|
-
|
|
871
|
-
if use_all_fragments and len(fragments) != len(set(flatten(map(abs, e[:2]) for e in assembly))):
|
|
872
|
-
return False
|
|
873
|
-
|
|
874
|
-
# Here we check whether subsequent pairs of fragments are compatible, for instance:
|
|
875
|
-
# Compatible (overlap of 1 and 2 occurs before overlap of 2 and 3):
|
|
876
|
-
# (1,2,[2:9],[0:7]), (2,3,[12:19],[0:7])
|
|
877
|
-
# -- A --
|
|
878
|
-
# 1 gtatcgtgt -- B --
|
|
879
|
-
# 2 atcgtgtactgtcatattc
|
|
880
|
-
# 3 catattcaa
|
|
881
|
-
# Incompatible (overlap of 1 and 2 occurs after overlap of 2 and 3):
|
|
882
|
-
# (1,2,[2:9],[13:20]), (2,3,[0:7],[0:7])
|
|
883
|
-
# -- A --
|
|
884
|
-
# 1 -- B -- gtatcgtgt
|
|
885
|
-
# 2 catattcccccccatcgtgtactgt
|
|
886
|
-
# 3 catattcaa
|
|
887
|
-
# Redundant: overlap of 1 and 2 ends at the same spot as overlap of 2 and 3
|
|
888
|
-
# (1,2,[2:9],[1:8]), (2,3,[0:8],[0:8])
|
|
889
|
-
# -- A --
|
|
890
|
-
# gtatcgtgt
|
|
891
|
-
# catcgtgtactgtcatattc
|
|
892
|
-
# catcgtgtactgtcatattc
|
|
893
|
-
# -- B ---
|
|
894
|
-
if is_circular:
|
|
895
|
-
# In a circular assembly, first and last fragment must be the same
|
|
896
|
-
if assembly[0][0] != assembly[-1][1]:
|
|
897
|
-
return False
|
|
898
|
-
edge_pairs = zip(assembly, assembly[1:] + assembly[:1])
|
|
899
|
-
else:
|
|
900
|
-
edge_pairs = zip(assembly, assembly[1:])
|
|
901
|
-
|
|
902
|
-
for (_u1, v1, _, start_location), (_u2, _v2, end_location, _) in edge_pairs:
|
|
903
|
-
# Incompatible as described in figure above
|
|
904
|
-
fragment = fragments[abs(v1) - 1]
|
|
905
|
-
if (isinstance(fragment, _Primer) or not fragment.circular) and _location_boundaries(start_location)[
|
|
906
|
-
1
|
|
907
|
-
] >= _location_boundaries(end_location)[1]:
|
|
908
|
-
return False
|
|
909
|
-
|
|
910
|
-
# Fragments are used only once
|
|
911
|
-
nodes_used = [
|
|
912
|
-
f[0] for f in edge_representation2subfragment_representation(assembly, is_circular or is_insertion)
|
|
913
|
-
]
|
|
914
|
-
if len(nodes_used) != len(set(map(abs, nodes_used))):
|
|
915
|
-
return False
|
|
916
|
-
|
|
917
|
-
return True
|
|
918
|
-
|
|
919
|
-
def add_edges_from_match(self, match, u: int, v: int, first: _Dseqrecord, secnd: _Dseqrecord):
|
|
920
|
-
"""Add edges to the graph from a match returned by an `algorithm` function (see pydna.common_substrings). For
|
|
921
|
-
format of edges (see documentation of the Assembly class).
|
|
922
|
-
|
|
923
|
-
Matches are directional, because not all `algorithm` functions return the same match for (u,v) and (v,u). For example,
|
|
924
|
-
homologous recombination does but sticky end ligation does not. The function returns two edges:
|
|
925
|
-
- Fragments in the orientation they were passed, with locations of the match (u, v, loc_u, loc_v)
|
|
926
|
-
- Reverse complement of the fragments with inverted order, with flipped locations (-v, -u, flip(loc_v), flip(loc_u))/
|
|
927
|
-
|
|
928
|
-
"""
|
|
929
|
-
x_start, y_start, length = match
|
|
930
|
-
if length == 0:
|
|
931
|
-
# Edge case, blunt ligation
|
|
932
|
-
locs = [SimpleLocation(x_start, x_start), SimpleLocation(y_start, y_start)]
|
|
933
|
-
else:
|
|
934
|
-
# We use shift_location with 0 to wrap origin-spanning features
|
|
935
|
-
locs = [
|
|
936
|
-
_shift_location(SimpleLocation(x_start, x_start + length), 0, len(first)),
|
|
937
|
-
_shift_location(SimpleLocation(y_start, y_start + length), 0, len(secnd)),
|
|
938
|
-
]
|
|
939
|
-
|
|
940
|
-
rc_locs = [locs[0]._flip(len(first)), locs[1]._flip(len(secnd))]
|
|
941
|
-
|
|
942
|
-
# Unique id that identifies the edge in either orientation
|
|
943
|
-
uid = f'{u}{locs[0]}:{v}{locs[1]}'
|
|
944
|
-
|
|
945
|
-
combinations = (
|
|
946
|
-
(u, v, locs),
|
|
947
|
-
(-v, -u, rc_locs[::-1]),
|
|
948
|
-
)
|
|
949
|
-
|
|
950
|
-
for u, v, l in combinations:
|
|
951
|
-
self.G.add_edge(u, v, f'{u}{l[0]}:{v}{l[1]}', locations=l, uid=uid)
|
|
952
|
-
|
|
953
|
-
def format_assembly_edge(self, assembly_edge):
|
|
954
|
-
"""Go from the (u, v, key) to the (u, v, locu, locv) format."""
|
|
955
|
-
u, v, key = assembly_edge
|
|
956
|
-
locu, locv = self.G.get_edge_data(u, v, key)['locations']
|
|
957
|
-
return u, v, locu, locv
|
|
958
|
-
|
|
959
|
-
def get_linear_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
960
|
-
"""Get linear assemblies, applying the constrains described in __init__, ensuring that paths represent
|
|
961
|
-
real assemblies (see assembly_is_valid). Subassemblies are removed (see remove_subassemblies)."""
|
|
962
|
-
|
|
963
|
-
# Copy the graph since we will add the begin and end mock nodes
|
|
964
|
-
G = _nx.MultiDiGraph(self.G)
|
|
965
|
-
G.add_nodes_from(['begin', 'end'])
|
|
966
|
-
|
|
967
|
-
if self.use_fragment_order:
|
|
968
|
-
# Path must start with the first fragment and end with the last
|
|
969
|
-
G.add_edge('begin', 1)
|
|
970
|
-
G.add_edge('begin', -1)
|
|
971
|
-
G.add_edge(len(self.fragments), 'end')
|
|
972
|
-
G.add_edge(-len(self.fragments), 'end')
|
|
973
|
-
else:
|
|
974
|
-
for node in filter(lambda x: type(x) is int, G.nodes):
|
|
975
|
-
G.add_edge('begin', node)
|
|
976
|
-
G.add_edge(node, 'end')
|
|
977
|
-
|
|
978
|
-
unique_linear_paths = self.get_unique_linear_paths(G, max_assemblies)
|
|
979
|
-
possible_assemblies = self.get_possible_assembly_number(unique_linear_paths)
|
|
980
|
-
if possible_assemblies > max_assemblies:
|
|
981
|
-
raise ValueError(f'Too many assemblies ({possible_assemblies} pre-validation) to assemble')
|
|
982
|
-
|
|
983
|
-
assemblies = sum(map(lambda x: self.node_path2assembly_list(x, False), unique_linear_paths), [])
|
|
984
|
-
|
|
985
|
-
out = [a for a in assemblies if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments)]
|
|
986
|
-
if only_adjacent_edges:
|
|
987
|
-
out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, False)]
|
|
988
|
-
return remove_subassemblies(out)
|
|
989
|
-
|
|
990
|
-
def node_path2assembly_list(self, cycle, circular: bool):
|
|
991
|
-
"""Convert a node path in the format [1, 2, 3] (as returned by _nx.cycles.simple_cycles) to a list of all
|
|
992
|
-
possible assemblies.
|
|
993
|
-
|
|
994
|
-
There may be multiple assemblies for a given node path, if there are several edges connecting two nodes,
|
|
995
|
-
for example two overlaps between 1 and 2, and single overlap between 2 and 3 should return 3 assemblies.
|
|
996
|
-
"""
|
|
997
|
-
combine = list()
|
|
998
|
-
pairing = zip(cycle, cycle[1:] + cycle[:1]) if circular else zip(cycle, cycle[1:])
|
|
999
|
-
for u, v in pairing:
|
|
1000
|
-
combine.append([(u, v, key) for key in self.G[u][v]])
|
|
1001
|
-
return [tuple(map(self.format_assembly_edge, x)) for x in _itertools.product(*combine)]
|
|
1002
|
-
|
|
1003
|
-
def get_unique_linear_paths(self, G_with_begin_end: _nx.MultiDiGraph, max_paths):
|
|
1004
|
-
# We remove the begin and end nodes, and get all paths without edges
|
|
1005
|
-
# e.g. we will get [1, 2, 3] only once, even if multiple edges connect
|
|
1006
|
-
# 1 and 2 or 2 and 3, by converting to DiGraph.
|
|
1007
|
-
|
|
1008
|
-
# Cutoff has a different meaning of what one would expect, see https://github.com/networkx/networkx/issues/2762
|
|
1009
|
-
node_paths = [
|
|
1010
|
-
x[1:-1]
|
|
1011
|
-
for x in limit_iterator(
|
|
1012
|
-
_nx.all_simple_paths(_nx.DiGraph(G_with_begin_end), 'begin', 'end', cutoff=(len(self.fragments) + 1)),
|
|
1013
|
-
10000,
|
|
1014
|
-
)
|
|
1015
|
-
]
|
|
1016
|
-
|
|
1017
|
-
# Remove those that contain the same node twice
|
|
1018
|
-
node_paths = [x for x in node_paths if len(x) == len(set(map(abs, x)))]
|
|
1019
|
-
|
|
1020
|
-
if self.use_all_fragments:
|
|
1021
|
-
node_paths = [x for x in node_paths if len(x) == len(self.fragments)]
|
|
1022
|
-
|
|
1023
|
-
# For each path, we check if there are reverse complement duplicates
|
|
1024
|
-
# See: https://github.com/manulera/OpenCloning_backend/issues/160
|
|
1025
|
-
unique_node_paths = list()
|
|
1026
|
-
for p in node_paths:
|
|
1027
|
-
if [-x for x in p[::-1]] not in unique_node_paths:
|
|
1028
|
-
unique_node_paths.append(p)
|
|
1029
|
-
|
|
1030
|
-
return unique_node_paths
|
|
1031
|
-
|
|
1032
|
-
def get_possible_assembly_number(self, paths):
|
|
1033
|
-
possibilities = 0
|
|
1034
|
-
for path in paths:
|
|
1035
|
-
this_path = 1
|
|
1036
|
-
for u, v in zip(path, path[1:]):
|
|
1037
|
-
if v in self.G[u]:
|
|
1038
|
-
this_path *= len(self.G[u][v])
|
|
1039
|
-
possibilities += this_path
|
|
1040
|
-
return possibilities
|
|
1041
|
-
|
|
1042
|
-
def get_circular_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1043
|
-
"""Get circular assemblies, applying the constrains described in __init__, ensuring that paths represent
|
|
1044
|
-
real assemblies (see assembly_is_valid)."""
|
|
1045
|
-
# The constrain of circular sequence is that the first node is the fragment with the smallest index in its initial orientation,
|
|
1046
|
-
# this is ensured by the circular_permutation_min_abs function + the filter below
|
|
1047
|
-
sorted_cycles = map(
|
|
1048
|
-
circular_permutation_min_abs,
|
|
1049
|
-
limit_iterator(_nx.cycles.simple_cycles(self.G, length_bound=len(self.fragments)), 10000),
|
|
1050
|
-
)
|
|
1051
|
-
sorted_cycles = filter(lambda x: x[0] > 0, sorted_cycles)
|
|
1052
|
-
# cycles.simple_cycles returns lists [1,2,3] not assemblies, see self.cycle2circular_assemblies
|
|
1053
|
-
|
|
1054
|
-
# We apply constrains already here because sometimes the combinatorial explosion is too large
|
|
1055
|
-
if self.use_all_fragments:
|
|
1056
|
-
sorted_cycles = [c for c in sorted_cycles if len(c) == len(self.fragments)]
|
|
1057
|
-
|
|
1058
|
-
# Remove cycles with duplicates
|
|
1059
|
-
sorted_cycles = [c for c in sorted_cycles if len(c) == len(set(map(abs, c)))]
|
|
1060
|
-
possible_assembly_number = self.get_possible_assembly_number([c + c[:1] for c in sorted_cycles])
|
|
1061
|
-
if possible_assembly_number > max_assemblies:
|
|
1062
|
-
raise ValueError(f'Too many assemblies ({possible_assembly_number} pre-validation) to assemble')
|
|
1063
|
-
|
|
1064
|
-
assemblies = sum(map(lambda x: self.node_path2assembly_list(x, True), sorted_cycles), [])
|
|
1065
|
-
|
|
1066
|
-
out = [a for a in assemblies if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)]
|
|
1067
|
-
if only_adjacent_edges:
|
|
1068
|
-
out = [a for a in out if self.assembly_uses_only_adjacent_edges(a, True)]
|
|
1069
|
-
return out
|
|
1070
|
-
|
|
1071
|
-
def format_insertion_assembly(self, assembly):
|
|
1072
|
-
"""Sorts the fragment representing a cycle so that they represent an insertion assembly if possible,
|
|
1073
|
-
else returns None.
|
|
1074
|
-
|
|
1075
|
-
Here we check if one of the joins between fragments represents the edges of an insertion assembly
|
|
1076
|
-
The fragment must be linear, and the join must be as indicated below
|
|
1077
|
-
|
|
1078
|
-
```
|
|
1079
|
-
-------- ------- Fragment 1
|
|
1080
|
-
|| ||
|
|
1081
|
-
xxxxxxxx || Fragment 2
|
|
1082
|
-
|| ||
|
|
1083
|
-
oooooooooo Fragment 3
|
|
1084
|
-
```
|
|
1085
|
-
The above example will be [(1, 2, [4:6], [0:2]), (2, 3, [6:8], [0:2]), (3, 1, [8:10], [9:11)])]
|
|
1086
|
-
|
|
1087
|
-
These could be returned in any order by simple_cycles, so we sort the edges so that the first
|
|
1088
|
-
and last `u` and `v` match the fragment that gets the insertion (1 in the example above).
|
|
1089
|
-
"""
|
|
1090
|
-
edge_pair_index = list()
|
|
1091
|
-
|
|
1092
|
-
# Pair edges with one another
|
|
1093
|
-
for i, ((_u1, v1, _, end_location), (_u2, _v2, start_location, _)) in enumerate(
|
|
1094
|
-
zip(assembly, assembly[1:] + assembly[:1])
|
|
1095
|
-
):
|
|
1096
|
-
fragment = self.fragments[abs(v1) - 1]
|
|
1097
|
-
# Find the pair of edges that should be last and first ((3, 1, [8:10], [9:11)]), (1, 2, [4:6], [0:2]) in
|
|
1098
|
-
# the example above. Only one of the pairs of edges should satisfy this condition for the topology to make sense.
|
|
1099
|
-
left_of_insertion = _location_boundaries(start_location)[0]
|
|
1100
|
-
right_of_insertion = _location_boundaries(end_location)[0]
|
|
1101
|
-
if not fragment.circular and (
|
|
1102
|
-
right_of_insertion >= left_of_insertion
|
|
1103
|
-
# The below condition is for single-site integration.
|
|
1104
|
-
# The reason to use locations_overlap instead of equality is because the location might extend
|
|
1105
|
-
# left of right. For example, let's take ACCGGTTT as homology arm for an integration:
|
|
1106
|
-
#
|
|
1107
|
-
# insert aaACCGGTTTccACCGGTTTtt
|
|
1108
|
-
# genome aaACCGGTTTtt
|
|
1109
|
-
#
|
|
1110
|
-
# The locations of homology on the genome are [0:10] and [2:12], so not identical
|
|
1111
|
-
# but they overlap.
|
|
1112
|
-
or _locations_overlap(start_location, end_location, len(fragment))
|
|
1113
|
-
):
|
|
1114
|
-
edge_pair_index.append(i)
|
|
1115
|
-
|
|
1116
|
-
if len(edge_pair_index) != 1:
|
|
1117
|
-
return None
|
|
1118
|
-
|
|
1119
|
-
shift_by = (edge_pair_index[0] + 1) % len(assembly)
|
|
1120
|
-
return assembly[shift_by:] + assembly[:shift_by]
|
|
1121
|
-
|
|
1122
|
-
def format_insertion_assembly_edge_case(self, assembly):
|
|
1123
|
-
"""
|
|
1124
|
-
Edge case from https://github.com/manulera/OpenCloning_backend/issues/329
|
|
1125
|
-
"""
|
|
1126
|
-
same_assembly = assembly[:]
|
|
1127
|
-
|
|
1128
|
-
if len(assembly) != 2:
|
|
1129
|
-
return same_assembly
|
|
1130
|
-
((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = assembly
|
|
1131
|
-
|
|
1132
|
-
if f1 != _f1 or _f2 != f2:
|
|
1133
|
-
return same_assembly
|
|
1134
|
-
|
|
1135
|
-
if loc_f2_1 == loc_f2_2 or loc_f1_2 == loc_f1_1:
|
|
1136
|
-
return same_assembly
|
|
1137
|
-
|
|
1138
|
-
fragment1 = self.fragments[abs(f1) - 1]
|
|
1139
|
-
fragment2 = self.fragments[abs(f2) - 1]
|
|
1140
|
-
|
|
1141
|
-
if not _locations_overlap(loc_f1_1, loc_f1_2, len(fragment1)) or not _locations_overlap(
|
|
1142
|
-
loc_f2_2, loc_f2_1, len(fragment2)
|
|
1143
|
-
):
|
|
1144
|
-
return same_assembly
|
|
1145
|
-
|
|
1146
|
-
# Sort to make compatible with insertion assembly
|
|
1147
|
-
if _location_boundaries(loc_f1_1)[0] > _location_boundaries(loc_f1_2)[0]:
|
|
1148
|
-
new_assembly = same_assembly[::-1]
|
|
1149
|
-
else:
|
|
1150
|
-
new_assembly = same_assembly[:]
|
|
1151
|
-
|
|
1152
|
-
((f1, f2, loc_f1_1, loc_f2_1), (_f2, _f1, loc_f2_2, loc_f1_2)) = new_assembly
|
|
1153
|
-
|
|
1154
|
-
fragment1 = self.fragments[abs(f1) - 1]
|
|
1155
|
-
if fragment1.circular:
|
|
1156
|
-
return same_assembly
|
|
1157
|
-
fragment2 = self.fragments[abs(f2) - 1]
|
|
1158
|
-
|
|
1159
|
-
# Extract boundaries
|
|
1160
|
-
f2_1_start, _ = _location_boundaries(loc_f2_1)
|
|
1161
|
-
f2_2_start, f2_2_end = _location_boundaries(loc_f2_2)
|
|
1162
|
-
f1_1_start, _ = _location_boundaries(loc_f1_1)
|
|
1163
|
-
f1_2_start, f1_2_end = _location_boundaries(loc_f1_2)
|
|
1164
|
-
|
|
1165
|
-
overlap_diff = len(fragment1[f1_1_start:f1_2_end]) - len(fragment2[f2_1_start:f2_2_end])
|
|
1166
|
-
|
|
1167
|
-
if overlap_diff == 0:
|
|
1168
|
-
assert False, 'Overlap is 0'
|
|
1169
|
-
|
|
1170
|
-
if overlap_diff > 0:
|
|
1171
|
-
new_loc_f1_1 = create_location(f1_1_start, f1_2_start - overlap_diff, len(fragment1))
|
|
1172
|
-
new_loc_f2_1 = create_location(f2_1_start, f2_2_start, len(fragment2))
|
|
1173
|
-
else:
|
|
1174
|
-
new_loc_f2_1 = create_location(f2_1_start, f2_2_start + overlap_diff, len(fragment2))
|
|
1175
|
-
new_loc_f1_1 = create_location(f1_1_start, f1_2_start, len(fragment1))
|
|
1176
|
-
|
|
1177
|
-
new_assembly = [
|
|
1178
|
-
(f1, f2, new_loc_f1_1, new_loc_f2_1),
|
|
1179
|
-
new_assembly[1],
|
|
1180
|
-
]
|
|
1181
|
-
|
|
1182
|
-
return new_assembly
|
|
1183
|
-
|
|
1184
|
-
def get_insertion_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1185
|
-
"""Assemblies that represent the insertion of a fragment or series of fragment inside a linear construct. For instance,
|
|
1186
|
-
digesting CCCCGAATTCCCCGAATTC with EcoRI and inserting the fragment with two overhangs into the EcoRI site of AAAGAATTCAAA.
|
|
1187
|
-
This is not so much meant for the use-case of linear fragments that represent actual linear fragments, but for linear
|
|
1188
|
-
fragments that represent a genome region. This can then be used to simulate homologous recombination.
|
|
1189
|
-
"""
|
|
1190
|
-
if only_adjacent_edges:
|
|
1191
|
-
raise NotImplementedError('only_adjacent_edges not implemented for insertion assemblies')
|
|
1192
|
-
|
|
1193
|
-
cycles = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
|
|
1194
|
-
|
|
1195
|
-
# We apply constrains already here because sometimes the combinatorial explosion is too large
|
|
1196
|
-
if self.use_all_fragments:
|
|
1197
|
-
cycles = [c for c in cycles if len(c) == len(self.fragments)]
|
|
1198
|
-
|
|
1199
|
-
# Remove cycles with duplicates
|
|
1200
|
-
cycles = [c for c in cycles if len(c) == len(set(map(abs, c)))]
|
|
1201
|
-
|
|
1202
|
-
possible_assembly_number = self.get_possible_assembly_number([c + c[:1] for c in cycles])
|
|
1203
|
-
|
|
1204
|
-
if possible_assembly_number > max_assemblies:
|
|
1205
|
-
raise ValueError(f'Too many assemblies ({possible_assembly_number} pre-validation) to assemble')
|
|
1206
|
-
|
|
1207
|
-
# We find cycles first
|
|
1208
|
-
iterator = limit_iterator(_nx.cycles.simple_cycles(self.G), 10000)
|
|
1209
|
-
assemblies = sum(map(lambda x: self.node_path2assembly_list(x, True), iterator), [])
|
|
1210
|
-
# We format the edge case
|
|
1211
|
-
assemblies = [self.format_insertion_assembly_edge_case(a) for a in assemblies]
|
|
1212
|
-
# We select those that contain exactly only one suitable edge
|
|
1213
|
-
assemblies = [b for a in assemblies if (b := self.format_insertion_assembly(a)) is not None]
|
|
1214
|
-
# First fragment should be in the + orientation
|
|
1215
|
-
assemblies = list(filter(lambda x: x[0][0] > 0, assemblies))
|
|
1216
|
-
return [
|
|
1217
|
-
a
|
|
1218
|
-
for a in assemblies
|
|
1219
|
-
if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments, is_insertion=True)
|
|
1220
|
-
]
|
|
1221
|
-
|
|
1222
|
-
def assemble_linear(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1223
|
-
"""Assemble linear constructs, from assemblies returned by self.get_linear_assemblies."""
|
|
1224
|
-
assemblies = self.get_linear_assemblies(only_adjacent_edges, max_assemblies)
|
|
1225
|
-
return [assemble(self.fragments, a) for a in assemblies]
|
|
1226
|
-
|
|
1227
|
-
def assemble_circular(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1228
|
-
"""Assemble circular constructs, from assemblies returned by self.get_circular_assemblies."""
|
|
1229
|
-
assemblies = self.get_circular_assemblies(only_adjacent_edges, max_assemblies)
|
|
1230
|
-
return [assemble(self.fragments, a) for a in assemblies]
|
|
1231
|
-
|
|
1232
|
-
def assemble_insertion(self, only_adjacent_edges: bool = False):
|
|
1233
|
-
"""Assemble insertion constructs, from assemblies returned by self.get_insertion_assemblies."""
|
|
1234
|
-
assemblies = self.get_insertion_assemblies(only_adjacent_edges)
|
|
1235
|
-
return [assemble(self.fragments, a, is_insertion=True) for a in assemblies]
|
|
1236
|
-
|
|
1237
|
-
def get_locations_on_fragments(self) -> dict[int, dict[str, list[Location]]]:
|
|
1238
|
-
"""Get a dictionary where the keys are the nodes in the graph, and the values are dictionaries with keys
|
|
1239
|
-
`left`, `right`, containing (for each fragment) the locations where the fragment is joined to another fragment on its left
|
|
1240
|
-
and right side. The values in `left` and `right` are often the same, except in restriction-ligation with partial overlap enabled,
|
|
1241
|
-
where we can end up with a situation like this:
|
|
1242
|
-
|
|
1243
|
-
GGTCTCCCCAATT and aGGTCTCCAACCAA as fragments
|
|
1244
|
-
|
|
1245
|
-
# Partial overlap in assembly 1[9:11]:2[8:10]
|
|
1246
|
-
GGTCTCCxxAACCAA
|
|
1247
|
-
CCAGAGGGGTTxxTT
|
|
1248
|
-
|
|
1249
|
-
# Partial overlap in 2[10:12]:1[7:9]
|
|
1250
|
-
aGGTCTCCxxCCAATT
|
|
1251
|
-
tCCAGAGGTTGGxxAA
|
|
1252
|
-
|
|
1253
|
-
Would return
|
|
1254
|
-
{
|
|
1255
|
-
1: {'left': [7:9], 'right': [9:11]},
|
|
1256
|
-
2: {'left': [8:10], 'right': [10:12]},
|
|
1257
|
-
-1: {'left': [2:4], 'right': [4:6]},
|
|
1258
|
-
-2: {'left': [2:4], 'right': [4:6]}
|
|
1259
|
-
}
|
|
1260
|
-
|
|
1261
|
-
"""
|
|
1262
|
-
|
|
1263
|
-
locations_on_fragments = dict()
|
|
1264
|
-
for node in self.G.nodes:
|
|
1265
|
-
this_dict = {'left': list(), 'right': list()}
|
|
1266
|
-
for edge in self.G.edges(data=True):
|
|
1267
|
-
for i, key in enumerate(['right', 'left']):
|
|
1268
|
-
if edge[i] == node:
|
|
1269
|
-
edge_location = edge[2]['locations'][i]
|
|
1270
|
-
if edge_location not in this_dict[key]:
|
|
1271
|
-
this_dict[key].append(edge_location)
|
|
1272
|
-
this_dict['left'] = sorted(this_dict['left'], key=lambda x: _location_boundaries(x)[0])
|
|
1273
|
-
this_dict['right'] = sorted(this_dict['right'], key=lambda x: _location_boundaries(x)[0])
|
|
1274
|
-
locations_on_fragments[node] = this_dict
|
|
1275
|
-
|
|
1276
|
-
return locations_on_fragments
|
|
1277
|
-
|
|
1278
|
-
def assembly_uses_only_adjacent_edges(self, assembly, is_circular: bool) -> bool:
|
|
1279
|
-
"""
|
|
1280
|
-
Check whether only adjacent edges within each fragment are used in the assembly. This is useful to check if a cut and ligate assembly is valid,
|
|
1281
|
-
and prevent including partially digested fragments. For example, imagine the following fragment being an input for a digestion
|
|
1282
|
-
and ligation assembly, where the enzyme cuts at the sites indicated by the vertical lines:
|
|
1283
|
-
|
|
1284
|
-
x y z
|
|
1285
|
-
-------|-------|-------|---------
|
|
1286
|
-
|
|
1287
|
-
We would only want assemblies that contain subfragments start-x, x-y, y-z, z-end, and not start-x, y-end, for instance.
|
|
1288
|
-
The latter would indicate that the fragment was partially digested.
|
|
1289
|
-
"""
|
|
1290
|
-
|
|
1291
|
-
locations_on_fragments = self.get_locations_on_fragments()
|
|
1292
|
-
for node in locations_on_fragments:
|
|
1293
|
-
fragment_len = len(self.fragments[abs(node) - 1])
|
|
1294
|
-
for side in ['left', 'right']:
|
|
1295
|
-
locations_on_fragments[node][side] = gather_overlapping_locations(
|
|
1296
|
-
locations_on_fragments[node][side], fragment_len
|
|
1297
|
-
)
|
|
1298
|
-
|
|
1299
|
-
allowed_location_pairs = dict()
|
|
1300
|
-
for node in locations_on_fragments:
|
|
1301
|
-
if not is_circular:
|
|
1302
|
-
# We add the existing ends of the fragment
|
|
1303
|
-
left = [(None,)] + locations_on_fragments[node]['left']
|
|
1304
|
-
right = locations_on_fragments[node]['right'] + [(None,)]
|
|
1305
|
-
|
|
1306
|
-
else:
|
|
1307
|
-
# For circular assemblies, we add the first location at the end
|
|
1308
|
-
# to allow for the last edge to be used
|
|
1309
|
-
left = locations_on_fragments[node]['left']
|
|
1310
|
-
right = locations_on_fragments[node]['right'][1:] + locations_on_fragments[node]['right'][:1]
|
|
1311
|
-
|
|
1312
|
-
pairs = list()
|
|
1313
|
-
for pair in zip(left, right):
|
|
1314
|
-
pairs += list(_itertools.product(*pair))
|
|
1315
|
-
allowed_location_pairs[node] = pairs
|
|
1316
|
-
|
|
1317
|
-
fragment_assembly = edge_representation2subfragment_representation(assembly, is_circular)
|
|
1318
|
-
for node, start_location, end_location in fragment_assembly:
|
|
1319
|
-
if (start_location, end_location) not in allowed_location_pairs[node]:
|
|
1320
|
-
return False
|
|
1321
|
-
return True
|
|
1322
|
-
|
|
1323
|
-
def __repr__(self):
|
|
1324
|
-
# https://pyformat.info
|
|
1325
|
-
return _pretty_str(
|
|
1326
|
-
'Assembly\n'
|
|
1327
|
-
'fragments..: {sequences}\n'
|
|
1328
|
-
'limit(bp)..: {limit}\n'
|
|
1329
|
-
'G.nodes....: {nodes}\n'
|
|
1330
|
-
'algorithm..: {al}'.format(
|
|
1331
|
-
sequences=' '.join('{}bp'.format(len(x)) for x in self.fragments),
|
|
1332
|
-
limit=self.limit,
|
|
1333
|
-
nodes=self.G.order(),
|
|
1334
|
-
al=self.algorithm.__name__,
|
|
1335
|
-
)
|
|
1336
|
-
)
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
class PCRAssembly(Assembly):
|
|
1340
|
-
def __init__(self, frags: list[_Dseqrecord | _Primer], limit=25, mismatches=0):
|
|
1341
|
-
|
|
1342
|
-
value_error = ValueError(
|
|
1343
|
-
'PCRAssembly assembly must be initialised with a list/tuple of primer, template, primer'
|
|
1344
|
-
)
|
|
1345
|
-
if len(frags) != 3:
|
|
1346
|
-
raise value_error
|
|
1347
|
-
|
|
1348
|
-
# Validate the inputs: should be a series of primer, template, primer
|
|
1349
|
-
wrong_fragment_class = (
|
|
1350
|
-
not isinstance(frags[0], _Primer),
|
|
1351
|
-
isinstance(frags[1], _Primer),
|
|
1352
|
-
not isinstance(frags[2], _Primer),
|
|
1353
|
-
)
|
|
1354
|
-
if any(wrong_fragment_class):
|
|
1355
|
-
raise value_error
|
|
1356
|
-
|
|
1357
|
-
# TODO: allow for the same fragment to be included more than once?
|
|
1358
|
-
self.G = _nx.MultiDiGraph()
|
|
1359
|
-
# Add positive and negative nodes for forward and reverse fragments
|
|
1360
|
-
self.G.add_nodes_from((i + 1, {'seq': f}) for (i, f) in enumerate(frags))
|
|
1361
|
-
self.G.add_nodes_from((-(i + 1), {'seq': f.reverse_complement()}) for (i, f) in enumerate(frags))
|
|
1362
|
-
|
|
1363
|
-
pairs = list()
|
|
1364
|
-
primer_ids = list()
|
|
1365
|
-
for i in range(0, len(frags), 3):
|
|
1366
|
-
# primer, template, primer
|
|
1367
|
-
p1, t, p2 = (i + 1, i + 2, i + 3)
|
|
1368
|
-
primer_ids += [p1, p2]
|
|
1369
|
-
pairs += list(_itertools.product([p1, p2], [t, -t]))
|
|
1370
|
-
pairs += list(_itertools.product([t, -t], [-p1, -p2]))
|
|
1371
|
-
|
|
1372
|
-
for u, v in pairs:
|
|
1373
|
-
u_seq = self.G.nodes[u]['seq']
|
|
1374
|
-
v_seq = self.G.nodes[v]['seq']
|
|
1375
|
-
matches = alignment_sub_strings(u_seq, v_seq, limit, mismatches)
|
|
1376
|
-
for match in matches:
|
|
1377
|
-
self.add_edges_from_match(match, u, v, u_seq, v_seq)
|
|
1378
|
-
|
|
1379
|
-
# These two are constrained
|
|
1380
|
-
self.use_fragment_order = False
|
|
1381
|
-
self.use_all_fragments = True
|
|
1382
|
-
|
|
1383
|
-
self.fragments = frags
|
|
1384
|
-
self.limit = limit
|
|
1385
|
-
self.algorithm = alignment_sub_strings
|
|
1386
|
-
|
|
1387
|
-
return
|
|
1388
|
-
|
|
1389
|
-
def get_linear_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1390
|
-
if only_adjacent_edges:
|
|
1391
|
-
raise NotImplementedError('only_adjacent_edges not implemented for PCR assemblies')
|
|
1392
|
-
|
|
1393
|
-
return super().get_linear_assemblies(max_assemblies=max_assemblies)
|
|
1394
|
-
|
|
1395
|
-
def get_circular_assemblies(self, only_adjacent_edges: bool = False):
|
|
1396
|
-
raise NotImplementedError('get_circular_assemblies not implemented for PCR assemblies')
|
|
1397
|
-
|
|
1398
|
-
def get_insertion_assemblies(self, only_adjacent_edges: bool = False):
|
|
1399
|
-
raise NotImplementedError('get_insertion_assemblies not implemented for PCR assemblies')
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
class SingleFragmentAssembly(Assembly):
|
|
1403
|
-
"""
|
|
1404
|
-
An assembly that represents the circularisation or splicing of a single fragment.
|
|
1405
|
-
"""
|
|
1406
|
-
|
|
1407
|
-
def __init__(self, frags: [_Dseqrecord], limit=25, algorithm=common_sub_strings):
|
|
1408
|
-
|
|
1409
|
-
if len(frags) != 1:
|
|
1410
|
-
raise ValueError('SingleFragmentAssembly assembly must be initialised with a single fragment')
|
|
1411
|
-
# TODO: allow for the same fragment to be included more than once?
|
|
1412
|
-
self.G = _nx.MultiDiGraph()
|
|
1413
|
-
frag = frags[0]
|
|
1414
|
-
# Add positive and negative nodes for forward and reverse fragments
|
|
1415
|
-
self.G.add_node(1, seq=frag)
|
|
1416
|
-
|
|
1417
|
-
matches = algorithm(frag, frag, limit)
|
|
1418
|
-
for match in matches:
|
|
1419
|
-
self.add_edges_from_match(match, 1, 1, frag, frag)
|
|
1420
|
-
|
|
1421
|
-
# To avoid duplicated outputs
|
|
1422
|
-
self.G.remove_edges_from([(-1, -1)])
|
|
1423
|
-
|
|
1424
|
-
# These two are constrained
|
|
1425
|
-
self.use_fragment_order = True
|
|
1426
|
-
self.use_all_fragments = True
|
|
1427
|
-
|
|
1428
|
-
self.fragments = frags
|
|
1429
|
-
self.limit = limit
|
|
1430
|
-
self.algorithm = algorithm
|
|
1431
|
-
|
|
1432
|
-
return
|
|
1433
|
-
|
|
1434
|
-
def get_circular_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1435
|
-
# We don't want the same location twice
|
|
1436
|
-
assemblies = filter(
|
|
1437
|
-
lambda x: x[0][2] != x[0][3], super().get_circular_assemblies(only_adjacent_edges, max_assemblies)
|
|
1438
|
-
)
|
|
1439
|
-
return [a for a in assemblies if self.assembly_is_valid(self.fragments, a, True, self.use_all_fragments)]
|
|
1440
|
-
|
|
1441
|
-
def get_insertion_assemblies(self, only_adjacent_edges: bool = False, max_assemblies: int = 50):
|
|
1442
|
-
"""This could be renamed splicing assembly, but the essence is similar"""
|
|
1443
|
-
|
|
1444
|
-
if only_adjacent_edges:
|
|
1445
|
-
raise NotImplementedError('only_adjacent_edges not implemented for insertion assemblies')
|
|
1446
|
-
|
|
1447
|
-
def splicing_assembly_filter(x):
|
|
1448
|
-
# We don't want the same location twice
|
|
1449
|
-
if x[0][2] == x[0][3]:
|
|
1450
|
-
return False
|
|
1451
|
-
# We don't want to get overlap only (e.g. GAATTCcatGAATTC giving GAATTC)
|
|
1452
|
-
left_start, _ = _location_boundaries(x[0][2])
|
|
1453
|
-
_, right_end = _location_boundaries(x[0][3])
|
|
1454
|
-
if left_start == 0 and right_end == len(self.fragments[0]):
|
|
1455
|
-
return False
|
|
1456
|
-
return True
|
|
1457
|
-
|
|
1458
|
-
# We don't want the same location twice
|
|
1459
|
-
assemblies = filter(splicing_assembly_filter, super().get_insertion_assemblies(max_assemblies=max_assemblies))
|
|
1460
|
-
return [
|
|
1461
|
-
a
|
|
1462
|
-
for a in assemblies
|
|
1463
|
-
if self.assembly_is_valid(self.fragments, a, False, self.use_all_fragments, is_insertion=True)
|
|
1464
|
-
]
|
|
1465
|
-
|
|
1466
|
-
def get_linear_assemblies(self):
|
|
1467
|
-
raise NotImplementedError('Linear assembly does not make sense')
|