opencloning 0.4.8__py3-none-any.whl → 0.5.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencloning/app_settings.py +7 -0
- opencloning/batch_cloning/pombe/__init__.py +2 -2
- opencloning/batch_cloning/pombe/pombe_clone.py +31 -112
- opencloning/batch_cloning/pombe/pombe_summary.py +20 -8
- opencloning/batch_cloning/ziqiang_et_al2024/__init__.py +8 -8
- opencloning/batch_cloning/ziqiang_et_al2024/ziqiang_et_al2024.json +2 -9
- opencloning/bug_fixing/backend_v0_3.py +13 -5
- opencloning/catalogs/__init__.py +36 -0
- opencloning/catalogs/igem2024.yaml +2172 -0
- opencloning/catalogs/openDNA_collections.yaml +1161 -0
- opencloning/catalogs/readme.txt +1 -0
- opencloning/catalogs/seva.tsv +231 -0
- opencloning/catalogs/snapgene.yaml +2837 -0
- opencloning/dna_functions.py +155 -158
- opencloning/dna_utils.py +45 -62
- opencloning/ebic/primer_design.py +1 -1
- opencloning/endpoints/annotation.py +9 -13
- opencloning/endpoints/assembly.py +157 -378
- opencloning/endpoints/endpoint_utils.py +52 -0
- opencloning/endpoints/external_import.py +169 -124
- opencloning/endpoints/no_assembly.py +23 -39
- opencloning/endpoints/no_input.py +32 -47
- opencloning/endpoints/other.py +1 -1
- opencloning/endpoints/primer_design.py +2 -1
- opencloning/http_client.py +2 -2
- opencloning/ncbi_requests.py +113 -47
- opencloning/primer_design.py +1 -1
- opencloning/pydantic_models.py +10 -510
- opencloning/request_examples.py +10 -22
- opencloning/temp_functions.py +50 -0
- {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/METADATA +18 -8
- opencloning-0.5.0.1.dist-info/RECORD +51 -0
- {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/WHEEL +1 -1
- opencloning/cre_lox.py +0 -116
- opencloning/gateway.py +0 -154
- opencloning-0.4.8.dist-info/RECORD +0 -45
- {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/licenses/LICENSE +0 -0
opencloning/dna_functions.py
CHANGED
|
@@ -1,24 +1,38 @@
|
|
|
1
|
-
from
|
|
2
|
-
from urllib.
|
|
1
|
+
from fastapi import HTTPException
|
|
2
|
+
from urllib.parse import quote
|
|
3
|
+
import math
|
|
3
4
|
from Bio.Restriction.Restriction import RestrictionBatch
|
|
4
5
|
from Bio.Seq import reverse_complement
|
|
5
6
|
from pydna.dseqrecord import Dseqrecord
|
|
6
7
|
from pydna.dseq import Dseq
|
|
7
|
-
from .
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
from opencloning_linkml.datamodel import (
|
|
9
|
+
PlannotateAnnotationReport,
|
|
10
|
+
TextFileSequence,
|
|
11
|
+
SequenceFileFormat,
|
|
12
|
+
)
|
|
13
|
+
from pydna.opencloning_models import (
|
|
14
|
+
AddgeneIdSource,
|
|
15
|
+
OpenDNACollectionsSource,
|
|
16
|
+
SEVASource,
|
|
17
|
+
SnapGenePlasmidSource,
|
|
18
|
+
WekWikGeneIdSource,
|
|
19
|
+
BenchlingUrlSource,
|
|
20
|
+
IGEMSource,
|
|
21
|
+
EuroscarfSource,
|
|
22
|
+
)
|
|
23
|
+
|
|
10
24
|
from bs4 import BeautifulSoup
|
|
11
|
-
import regex
|
|
12
|
-
from Bio.SeqFeature import SimpleLocation, Location
|
|
13
|
-
from pydna.utils import shift_location
|
|
14
25
|
from pydna.common_sub_strings import common_sub_strings
|
|
15
26
|
from Bio.SeqIO import parse as seqio_parse
|
|
16
27
|
import io
|
|
17
28
|
import warnings
|
|
18
29
|
from Bio.SeqIO.InsdcIO import GenBankScanner, GenBankIterator
|
|
19
30
|
import re
|
|
31
|
+
|
|
32
|
+
from opencloning.catalogs import iGEM2024_catalog, openDNA_collections_catalog, seva_catalog, snapgene_catalog
|
|
20
33
|
from .http_client import get_http_client, ConnectError, TimeoutException
|
|
21
34
|
from .ncbi_requests import get_genbank_sequence
|
|
35
|
+
from typing import Callable
|
|
22
36
|
|
|
23
37
|
|
|
24
38
|
def format_sequence_genbank(seq: Dseqrecord, seq_name: str = None) -> TextFileSequence:
|
|
@@ -33,12 +47,18 @@ def format_sequence_genbank(seq: Dseqrecord, seq_name: str = None) -> TextFileSe
|
|
|
33
47
|
file_content=seq.format('genbank'),
|
|
34
48
|
sequence_file_format=SequenceFileFormat('genbank'),
|
|
35
49
|
overhang_crick_3prime=seq.seq.ovhg,
|
|
36
|
-
overhang_watson_3prime=seq.seq.watson_ovhg
|
|
50
|
+
overhang_watson_3prime=seq.seq.watson_ovhg,
|
|
37
51
|
)
|
|
38
52
|
|
|
39
53
|
|
|
40
54
|
def read_dsrecord_from_json(seq: TextFileSequence) -> Dseqrecord:
|
|
41
|
-
|
|
55
|
+
with io.StringIO(seq.file_content) as handle:
|
|
56
|
+
try:
|
|
57
|
+
initial_dseqrecord: Dseqrecord = custom_file_parser(handle, 'genbank')[0]
|
|
58
|
+
except ValueError as e:
|
|
59
|
+
raise HTTPException(
|
|
60
|
+
422, f'The file for sequence with id {seq.id} is not in a valid genbank format: {e}'
|
|
61
|
+
) from e
|
|
42
62
|
if seq.overhang_watson_3prime == 0 and seq.overhang_crick_3prime == 0:
|
|
43
63
|
out_dseq_record = initial_dseqrecord
|
|
44
64
|
else:
|
|
@@ -68,117 +88,118 @@ def get_invalid_enzyme_names(enzyme_names_list: list[str | None]) -> list[str]:
|
|
|
68
88
|
|
|
69
89
|
|
|
70
90
|
async def get_sequences_from_file_url(
|
|
71
|
-
url: str,
|
|
91
|
+
url: str,
|
|
92
|
+
format: SequenceFileFormat = SequenceFileFormat('genbank'),
|
|
93
|
+
params: dict | None = None,
|
|
94
|
+
headers: dict | None = None,
|
|
95
|
+
get_function: None | Callable = None,
|
|
72
96
|
) -> list[Dseqrecord]:
|
|
73
|
-
# TODO once pydna parse is fixed it should handle urls that point to non-gb files
|
|
74
|
-
async with get_http_client() as client:
|
|
75
|
-
resp = await client.get(url)
|
|
76
97
|
|
|
77
|
-
if
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return custom_file_parser(io.BytesIO(resp.content), format)
|
|
98
|
+
if get_function is None:
|
|
99
|
+
async with get_http_client() as client:
|
|
100
|
+
resp = await client.get(url, params=params, headers=headers)
|
|
81
101
|
else:
|
|
82
|
-
|
|
102
|
+
resp = await get_function(url, params=params, headers=headers)
|
|
103
|
+
|
|
104
|
+
if math.floor(resp.status_code / 100) == 5:
|
|
105
|
+
raise HTTPException(503, 'the external server (not OpenCloning) returned an error')
|
|
106
|
+
elif math.floor(resp.status_code / 100) != 2:
|
|
107
|
+
raise HTTPException(404, 'file requested from url not found')
|
|
108
|
+
try:
|
|
109
|
+
if format == SequenceFileFormat('snapgene'):
|
|
110
|
+
return custom_file_parser(io.BytesIO(resp.content), format)
|
|
111
|
+
else:
|
|
112
|
+
return custom_file_parser(io.StringIO(resp.text), format)
|
|
113
|
+
except ValueError as e:
|
|
114
|
+
raise HTTPException(400, f'{e}') from e
|
|
83
115
|
|
|
84
116
|
|
|
85
|
-
async def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
117
|
+
async def request_from_snapgene(plasmid_set: dict, plasmid_name: str) -> Dseqrecord:
|
|
118
|
+
if plasmid_set not in snapgene_catalog:
|
|
119
|
+
raise HTTPException(404, 'invalid plasmid set')
|
|
120
|
+
if plasmid_name not in snapgene_catalog[plasmid_set]:
|
|
121
|
+
raise HTTPException(404, f'{plasmid_name} is not part of {plasmid_set}')
|
|
122
|
+
url = f'https://www.snapgene.com/local/fetch.php?set={plasmid_set}&plasmid={plasmid_name}'
|
|
123
|
+
seqs = await get_sequences_from_file_url(url, SequenceFileFormat('snapgene'))
|
|
124
|
+
seq = seqs[0]
|
|
125
|
+
seq.name = plasmid_name
|
|
126
|
+
seq.source = SnapGenePlasmidSource(repository_id=f'{plasmid_set}/{plasmid_name}')
|
|
127
|
+
return seq
|
|
94
128
|
|
|
95
129
|
|
|
96
|
-
async def request_from_addgene(
|
|
130
|
+
async def request_from_addgene(repository_id: str) -> Dseqrecord:
|
|
97
131
|
|
|
98
|
-
url = f'https://www.addgene.org/{
|
|
132
|
+
url = f'https://www.addgene.org/{repository_id}/sequences/'
|
|
99
133
|
async with get_http_client() as client:
|
|
100
134
|
resp = await client.get(url)
|
|
101
135
|
if resp.status_code == 404:
|
|
102
|
-
raise
|
|
136
|
+
raise HTTPException(404, 'wrong addgene id')
|
|
103
137
|
soup = BeautifulSoup(resp.content, 'html.parser')
|
|
104
138
|
|
|
105
139
|
# Get a span.material-name from the soup, see https://github.com/manulera/OpenCloning_backend/issues/182
|
|
106
140
|
plasmid_name = soup.find('span', class_='material-name').text.replace(' ', '_')
|
|
107
141
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
sequence_file_url_dict[_type] = [
|
|
118
|
-
a.get('href') for a in soup.find(id=_type).findAll(class_='genbank-file-download')
|
|
119
|
-
]
|
|
120
|
-
|
|
121
|
-
# TODO provide addgene sequencing data supporting the sequence
|
|
122
|
-
# We prefer to return addgene full if both available
|
|
123
|
-
products = list()
|
|
124
|
-
sources = list()
|
|
125
|
-
for _type in ['addgene-full', 'depositor-full']:
|
|
126
|
-
if len(sequence_file_url_dict[_type]) > 0:
|
|
127
|
-
for seq_url in sequence_file_url_dict[_type]:
|
|
128
|
-
new_source = source.model_copy()
|
|
129
|
-
new_source.sequence_file_url = seq_url
|
|
130
|
-
new_source.addgene_sequence_type = _type
|
|
131
|
-
sources.append(new_source)
|
|
132
|
-
# There should be only one sequence
|
|
133
|
-
products.append((await get_sequences_from_file_url(seq_url))[0])
|
|
134
|
-
|
|
135
|
-
if len(products) == 0:
|
|
136
|
-
# They may have only partial sequences
|
|
137
|
-
raise HTTPError(
|
|
138
|
-
url,
|
|
142
|
+
# Find the link to either the addgene-full (preferred) or depositor-full (secondary)
|
|
143
|
+
for addgene_sequence_type in ['depositor-full', 'addgene-full']:
|
|
144
|
+
if soup.find(id=addgene_sequence_type) is not None:
|
|
145
|
+
sequence_file_url = next(
|
|
146
|
+
a.get('href') for a in soup.find(id=addgene_sequence_type).findAll(class_='genbank-file-download')
|
|
147
|
+
)
|
|
148
|
+
break
|
|
149
|
+
else:
|
|
150
|
+
raise HTTPException(
|
|
139
151
|
404,
|
|
140
|
-
f'The requested plasmid does not have full sequences, see https://www.addgene.org/{
|
|
141
|
-
f'The requested plasmid does not have full sequences, see https://www.addgene.org/{source.repository_id}/sequences/',
|
|
142
|
-
None,
|
|
152
|
+
f'The requested plasmid does not have full sequences, see https://www.addgene.org/{repository_id}/sequences/',
|
|
143
153
|
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
154
|
+
dseqr = (await get_sequences_from_file_url(sequence_file_url))[0]
|
|
155
|
+
dseqr.name = plasmid_name
|
|
156
|
+
dseqr.source = AddgeneIdSource(
|
|
157
|
+
repository_id=repository_id,
|
|
158
|
+
sequence_file_url=sequence_file_url,
|
|
159
|
+
addgene_sequence_type=addgene_sequence_type,
|
|
160
|
+
)
|
|
161
|
+
return dseqr
|
|
149
162
|
|
|
150
163
|
|
|
151
|
-
async def request_from_wekwikgene(
|
|
152
|
-
url = f'https://wekwikgene.wllsb.edu.cn/plasmids/{
|
|
164
|
+
async def request_from_wekwikgene(repository_id: str) -> Dseqrecord:
|
|
165
|
+
url = f'https://wekwikgene.wllsb.edu.cn/plasmids/{repository_id}'
|
|
153
166
|
async with get_http_client() as client:
|
|
154
167
|
resp = await client.get(url)
|
|
155
168
|
if resp.status_code == 404:
|
|
156
|
-
raise
|
|
169
|
+
raise HTTPException(404, 'invalid wekwikgene id')
|
|
157
170
|
soup = BeautifulSoup(resp.content, 'html.parser')
|
|
158
171
|
# Get the sequence file URL from the page
|
|
159
172
|
sequence_file_url = soup.find('a', text=lambda x: x and 'Download Sequence' in x).get('href')
|
|
160
173
|
sequence_name = soup.find('h1', class_='plasmid__info__name').text.replace(' ', '_')
|
|
161
174
|
seq = (await get_sequences_from_file_url(sequence_file_url, 'snapgene'))[0]
|
|
162
175
|
seq.name = sequence_name
|
|
163
|
-
source
|
|
164
|
-
return seq
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
async def get_seva_plasmid(
|
|
168
|
-
if
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
seq_list = await get_sequences_from_file_url(source.sequence_file_url)
|
|
174
|
-
if len(seq_list) == 0:
|
|
175
|
-
raise ValueError('No sequences found in SEVA file')
|
|
176
|
-
seq = seq_list[0]
|
|
176
|
+
seq.source = WekWikGeneIdSource(repository_id=repository_id, sequence_file_url=sequence_file_url)
|
|
177
|
+
return seq
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def get_seva_plasmid(repository_id: str) -> Dseqrecord:
|
|
181
|
+
if repository_id not in seva_catalog:
|
|
182
|
+
raise HTTPException(404, 'invalid SEVA id')
|
|
183
|
+
link = seva_catalog[repository_id]
|
|
184
|
+
if 'http' not in link:
|
|
185
|
+
seq = await get_genbank_sequence(link)
|
|
177
186
|
else:
|
|
178
|
-
|
|
187
|
+
seqs = await get_sequences_from_file_url(link)
|
|
188
|
+
seq = seqs[0]
|
|
189
|
+
|
|
179
190
|
if not seq.circular:
|
|
180
191
|
seq = seq.looped()
|
|
181
|
-
|
|
192
|
+
seq.name = repository_id
|
|
193
|
+
sequence_file_url = link if 'http' in link else f'https://www.ncbi.nlm.nih.gov/nuccore/{link}'
|
|
194
|
+
seq.source = SEVASource(repository_id=repository_id, sequence_file_url=sequence_file_url)
|
|
195
|
+
return seq
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def get_sequence_from_benchling_url(url: str) -> Dseqrecord:
|
|
199
|
+
dseqs = await get_sequences_from_file_url(url)
|
|
200
|
+
dseq = dseqs[0]
|
|
201
|
+
dseq.source = BenchlingUrlSource(repository_id=url)
|
|
202
|
+
return dseq
|
|
182
203
|
|
|
183
204
|
|
|
184
205
|
def correct_name(dseq: Dseqrecord):
|
|
@@ -187,57 +208,6 @@ def correct_name(dseq: Dseqrecord):
|
|
|
187
208
|
dseq.name = dseq.annotations['keywords'][0].replace(' ', '_')
|
|
188
209
|
|
|
189
210
|
|
|
190
|
-
def location_sorter(x, y) -> int:
|
|
191
|
-
"""
|
|
192
|
-
Sort by start, then length, then strand.
|
|
193
|
-
"""
|
|
194
|
-
if x.parts[0].start != y.parts[0].start:
|
|
195
|
-
return x.parts[0].start - y.parts[0].start
|
|
196
|
-
elif x.parts[-1].end != y.parts[-1].end:
|
|
197
|
-
return x.parts[-1].end - y.parts[-1].end
|
|
198
|
-
return x.strand - y.strand
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def get_all_regex_feature_edges(pattern: str, seq: str, is_circular: bool) -> list[tuple[int, int]]:
|
|
202
|
-
|
|
203
|
-
subject = 2 * seq if is_circular else seq
|
|
204
|
-
|
|
205
|
-
compiled_pattern = regex.compile(pattern, regex.IGNORECASE)
|
|
206
|
-
compiled_pattern_rev = regex.compile('(?r)' + pattern, regex.IGNORECASE)
|
|
207
|
-
|
|
208
|
-
matches = list(regex.finditer(compiled_pattern, subject, overlapped=True))
|
|
209
|
-
matches += list(regex.finditer(compiled_pattern_rev, subject, overlapped=True))
|
|
210
|
-
|
|
211
|
-
# In circular objects we remove the matches that span the sequence more than once: m.end() - m.start() <= len(seq)
|
|
212
|
-
return list(set([(m.start(), m.end()) for m in matches if (m.end() - m.start() <= len(seq))]))
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def find_sequence_regex(pattern: str, seq: str, is_circular: bool) -> list[Location]:
|
|
216
|
-
|
|
217
|
-
feature_locations = list()
|
|
218
|
-
|
|
219
|
-
# Strand 1
|
|
220
|
-
feature_edges = get_all_regex_feature_edges(pattern, seq, is_circular)
|
|
221
|
-
# We use shift_location to format origin-spanning features in circular DNA
|
|
222
|
-
feature_locations += [shift_location(SimpleLocation(start, end, 1), 0, len(seq)) for start, end in feature_edges]
|
|
223
|
-
|
|
224
|
-
# Strand -1
|
|
225
|
-
feature_edges = get_all_regex_feature_edges(pattern, reverse_complement(seq), is_circular)
|
|
226
|
-
feature_locations += [
|
|
227
|
-
shift_location(SimpleLocation(start, end, 1)._flip(len(seq)), 0, len(seq)) for start, end in feature_edges
|
|
228
|
-
]
|
|
229
|
-
|
|
230
|
-
# We return a unique list, cannot use a set because Location is not hashable
|
|
231
|
-
return sorted(
|
|
232
|
-
[x for i, x in enumerate(feature_locations) if x not in feature_locations[:i]], key=cmp_to_key(location_sorter)
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
# Could be useful at some point
|
|
237
|
-
# def seq_overlap_length(dseq: Dseq) -> int:
|
|
238
|
-
# return len(dseq) - abs(dseq.ovhg) - abs(dseq.watson_ovhg())
|
|
239
|
-
|
|
240
|
-
|
|
241
211
|
def oligonucleotide_hybridization_overhangs(
|
|
242
212
|
fwd_oligo_seq: str, rvs_oligo_seq: str, minimal_annealing: int
|
|
243
213
|
) -> list[int]:
|
|
@@ -327,38 +297,38 @@ def custom_file_parser(
|
|
|
327
297
|
)
|
|
328
298
|
out.append(Dseqrecord(parsed_seq, circular=circularize))
|
|
329
299
|
|
|
300
|
+
if len(out) == 0:
|
|
301
|
+
raise ValueError('No sequences found in file')
|
|
330
302
|
return out
|
|
331
303
|
|
|
332
304
|
|
|
333
305
|
async def get_sequence_from_euroscarf_url(plasmid_id: str) -> Dseqrecord:
|
|
334
306
|
url = f'http://www.euroscarf.de/plasmid_details.php?accno={plasmid_id}'
|
|
335
307
|
async with get_http_client() as client:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
except ConnectError as e:
|
|
339
|
-
raise HTTPError(url, 504, 'could not connect to euroscarf', 'could not connect to euroscarf', None) from e
|
|
340
|
-
# I don't think this ever happens
|
|
341
|
-
if resp.status_code != 200:
|
|
342
|
-
raise HTTPError(
|
|
343
|
-
url, resp.status_code, 'could not connect to euroscarf', 'could not connect to euroscarf', None
|
|
344
|
-
)
|
|
308
|
+
resp = await client.get(url)
|
|
309
|
+
|
|
345
310
|
# Use beautifulsoup to parse the html
|
|
346
311
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
347
312
|
# Identify if it's an error (seems to be a php error log without a body tag)
|
|
348
313
|
body_tag = soup.find('body')
|
|
349
314
|
if body_tag is None:
|
|
350
315
|
if 'Call to a member function getName()' in resp.text:
|
|
351
|
-
raise
|
|
316
|
+
raise HTTPException(404, 'invalid euroscarf id')
|
|
352
317
|
else:
|
|
353
318
|
msg = f'Could not retrieve plasmid details, double-check the euroscarf site: {url}'
|
|
354
|
-
raise
|
|
319
|
+
raise HTTPException(503, msg)
|
|
355
320
|
# Get the download link
|
|
356
321
|
subpath = soup.find('a', href=lambda x: x and x.startswith('files/dna'))
|
|
357
322
|
if subpath is None:
|
|
358
323
|
msg = f'Could not retrieve plasmid details, double-check the euroscarf site: {url}'
|
|
359
|
-
raise
|
|
324
|
+
raise HTTPException(503, msg)
|
|
360
325
|
genbank_url = f'http://www.euroscarf.de/{subpath.get("href")}'
|
|
361
|
-
|
|
326
|
+
seq = (await get_sequences_from_file_url(genbank_url))[0]
|
|
327
|
+
# Sometimes the files do not contain correct topology information, so we loop them
|
|
328
|
+
if not seq.circular:
|
|
329
|
+
seq = seq.looped()
|
|
330
|
+
seq.source = EuroscarfSource(repository_id=plasmid_id)
|
|
331
|
+
return seq
|
|
362
332
|
|
|
363
333
|
|
|
364
334
|
async def annotate_with_plannotate(
|
|
@@ -373,14 +343,41 @@ async def annotate_with_plannotate(
|
|
|
373
343
|
)
|
|
374
344
|
if response.status_code != 200:
|
|
375
345
|
detail = response.json().get('detail', 'plannotate server error')
|
|
376
|
-
raise
|
|
346
|
+
raise HTTPException(response.status_code, detail)
|
|
377
347
|
data = response.json()
|
|
378
348
|
dseqr = custom_file_parser(io.StringIO(data['gb_file']), 'genbank')[0]
|
|
379
349
|
report = [PlannotateAnnotationReport.model_validate(r) for r in data['report']]
|
|
380
350
|
return dseqr, report, data['version']
|
|
381
351
|
except TimeoutException as e:
|
|
382
|
-
raise
|
|
352
|
+
raise HTTPException(504, 'plannotate server timeout') from e
|
|
383
353
|
except ConnectError as e:
|
|
384
|
-
raise
|
|
385
|
-
|
|
386
|
-
|
|
354
|
+
raise HTTPException(500, 'cannot connect to plannotate server') from e
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
async def get_sequence_from_openDNA_collections(collection_name: str, plasmid_id: str) -> Dseqrecord:
|
|
358
|
+
if collection_name not in openDNA_collections_catalog:
|
|
359
|
+
raise HTTPException(404, 'invalid openDNA collections collection')
|
|
360
|
+
plasmid = next((item for item in openDNA_collections_catalog[collection_name] if item['id'] == plasmid_id), None)
|
|
361
|
+
if plasmid is None:
|
|
362
|
+
raise HTTPException(404, f'plasmid {plasmid_id} not found in {collection_name}')
|
|
363
|
+
|
|
364
|
+
path = quote(plasmid['path'])
|
|
365
|
+
url = f'https://assets.opencloning.org/open-dna-collections/{path}'
|
|
366
|
+
seqs = await get_sequences_from_file_url(url)
|
|
367
|
+
seq = seqs[0]
|
|
368
|
+
seq.name = plasmid['name'] if plasmid['name'] is not None else plasmid_id
|
|
369
|
+
seq.source = OpenDNACollectionsSource(repository_id=f'{collection_name}/{plasmid_id}', sequence_file_url=url)
|
|
370
|
+
return seq
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
async def get_sequence_from_iGEM2024(part: str, backbone: str) -> Dseqrecord:
|
|
374
|
+
all_plasmids = [item for collection in iGEM2024_catalog.values() for item in collection]
|
|
375
|
+
plasmid = next((item for item in all_plasmids if item['part'] == part and item['backbone'] == backbone), None)
|
|
376
|
+
if plasmid is None:
|
|
377
|
+
raise HTTPException(404, f'plasmid {part}-{backbone} not found in iGEM 2024')
|
|
378
|
+
url = f'https://assets.opencloning.org/annotated-igem-distribution/results/plasmids/{plasmid["id"]}.gb'
|
|
379
|
+
seqs = await get_sequences_from_file_url(url)
|
|
380
|
+
seq = seqs[0]
|
|
381
|
+
seq.name = f'{part}-{backbone}'
|
|
382
|
+
seq.source = IGEMSource(repository_id=f'{part}-{backbone}', sequence_file_url=url)
|
|
383
|
+
return seq
|
opencloning/dna_utils.py
CHANGED
|
@@ -4,18 +4,16 @@ Utility functions moved here to avoid circular imports.
|
|
|
4
4
|
|
|
5
5
|
from Bio.Seq import reverse_complement
|
|
6
6
|
from pydna.dseqrecord import Dseqrecord
|
|
7
|
-
from pydna.dseq import Dseq
|
|
8
7
|
import tempfile
|
|
9
8
|
import subprocess
|
|
10
9
|
import os
|
|
11
10
|
import shutil
|
|
12
11
|
from pydna.parsers import parse
|
|
13
|
-
from Bio.Align import PairwiseAligner
|
|
12
|
+
from Bio.Align import PairwiseAligner, Alignment
|
|
14
13
|
from Bio.Data.IUPACData import ambiguous_dna_values as _ambiguous_dna_values
|
|
15
|
-
import re
|
|
16
|
-
from Bio.SeqFeature import Location, SimpleLocation
|
|
17
|
-
from pydna.utils import shift_location
|
|
18
14
|
from pairwise_alignments_to_msa.alignment import aligned_tuples_to_MSA
|
|
15
|
+
from copy import deepcopy
|
|
16
|
+
import numpy as np
|
|
19
17
|
|
|
20
18
|
aligner = PairwiseAligner(scoring='blastn')
|
|
21
19
|
|
|
@@ -24,6 +22,39 @@ for normal_base in 'ACGT':
|
|
|
24
22
|
del ambiguous_only_dna_values[normal_base]
|
|
25
23
|
|
|
26
24
|
|
|
25
|
+
def get_sequence_shift(sequence: str, reference: str) -> int:
|
|
26
|
+
"""Given two identical but shifted sequences, return the shift."""
|
|
27
|
+
if sequence == reference:
|
|
28
|
+
return 0
|
|
29
|
+
else:
|
|
30
|
+
result = (sequence.upper() * 2).find(reference.upper())
|
|
31
|
+
if result == -1:
|
|
32
|
+
raise ValueError('Sequence not found in reference')
|
|
33
|
+
return result % len(sequence)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def remove_padding(alignment: Alignment, reference: str) -> (str, str):
|
|
37
|
+
"""Remove the padding from the permutated sequence."""
|
|
38
|
+
new_alignment = deepcopy(alignment)
|
|
39
|
+
permutated_sequence = new_alignment.sequences[1]
|
|
40
|
+
sequence_shift = get_sequence_shift(permutated_sequence, reference)
|
|
41
|
+
padding = len(permutated_sequence) - len(reference)
|
|
42
|
+
if padding == 0:
|
|
43
|
+
return tuple(new_alignment)
|
|
44
|
+
unshifted = permutated_sequence[sequence_shift:] + permutated_sequence[:sequence_shift]
|
|
45
|
+
replaced = unshifted[:-padding] + '-' * padding
|
|
46
|
+
new_alignment.sequences[1] = replaced[-sequence_shift:] + replaced[:-sequence_shift]
|
|
47
|
+
|
|
48
|
+
# Remove positions in the alignment where both positions contain a dash
|
|
49
|
+
# this happens because of - matching Ns in the permutated sequence.
|
|
50
|
+
# It's not the best way to do this, but it works for now.
|
|
51
|
+
out_seqs = tuple(new_alignment)
|
|
52
|
+
seqs_array = np.array([list(s) for s in out_seqs])
|
|
53
|
+
# Drop positions where both sequences are dashes
|
|
54
|
+
seqs_array = seqs_array[:, ~np.all(seqs_array == '-', axis=0)]
|
|
55
|
+
return tuple(''.join(s) for s in seqs_array)
|
|
56
|
+
|
|
57
|
+
|
|
27
58
|
def sum_is_sticky(three_prime_end: tuple[str, str], five_prime_end: tuple[str, str], partial: bool = False) -> int:
|
|
28
59
|
"""Return the overlap length if the 3' end of seq1 and 5' end of seq2 ends are sticky and compatible for ligation.
|
|
29
60
|
Return 0 if they are not compatible."""
|
|
@@ -52,31 +83,6 @@ def sum_is_sticky(three_prime_end: tuple[str, str], five_prime_end: tuple[str, s
|
|
|
52
83
|
return 0
|
|
53
84
|
|
|
54
85
|
|
|
55
|
-
def get_alignment_shift(alignment: Dseq, shift: int) -> int:
|
|
56
|
-
"""Shift the alignment by the given number of positions, ignoring gap characters (-).
|
|
57
|
-
|
|
58
|
-
Parameters
|
|
59
|
-
----------
|
|
60
|
-
alignment : Dseq
|
|
61
|
-
The alignment sequence that may contain gap characters (-)
|
|
62
|
-
shift : int
|
|
63
|
-
Number of positions to shift the sequence by
|
|
64
|
-
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
nucleotides_shifted = 0
|
|
68
|
-
positions_shifted = 0
|
|
69
|
-
corrected_shift = shift if shift >= 0 else len(alignment) + shift
|
|
70
|
-
alignment_str = str(alignment)
|
|
71
|
-
|
|
72
|
-
while nucleotides_shifted != corrected_shift:
|
|
73
|
-
if alignment_str[positions_shifted] != '-':
|
|
74
|
-
nucleotides_shifted += 1
|
|
75
|
-
positions_shifted += 1
|
|
76
|
-
|
|
77
|
-
return positions_shifted
|
|
78
|
-
|
|
79
|
-
|
|
80
86
|
def align_with_mafft(inputs: list[str], orientation_known: bool) -> list[str]:
|
|
81
87
|
"""Align a sanger track to a dseqr sequence"""
|
|
82
88
|
|
|
@@ -140,12 +146,13 @@ def align_sanger_traces(dseqr: Dseqrecord, sanger_traces: list[str]) -> list[str
|
|
|
140
146
|
aligned_pairs = []
|
|
141
147
|
for trace in sanger_traces:
|
|
142
148
|
# If the sequence is circular, permutate both fwd and reverse complement
|
|
149
|
+
rc_trace = reverse_complement(trace)
|
|
143
150
|
if dseqr.circular:
|
|
144
151
|
fwd = permutate_trace(query_str, trace)
|
|
145
|
-
rvs = permutate_trace(query_str,
|
|
152
|
+
rvs = permutate_trace(query_str, rc_trace)
|
|
146
153
|
else:
|
|
147
154
|
fwd = trace
|
|
148
|
-
rvs =
|
|
155
|
+
rvs = rc_trace
|
|
149
156
|
|
|
150
157
|
# Pairwise-align and keep the best alignment
|
|
151
158
|
fwd_alignment = next(aligner.align(query_str, fwd))
|
|
@@ -153,35 +160,11 @@ def align_sanger_traces(dseqr: Dseqrecord, sanger_traces: list[str]) -> list[str
|
|
|
153
160
|
|
|
154
161
|
best_alignment = fwd_alignment if fwd_alignment.score > rvs_alignment.score else rvs_alignment
|
|
155
162
|
|
|
156
|
-
|
|
157
|
-
|
|
163
|
+
if dseqr.circular:
|
|
164
|
+
trace4padding = trace if best_alignment is fwd_alignment else rc_trace
|
|
165
|
+
formatted_alignment = remove_padding(best_alignment, trace4padding)
|
|
166
|
+
else:
|
|
167
|
+
formatted_alignment = tuple(best_alignment)
|
|
168
|
+
aligned_pairs.append(formatted_alignment)
|
|
158
169
|
|
|
159
170
|
return aligned_tuples_to_MSA(aligned_pairs)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def compute_regex_site(site: str) -> str:
|
|
163
|
-
upper_site = site.upper()
|
|
164
|
-
for k, v in ambiguous_only_dna_values.items():
|
|
165
|
-
if len(v) > 1:
|
|
166
|
-
upper_site = upper_site.replace(k, f"[{''.join(v)}]")
|
|
167
|
-
|
|
168
|
-
# Make case insensitive
|
|
169
|
-
upper_site = f'(?i){upper_site}'
|
|
170
|
-
return upper_site
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def dseqrecord_finditer(pattern: str, seq: Dseqrecord) -> list[re.Match]:
|
|
174
|
-
query = str(seq.seq) if not seq.circular else str(seq.seq) * 2
|
|
175
|
-
matches = re.finditer(pattern, query)
|
|
176
|
-
return (m for m in matches if m.start() <= len(seq))
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def create_location(start: int, end: int, lim: int) -> Location:
|
|
180
|
-
while start < 0:
|
|
181
|
-
start += lim
|
|
182
|
-
while end < 0:
|
|
183
|
-
end += lim
|
|
184
|
-
if end > start:
|
|
185
|
-
return SimpleLocation(start, end)
|
|
186
|
-
else:
|
|
187
|
-
return shift_location(SimpleLocation(start, end + lim), 0, lim)
|
|
@@ -2,7 +2,7 @@ from pydna.dseqrecord import Dseqrecord
|
|
|
2
2
|
from Bio.SeqFeature import SimpleLocation
|
|
3
3
|
from ..primer3_functions import PrimerDesignSettings, primer3_design_primers
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from opencloning_linkml.datamodel import Primer as PrimerModel
|
|
6
6
|
from .primer_design_settings import amanda_settings
|
|
7
7
|
|
|
8
8
|
adapter_left_fwd = 'ataGGTCTCtGGAG'
|
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
from fastapi import Query
|
|
1
|
+
from fastapi import Query
|
|
2
2
|
from pydantic import create_model
|
|
3
|
-
from urllib.error import HTTPError
|
|
4
3
|
|
|
5
4
|
from ..get_router import get_router
|
|
6
|
-
from
|
|
5
|
+
from opencloning_linkml.datamodel import TextFileSequence, AnnotationSource
|
|
7
6
|
from ..dna_functions import (
|
|
8
7
|
read_dsrecord_from_json,
|
|
9
8
|
annotate_with_plannotate as _annotate_with_plannotate,
|
|
10
9
|
format_sequence_genbank,
|
|
11
10
|
)
|
|
12
|
-
from
|
|
11
|
+
from pydna.gateway import find_gateway_sites
|
|
13
12
|
from ..app_settings import settings
|
|
14
13
|
|
|
15
14
|
router = get_router()
|
|
@@ -46,15 +45,12 @@ if settings.PLANNOTATE_URL is not None:
|
|
|
46
45
|
):
|
|
47
46
|
input_seqr = read_dsrecord_from_json(sequence)
|
|
48
47
|
# Make a request submitting sequence as a file:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
)
|
|
56
|
-
except HTTPError as e:
|
|
57
|
-
raise HTTPException(e.code, e.msg) from e
|
|
48
|
+
seqr, annotations, version = await _annotate_with_plannotate(
|
|
49
|
+
sequence.file_content,
|
|
50
|
+
f'{sequence.id}.gb',
|
|
51
|
+
settings.PLANNOTATE_URL + 'annotate',
|
|
52
|
+
settings.PLANNOTATE_TIMEOUT,
|
|
53
|
+
)
|
|
58
54
|
|
|
59
55
|
source.annotation_report = annotations
|
|
60
56
|
source.annotation_tool = 'plannotate'
|