opencloning 0.4.8__py3-none-any.whl → 0.5.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. opencloning/app_settings.py +7 -0
  2. opencloning/batch_cloning/pombe/__init__.py +2 -2
  3. opencloning/batch_cloning/pombe/pombe_clone.py +31 -112
  4. opencloning/batch_cloning/pombe/pombe_summary.py +20 -8
  5. opencloning/batch_cloning/ziqiang_et_al2024/__init__.py +8 -8
  6. opencloning/batch_cloning/ziqiang_et_al2024/ziqiang_et_al2024.json +2 -9
  7. opencloning/bug_fixing/backend_v0_3.py +13 -5
  8. opencloning/catalogs/__init__.py +36 -0
  9. opencloning/catalogs/igem2024.yaml +2172 -0
  10. opencloning/catalogs/openDNA_collections.yaml +1161 -0
  11. opencloning/catalogs/readme.txt +1 -0
  12. opencloning/catalogs/seva.tsv +231 -0
  13. opencloning/catalogs/snapgene.yaml +2837 -0
  14. opencloning/dna_functions.py +155 -158
  15. opencloning/dna_utils.py +45 -62
  16. opencloning/ebic/primer_design.py +1 -1
  17. opencloning/endpoints/annotation.py +9 -13
  18. opencloning/endpoints/assembly.py +157 -378
  19. opencloning/endpoints/endpoint_utils.py +52 -0
  20. opencloning/endpoints/external_import.py +169 -124
  21. opencloning/endpoints/no_assembly.py +23 -39
  22. opencloning/endpoints/no_input.py +32 -47
  23. opencloning/endpoints/other.py +1 -1
  24. opencloning/endpoints/primer_design.py +2 -1
  25. opencloning/http_client.py +2 -2
  26. opencloning/ncbi_requests.py +113 -47
  27. opencloning/primer_design.py +1 -1
  28. opencloning/pydantic_models.py +10 -510
  29. opencloning/request_examples.py +10 -22
  30. opencloning/temp_functions.py +50 -0
  31. {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/METADATA +18 -8
  32. opencloning-0.5.0.1.dist-info/RECORD +51 -0
  33. {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/WHEEL +1 -1
  34. opencloning/cre_lox.py +0 -116
  35. opencloning/gateway.py +0 -154
  36. opencloning-0.4.8.dist-info/RECORD +0 -45
  37. {opencloning-0.4.8.dist-info → opencloning-0.5.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,38 @@
1
- from functools import cmp_to_key
2
- from urllib.error import HTTPError
1
+ from fastapi import HTTPException
2
+ from urllib.parse import quote
3
+ import math
3
4
  from Bio.Restriction.Restriction import RestrictionBatch
4
5
  from Bio.Seq import reverse_complement
5
6
  from pydna.dseqrecord import Dseqrecord
6
7
  from pydna.dseq import Dseq
7
- from .pydantic_models import TextFileSequence, AddgeneIdSource, SequenceFileFormat, WekWikGeneIdSource, SEVASource
8
- from opencloning_linkml.datamodel import PlannotateAnnotationReport
9
- from pydna.parsers import parse as pydna_parse
8
+ from opencloning_linkml.datamodel import (
9
+ PlannotateAnnotationReport,
10
+ TextFileSequence,
11
+ SequenceFileFormat,
12
+ )
13
+ from pydna.opencloning_models import (
14
+ AddgeneIdSource,
15
+ OpenDNACollectionsSource,
16
+ SEVASource,
17
+ SnapGenePlasmidSource,
18
+ WekWikGeneIdSource,
19
+ BenchlingUrlSource,
20
+ IGEMSource,
21
+ EuroscarfSource,
22
+ )
23
+
10
24
  from bs4 import BeautifulSoup
11
- import regex
12
- from Bio.SeqFeature import SimpleLocation, Location
13
- from pydna.utils import shift_location
14
25
  from pydna.common_sub_strings import common_sub_strings
15
26
  from Bio.SeqIO import parse as seqio_parse
16
27
  import io
17
28
  import warnings
18
29
  from Bio.SeqIO.InsdcIO import GenBankScanner, GenBankIterator
19
30
  import re
31
+
32
+ from opencloning.catalogs import iGEM2024_catalog, openDNA_collections_catalog, seva_catalog, snapgene_catalog
20
33
  from .http_client import get_http_client, ConnectError, TimeoutException
21
34
  from .ncbi_requests import get_genbank_sequence
35
+ from typing import Callable
22
36
 
23
37
 
24
38
  def format_sequence_genbank(seq: Dseqrecord, seq_name: str = None) -> TextFileSequence:
@@ -33,12 +47,18 @@ def format_sequence_genbank(seq: Dseqrecord, seq_name: str = None) -> TextFileSe
33
47
  file_content=seq.format('genbank'),
34
48
  sequence_file_format=SequenceFileFormat('genbank'),
35
49
  overhang_crick_3prime=seq.seq.ovhg,
36
- overhang_watson_3prime=seq.seq.watson_ovhg(),
50
+ overhang_watson_3prime=seq.seq.watson_ovhg,
37
51
  )
38
52
 
39
53
 
40
54
  def read_dsrecord_from_json(seq: TextFileSequence) -> Dseqrecord:
41
- initial_dseqrecord: Dseqrecord = pydna_parse(seq.file_content)[0]
55
+ with io.StringIO(seq.file_content) as handle:
56
+ try:
57
+ initial_dseqrecord: Dseqrecord = custom_file_parser(handle, 'genbank')[0]
58
+ except ValueError as e:
59
+ raise HTTPException(
60
+ 422, f'The file for sequence with id {seq.id} is not in a valid genbank format: {e}'
61
+ ) from e
42
62
  if seq.overhang_watson_3prime == 0 and seq.overhang_crick_3prime == 0:
43
63
  out_dseq_record = initial_dseqrecord
44
64
  else:
@@ -68,117 +88,118 @@ def get_invalid_enzyme_names(enzyme_names_list: list[str | None]) -> list[str]:
68
88
 
69
89
 
70
90
  async def get_sequences_from_file_url(
71
- url: str, format: SequenceFileFormat = SequenceFileFormat('genbank')
91
+ url: str,
92
+ format: SequenceFileFormat = SequenceFileFormat('genbank'),
93
+ params: dict | None = None,
94
+ headers: dict | None = None,
95
+ get_function: None | Callable = None,
72
96
  ) -> list[Dseqrecord]:
73
- # TODO once pydna parse is fixed it should handle urls that point to non-gb files
74
- async with get_http_client() as client:
75
- resp = await client.get(url)
76
97
 
77
- if resp.status_code != 200:
78
- raise HTTPError(url, 404, 'file requested from url not found', 'file requested from url not found', None)
79
- if format == SequenceFileFormat('snapgene'):
80
- return custom_file_parser(io.BytesIO(resp.content), format)
98
+ if get_function is None:
99
+ async with get_http_client() as client:
100
+ resp = await client.get(url, params=params, headers=headers)
81
101
  else:
82
- return custom_file_parser(io.StringIO(resp.text), format)
102
+ resp = await get_function(url, params=params, headers=headers)
103
+
104
+ if math.floor(resp.status_code / 100) == 5:
105
+ raise HTTPException(503, 'the external server (not OpenCloning) returned an error')
106
+ elif math.floor(resp.status_code / 100) != 2:
107
+ raise HTTPException(404, 'file requested from url not found')
108
+ try:
109
+ if format == SequenceFileFormat('snapgene'):
110
+ return custom_file_parser(io.BytesIO(resp.content), format)
111
+ else:
112
+ return custom_file_parser(io.StringIO(resp.text), format)
113
+ except ValueError as e:
114
+ raise HTTPException(400, f'{e}') from e
83
115
 
84
116
 
85
- async def get_sequence_from_snapgene_url(url: str) -> Dseqrecord:
86
- async with get_http_client() as client:
87
- resp = await client.get(url)
88
- # Check that resp.content is not empty
89
- if len(resp.content) == 0:
90
- raise HTTPError(url, 404, 'invalid snapgene id', 'invalid snapgene id', None)
91
- parsed_seq = next(seqio_parse(io.BytesIO(resp.content), 'snapgene'))
92
- circularize = 'topology' in parsed_seq.annotations.keys() and parsed_seq.annotations['topology'] == 'circular'
93
- return Dseqrecord(parsed_seq, circular=circularize)
117
+ async def request_from_snapgene(plasmid_set: dict, plasmid_name: str) -> Dseqrecord:
118
+ if plasmid_set not in snapgene_catalog:
119
+ raise HTTPException(404, 'invalid plasmid set')
120
+ if plasmid_name not in snapgene_catalog[plasmid_set]:
121
+ raise HTTPException(404, f'{plasmid_name} is not part of {plasmid_set}')
122
+ url = f'https://www.snapgene.com/local/fetch.php?set={plasmid_set}&plasmid={plasmid_name}'
123
+ seqs = await get_sequences_from_file_url(url, SequenceFileFormat('snapgene'))
124
+ seq = seqs[0]
125
+ seq.name = plasmid_name
126
+ seq.source = SnapGenePlasmidSource(repository_id=f'{plasmid_set}/{plasmid_name}')
127
+ return seq
94
128
 
95
129
 
96
- async def request_from_addgene(source: AddgeneIdSource) -> tuple[Dseqrecord, AddgeneIdSource]:
130
+ async def request_from_addgene(repository_id: str) -> Dseqrecord:
97
131
 
98
- url = f'https://www.addgene.org/{source.repository_id}/sequences/'
132
+ url = f'https://www.addgene.org/{repository_id}/sequences/'
99
133
  async with get_http_client() as client:
100
134
  resp = await client.get(url)
101
135
  if resp.status_code == 404:
102
- raise HTTPError(url, 404, 'wrong addgene id', 'wrong addgene id', None)
136
+ raise HTTPException(404, 'wrong addgene id')
103
137
  soup = BeautifulSoup(resp.content, 'html.parser')
104
138
 
105
139
  # Get a span.material-name from the soup, see https://github.com/manulera/OpenCloning_backend/issues/182
106
140
  plasmid_name = soup.find('span', class_='material-name').text.replace(' ', '_')
107
141
 
108
- if source.sequence_file_url:
109
- dseqr = (await get_sequences_from_file_url(source.sequence_file_url))[0]
110
- dseqr.name = plasmid_name
111
- return dseqr, source
112
-
113
- sequence_file_url_dict = dict()
114
- for _type in ['depositor-full', 'depositor-partial', 'addgene-full', 'addgene-partial']:
115
- sequence_file_url_dict[_type] = []
116
- if soup.find(id=_type) is not None:
117
- sequence_file_url_dict[_type] = [
118
- a.get('href') for a in soup.find(id=_type).findAll(class_='genbank-file-download')
119
- ]
120
-
121
- # TODO provide addgene sequencing data supporting the sequence
122
- # We prefer to return addgene full if both available
123
- products = list()
124
- sources = list()
125
- for _type in ['addgene-full', 'depositor-full']:
126
- if len(sequence_file_url_dict[_type]) > 0:
127
- for seq_url in sequence_file_url_dict[_type]:
128
- new_source = source.model_copy()
129
- new_source.sequence_file_url = seq_url
130
- new_source.addgene_sequence_type = _type
131
- sources.append(new_source)
132
- # There should be only one sequence
133
- products.append((await get_sequences_from_file_url(seq_url))[0])
134
-
135
- if len(products) == 0:
136
- # They may have only partial sequences
137
- raise HTTPError(
138
- url,
142
+ # Find the link to either the addgene-full (preferred) or depositor-full (secondary)
143
+ for addgene_sequence_type in ['depositor-full', 'addgene-full']:
144
+ if soup.find(id=addgene_sequence_type) is not None:
145
+ sequence_file_url = next(
146
+ a.get('href') for a in soup.find(id=addgene_sequence_type).findAll(class_='genbank-file-download')
147
+ )
148
+ break
149
+ else:
150
+ raise HTTPException(
139
151
  404,
140
- f'The requested plasmid does not have full sequences, see https://www.addgene.org/{source.repository_id}/sequences/',
141
- f'The requested plasmid does not have full sequences, see https://www.addgene.org/{source.repository_id}/sequences/',
142
- None,
152
+ f'The requested plasmid does not have full sequences, see https://www.addgene.org/{repository_id}/sequences/',
143
153
  )
144
-
145
- # Rename the plasmid
146
- for p in products:
147
- p.name = plasmid_name
148
- return products[0], sources[0]
154
+ dseqr = (await get_sequences_from_file_url(sequence_file_url))[0]
155
+ dseqr.name = plasmid_name
156
+ dseqr.source = AddgeneIdSource(
157
+ repository_id=repository_id,
158
+ sequence_file_url=sequence_file_url,
159
+ addgene_sequence_type=addgene_sequence_type,
160
+ )
161
+ return dseqr
149
162
 
150
163
 
151
- async def request_from_wekwikgene(source: WekWikGeneIdSource) -> tuple[Dseqrecord, WekWikGeneIdSource]:
152
- url = f'https://wekwikgene.wllsb.edu.cn/plasmids/{source.repository_id}'
164
+ async def request_from_wekwikgene(repository_id: str) -> Dseqrecord:
165
+ url = f'https://wekwikgene.wllsb.edu.cn/plasmids/{repository_id}'
153
166
  async with get_http_client() as client:
154
167
  resp = await client.get(url)
155
168
  if resp.status_code == 404:
156
- raise HTTPError(url, 404, 'invalid wekwikgene id', 'invalid wekwikgene id', None)
169
+ raise HTTPException(404, 'invalid wekwikgene id')
157
170
  soup = BeautifulSoup(resp.content, 'html.parser')
158
171
  # Get the sequence file URL from the page
159
172
  sequence_file_url = soup.find('a', text=lambda x: x and 'Download Sequence' in x).get('href')
160
173
  sequence_name = soup.find('h1', class_='plasmid__info__name').text.replace(' ', '_')
161
174
  seq = (await get_sequences_from_file_url(sequence_file_url, 'snapgene'))[0]
162
175
  seq.name = sequence_name
163
- source.sequence_file_url = sequence_file_url
164
- return seq, source
165
-
166
-
167
- async def get_seva_plasmid(source: SEVASource) -> tuple[Dseqrecord, SEVASource]:
168
- if 'ncbi.nlm.nih.gov/nuccore' in source.sequence_file_url:
169
- genbank_id = source.sequence_file_url.split('/')[-1]
170
- seq = await get_genbank_sequence(genbank_id)
171
- seq.name = source.repository_id
172
- elif source.sequence_file_url.startswith('https://seva-plasmids.com'):
173
- seq_list = await get_sequences_from_file_url(source.sequence_file_url)
174
- if len(seq_list) == 0:
175
- raise ValueError('No sequences found in SEVA file')
176
- seq = seq_list[0]
176
+ seq.source = WekWikGeneIdSource(repository_id=repository_id, sequence_file_url=sequence_file_url)
177
+ return seq
178
+
179
+
180
+ async def get_seva_plasmid(repository_id: str) -> Dseqrecord:
181
+ if repository_id not in seva_catalog:
182
+ raise HTTPException(404, 'invalid SEVA id')
183
+ link = seva_catalog[repository_id]
184
+ if 'http' not in link:
185
+ seq = await get_genbank_sequence(link)
177
186
  else:
178
- raise HTTPError(source.sequence_file_url, 404, 'invalid SEVA url', 'invalid SEVA url', None)
187
+ seqs = await get_sequences_from_file_url(link)
188
+ seq = seqs[0]
189
+
179
190
  if not seq.circular:
180
191
  seq = seq.looped()
181
- return seq, source
192
+ seq.name = repository_id
193
+ sequence_file_url = link if 'http' in link else f'https://www.ncbi.nlm.nih.gov/nuccore/{link}'
194
+ seq.source = SEVASource(repository_id=repository_id, sequence_file_url=sequence_file_url)
195
+ return seq
196
+
197
+
198
+ async def get_sequence_from_benchling_url(url: str) -> Dseqrecord:
199
+ dseqs = await get_sequences_from_file_url(url)
200
+ dseq = dseqs[0]
201
+ dseq.source = BenchlingUrlSource(repository_id=url)
202
+ return dseq
182
203
 
183
204
 
184
205
  def correct_name(dseq: Dseqrecord):
@@ -187,57 +208,6 @@ def correct_name(dseq: Dseqrecord):
187
208
  dseq.name = dseq.annotations['keywords'][0].replace(' ', '_')
188
209
 
189
210
 
190
- def location_sorter(x, y) -> int:
191
- """
192
- Sort by start, then length, then strand.
193
- """
194
- if x.parts[0].start != y.parts[0].start:
195
- return x.parts[0].start - y.parts[0].start
196
- elif x.parts[-1].end != y.parts[-1].end:
197
- return x.parts[-1].end - y.parts[-1].end
198
- return x.strand - y.strand
199
-
200
-
201
- def get_all_regex_feature_edges(pattern: str, seq: str, is_circular: bool) -> list[tuple[int, int]]:
202
-
203
- subject = 2 * seq if is_circular else seq
204
-
205
- compiled_pattern = regex.compile(pattern, regex.IGNORECASE)
206
- compiled_pattern_rev = regex.compile('(?r)' + pattern, regex.IGNORECASE)
207
-
208
- matches = list(regex.finditer(compiled_pattern, subject, overlapped=True))
209
- matches += list(regex.finditer(compiled_pattern_rev, subject, overlapped=True))
210
-
211
- # In circular objects we remove the matches that span the sequence more than once: m.end() - m.start() <= len(seq)
212
- return list(set([(m.start(), m.end()) for m in matches if (m.end() - m.start() <= len(seq))]))
213
-
214
-
215
- def find_sequence_regex(pattern: str, seq: str, is_circular: bool) -> list[Location]:
216
-
217
- feature_locations = list()
218
-
219
- # Strand 1
220
- feature_edges = get_all_regex_feature_edges(pattern, seq, is_circular)
221
- # We use shift_location to format origin-spanning features in circular DNA
222
- feature_locations += [shift_location(SimpleLocation(start, end, 1), 0, len(seq)) for start, end in feature_edges]
223
-
224
- # Strand -1
225
- feature_edges = get_all_regex_feature_edges(pattern, reverse_complement(seq), is_circular)
226
- feature_locations += [
227
- shift_location(SimpleLocation(start, end, 1)._flip(len(seq)), 0, len(seq)) for start, end in feature_edges
228
- ]
229
-
230
- # We return a unique list, cannot use a set because Location is not hashable
231
- return sorted(
232
- [x for i, x in enumerate(feature_locations) if x not in feature_locations[:i]], key=cmp_to_key(location_sorter)
233
- )
234
-
235
-
236
- # Could be useful at some point
237
- # def seq_overlap_length(dseq: Dseq) -> int:
238
- # return len(dseq) - abs(dseq.ovhg) - abs(dseq.watson_ovhg())
239
-
240
-
241
211
  def oligonucleotide_hybridization_overhangs(
242
212
  fwd_oligo_seq: str, rvs_oligo_seq: str, minimal_annealing: int
243
213
  ) -> list[int]:
@@ -327,38 +297,38 @@ def custom_file_parser(
327
297
  )
328
298
  out.append(Dseqrecord(parsed_seq, circular=circularize))
329
299
 
300
+ if len(out) == 0:
301
+ raise ValueError('No sequences found in file')
330
302
  return out
331
303
 
332
304
 
333
305
  async def get_sequence_from_euroscarf_url(plasmid_id: str) -> Dseqrecord:
334
306
  url = f'http://www.euroscarf.de/plasmid_details.php?accno={plasmid_id}'
335
307
  async with get_http_client() as client:
336
- try:
337
- resp = await client.get(url)
338
- except ConnectError as e:
339
- raise HTTPError(url, 504, 'could not connect to euroscarf', 'could not connect to euroscarf', None) from e
340
- # I don't think this ever happens
341
- if resp.status_code != 200:
342
- raise HTTPError(
343
- url, resp.status_code, 'could not connect to euroscarf', 'could not connect to euroscarf', None
344
- )
308
+ resp = await client.get(url)
309
+
345
310
  # Use beautifulsoup to parse the html
346
311
  soup = BeautifulSoup(resp.text, 'html.parser')
347
312
  # Identify if it's an error (seems to be a php error log without a body tag)
348
313
  body_tag = soup.find('body')
349
314
  if body_tag is None:
350
315
  if 'Call to a member function getName()' in resp.text:
351
- raise HTTPError(url, 404, 'invalid euroscarf id', 'invalid euroscarf id', None)
316
+ raise HTTPException(404, 'invalid euroscarf id')
352
317
  else:
353
318
  msg = f'Could not retrieve plasmid details, double-check the euroscarf site: {url}'
354
- raise HTTPError(url, 503, msg, msg, None)
319
+ raise HTTPException(503, msg)
355
320
  # Get the download link
356
321
  subpath = soup.find('a', href=lambda x: x and x.startswith('files/dna'))
357
322
  if subpath is None:
358
323
  msg = f'Could not retrieve plasmid details, double-check the euroscarf site: {url}'
359
- raise HTTPError(url, 503, msg, msg, None)
324
+ raise HTTPException(503, msg)
360
325
  genbank_url = f'http://www.euroscarf.de/{subpath.get("href")}'
361
- return (await get_sequences_from_file_url(genbank_url))[0]
326
+ seq = (await get_sequences_from_file_url(genbank_url))[0]
327
+ # Sometimes the files do not contain correct topology information, so we loop them
328
+ if not seq.circular:
329
+ seq = seq.looped()
330
+ seq.source = EuroscarfSource(repository_id=plasmid_id)
331
+ return seq
362
332
 
363
333
 
364
334
  async def annotate_with_plannotate(
@@ -373,14 +343,41 @@ async def annotate_with_plannotate(
373
343
  )
374
344
  if response.status_code != 200:
375
345
  detail = response.json().get('detail', 'plannotate server error')
376
- raise HTTPError(url, response.status_code, detail, detail, None)
346
+ raise HTTPException(response.status_code, detail)
377
347
  data = response.json()
378
348
  dseqr = custom_file_parser(io.StringIO(data['gb_file']), 'genbank')[0]
379
349
  report = [PlannotateAnnotationReport.model_validate(r) for r in data['report']]
380
350
  return dseqr, report, data['version']
381
351
  except TimeoutException as e:
382
- raise HTTPError(url, 504, 'plannotate server timeout', 'plannotate server timeout', None) from e
352
+ raise HTTPException(504, 'plannotate server timeout') from e
383
353
  except ConnectError as e:
384
- raise HTTPError(
385
- url, 500, 'cannot connect to plannotate server', 'cannot connect to plannotate server', None
386
- ) from e
354
+ raise HTTPException(500, 'cannot connect to plannotate server') from e
355
+
356
+
357
+ async def get_sequence_from_openDNA_collections(collection_name: str, plasmid_id: str) -> Dseqrecord:
358
+ if collection_name not in openDNA_collections_catalog:
359
+ raise HTTPException(404, 'invalid openDNA collections collection')
360
+ plasmid = next((item for item in openDNA_collections_catalog[collection_name] if item['id'] == plasmid_id), None)
361
+ if plasmid is None:
362
+ raise HTTPException(404, f'plasmid {plasmid_id} not found in {collection_name}')
363
+
364
+ path = quote(plasmid['path'])
365
+ url = f'https://assets.opencloning.org/open-dna-collections/{path}'
366
+ seqs = await get_sequences_from_file_url(url)
367
+ seq = seqs[0]
368
+ seq.name = plasmid['name'] if plasmid['name'] is not None else plasmid_id
369
+ seq.source = OpenDNACollectionsSource(repository_id=f'{collection_name}/{plasmid_id}', sequence_file_url=url)
370
+ return seq
371
+
372
+
373
+ async def get_sequence_from_iGEM2024(part: str, backbone: str) -> Dseqrecord:
374
+ all_plasmids = [item for collection in iGEM2024_catalog.values() for item in collection]
375
+ plasmid = next((item for item in all_plasmids if item['part'] == part and item['backbone'] == backbone), None)
376
+ if plasmid is None:
377
+ raise HTTPException(404, f'plasmid {part}-{backbone} not found in iGEM 2024')
378
+ url = f'https://assets.opencloning.org/annotated-igem-distribution/results/plasmids/{plasmid["id"]}.gb'
379
+ seqs = await get_sequences_from_file_url(url)
380
+ seq = seqs[0]
381
+ seq.name = f'{part}-{backbone}'
382
+ seq.source = IGEMSource(repository_id=f'{part}-{backbone}', sequence_file_url=url)
383
+ return seq
opencloning/dna_utils.py CHANGED
@@ -4,18 +4,16 @@ Utility functions moved here to avoid circular imports.
4
4
 
5
5
  from Bio.Seq import reverse_complement
6
6
  from pydna.dseqrecord import Dseqrecord
7
- from pydna.dseq import Dseq
8
7
  import tempfile
9
8
  import subprocess
10
9
  import os
11
10
  import shutil
12
11
  from pydna.parsers import parse
13
- from Bio.Align import PairwiseAligner
12
+ from Bio.Align import PairwiseAligner, Alignment
14
13
  from Bio.Data.IUPACData import ambiguous_dna_values as _ambiguous_dna_values
15
- import re
16
- from Bio.SeqFeature import Location, SimpleLocation
17
- from pydna.utils import shift_location
18
14
  from pairwise_alignments_to_msa.alignment import aligned_tuples_to_MSA
15
+ from copy import deepcopy
16
+ import numpy as np
19
17
 
20
18
  aligner = PairwiseAligner(scoring='blastn')
21
19
 
@@ -24,6 +22,39 @@ for normal_base in 'ACGT':
24
22
  del ambiguous_only_dna_values[normal_base]
25
23
 
26
24
 
25
+ def get_sequence_shift(sequence: str, reference: str) -> int:
26
+ """Given two identical but shifted sequences, return the shift."""
27
+ if sequence == reference:
28
+ return 0
29
+ else:
30
+ result = (sequence.upper() * 2).find(reference.upper())
31
+ if result == -1:
32
+ raise ValueError('Sequence not found in reference')
33
+ return result % len(sequence)
34
+
35
+
36
+ def remove_padding(alignment: Alignment, reference: str) -> (str, str):
37
+ """Remove the padding from the permutated sequence."""
38
+ new_alignment = deepcopy(alignment)
39
+ permutated_sequence = new_alignment.sequences[1]
40
+ sequence_shift = get_sequence_shift(permutated_sequence, reference)
41
+ padding = len(permutated_sequence) - len(reference)
42
+ if padding == 0:
43
+ return tuple(new_alignment)
44
+ unshifted = permutated_sequence[sequence_shift:] + permutated_sequence[:sequence_shift]
45
+ replaced = unshifted[:-padding] + '-' * padding
46
+ new_alignment.sequences[1] = replaced[-sequence_shift:] + replaced[:-sequence_shift]
47
+
48
+ # Remove positions in the alignment where both positions contain a dash
49
+ # this happens because of - matching Ns in the permutated sequence.
50
+ # It's not the best way to do this, but it works for now.
51
+ out_seqs = tuple(new_alignment)
52
+ seqs_array = np.array([list(s) for s in out_seqs])
53
+ # Drop positions where both sequences are dashes
54
+ seqs_array = seqs_array[:, ~np.all(seqs_array == '-', axis=0)]
55
+ return tuple(''.join(s) for s in seqs_array)
56
+
57
+
27
58
  def sum_is_sticky(three_prime_end: tuple[str, str], five_prime_end: tuple[str, str], partial: bool = False) -> int:
28
59
  """Return the overlap length if the 3' end of seq1 and 5' end of seq2 ends are sticky and compatible for ligation.
29
60
  Return 0 if they are not compatible."""
@@ -52,31 +83,6 @@ def sum_is_sticky(three_prime_end: tuple[str, str], five_prime_end: tuple[str, s
52
83
  return 0
53
84
 
54
85
 
55
- def get_alignment_shift(alignment: Dseq, shift: int) -> int:
56
- """Shift the alignment by the given number of positions, ignoring gap characters (-).
57
-
58
- Parameters
59
- ----------
60
- alignment : Dseq
61
- The alignment sequence that may contain gap characters (-)
62
- shift : int
63
- Number of positions to shift the sequence by
64
-
65
- """
66
-
67
- nucleotides_shifted = 0
68
- positions_shifted = 0
69
- corrected_shift = shift if shift >= 0 else len(alignment) + shift
70
- alignment_str = str(alignment)
71
-
72
- while nucleotides_shifted != corrected_shift:
73
- if alignment_str[positions_shifted] != '-':
74
- nucleotides_shifted += 1
75
- positions_shifted += 1
76
-
77
- return positions_shifted
78
-
79
-
80
86
  def align_with_mafft(inputs: list[str], orientation_known: bool) -> list[str]:
81
87
  """Align a sanger track to a dseqr sequence"""
82
88
 
@@ -140,12 +146,13 @@ def align_sanger_traces(dseqr: Dseqrecord, sanger_traces: list[str]) -> list[str
140
146
  aligned_pairs = []
141
147
  for trace in sanger_traces:
142
148
  # If the sequence is circular, permutate both fwd and reverse complement
149
+ rc_trace = reverse_complement(trace)
143
150
  if dseqr.circular:
144
151
  fwd = permutate_trace(query_str, trace)
145
- rvs = permutate_trace(query_str, reverse_complement(trace))
152
+ rvs = permutate_trace(query_str, rc_trace)
146
153
  else:
147
154
  fwd = trace
148
- rvs = reverse_complement(trace)
155
+ rvs = rc_trace
149
156
 
150
157
  # Pairwise-align and keep the best alignment
151
158
  fwd_alignment = next(aligner.align(query_str, fwd))
@@ -153,35 +160,11 @@ def align_sanger_traces(dseqr: Dseqrecord, sanger_traces: list[str]) -> list[str
153
160
 
154
161
  best_alignment = fwd_alignment if fwd_alignment.score > rvs_alignment.score else rvs_alignment
155
162
 
156
- formatted_alignment = best_alignment.format('fasta').split()[1::2]
157
- aligned_pairs.append(tuple(formatted_alignment))
163
+ if dseqr.circular:
164
+ trace4padding = trace if best_alignment is fwd_alignment else rc_trace
165
+ formatted_alignment = remove_padding(best_alignment, trace4padding)
166
+ else:
167
+ formatted_alignment = tuple(best_alignment)
168
+ aligned_pairs.append(formatted_alignment)
158
169
 
159
170
  return aligned_tuples_to_MSA(aligned_pairs)
160
-
161
-
162
- def compute_regex_site(site: str) -> str:
163
- upper_site = site.upper()
164
- for k, v in ambiguous_only_dna_values.items():
165
- if len(v) > 1:
166
- upper_site = upper_site.replace(k, f"[{''.join(v)}]")
167
-
168
- # Make case insensitive
169
- upper_site = f'(?i){upper_site}'
170
- return upper_site
171
-
172
-
173
- def dseqrecord_finditer(pattern: str, seq: Dseqrecord) -> list[re.Match]:
174
- query = str(seq.seq) if not seq.circular else str(seq.seq) * 2
175
- matches = re.finditer(pattern, query)
176
- return (m for m in matches if m.start() <= len(seq))
177
-
178
-
179
- def create_location(start: int, end: int, lim: int) -> Location:
180
- while start < 0:
181
- start += lim
182
- while end < 0:
183
- end += lim
184
- if end > start:
185
- return SimpleLocation(start, end)
186
- else:
187
- return shift_location(SimpleLocation(start, end + lim), 0, lim)
@@ -2,7 +2,7 @@ from pydna.dseqrecord import Dseqrecord
2
2
  from Bio.SeqFeature import SimpleLocation
3
3
  from ..primer3_functions import PrimerDesignSettings, primer3_design_primers
4
4
 
5
- from ..pydantic_models import PrimerModel
5
+ from opencloning_linkml.datamodel import Primer as PrimerModel
6
6
  from .primer_design_settings import amanda_settings
7
7
 
8
8
  adapter_left_fwd = 'ataGGTCTCtGGAG'
@@ -1,15 +1,14 @@
1
- from fastapi import Query, HTTPException
1
+ from fastapi import Query
2
2
  from pydantic import create_model
3
- from urllib.error import HTTPError
4
3
 
5
4
  from ..get_router import get_router
6
- from ..pydantic_models import TextFileSequence, AnnotationSource
5
+ from opencloning_linkml.datamodel import TextFileSequence, AnnotationSource
7
6
  from ..dna_functions import (
8
7
  read_dsrecord_from_json,
9
8
  annotate_with_plannotate as _annotate_with_plannotate,
10
9
  format_sequence_genbank,
11
10
  )
12
- from ..gateway import find_gateway_sites
11
+ from pydna.gateway import find_gateway_sites
13
12
  from ..app_settings import settings
14
13
 
15
14
  router = get_router()
@@ -46,15 +45,12 @@ if settings.PLANNOTATE_URL is not None:
46
45
  ):
47
46
  input_seqr = read_dsrecord_from_json(sequence)
48
47
  # Make a request submitting sequence as a file:
49
- try:
50
- seqr, annotations, version = await _annotate_with_plannotate(
51
- sequence.file_content,
52
- f'{sequence.id}.gb',
53
- settings.PLANNOTATE_URL + 'annotate',
54
- settings.PLANNOTATE_TIMEOUT,
55
- )
56
- except HTTPError as e:
57
- raise HTTPException(e.code, e.msg) from e
48
+ seqr, annotations, version = await _annotate_with_plannotate(
49
+ sequence.file_content,
50
+ f'{sequence.id}.gb',
51
+ settings.PLANNOTATE_URL + 'annotate',
52
+ settings.PLANNOTATE_TIMEOUT,
53
+ )
58
54
 
59
55
  source.annotation_report = annotations
60
56
  source.annotation_tool = 'plannotate'