opencloning 0.4.8__py3-none-any.whl → 0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. opencloning/app_settings.py +7 -0
  2. opencloning/batch_cloning/pombe/__init__.py +2 -2
  3. opencloning/batch_cloning/pombe/pombe_clone.py +31 -112
  4. opencloning/batch_cloning/pombe/pombe_summary.py +20 -8
  5. opencloning/batch_cloning/ziqiang_et_al2024/__init__.py +8 -8
  6. opencloning/batch_cloning/ziqiang_et_al2024/ziqiang_et_al2024.json +2 -9
  7. opencloning/bug_fixing/backend_v0_3.py +13 -5
  8. opencloning/catalogs/__init__.py +36 -0
  9. opencloning/catalogs/igem2024.yaml +2172 -0
  10. opencloning/catalogs/openDNA_collections.yaml +1161 -0
  11. opencloning/catalogs/readme.txt +1 -0
  12. opencloning/catalogs/seva.tsv +231 -0
  13. opencloning/catalogs/snapgene.yaml +2837 -0
  14. opencloning/dna_functions.py +155 -158
  15. opencloning/dna_utils.py +45 -62
  16. opencloning/ebic/primer_design.py +1 -1
  17. opencloning/endpoints/annotation.py +9 -13
  18. opencloning/endpoints/assembly.py +157 -378
  19. opencloning/endpoints/endpoint_utils.py +52 -0
  20. opencloning/endpoints/external_import.py +169 -124
  21. opencloning/endpoints/no_assembly.py +23 -39
  22. opencloning/endpoints/no_input.py +32 -47
  23. opencloning/endpoints/other.py +1 -1
  24. opencloning/endpoints/primer_design.py +2 -1
  25. opencloning/http_client.py +2 -2
  26. opencloning/ncbi_requests.py +113 -47
  27. opencloning/primer_design.py +1 -1
  28. opencloning/pydantic_models.py +10 -510
  29. opencloning/request_examples.py +10 -22
  30. opencloning/temp_functions.py +50 -0
  31. {opencloning-0.4.8.dist-info → opencloning-0.5.dist-info}/METADATA +18 -8
  32. opencloning-0.5.dist-info/RECORD +51 -0
  33. {opencloning-0.4.8.dist-info → opencloning-0.5.dist-info}/WHEEL +1 -1
  34. opencloning/cre_lox.py +0 -116
  35. opencloning/gateway.py +0 -154
  36. opencloning-0.4.8.dist-info/RECORD +0 -45
  37. {opencloning-0.4.8.dist-info → opencloning-0.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,22 @@
1
1
  from fastapi import Query, HTTPException
2
2
  from pydna.dseqrecord import Dseqrecord
3
3
  from pydna.dseq import Dseq
4
+ from pydna.primer import Primer as PydnaPrimer
5
+ from pydna.oligonucleotide_hybridization import oligonucleotide_hybridization as _oligonucleotide_hybridization
4
6
  from pydantic import create_model, Field
5
7
  from typing import Annotated
6
8
 
9
+ from opencloning.endpoints.endpoint_utils import format_products
10
+
7
11
  from ..dna_functions import (
8
12
  format_sequence_genbank,
9
- oligonucleotide_hybridization_overhangs,
10
13
  )
11
- from ..pydantic_models import (
12
- PrimerModel,
14
+ from opencloning_linkml.datamodel import (
15
+ Primer as PrimerModel,
13
16
  TextFileSequence,
14
17
  ManuallyTypedSource,
15
18
  OligoHybridizationSource,
16
- SourceInput,
19
+ ManuallyTypedSequence,
17
20
  )
18
21
 
19
22
  from .. import request_examples
@@ -28,14 +31,14 @@ router = get_router()
28
31
  'ManuallyTypedResponse', sources=(list[ManuallyTypedSource], ...), sequences=(list[TextFileSequence], ...)
29
32
  ),
30
33
  )
31
- async def manually_typed(source: ManuallyTypedSource):
34
+ async def manually_typed(source: ManuallyTypedSource, sequence: ManuallyTypedSequence):
32
35
  """Return the sequence from a manually typed sequence"""
33
- if source.circular:
34
- seq = Dseqrecord(source.user_input, circular=source.circular)
36
+ if sequence.circular:
37
+ seq = Dseqrecord(sequence.sequence, circular=sequence.circular)
35
38
  else:
36
39
  seq = Dseqrecord(
37
40
  Dseq.from_full_sequence_and_overhangs(
38
- source.user_input, source.overhang_crick_3prime, source.overhang_watson_3prime
41
+ sequence.sequence, sequence.overhang_crick_3prime, sequence.overhang_watson_3prime
39
42
  )
40
43
  )
41
44
  return {'sequences': [format_sequence_genbank(seq, source.output_name)], 'sources': [source]}
@@ -59,57 +62,39 @@ async def oligonucleotide_hybridization(
59
62
  primers: Annotated[list[PrimerModel], Field(min_length=1, max_length=2)],
60
63
  minimal_annealing: int = Query(20, description='The minimal annealing length for each primer.'),
61
64
  ):
65
+
62
66
  if len(source.input):
63
- watson_seq = next((p.sequence for p in primers if p.id == source.input[0].sequence), None)
64
- crick_seq = next((p.sequence for p in primers if p.id == source.input[1].sequence), None)
67
+ fwd_primer = next((p for p in primers if p.id == source.input[0].sequence), None)
68
+ rvs_primer = next((p for p in primers if p.id == source.input[1].sequence), None)
65
69
  else:
66
- watson_seq = primers[0].sequence
67
- crick_seq = primers[1].sequence if len(primers) > 1 else watson_seq
68
- source.input = [SourceInput(sequence=primers[0].id), SourceInput(sequence=primers[1].id)]
70
+ fwd_primer = primers[0]
71
+ rvs_primer = primers[1] if len(primers) > 1 else fwd_primer
69
72
 
70
- if watson_seq is None or crick_seq is None:
73
+ if fwd_primer is None or rvs_primer is None:
71
74
  raise HTTPException(404, 'Invalid oligo id.')
72
75
 
73
- # The overhang is provided
76
+ fwd_primer = PydnaPrimer(fwd_primer.sequence, id=str(fwd_primer.id), name=fwd_primer.name)
77
+ rvs_primer = PydnaPrimer(rvs_primer.sequence, id=str(rvs_primer.id), name=rvs_primer.name)
78
+
79
+ # If the overhang is provided, the minimal annealing is set from that
74
80
  if source.overhang_crick_3prime is not None:
75
- ovhg_watson = len(watson_seq) - len(crick_seq) + source.overhang_crick_3prime
76
- minimal_annealing = len(watson_seq)
81
+ ovhg_watson = len(fwd_primer.seq) - len(rvs_primer.seq) + source.overhang_crick_3prime
82
+ minimal_annealing = len(fwd_primer.seq)
77
83
  if source.overhang_crick_3prime < 0:
78
84
  minimal_annealing += source.overhang_crick_3prime
79
85
  if ovhg_watson > 0:
80
86
  minimal_annealing -= ovhg_watson
81
87
 
82
88
  try:
83
- possible_overhangs = oligonucleotide_hybridization_overhangs(watson_seq, crick_seq, minimal_annealing)
89
+ dseqs = _oligonucleotide_hybridization(fwd_primer, rvs_primer, minimal_annealing)
84
90
  except ValueError as e:
85
91
  raise HTTPException(400, *e.args)
86
92
 
87
- if len(possible_overhangs) == 0:
88
- raise HTTPException(400, 'No pair of annealing oligos was found. Try changing the annealing settings.')
89
-
90
- if source.overhang_crick_3prime is not None:
91
- if source.overhang_crick_3prime not in possible_overhangs:
92
- raise HTTPException(400, 'The provided overhang is not compatible with the primers.')
93
-
94
- return {
95
- 'sources': [source],
96
- 'sequences': [
97
- format_sequence_genbank(
98
- Dseqrecord(Dseq(watson_seq, crick_seq, source.overhang_crick_3prime)), source.output_name
99
- )
100
- ],
101
- }
102
-
103
- out_sources = list()
104
- out_sequences = list()
105
- for overhang in possible_overhangs:
106
- new_source = source.model_copy()
107
- new_source.overhang_crick_3prime = overhang
108
- out_sources.append(new_source)
109
- out_sequences.append(
110
- format_sequence_genbank(
111
- Dseqrecord(Dseq(watson_seq, crick_seq, new_source.overhang_crick_3prime)), source.output_name
112
- )
113
- )
114
-
115
- return {'sources': out_sources, 'sequences': out_sequences}
93
+ return format_products(
94
+ source.id,
95
+ dseqs,
96
+ source if source.overhang_crick_3prime is not None else None,
97
+ source.output_name,
98
+ no_products_error_message='No pair of annealing oligos was found. Try changing the annealing settings.',
99
+ wrong_completed_source_error_message='The provided source is not valid.',
100
+ )
@@ -13,9 +13,9 @@ from ..dna_functions import (
13
13
  )
14
14
  from ..dna_utils import align_sanger_traces
15
15
  from ..pydantic_models import (
16
- TextFileSequence,
17
16
  BaseCloningStrategy,
18
17
  )
18
+ from opencloning_linkml.datamodel import TextFileSequence
19
19
  from ..get_router import get_router
20
20
  from .._version import __version__ as backend_version
21
21
 
@@ -5,7 +5,8 @@ from Bio.Restriction import RestrictionBatch
5
5
  from Bio.SeqUtils import gc_fraction
6
6
 
7
7
  from ..dna_functions import get_invalid_enzyme_names
8
- from ..pydantic_models import PrimerModel, PrimerDesignQuery
8
+ from opencloning_linkml.datamodel import Primer as PrimerModel
9
+ from opencloning.pydantic_models import PrimerDesignQuery
9
10
  from ..dna_functions import read_dsrecord_from_json
10
11
  from ..primer_design import (
11
12
  homologous_recombination_primers,
@@ -7,7 +7,7 @@ from httpx import ( # noqa: F401
7
7
  AsyncHTTPTransport,
8
8
  Request,
9
9
  )
10
- from urllib.error import HTTPError
10
+ from fastapi import HTTPException
11
11
  import ssl
12
12
  import certifi
13
13
  from .app_settings import settings
@@ -22,7 +22,7 @@ class AllowedExternalUrlsTransport(AsyncHTTPTransport):
22
22
  async def handle_async_request(self, request: Request) -> Response:
23
23
  if any(str(request.url).startswith(url) for url in allowed_external_urls):
24
24
  return await super().handle_async_request(request)
25
- raise HTTPError(request.url, 403, f'Request to {request.url} is not allowed', None, None)
25
+ raise HTTPException(403, f'Request to {request.url} is not allowed')
26
26
 
27
27
 
28
28
  proxy = None
@@ -1,6 +1,9 @@
1
1
  from fastapi import HTTPException
2
- from pydna.parsers import parse as pydna_parse
2
+ import math
3
3
  from pydna.dseqrecord import Dseqrecord
4
+ from pydna.opencloning_models import GenomeCoordinatesSource, NCBISequenceSource
5
+ from Bio.SeqFeature import Location
6
+
4
7
  from .app_settings import settings
5
8
  from .http_client import get_http_client, Response
6
9
 
@@ -9,7 +12,14 @@ headers = None if settings.NCBI_API_KEY is None else {'api_key': settings.NCBI_A
9
12
 
10
13
  async def async_get(url, headers, params=None) -> Response:
11
14
  async with get_http_client() as client:
12
- return await client.get(url, headers=headers, params=params, timeout=20.0)
15
+ resp = await client.get(url, headers=headers, params=params, timeout=20.0)
16
+ if resp.status_code == 500:
17
+ raise HTTPException(503, 'NCBI is down, try again later')
18
+ elif resp.status_code == 503:
19
+ raise HTTPException(503, 'NCBI returned an internal server error')
20
+ elif resp.status_code != 200 and not math.floor(resp.status_code / 100) == 4:
21
+ raise HTTPException(503, 'NCBI returned an unexpected error')
22
+ return resp
13
23
 
14
24
 
15
25
  # TODO: this does not return old assembly accessions, see https://github.com/ncbi/datasets/issues/380#issuecomment-2231142816
@@ -43,23 +53,11 @@ async def get_sequence_accessions_from_assembly_accession(assembly_accession: st
43
53
 
44
54
 
45
55
  async def get_annotation_from_locus_tag(locus_tag: str, assembly_accession: str) -> dict:
46
- url = f'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{assembly_accession}/annotation_report?search_text={locus_tag}'
47
- resp = await async_get(url, headers=headers)
48
- if resp.status_code == 404:
49
- raise HTTPException(404, 'wrong accession number')
50
- data = resp.json()
51
- if 'reports' not in data:
52
- raise HTTPException(404, 'wrong locus_tag')
53
-
54
- matching_annotations = list(a['annotation'] for a in data['reports'] if a['annotation']['locus_tag'] == locus_tag)
55
-
56
- if len(matching_annotations) == 0:
57
- raise HTTPException(404, 'wrong locus_tag')
58
- elif len(matching_annotations) > 1:
59
- # Not sure if this can ever happen, but just in case
56
+ annotations = await get_annotations_from_query(locus_tag, assembly_accession)
57
+ locus_tag_annotations = [a for a in annotations if locus_tag.upper() in a['locus_tag'].upper()]
58
+ if len(locus_tag_annotations) != 1:
60
59
  raise HTTPException(400, 'multiple matches for locus_tag')
61
-
62
- return matching_annotations[0]
60
+ return locus_tag_annotations[0]
63
61
 
64
62
 
65
63
  async def get_annotations_from_query(query: str, assembly_accession: str) -> list[dict]:
@@ -72,9 +70,6 @@ async def get_annotations_from_query(query: str, assembly_accession: str) -> lis
72
70
  if 'reports' not in data:
73
71
  raise HTTPException(404, f'query "{query}" gave no results')
74
72
 
75
- if len(data['reports']) > 1:
76
- raise HTTPException(400, 'multiple matches for query')
77
-
78
73
  return [r['annotation'] for r in data['reports']]
79
74
 
80
75
 
@@ -94,6 +89,12 @@ async def get_sequence_length_from_sequence_accession(sequence_accession: str) -
94
89
 
95
90
 
96
91
  async def get_genbank_sequence(sequence_accession, start=None, end=None, strand=None) -> Dseqrecord:
92
+ from opencloning.dna_functions import get_sequences_from_file_url
93
+
94
+ # Ensure that start, end, and strand are either all None or none are None
95
+ if (start is None or end is None or strand is None) and not (start is None and end is None and strand is None):
96
+ raise ValueError('start, end, and strand must either all be None or none be None')
97
+
97
98
  gb_strand = 1 if strand == 1 or strand is None else 2
98
99
  url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
99
100
  params = {
@@ -108,30 +109,95 @@ async def get_genbank_sequence(sequence_accession, start=None, end=None, strand=
108
109
  if headers is not None:
109
110
  params['api_key'] = headers['api_key']
110
111
 
111
- resp = await async_get(url, headers=headers, params=params)
112
- if resp.status_code == 200:
113
- try:
114
- return pydna_parse(resp.text)[0]
115
- except Exception:
116
- # Now the ncbi returns something like this:
117
- # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=blah&rettype=gbwithparts&retmode=text
118
- # 'Error: F a i l e d t o u n d e r s t a n d i d : b l a h '
119
- raise HTTPException(404, 'wrong sequence accession')
120
- elif resp.status_code == 400:
121
- raise HTTPException(404, 'wrong sequence accession')
122
- elif resp.status_code == 503:
123
- raise HTTPException(503, 'NCBI returned an error')
112
+ try:
113
+ seq = (await get_sequences_from_file_url(url, params=params, headers=headers, get_function=async_get))[0]
114
+ except HTTPException as e:
115
+ # Now the ncbi returns something like this:
116
+ # Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=blah&rettype=gbwithparts&retmode=text
117
+ # 'Error: F a i l e d t o u n d e r s t a n d i d : b l a h '
118
+ if 'No sequences found in file' in e.detail:
119
+ raise HTTPException(404, 'invalid sequence accession') from e
120
+ raise e
121
+ except Exception as e:
122
+ raise e
123
+
124
+ if start is not None:
125
+ if strand == -1:
126
+ location = Location.fromstring(f'complement({start}..{end})')
127
+ else:
128
+ location = Location.fromstring(f'{start}..{end}')
124
129
  else:
125
- raise HTTPException(500, 'NCBI returned an unexpected error')
126
-
127
-
128
- def validate_coordinates_pre_request(start, end, strand):
129
- # TODO: move this to the class
130
- if strand not in [1, -1]:
131
- raise HTTPException(422, 'strand must be 1 or -1')
132
- if start >= end:
133
- raise HTTPException(422, 'start must be less than end')
134
- if start < 1:
135
- raise HTTPException(422, 'start must be greater than 0')
136
- if end - start > 100000:
137
- raise HTTPException(400, 'sequence is too long (max 100000 bp)')
130
+ location = None
131
+
132
+ seq.source = NCBISequenceSource(repository_id=sequence_accession, coordinates=location)
133
+ return seq
134
+
135
+
136
+ def get_info_from_annotation(annotation: dict) -> dict:
137
+ start = int(annotation['genomic_regions'][0]['gene_range']['range'][0]['begin'])
138
+ end = int(annotation['genomic_regions'][0]['gene_range']['range'][0]['end'])
139
+ strand = 1 if annotation['genomic_regions'][0]['gene_range']['range'][0]['orientation'] == 'plus' else -1
140
+ sequence_accession = annotation['genomic_regions'][0]['gene_range']['accession_version']
141
+ locus_tag = annotation['locus_tag'] if 'locus_tag' in annotation else None
142
+ gene_id = int(annotation['gene_id']) if 'gene_id' in annotation else None
143
+ try:
144
+ assembly_accession = annotation['annotations'][0]['assembly_accession']
145
+ except KeyError:
146
+ assembly_accession = None
147
+ except IndexError:
148
+ assembly_accession = None
149
+
150
+ return start, end, strand, gene_id, sequence_accession, locus_tag, assembly_accession
151
+
152
+
153
+ async def validate_locus_tag(
154
+ locus_tag: str, assembly_accession: str, gene_id: int | None, start: int, end: int, strand: int
155
+ ) -> int:
156
+ """
157
+ Validate that the locus tag exists in the assembly and that the gene falls within the requested coordinates.
158
+ Returns gene_id for convenience.
159
+ """
160
+
161
+ annotation = await get_annotation_from_locus_tag(locus_tag, assembly_accession)
162
+ gene_start, gene_end, gene_strand, gene_id_annotation, *_ = get_info_from_annotation(annotation)
163
+
164
+ # This field will not be present in all cases, but should be there in reference genomes
165
+ if gene_id is not None:
166
+ if 'gene_id' not in annotation:
167
+ raise HTTPException(400, 'gene_id is set, but not found in the annotation')
168
+ if gene_id != gene_id_annotation:
169
+ raise HTTPException(400, 'gene_id does not match the locus_tag')
170
+ elif 'gene_id' in annotation:
171
+ gene_id = gene_id_annotation
172
+
173
+ # The gene should fall within the range (range might be bigger if bases were requested upstream or downstream)
174
+ if gene_start < start or gene_end > end or gene_strand != strand:
175
+ raise HTTPException(
176
+ 400,
177
+ f'wrong coordinates, the gene should fall within the requested coordinates, {start}, {end} on strand: {strand}',
178
+ )
179
+
180
+ return gene_id
181
+
182
+
183
+ async def get_genome_region_from_annotation(
184
+ annotation: dict, padding_left: int = 0, padding_right: int = 0
185
+ ) -> Dseqrecord:
186
+ start, end, strand, gene_id, sequence_accession, locus_tag, assembly_accession = get_info_from_annotation(
187
+ annotation
188
+ )
189
+ start = start - padding_left
190
+ end = end + padding_right
191
+ seq = await get_genbank_sequence(sequence_accession, start, end, strand)
192
+ location_str = f'{start}..{end}' if strand != -1 else f'complement({start}..{end})'
193
+ coordinates = Location.fromstring(location_str)
194
+ source = GenomeCoordinatesSource(
195
+ assembly_accession=assembly_accession,
196
+ repository_id=sequence_accession,
197
+ coordinates=coordinates,
198
+ locus_tag=locus_tag,
199
+ gene_id=gene_id,
200
+ )
201
+ seq.name = locus_tag
202
+ seq.source = source
203
+ return seq
@@ -3,12 +3,12 @@ from pydna.design import primer_design, assembly_fragments
3
3
  from Bio.SeqFeature import SimpleLocation
4
4
  from pydna.utils import locations_overlap, shift_location, location_boundaries
5
5
  from pydna.amplicon import Amplicon
6
- from .pydantic_models import PrimerModel
7
6
  from Bio.Seq import reverse_complement
8
7
  from Bio.Restriction.Restriction import RestrictionType
9
8
  from Bio.Data.IUPACData import ambiguous_dna_values as _ambiguous_dna_values
10
9
  from typing import Callable
11
10
  from .primer3_functions import primer3_calc_tm, PrimerDesignSettings
11
+ from opencloning_linkml.datamodel import Primer as PrimerModel
12
12
 
13
13
  ambiguous_dna_values = _ambiguous_dna_values.copy()
14
14
  # Remove acgt