opencloning 0.2.8.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,138 @@
1
+ # Bug Fixing
2
+
3
+ ## backend_v0_3.py
4
+
5
+ PR fixing this is here: https://github.com/manulera/OpenCloning_backend/pull/305
6
+
7
+ ### Bug in assemblies with locations spanning the origin
8
+
9
+ Before version 0.3, there was a bug for assembly fields that included locations spanning the origin. For example, let's take the following two circular sequences from [this test file](../../../tests/test_files/bug_fixing/digestion_spanning_origin.json):
10
+
11
+ ```
12
+ ttcaaaagaa
13
+
14
+ ttcccccccgaa
15
+ ```
16
+
17
+ In both of them, the EcoRI site `GAATTC` is splitted by the origin. The assembly field in the current format should be:
18
+
19
+ ```json
20
+ {
21
+ "assembly": [
22
+ {
23
+ "sequence": 2,
24
+ "left_location": "join(9..10,1..2)",
25
+ "right_location": "join(9..10,1..2)",
26
+ "reverse_complemented": false
27
+ },
28
+ {
29
+ "sequence": 4,
30
+ "left_location": "join(11..12,1..2)",
31
+ "right_location": "join(11..12,1..2)",
32
+ "reverse_complemented": false
33
+ }
34
+ ],
35
+ "restriction_enzymes": [
36
+ "EcoRI"
37
+ ]
38
+ }
39
+ ```
40
+
41
+ However, the old code was not handling this use-case correctly, and produced something like this (`left_location` and `right_location` span the entire sequence rather than the common part):
42
+
43
+ ```json
44
+ {
45
+ "assembly": [
46
+ {
47
+ "sequence": 2,
48
+ "left_location": "1..10",
49
+ "right_location": "1..10",
50
+ "reverse_complemented": false
51
+ },
52
+ {
53
+ "sequence": 4,
54
+ "left_location": "1..12",
55
+ "right_location": "1..12",
56
+ "reverse_complemented": false
57
+ }
58
+ ],
59
+ "restriction_enzymes": [
60
+ "EcoRI"
61
+ ]
62
+ }
63
+ ```
64
+
65
+ This was being used in `generate_assemblies` and producing wrong assembly products.
66
+
67
+ ### Bug in gateway assemblies (rare, but could happen)
68
+
69
+ `gateway_overlap` was returning the entire overlap, which matched regex like `twtGTACAAAaaa` (for attB1). That created assemblies in which
70
+ the overlapping part may have mismatches on the w (might be rare). Now, instead of returning the whole `twtGTACAAAaaa` as overlap, it returns only the common part `GTACAAA`. For example in the [test file](../../../tests/test_files/bug_fixing/gateway_13bp_overlap.json)
71
+
72
+ Wrong (before fix):
73
+
74
+ ```json
75
+ {
76
+ "assembly": [
77
+ {
78
+ "sequence": 4,
79
+ "left_location": "2893..2905", # < Length 13 (applies to all locations)
80
+ "right_location": "649..661",
81
+ "reverse_complemented": false
82
+ },
83
+ {
84
+ "sequence": 8,
85
+ "left_location": "10..22",
86
+ "right_location": "3112..3124",
87
+ "reverse_complemented": false
88
+ }
89
+ ],
90
+ "reaction_type": "BP",
91
+ }
92
+ ```
93
+
94
+ Right (after fix):
95
+
96
+ ```json
97
+ {
98
+ "assembly": [
99
+ {
100
+ "sequence": 4,
101
+ "left_location": "2896..2902", # < Length 7 (common part, all locations)
102
+ "right_location": "652..658",
103
+ "reverse_complemented": false
104
+ },
105
+ {
106
+ "sequence": 8,
107
+ "left_location": "13..19",
108
+ "right_location": "3115..3121",
109
+ "reverse_complemented": false
110
+ }
111
+ ],
112
+ "reaction_type": "BP",
113
+ }
114
+ ```
115
+ ### Fixing these bugs
116
+
117
+ If you load a json file into the web application, it will automatically apply the fix.
118
+
119
+ If you want to fix several bugs from the command line, you can use the `backend_v0_3.py` script as below.
120
+
121
+ Before running this script, you need to migrate the data to the latest version of the schema. See [full documentation](https://github.com/OpenCloning/OpenCloning_LinkML?tab=readme-ov-file#migration-from-previous-versions-of-the-schema), but basically:
122
+
123
+ ```bash
124
+ python -m opencloning.migrations.migrate file1.json file2.json ...
125
+ ```
126
+
127
+ Then, you can run the script:
128
+
129
+ ```bash
130
+ python -m opencloning.bug_fixing.backend_v0_3 file1.json file2.json ...
131
+ ```
132
+
133
+ For each file:
134
+ * If the file does not need fixing, it will be skipped.
135
+ * If the file needs fixing, it will create a new file `file_1_needs_fixing.json` at the same location where the original file is, with the problematic sources replaced by templates.
136
+ * You can then load these files into the web application and run the correct steps manually.
137
+
138
+ Unless you are using gateway a lot, most files should not need fixing.
File without changes
@@ -0,0 +1,117 @@
1
+ """
2
+ See info in README.md
3
+ """
4
+
5
+ from ..pydantic_models import (
6
+ BaseCloningStrategy as CloningStrategy,
7
+ AssemblySource,
8
+ TextFileSequence,
9
+ PrimerModel,
10
+ SequenceLocationStr,
11
+ )
12
+ from .._version import __version__
13
+ import json
14
+ import os
15
+ from packaging import version
16
+ import copy
17
+
18
+
19
+ def fix_backend_v0_3(input_data: dict) -> CloningStrategy | None:
20
+
21
+ data = copy.deepcopy(input_data)
22
+ # Make sure that it is a valid CloningStrategy
23
+ cs = CloningStrategy.model_validate(data)
24
+
25
+ # First fix gateway assemblies
26
+ problematic_source_ids = set()
27
+
28
+ for source in data['sources']:
29
+ if source['type'] == 'GatewaySource':
30
+ # Take the first assembly value and check that the length of features is 7
31
+ assembly = source['assembly']
32
+ if len(assembly):
33
+ feat2check = (
34
+ assembly[0]['left_location']
35
+ if assembly[0]['left_location'] is not None
36
+ else assembly[0]['right_location']
37
+ )
38
+ if len(SequenceLocationStr(feat2check).to_biopython_location()) != 7:
39
+ problematic_source_ids.add(source['id'])
40
+
41
+ elif 'assembly' in source:
42
+ assembly_source = AssemblySource(
43
+ id=source['id'],
44
+ input=source['input'],
45
+ output=source['output'],
46
+ circular=source['circular'],
47
+ assembly=source['assembly'],
48
+ )
49
+ input_seqs = [
50
+ TextFileSequence.model_validate(s) for s in data['sequences'] if s['id'] in assembly_source.input
51
+ ]
52
+ # Sort input_seqs as in input
53
+ input_seqs.sort(key=lambda x: assembly_source.input.index(x.id))
54
+ if source['type'] == 'PCRSource':
55
+ primer_ids = [assembly_source.assembly[0].sequence, assembly_source.assembly[2].sequence]
56
+ primers = [PrimerModel.model_validate(p) for p in data['primers'] if p['id'] in primer_ids]
57
+ input_seqs = [primers[0], input_seqs[0], primers[1]]
58
+
59
+ assembly_plan = assembly_source.get_assembly_plan(input_seqs)
60
+ for join in assembly_plan:
61
+ if len(join[2]) != len(join[3]):
62
+ problematic_source_ids.add(source['id'])
63
+ break
64
+
65
+ if len(problematic_source_ids) == 0:
66
+ return None
67
+
68
+ # Replace problematic sources and their output sequences by templates
69
+ problematic_source_ids.update(sum([cs.all_children_source_ids(s) for s in problematic_source_ids], []))
70
+ for source_id in problematic_source_ids:
71
+ source = next(s for s in data['sources'] if s['id'] == source_id)
72
+ output_seq = next(s for s in data['sequences'] if s['id'] == source['output'])
73
+ remove_keys = ['assembly', 'circular']
74
+ source_keep = {key: value for key, value in source.items() if key not in remove_keys}
75
+ source.clear()
76
+ source.update(source_keep)
77
+
78
+ seq_keep = {'id': output_seq['id'], 'type': 'TemplateSequence'}
79
+ output_seq.clear()
80
+ output_seq.update(seq_keep)
81
+
82
+ return CloningStrategy.model_validate(data)
83
+
84
+
85
+ def main(file_path: str):
86
+ file_dir = os.path.dirname(file_path)
87
+ file_base = os.path.splitext(os.path.basename(file_path))[0]
88
+ new_file_path = os.path.join(file_dir, f'{file_base}_needs_fixing.json')
89
+
90
+ with open(file_path, 'r') as f:
91
+ data = json.load(f)
92
+
93
+ if 'backend_version' not in data or data['backend_version'] is None:
94
+
95
+ # Fix the data
96
+ cs = fix_backend_v0_3(data)
97
+
98
+ if cs is not None:
99
+ cs.backend_version = __version__ if version.parse(__version__) > version.parse('0.3') else '0.3'
100
+ with open(new_file_path, 'w') as f:
101
+ f.write(cs.model_dump_json(indent=2, exclude_none=True))
102
+
103
+
104
+ if __name__ == '__main__':
105
+ import sys
106
+
107
+ if len(sys.argv) == 1:
108
+ print('Usage: python assembly_features_spanning_origin.py <file1> <file2> ...')
109
+ sys.exit(1)
110
+
111
+ file_paths = sys.argv[1:]
112
+
113
+ for file_path in file_paths:
114
+ if file_path.endswith('_needs_fixing.json'):
115
+ print(f'Skipping {file_path}')
116
+ continue
117
+ main(file_path)
opencloning/cre_lox.py CHANGED
@@ -3,6 +3,8 @@ from pydna.dseqrecord import Dseqrecord
3
3
  from Bio.Data.IUPACData import ambiguous_dna_values
4
4
  from Bio.Seq import reverse_complement
5
5
  from .dna_utils import compute_regex_site, dseqrecord_finditer
6
+ from Bio.SeqFeature import Location, SimpleLocation, SeqFeature
7
+ from pydna.utils import shift_location
6
8
 
7
9
  # We create a dictionary to map ambiguous bases to their consensus base
8
10
  # For example, ambigous_base_dict['ACGT'] -> 'N'
@@ -56,3 +58,59 @@ def cre_loxP_overlap(x: Dseqrecord, y: Dseqrecord, _l: None = None) -> list[tupl
56
58
  if item not in unique_out:
57
59
  unique_out.append(item)
58
60
  return unique_out
61
+
62
+
63
+ loxP_dict = {
64
+ 'loxP': 'ATAACTTCGTATANNNTANNNTATACGAAGTTAT',
65
+ 'lox66': 'ATAACTTCGTATANNNTANNNTATACGAACGGTA',
66
+ 'lox71': 'TACCGTTCGTATANNNTANNNTATACGAAGTTAT',
67
+ 'loxP_mutant': 'TACCGTTCGTATANNNTANNNTATACGAACGGTA',
68
+ }
69
+
70
+
71
+ def get_regex_dict(original_dict: dict[str, str]) -> dict[str, str]:
72
+ """Get the regex dictionary for the original dictionary."""
73
+ out = dict()
74
+ for site in original_dict:
75
+ consensus_seq = original_dict[site]
76
+ is_palindromic = consensus_seq == reverse_complement(consensus_seq)
77
+ out[site] = {
78
+ 'forward_regex': compute_regex_site(original_dict[site]),
79
+ 'reverse_regex': None if is_palindromic else compute_regex_site(reverse_complement(original_dict[site])),
80
+ }
81
+ return out
82
+
83
+
84
+ def find_loxP_sites(seq: Dseqrecord) -> dict[str, list[Location]]:
85
+ """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
86
+
87
+ out = dict()
88
+ regex_dict = get_regex_dict(loxP_dict)
89
+ for site in loxP_dict:
90
+
91
+ for pattern in ['forward_regex', 'reverse_regex']:
92
+ # Palindromic sequences have no reverse complement
93
+ if regex_dict[site][pattern] is None:
94
+ continue
95
+ matches = list(dseqrecord_finditer(regex_dict[site][pattern], seq))
96
+ for match in matches:
97
+ if site not in out:
98
+ out[site] = []
99
+ strand = 1 if pattern == 'forward_regex' else -1
100
+ loc = SimpleLocation(match.start(), match.end(), strand)
101
+ loc = shift_location(loc, 0, len(seq))
102
+ out[site].append(loc)
103
+ return out
104
+
105
+
106
+ def annotate_loxP_sites(seq: Dseqrecord) -> Dseqrecord:
107
+ sites = find_loxP_sites(seq)
108
+ for site in sites:
109
+ for loc in sites[site]:
110
+ # Don't add the same feature twice
111
+ if not any(
112
+ f.location == loc and f.type == 'protein_bind' and f.qualifiers.get('label', []) == [site]
113
+ for f in seq.features
114
+ ):
115
+ seq.features.append(SeqFeature(loc, type='protein_bind', qualifiers={'label': [site]}))
116
+ return seq
@@ -5,7 +5,7 @@ from pydna.primer import Primer as PydnaPrimer
5
5
  from pydna.crispr import cas9
6
6
  from pydantic import conlist, create_model
7
7
  from Bio.Restriction.Restriction import RestrictionBatch
8
- from opencloning.cre_lox import cre_loxP_overlap
8
+ from opencloning.cre_lox import cre_loxP_overlap, annotate_loxP_sites
9
9
  from ..dna_functions import (
10
10
  get_invalid_enzyme_names,
11
11
  format_sequence_genbank,
@@ -57,7 +57,9 @@ def format_known_assembly_response(
57
57
  # If a specific assembly is requested
58
58
  assembly_plan = source.get_assembly_plan(fragments)
59
59
  for s in out_sources:
60
- if s == source:
60
+ # TODO: it seems that assemble() is not getting is_insertion ever
61
+ other_assembly_plan = s.get_assembly_plan(fragments)
62
+ if assembly_plan == other_assembly_plan:
61
63
  return {
62
64
  'sequences': [
63
65
  format_sequence_genbank(product_callback(assemble(fragments, assembly_plan)), s.output_name)
@@ -553,7 +555,14 @@ async def cre_lox_recombination(source: CreLoxRecombinationSource, sequences: co
553
555
  )
554
556
 
555
557
  resp = generate_assemblies(
556
- source, create_source, fragments, False, cre_loxP_overlap, True, recombination_mode=True
558
+ source,
559
+ create_source,
560
+ fragments,
561
+ False,
562
+ cre_loxP_overlap,
563
+ True,
564
+ recombination_mode=True,
565
+ product_callback=annotate_loxP_sites,
557
566
  )
558
567
 
559
568
  if len(resp['sources']) == 0:
@@ -24,7 +24,7 @@ from ..pydantic_models import (
24
24
  GenomeCoordinatesSource,
25
25
  SequenceFileFormat,
26
26
  SEVASource,
27
- SimpleSequenceLocation,
27
+ SequenceLocationStr,
28
28
  )
29
29
  from ..dna_functions import (
30
30
  format_sequence_genbank,
@@ -150,12 +150,12 @@ async def read_from_file(
150
150
 
151
151
  seq_feature = None
152
152
  if start is not None and end is not None:
153
- seq_feature = SimpleSequenceLocation(start=start, end=end)
154
153
  extracted_sequences = list()
155
154
  for dseq in dseqs:
156
155
  try:
156
+ seq_feature = SequenceLocationStr.from_start_and_end(start=start, end=end, seq_len=len(dseq))
157
157
  # TODO: We could use extract when this is addressed: https://github.com/biopython/biopython/issues/4989
158
- location = seq_feature.to_biopython_location(circular=dseq.circular, seq_len=len(dseq))
158
+ location = seq_feature.to_biopython_location()
159
159
  i, j = location_boundaries(location)
160
160
  extracted_sequence = dseq[i:j]
161
161
  # Only add the sequence if the interval is not out of bounds
@@ -1,6 +1,10 @@
1
- from fastapi import Query, HTTPException
1
+ from fastapi import Query, HTTPException, Response
2
2
  from Bio.Restriction.Restriction_Dictionary import rest_dict
3
+ from pydantic import ValidationError
4
+ from opencloning_linkml.migrations import migrate
5
+ from opencloning_linkml._version import __version__ as schema_version
3
6
 
7
+ from ..bug_fixing.backend_v0_3 import fix_backend_v0_3
4
8
 
5
9
  from ..dna_functions import (
6
10
  format_sequence_genbank,
@@ -12,7 +16,7 @@ from ..pydantic_models import (
12
16
  BaseCloningStrategy,
13
17
  )
14
18
  from ..get_router import get_router
15
- from ..utils import api_version
19
+ from .._version import __version__ as backend_version
16
20
 
17
21
 
18
22
  router = get_router()
@@ -20,7 +24,7 @@ router = get_router()
20
24
 
21
25
  @router.get('/version')
22
26
  async def get_version():
23
- return api_version()
27
+ return {'backend_version': backend_version, 'schema_version': schema_version}
24
28
 
25
29
 
26
30
  @router.get('/restriction_enzyme_list', response_model=dict[str, list[str]])
@@ -32,12 +36,50 @@ async def get_restriction_enzyme_list():
32
36
  @router.post(
33
37
  '/validate',
34
38
  summary='Validate a cloning strategy',
39
+ responses={
40
+ 200: {
41
+ 'description': 'The cloning strategy is valid',
42
+ 'headers': {
43
+ 'x-warning': {
44
+ 'description': 'A warning returned if the file either contains errors or is in a previous version of the model',
45
+ 'schema': {'type': 'string'},
46
+ },
47
+ },
48
+ },
49
+ 422: {
50
+ 'description': 'The cloning strategy is invalid',
51
+ },
52
+ },
35
53
  )
36
- async def cloning_strategy_is_valid(
37
- cloning_strategy: BaseCloningStrategy,
38
- ) -> bool:
54
+ async def cloning_strategy_is_valid(data: dict, response: Response):
39
55
  """Validate a cloning strategy"""
40
- return True
56
+ warnings = []
57
+ if any(key not in data for key in ['primers', 'sources', 'sequences']):
58
+ raise HTTPException(status_code=422, detail='The cloning strategy is invalid')
59
+
60
+ try:
61
+ migrated_data = migrate(data)
62
+ if migrated_data is None:
63
+ BaseCloningStrategy.model_validate(data)
64
+ return None
65
+
66
+ data = migrated_data
67
+ warnings.append(
68
+ 'The cloning strategy is in a previous version of the model and has been migrated to the latest version.'
69
+ )
70
+
71
+ fixed_data = fix_backend_v0_3(data)
72
+ if fixed_data is not None:
73
+ data = fixed_data
74
+ warnings.append('The cloning strategy contained an error and has been turned into a template.')
75
+ cs = BaseCloningStrategy.model_validate(data)
76
+ if len(warnings) > 0:
77
+ response.headers['x-warning'] = ';'.join(warnings)
78
+ return cs
79
+ return None
80
+
81
+ except ValidationError:
82
+ raise HTTPException(status_code=422, detail='The cloning strategy is invalid')
41
83
 
42
84
 
43
85
  @router.post('/rename_sequence', response_model=TextFileSequence)
@@ -62,10 +62,10 @@ async def primer_design_homologous_recombination(
62
62
  validate_spacers(spacers, 1, False)
63
63
 
64
64
  pcr_seq = read_dsrecord_from_json(pcr_template.sequence)
65
- pcr_loc = pcr_template.location.to_biopython_location(pcr_seq.circular, len(pcr_seq))
65
+ pcr_loc = pcr_template.location.to_biopython_location()
66
66
 
67
67
  hr_seq = read_dsrecord_from_json(homologous_recombination_target.sequence)
68
- hr_loc = homologous_recombination_target.location.to_biopython_location(hr_seq.circular, len(hr_seq))
68
+ hr_loc = homologous_recombination_target.location.to_biopython_location()
69
69
 
70
70
  insert_forward = pcr_template.forward_orientation
71
71
 
@@ -112,7 +112,7 @@ async def primer_design_gibson_assembly(
112
112
  templates = list()
113
113
  for query in pcr_templates:
114
114
  dseqr = read_dsrecord_from_json(query.sequence)
115
- location = query.location.to_biopython_location(dseqr.circular, len(dseqr))
115
+ location = query.location.to_biopython_location()
116
116
  template = location.extract(dseqr)
117
117
  if not query.forward_orientation:
118
118
  template = template.reverse_complement()
@@ -167,7 +167,7 @@ async def primer_design_simple_pair(
167
167
  validate_spacers(spacers, 1, False)
168
168
 
169
169
  dseqr = read_dsrecord_from_json(pcr_template.sequence)
170
- location = pcr_template.location.to_biopython_location(dseqr.circular, len(dseqr))
170
+ location = pcr_template.location.to_biopython_location()
171
171
  template = location.extract(dseqr)
172
172
  if not pcr_template.forward_orientation:
173
173
  template = template.reverse_complement()
@@ -201,8 +201,7 @@ async def primer_design_ebic(
201
201
  ):
202
202
  """Design primers for EBIC"""
203
203
  dseqr = read_dsrecord_from_json(template.sequence)
204
- location = template.location.to_biopython_location(dseqr.circular, len(dseqr))
205
-
204
+ location = template.location.to_biopython_location()
206
205
  return {'primers': ebic_primers(dseqr, location, max_inside, max_outside)}
207
206
 
208
207
 
opencloning/gateway.py CHANGED
@@ -105,7 +105,8 @@ def gateway_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy:
105
105
  continue
106
106
 
107
107
  for match_x, match_y in _itertools.product(matches_x, matches_y):
108
- # Find the overlap sequence within each match
108
+ # Find the overlap sequence within each match, and use the
109
+ # core 7 pbs that are constant
109
110
  overlap_x = re.search(overlap_regex, match_x.group())
110
111
  overlap_y = re.search(overlap_regex, match_y.group())
111
112
 
@@ -116,9 +117,9 @@ def gateway_overlap(seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy:
116
117
 
117
118
  out.append(
118
119
  (
119
- match_x.start() + overlap_x.start(),
120
- match_y.start() + overlap_y.start(),
121
- len(overlap_x.group()),
120
+ match_x.start() + overlap_x.start() + 3,
121
+ match_y.start() + overlap_y.start() + 3,
122
+ 7,
122
123
  )
123
124
  )
124
125