pydna 5.5.2__py3-none-any.whl → 5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/cre_lox.py ADDED
@@ -0,0 +1,130 @@
1
+ # -*- coding: utf-8 -*-
2
+ from itertools import product
3
+ from pydna.dseqrecord import Dseqrecord
4
+ from Bio.Data.IUPACData import ambiguous_dna_values
5
+ from Bio.Seq import reverse_complement
6
+ from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
7
+ from Bio.SeqFeature import Location, SimpleLocation, SeqFeature
8
+ from pydna.utils import shift_location
9
+
10
+ # We create a dictionary to map ambiguous bases to their consensus base
11
+ # For example, ambigous_base_dict['ACGT'] -> 'N'
12
+ ambiguous_base_dict = {}
13
+ for ambiguous, bases in ambiguous_dna_values.items():
14
+ ambiguous_base_dict["".join(sorted(bases))] = ambiguous
15
+
16
+ # To handle N values
17
+ ambiguous_base_dict["N"] = "N"
18
+
19
+ # This is the original loxP sequence, here for reference
20
+ LOXP_SEQUENCE = "ATAACTTCGTATAGCATACATTATACGAAGTTAT"
21
+
22
+ loxP_sequences = [
23
+ # https://blog.addgene.org/plasmids-101-cre-lox
24
+ # loxP
25
+ "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
26
+ # PMID:12202778
27
+ # lox66
28
+ "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
29
+ # lox71
30
+ "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
31
+ ]
32
+
33
+ loxP_consensus = ""
34
+
35
+ for pos in range(len(LOXP_SEQUENCE)):
36
+ all_letters = set(seq[pos] for seq in loxP_sequences)
37
+ key = "".join(sorted(all_letters))
38
+ loxP_consensus += ambiguous_base_dict[key]
39
+
40
+ # We compute the regex for the forward and reverse loxP sequences
41
+ loxP_regex = (
42
+ compute_regex_site(loxP_consensus),
43
+ compute_regex_site(reverse_complement(loxP_consensus)),
44
+ )
45
+
46
+
47
+ def cre_loxP_overlap(
48
+ x: Dseqrecord, y: Dseqrecord, _l: None = None
49
+ ) -> list[tuple[int, int, int]]:
50
+ """Find matching loxP sites between two sequences."""
51
+ out = list()
52
+ for pattern in loxP_regex:
53
+ matches_x = dseqrecord_finditer(pattern, x)
54
+ matches_y = dseqrecord_finditer(pattern, y)
55
+
56
+ for match_x, match_y in product(matches_x, matches_y):
57
+ value_x = match_x.group()
58
+ value_y = match_y.group()
59
+ if value_x[13:21] == value_y[13:21]:
60
+ out.append((match_x.start() + 13, match_y.start() + 13, 8))
61
+ # Unique values (keeping the order)
62
+ unique_out = []
63
+ for item in out:
64
+ if item not in unique_out:
65
+ unique_out.append(item)
66
+ return unique_out
67
+
68
+
69
+ loxP_dict = {
70
+ "loxP": "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
71
+ "lox66": "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
72
+ "lox71": "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
73
+ "loxP_mutant": "TACCGTTCGTATANNNTANNNTATACGAACGGTA",
74
+ }
75
+
76
+
77
+ def get_regex_dict(original_dict: dict[str, str]) -> dict[str, str]:
78
+ """Get the regex dictionary for the original dictionary."""
79
+ out = dict()
80
+ for site in original_dict:
81
+ consensus_seq = original_dict[site]
82
+ is_palindromic = consensus_seq == reverse_complement(consensus_seq)
83
+ out[site] = {
84
+ "forward_regex": compute_regex_site(original_dict[site]),
85
+ "reverse_regex": (
86
+ None
87
+ if is_palindromic
88
+ else compute_regex_site(reverse_complement(original_dict[site]))
89
+ ),
90
+ }
91
+ return out
92
+
93
+
94
+ def find_loxP_sites(seq: Dseqrecord) -> dict[str, list[Location]]:
95
+ """Find all loxP sites in a sequence and return a dictionary with the name and positions of the sites."""
96
+
97
+ out = dict()
98
+ regex_dict = get_regex_dict(loxP_dict)
99
+ for site in loxP_dict:
100
+
101
+ for pattern in ["forward_regex", "reverse_regex"]:
102
+ # Palindromic sequences have no reverse complement
103
+ if regex_dict[site][pattern] is None:
104
+ continue
105
+ matches = list(dseqrecord_finditer(regex_dict[site][pattern], seq))
106
+ for match in matches:
107
+ if site not in out:
108
+ out[site] = []
109
+ strand = 1 if pattern == "forward_regex" else -1
110
+ loc = SimpleLocation(match.start(), match.end(), strand)
111
+ loc = shift_location(loc, 0, len(seq))
112
+ out[site].append(loc)
113
+ return out
114
+
115
+
116
+ def annotate_loxP_sites(seq: Dseqrecord) -> Dseqrecord:
117
+ sites = find_loxP_sites(seq)
118
+ for site in sites:
119
+ for loc in sites[site]:
120
+ # Don't add the same feature twice
121
+ if not any(
122
+ f.location == loc
123
+ and f.type == "protein_bind"
124
+ and f.qualifiers.get("label", []) == [site]
125
+ for f in seq.features
126
+ ):
127
+ seq.features.append(
128
+ SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
129
+ )
130
+ return seq
pydna/dseqrecord.py CHANGED
@@ -35,6 +35,11 @@ import os as _os
35
35
  import re as _re
36
36
  import time as _time
37
37
  import datetime as _datetime
38
+ from typing import Union, TYPE_CHECKING
39
+ from pydna.opencloning_models import SequenceCutSource
40
+
41
+ if TYPE_CHECKING: # pragma: no cover
42
+ from pydna.opencloning_models import Source
38
43
 
39
44
 
40
45
  # import logging as _logging
@@ -128,6 +133,7 @@ class Dseqrecord(_SeqRecord):
128
133
  """
129
134
 
130
135
  seq: _Dseq
136
+ source: Union["Source", None] = None
131
137
 
132
138
  def __init__(
133
139
  self,
@@ -135,6 +141,7 @@ class Dseqrecord(_SeqRecord):
135
141
  *args,
136
142
  circular=None,
137
143
  n=5e-14, # mol ( = 0.05 pmol)
144
+ source=None,
138
145
  **kwargs,
139
146
  ):
140
147
  # _module_logger.info("### Dseqrecord initialized ###")
@@ -202,6 +209,7 @@ class Dseqrecord(_SeqRecord):
202
209
  self.map_target = None
203
210
  self.n = n # amount, set to 5E-14 which is 5 pmols
204
211
  self.annotations.update({"molecule_type": "DNA"})
212
+ self.source = source
205
213
 
206
214
  @classmethod
207
215
  def from_string(
@@ -256,6 +264,7 @@ class Dseqrecord(_SeqRecord):
256
264
  obj.features = record.features
257
265
  obj.map_target = None
258
266
  obj.n = n
267
+ obj.source = None
259
268
  if circular is None:
260
269
  circular = record.annotations.get("topology") == "circular"
261
270
  obj.seq = _Dseq.quick(
@@ -875,7 +884,11 @@ class Dseqrecord(_SeqRecord):
875
884
  def __eq__(self, other):
876
885
  """docstring."""
877
886
  try:
878
- if self.seq == other.seq and str(self.__dict__) == str(other.__dict__):
887
+ this_dict = self.__dict__.copy()
888
+ other_dict = other.__dict__.copy()
889
+ del this_dict["source"]
890
+ del other_dict["source"]
891
+ if self.seq == other.seq and str(this_dict) == str(other_dict):
879
892
  return True
880
893
  except AttributeError:
881
894
  pass
@@ -1419,4 +1432,39 @@ class Dseqrecord(_SeqRecord):
1419
1432
  right_edge = right_watson if right_ovhg > 0 else right_crick
1420
1433
  features = self[left_edge:right_edge].features
1421
1434
 
1422
- return Dseqrecord(dseq, features=features)
1435
+ # This will need to be generalised to all types of cuts
1436
+ source = SequenceCutSource.from_parent(self, left_cut, right_cut)
1437
+ return Dseqrecord(dseq, features=features, source=source)
1438
+
1439
+ def history(self):
1440
+ """
1441
+ Returns a string representation of the cloning history of the sequence.
1442
+ Returns an empty string if the sequence has no source.
1443
+
1444
+ Check the documentation notebooks for extensive examples.
1445
+
1446
+ Returns
1447
+ -------
1448
+ str: A string representation of the cloning history of the sequence.
1449
+
1450
+ Examples
1451
+ --------
1452
+ >>> from pydna.dseqrecord import Dseqrecord
1453
+ >>> from pydna.assembly2 import gibson_assembly
1454
+ >>> fragments = [
1455
+ ... Dseqrecord("TTTTacgatAAtgctccCCCC", circular=False, name="fragment1"),
1456
+ ... Dseqrecord("CCCCtcatGGGG", circular=False, name="fragment2"),
1457
+ ... Dseqrecord("GGGGatataTTTT", circular=False, name="fragment3"),
1458
+ ... ]
1459
+ >>> product, *_ = gibson_assembly(fragments, limit=4)
1460
+ >>> product.name = "product_name"
1461
+ >>> print(product.history())
1462
+ ╙── product_name (Dseqrecord(o34))
1463
+ └─╼ GibsonAssemblySource
1464
+ ├─╼ fragment1 (Dseqrecord(-21))
1465
+ ├─╼ fragment2 (Dseqrecord(-12))
1466
+ └─╼ fragment3 (Dseqrecord(-13))
1467
+ """
1468
+ if self.source is None:
1469
+ return ""
1470
+ return self.source.history_string(self)
pydna/gateway.py CHANGED
@@ -1,162 +1,164 @@
1
- #!/usr/bin/env python3
2
1
  # -*- coding: utf-8 -*-
3
- # Copyright 2013-2023 by Björn Johansson. All rights reserved.
4
- # This code is part of the Python-dna distribution and governed by its
5
- # license. Please see the LICENSE.txt file that should have been included
6
- # as part of this package.
7
-
8
- """Assembly of sequences by Gateway recombination.
9
-
10
- Given a list of sequences (Dseqrecords), all sequences are analyzed for
11
- presence of att(P|B|L|R)N where N is 1,2,3 or 4.
12
-
13
- A graph is constructed where the att sites form a nodes and
14
- sequences separating att sites form edges.
15
-
16
- The NetworkX package is used to trace linear and circular paths through the
17
- graph.
18
- """
19
- # from Bio.SeqFeature import ExactPosition as _ExactPosition
20
- # from Bio.SeqFeature import SimpleLocation as _SimpleLocation
21
- # from Bio.SeqFeature import CompoundLocation as _CompoundLocation
22
- # from pydna.utils import rc as _rc
23
-
24
- # from pydna._pretty import pretty_str as _pretty_str
25
- # from pydna.contig import Contig as _Contig
26
- # from pydna.common_sub_strings import common_sub_strings
27
- # from pydna.dseqrecord import Dseqrecord as _Dseqrecord
28
- # import networkx as _nx
29
- # from copy import deepcopy as _deepcopy
30
- # import itertools as _itertools
31
- # import logging as _logging
32
-
33
- # _module_logger = _logging.getLogger("pydna." + __name__)
34
-
35
- ambiguous_dna_regex = {
36
- "A": "T",
37
- "C": "G",
38
- "G": "C",
39
- "T": "A",
40
- "M": "[ACM]",
41
- "R": "[AGR]",
42
- "W": "[ATW]",
43
- "S": "[CGS]",
44
- "Y": "[CTY]",
45
- "K": "[GTK]",
46
- "V": "[ACGVMSR]",
47
- "H": "[ACTHMYW]",
48
- "D": "[AGTDRWK]",
49
- "B": "[CGTBSKY]",
50
- "X": "X",
51
- "N": "[ACGTBDHKMNRSVWY]",
2
+ from Bio.Seq import reverse_complement
3
+ from pydna.dseqrecord import Dseqrecord as _Dseqrecord
4
+ import re
5
+ import itertools as _itertools
6
+ from Bio.SeqFeature import SimpleLocation, SeqFeature
7
+ from pydna.utils import shift_location
8
+ from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
9
+
10
+
11
+ raw_gateway_common = {
12
+ "attB1": "CHWVTWTGTACAAAAAANNNG",
13
+ "attB2": "CHWVTWTGTACAAGAAANNNG",
14
+ "attB3": "CHWVTWTGTATAATAAANNNG",
15
+ "attB4": "CHWVTWTGTATAGAAAANNNG",
16
+ "attB5": "CHWVTWTGTATACAAAANNNG",
17
+ "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG",
18
+ "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG",
19
+ "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG",
20
+ "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG",
21
+ "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG",
22
+ "attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
23
+ "attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
24
+ "attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
25
+ "attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
26
+ "attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
27
+ "overlap_1": "twtGTACAAAaaa",
28
+ "overlap_2": "twtGTACAAGaaa",
29
+ "overlap_3": "twtGTATAATaaa",
30
+ "overlap_4": "twtGTATAGAaaa",
31
+ "overlap_5": "twtGTATACAaaa",
52
32
  }
53
33
 
54
- atts = """
55
- attP1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
56
- attP2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
57
- attP3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
58
- attP4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
59
- attP5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
60
-
61
- attB1 CMASTWT GTACAAA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
62
- attB2 CMASTWT GTACAAG AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
63
- attB3 CMASTWT GTATAAT AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
64
- attB4 CMASTWT GTATAGA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
65
- attB5 CMASTWT GTATACA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
66
-
67
- attR1 CMASTWT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
68
- attR2 CMASTWT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
69
- attR3 CMASTWT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
70
- attR4 CMASTWT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
71
- attR5 CMASTWT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
72
-
73
- attL1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
74
- attL2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
75
- attL3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
76
- attL4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
77
- attL5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
78
- """
79
-
80
-
81
- retable = str.maketrans(ambiguous_dna_regex)
82
-
83
- for line in (line for line in atts.splitlines() if line.strip()):
84
- name, *parts = line.split()
85
- for part in parts:
86
- part.translate(retable)
87
-
88
-
89
- class Gateway(object):
90
- """Assembly of linear DNA fragments into linear or circular constructs.
91
-
92
- The Assembly is meant to replace the Assembly method as it
93
- is easier to use. Accepts a list of Dseqrecords (source fragments) to
94
- initiate an Assembly object. Several methods are available for analysis
95
- of overlapping sequences, graph construction and assembly.
96
-
97
- Parameters
98
- ----------
99
- fragments : list
100
- a list of Dseqrecord objects.
101
- """
102
-
103
- def __init__(self, molecules=None):
104
- self.molecules = molecules
105
-
106
-
107
- """
108
- Created on Sat Aug 21 15:41:42 2021
109
-
110
- @author: bjorn
111
-
112
-
113
- https://en.wikipedia.org/wiki/Cre-Lox_recombination
114
-
115
- 13bp 8bp 13bp
116
- ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
117
-
118
-
119
- Name 13 bp 8 bp 13 bp
120
- Recognition Spacer Recognition
121
- Region Region Region
122
-
123
- Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
124
- lox 511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
125
- lox 5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
126
- lox 2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
127
- M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
128
- M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
129
- M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
130
- M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
131
- lox 71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
132
- lox 66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
133
34
 
134
- """
135
-
136
-
137
- """
138
-
139
- https://blog.addgene.org/plasmids-101-cre-lox
35
+ raw_gateway_sites_greedy = {
36
+ **raw_gateway_common,
37
+ "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
38
+ "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
39
+ "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
40
+ "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
41
+ "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
42
+ }
140
43
 
141
- https://en.wikipedia.org/wiki/Cre-Lox_recombination
44
+ raw_gateway_sites_conservative = {
45
+ **raw_gateway_common,
46
+ "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
47
+ "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
48
+ "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
49
+ "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
50
+ "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
51
+ }
142
52
 
143
- 13bp 8bp 13bp
144
- ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
53
+ gateway_sites_greedy = {
54
+ k: {
55
+ "forward_regex": compute_regex_site(v),
56
+ "reverse_regex": compute_regex_site(reverse_complement(v)),
57
+ "consensus_sequence": v,
58
+ }
59
+ for k, v in raw_gateway_sites_greedy.items()
60
+ }
145
61
 
62
+ gateway_sites_conservative = {
63
+ k: {
64
+ "forward_regex": compute_regex_site(v),
65
+ "reverse_regex": compute_regex_site(reverse_complement(v)),
66
+ "consensus_sequence": v,
67
+ }
68
+ for k, v in raw_gateway_sites_conservative.items()
69
+ }
146
70
 
147
- Name 13 bp 8 bp 13 bp
148
- Recognition Spacer Recognition
149
- Region Region Region
71
+ # From snapgene - ask Valerie
72
+ primer_design_attB = {
73
+ "attB1": "ACAAGTTTGTACAAAAAAGCAGGCT",
74
+ "attB2": "ACCACTTTGTACAAGAAAGCTGGGT",
75
+ "attB3": "ACAACTTTGTATAATAAAGTTGTA",
76
+ "attB4": "ACAACTTTGTATAGAAAAGTTGTA",
77
+ "attB5": "ACAACTTTGTATACAAAAGTTGTA",
78
+ }
150
79
 
151
- Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
152
- lox511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
153
- lox5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
154
- lox2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
155
- M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
156
- M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
157
- M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
158
- M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
159
- lox71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
160
- lox66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
161
80
 
162
- """
81
+ def gateway_overlap(
82
+ seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy: bool
83
+ ) -> list[tuple[int, int, int]]:
84
+ """
85
+ Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
86
+ which might give false positives
87
+ """
88
+ if reaction not in ["BP", "LR"]:
89
+ raise ValueError(f"Invalid overlap type: {reaction}")
90
+
91
+ gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
92
+ out = list()
93
+ # Iterate over the four possible att sites
94
+ for num in range(1, 5):
95
+ # Iterate over the two possible orientations
96
+ # The sites have to be in the same orientation (fwd + fwd or rev + rev)
97
+ for pattern in ["forward_regex", "reverse_regex"]:
98
+ # The overlap regex is the same for all types
99
+ overlap_regex = gateway_sites[f"overlap_{num}"][pattern]
100
+
101
+ # Iterate over pairs B, P and P, B for BP and L, R and R, L for LR
102
+ for site_x, site_y in zip(reaction, reaction[::-1]):
103
+ site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern]
104
+ matches_x = list(dseqrecord_finditer(site_x_regex, seqx))
105
+ if len(matches_x) == 0:
106
+ continue
107
+
108
+ site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern]
109
+ matches_y = list(dseqrecord_finditer(site_y_regex, seqy))
110
+ if len(matches_y) == 0:
111
+ continue
112
+
113
+ for match_x, match_y in _itertools.product(matches_x, matches_y):
114
+ # Find the overlap sequence within each match, and use the
115
+ # core 7 pbs that are constant
116
+ overlap_x = re.search(overlap_regex, match_x.group())
117
+ overlap_y = re.search(overlap_regex, match_y.group())
118
+
119
+ # Sanity check
120
+ assert (
121
+ overlap_x is not None and overlap_y is not None
122
+ ), "Something went wrong, no overlap found within the matches"
123
+
124
+ out.append(
125
+ (
126
+ match_x.start() + overlap_x.start() + 3,
127
+ match_y.start() + overlap_y.start() + 3,
128
+ 7,
129
+ )
130
+ )
131
+
132
+ return out
133
+
134
+
135
+ def find_gateway_sites(
136
+ seq: _Dseqrecord, greedy: bool
137
+ ) -> dict[str, list[SimpleLocation]]:
138
+ """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
139
+ gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
140
+ out = dict()
141
+ for site in gateway_sites:
142
+ if not site.startswith("att"):
143
+ continue
144
+
145
+ for pattern in ["forward_regex", "reverse_regex"]:
146
+ matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq))
147
+ for match in matches:
148
+ if site not in out:
149
+ out[site] = []
150
+ strand = 1 if pattern == "forward_regex" else -1
151
+ loc = SimpleLocation(match.start(), match.end(), strand)
152
+ loc = shift_location(loc, 0, len(seq))
153
+ out[site].append(loc)
154
+ return out
155
+
156
+
157
+ def annotate_gateway_sites(seq: _Dseqrecord, greedy: bool) -> _Dseqrecord:
158
+ sites = find_gateway_sites(seq, greedy)
159
+ for site in sites:
160
+ for loc in sites[site]:
161
+ seq.features.append(
162
+ SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
163
+ )
164
+ return seq