pydna 5.5.2__py3-none-any.whl → 5.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +1 -1
- pydna/assembly2.py +1116 -135
- pydna/cre_lox.py +130 -0
- pydna/dseqrecord.py +50 -2
- pydna/gateway.py +154 -152
- pydna/opencloning_models.py +553 -0
- pydna/parsers.py +23 -0
- pydna/seqrecord.py +1 -1
- pydna/sequence_regex.py +44 -0
- pydna/types.py +5 -2
- {pydna-5.5.2.dist-info → pydna-5.5.4.dist-info}/METADATA +14 -56
- {pydna-5.5.2.dist-info → pydna-5.5.4.dist-info}/RECORD +14 -11
- {pydna-5.5.2.dist-info → pydna-5.5.4.dist-info}/WHEEL +1 -1
- {pydna-5.5.2.dist-info → pydna-5.5.4.dist-info/licenses}/LICENSE.txt +0 -0
pydna/cre_lox.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from itertools import product
|
|
3
|
+
from pydna.dseqrecord import Dseqrecord
|
|
4
|
+
from Bio.Data.IUPACData import ambiguous_dna_values
|
|
5
|
+
from Bio.Seq import reverse_complement
|
|
6
|
+
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
|
|
7
|
+
from Bio.SeqFeature import Location, SimpleLocation, SeqFeature
|
|
8
|
+
from pydna.utils import shift_location
|
|
9
|
+
|
|
10
|
+
# We create a dictionary to map ambiguous bases to their consensus base
|
|
11
|
+
# For example, ambigous_base_dict['ACGT'] -> 'N'
|
|
12
|
+
ambiguous_base_dict = {}
|
|
13
|
+
for ambiguous, bases in ambiguous_dna_values.items():
|
|
14
|
+
ambiguous_base_dict["".join(sorted(bases))] = ambiguous
|
|
15
|
+
|
|
16
|
+
# To handle N values
|
|
17
|
+
ambiguous_base_dict["N"] = "N"
|
|
18
|
+
|
|
19
|
+
# This is the original loxP sequence, here for reference
|
|
20
|
+
LOXP_SEQUENCE = "ATAACTTCGTATAGCATACATTATACGAAGTTAT"
|
|
21
|
+
|
|
22
|
+
loxP_sequences = [
|
|
23
|
+
# https://blog.addgene.org/plasmids-101-cre-lox
|
|
24
|
+
# loxP
|
|
25
|
+
"ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
|
|
26
|
+
# PMID:12202778
|
|
27
|
+
# lox66
|
|
28
|
+
"ATAACTTCGTATANNNTANNNTATACGAACGGTA",
|
|
29
|
+
# lox71
|
|
30
|
+
"TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
loxP_consensus = ""
|
|
34
|
+
|
|
35
|
+
for pos in range(len(LOXP_SEQUENCE)):
|
|
36
|
+
all_letters = set(seq[pos] for seq in loxP_sequences)
|
|
37
|
+
key = "".join(sorted(all_letters))
|
|
38
|
+
loxP_consensus += ambiguous_base_dict[key]
|
|
39
|
+
|
|
40
|
+
# We compute the regex for the forward and reverse loxP sequences
|
|
41
|
+
loxP_regex = (
|
|
42
|
+
compute_regex_site(loxP_consensus),
|
|
43
|
+
compute_regex_site(reverse_complement(loxP_consensus)),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def cre_loxP_overlap(
|
|
48
|
+
x: Dseqrecord, y: Dseqrecord, _l: None = None
|
|
49
|
+
) -> list[tuple[int, int, int]]:
|
|
50
|
+
"""Find matching loxP sites between two sequences."""
|
|
51
|
+
out = list()
|
|
52
|
+
for pattern in loxP_regex:
|
|
53
|
+
matches_x = dseqrecord_finditer(pattern, x)
|
|
54
|
+
matches_y = dseqrecord_finditer(pattern, y)
|
|
55
|
+
|
|
56
|
+
for match_x, match_y in product(matches_x, matches_y):
|
|
57
|
+
value_x = match_x.group()
|
|
58
|
+
value_y = match_y.group()
|
|
59
|
+
if value_x[13:21] == value_y[13:21]:
|
|
60
|
+
out.append((match_x.start() + 13, match_y.start() + 13, 8))
|
|
61
|
+
# Unique values (keeping the order)
|
|
62
|
+
unique_out = []
|
|
63
|
+
for item in out:
|
|
64
|
+
if item not in unique_out:
|
|
65
|
+
unique_out.append(item)
|
|
66
|
+
return unique_out
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
loxP_dict = {
|
|
70
|
+
"loxP": "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
|
|
71
|
+
"lox66": "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
|
|
72
|
+
"lox71": "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
|
|
73
|
+
"loxP_mutant": "TACCGTTCGTATANNNTANNNTATACGAACGGTA",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_regex_dict(original_dict: dict[str, str]) -> dict[str, str]:
|
|
78
|
+
"""Get the regex dictionary for the original dictionary."""
|
|
79
|
+
out = dict()
|
|
80
|
+
for site in original_dict:
|
|
81
|
+
consensus_seq = original_dict[site]
|
|
82
|
+
is_palindromic = consensus_seq == reverse_complement(consensus_seq)
|
|
83
|
+
out[site] = {
|
|
84
|
+
"forward_regex": compute_regex_site(original_dict[site]),
|
|
85
|
+
"reverse_regex": (
|
|
86
|
+
None
|
|
87
|
+
if is_palindromic
|
|
88
|
+
else compute_regex_site(reverse_complement(original_dict[site]))
|
|
89
|
+
),
|
|
90
|
+
}
|
|
91
|
+
return out
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def find_loxP_sites(seq: Dseqrecord) -> dict[str, list[Location]]:
|
|
95
|
+
"""Find all loxP sites in a sequence and return a dictionary with the name and positions of the sites."""
|
|
96
|
+
|
|
97
|
+
out = dict()
|
|
98
|
+
regex_dict = get_regex_dict(loxP_dict)
|
|
99
|
+
for site in loxP_dict:
|
|
100
|
+
|
|
101
|
+
for pattern in ["forward_regex", "reverse_regex"]:
|
|
102
|
+
# Palindromic sequences have no reverse complement
|
|
103
|
+
if regex_dict[site][pattern] is None:
|
|
104
|
+
continue
|
|
105
|
+
matches = list(dseqrecord_finditer(regex_dict[site][pattern], seq))
|
|
106
|
+
for match in matches:
|
|
107
|
+
if site not in out:
|
|
108
|
+
out[site] = []
|
|
109
|
+
strand = 1 if pattern == "forward_regex" else -1
|
|
110
|
+
loc = SimpleLocation(match.start(), match.end(), strand)
|
|
111
|
+
loc = shift_location(loc, 0, len(seq))
|
|
112
|
+
out[site].append(loc)
|
|
113
|
+
return out
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def annotate_loxP_sites(seq: Dseqrecord) -> Dseqrecord:
|
|
117
|
+
sites = find_loxP_sites(seq)
|
|
118
|
+
for site in sites:
|
|
119
|
+
for loc in sites[site]:
|
|
120
|
+
# Don't add the same feature twice
|
|
121
|
+
if not any(
|
|
122
|
+
f.location == loc
|
|
123
|
+
and f.type == "protein_bind"
|
|
124
|
+
and f.qualifiers.get("label", []) == [site]
|
|
125
|
+
for f in seq.features
|
|
126
|
+
):
|
|
127
|
+
seq.features.append(
|
|
128
|
+
SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
|
|
129
|
+
)
|
|
130
|
+
return seq
|
pydna/dseqrecord.py
CHANGED
|
@@ -35,6 +35,11 @@ import os as _os
|
|
|
35
35
|
import re as _re
|
|
36
36
|
import time as _time
|
|
37
37
|
import datetime as _datetime
|
|
38
|
+
from typing import Union, TYPE_CHECKING
|
|
39
|
+
from pydna.opencloning_models import SequenceCutSource
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
42
|
+
from pydna.opencloning_models import Source
|
|
38
43
|
|
|
39
44
|
|
|
40
45
|
# import logging as _logging
|
|
@@ -128,6 +133,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
128
133
|
"""
|
|
129
134
|
|
|
130
135
|
seq: _Dseq
|
|
136
|
+
source: Union["Source", None] = None
|
|
131
137
|
|
|
132
138
|
def __init__(
|
|
133
139
|
self,
|
|
@@ -135,6 +141,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
135
141
|
*args,
|
|
136
142
|
circular=None,
|
|
137
143
|
n=5e-14, # mol ( = 0.05 pmol)
|
|
144
|
+
source=None,
|
|
138
145
|
**kwargs,
|
|
139
146
|
):
|
|
140
147
|
# _module_logger.info("### Dseqrecord initialized ###")
|
|
@@ -202,6 +209,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
202
209
|
self.map_target = None
|
|
203
210
|
self.n = n # amount, set to 5E-14 which is 5 pmols
|
|
204
211
|
self.annotations.update({"molecule_type": "DNA"})
|
|
212
|
+
self.source = source
|
|
205
213
|
|
|
206
214
|
@classmethod
|
|
207
215
|
def from_string(
|
|
@@ -256,6 +264,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
256
264
|
obj.features = record.features
|
|
257
265
|
obj.map_target = None
|
|
258
266
|
obj.n = n
|
|
267
|
+
obj.source = None
|
|
259
268
|
if circular is None:
|
|
260
269
|
circular = record.annotations.get("topology") == "circular"
|
|
261
270
|
obj.seq = _Dseq.quick(
|
|
@@ -875,7 +884,11 @@ class Dseqrecord(_SeqRecord):
|
|
|
875
884
|
def __eq__(self, other):
|
|
876
885
|
"""docstring."""
|
|
877
886
|
try:
|
|
878
|
-
|
|
887
|
+
this_dict = self.__dict__.copy()
|
|
888
|
+
other_dict = other.__dict__.copy()
|
|
889
|
+
del this_dict["source"]
|
|
890
|
+
del other_dict["source"]
|
|
891
|
+
if self.seq == other.seq and str(this_dict) == str(other_dict):
|
|
879
892
|
return True
|
|
880
893
|
except AttributeError:
|
|
881
894
|
pass
|
|
@@ -1419,4 +1432,39 @@ class Dseqrecord(_SeqRecord):
|
|
|
1419
1432
|
right_edge = right_watson if right_ovhg > 0 else right_crick
|
|
1420
1433
|
features = self[left_edge:right_edge].features
|
|
1421
1434
|
|
|
1422
|
-
|
|
1435
|
+
# This will need to be generalised to all types of cuts
|
|
1436
|
+
source = SequenceCutSource.from_parent(self, left_cut, right_cut)
|
|
1437
|
+
return Dseqrecord(dseq, features=features, source=source)
|
|
1438
|
+
|
|
1439
|
+
def history(self):
|
|
1440
|
+
"""
|
|
1441
|
+
Returns a string representation of the cloning history of the sequence.
|
|
1442
|
+
Returns an empty string if the sequence has no source.
|
|
1443
|
+
|
|
1444
|
+
Check the documentation notebooks for extensive examples.
|
|
1445
|
+
|
|
1446
|
+
Returns
|
|
1447
|
+
-------
|
|
1448
|
+
str: A string representation of the cloning history of the sequence.
|
|
1449
|
+
|
|
1450
|
+
Examples
|
|
1451
|
+
--------
|
|
1452
|
+
>>> from pydna.dseqrecord import Dseqrecord
|
|
1453
|
+
>>> from pydna.assembly2 import gibson_assembly
|
|
1454
|
+
>>> fragments = [
|
|
1455
|
+
... Dseqrecord("TTTTacgatAAtgctccCCCC", circular=False, name="fragment1"),
|
|
1456
|
+
... Dseqrecord("CCCCtcatGGGG", circular=False, name="fragment2"),
|
|
1457
|
+
... Dseqrecord("GGGGatataTTTT", circular=False, name="fragment3"),
|
|
1458
|
+
... ]
|
|
1459
|
+
>>> product, *_ = gibson_assembly(fragments, limit=4)
|
|
1460
|
+
>>> product.name = "product_name"
|
|
1461
|
+
>>> print(product.history())
|
|
1462
|
+
╙── product_name (Dseqrecord(o34))
|
|
1463
|
+
└─╼ GibsonAssemblySource
|
|
1464
|
+
├─╼ fragment1 (Dseqrecord(-21))
|
|
1465
|
+
├─╼ fragment2 (Dseqrecord(-12))
|
|
1466
|
+
└─╼ fragment3 (Dseqrecord(-13))
|
|
1467
|
+
"""
|
|
1468
|
+
if self.source is None:
|
|
1469
|
+
return ""
|
|
1470
|
+
return self.source.history_string(self)
|
pydna/gateway.py
CHANGED
|
@@ -1,162 +1,164 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
34
|
-
|
|
35
|
-
ambiguous_dna_regex = {
|
|
36
|
-
"A": "T",
|
|
37
|
-
"C": "G",
|
|
38
|
-
"G": "C",
|
|
39
|
-
"T": "A",
|
|
40
|
-
"M": "[ACM]",
|
|
41
|
-
"R": "[AGR]",
|
|
42
|
-
"W": "[ATW]",
|
|
43
|
-
"S": "[CGS]",
|
|
44
|
-
"Y": "[CTY]",
|
|
45
|
-
"K": "[GTK]",
|
|
46
|
-
"V": "[ACGVMSR]",
|
|
47
|
-
"H": "[ACTHMYW]",
|
|
48
|
-
"D": "[AGTDRWK]",
|
|
49
|
-
"B": "[CGTBSKY]",
|
|
50
|
-
"X": "X",
|
|
51
|
-
"N": "[ACGTBDHKMNRSVWY]",
|
|
2
|
+
from Bio.Seq import reverse_complement
|
|
3
|
+
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
|
|
4
|
+
import re
|
|
5
|
+
import itertools as _itertools
|
|
6
|
+
from Bio.SeqFeature import SimpleLocation, SeqFeature
|
|
7
|
+
from pydna.utils import shift_location
|
|
8
|
+
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
raw_gateway_common = {
|
|
12
|
+
"attB1": "CHWVTWTGTACAAAAAANNNG",
|
|
13
|
+
"attB2": "CHWVTWTGTACAAGAAANNNG",
|
|
14
|
+
"attB3": "CHWVTWTGTATAATAAANNNG",
|
|
15
|
+
"attB4": "CHWVTWTGTATAGAAAANNNG",
|
|
16
|
+
"attB5": "CHWVTWTGTATACAAAANNNG",
|
|
17
|
+
"attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG",
|
|
18
|
+
"attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG",
|
|
19
|
+
"attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG",
|
|
20
|
+
"attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG",
|
|
21
|
+
"attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG",
|
|
22
|
+
"attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
23
|
+
"attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
24
|
+
"attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
25
|
+
"attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
26
|
+
"attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
27
|
+
"overlap_1": "twtGTACAAAaaa",
|
|
28
|
+
"overlap_2": "twtGTACAAGaaa",
|
|
29
|
+
"overlap_3": "twtGTATAATaaa",
|
|
30
|
+
"overlap_4": "twtGTATAGAaaa",
|
|
31
|
+
"overlap_5": "twtGTATACAaaa",
|
|
52
32
|
}
|
|
53
33
|
|
|
54
|
-
atts = """
|
|
55
|
-
attP1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
56
|
-
attP2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
57
|
-
attP3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
58
|
-
attP4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
59
|
-
attP5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
60
|
-
|
|
61
|
-
attB1 CMASTWT GTACAAA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
62
|
-
attB2 CMASTWT GTACAAG AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
63
|
-
attB3 CMASTWT GTATAAT AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
64
|
-
attB4 CMASTWT GTATAGA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
65
|
-
attB5 CMASTWT GTATACA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
66
|
-
|
|
67
|
-
attR1 CMASTWT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
68
|
-
attR2 CMASTWT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
69
|
-
attR3 CMASTWT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
70
|
-
attR4 CMASTWT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
71
|
-
attR5 CMASTWT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
72
|
-
|
|
73
|
-
attL1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
74
|
-
attL2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
75
|
-
attL3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
76
|
-
attL4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
77
|
-
attL5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
retable = str.maketrans(ambiguous_dna_regex)
|
|
82
|
-
|
|
83
|
-
for line in (line for line in atts.splitlines() if line.strip()):
|
|
84
|
-
name, *parts = line.split()
|
|
85
|
-
for part in parts:
|
|
86
|
-
part.translate(retable)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class Gateway(object):
|
|
90
|
-
"""Assembly of linear DNA fragments into linear or circular constructs.
|
|
91
|
-
|
|
92
|
-
The Assembly is meant to replace the Assembly method as it
|
|
93
|
-
is easier to use. Accepts a list of Dseqrecords (source fragments) to
|
|
94
|
-
initiate an Assembly object. Several methods are available for analysis
|
|
95
|
-
of overlapping sequences, graph construction and assembly.
|
|
96
|
-
|
|
97
|
-
Parameters
|
|
98
|
-
----------
|
|
99
|
-
fragments : list
|
|
100
|
-
a list of Dseqrecord objects.
|
|
101
|
-
"""
|
|
102
|
-
|
|
103
|
-
def __init__(self, molecules=None):
|
|
104
|
-
self.molecules = molecules
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
"""
|
|
108
|
-
Created on Sat Aug 21 15:41:42 2021
|
|
109
|
-
|
|
110
|
-
@author: bjorn
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
https://en.wikipedia.org/wiki/Cre-Lox_recombination
|
|
114
|
-
|
|
115
|
-
13bp 8bp 13bp
|
|
116
|
-
ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
Name 13 bp 8 bp 13 bp
|
|
120
|
-
Recognition Spacer Recognition
|
|
121
|
-
Region Region Region
|
|
122
|
-
|
|
123
|
-
Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
|
|
124
|
-
lox 511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
|
|
125
|
-
lox 5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
|
|
126
|
-
lox 2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
|
|
127
|
-
M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
|
|
128
|
-
M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
|
|
129
|
-
M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
|
|
130
|
-
M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
|
|
131
|
-
lox 71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
|
|
132
|
-
lox 66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
|
|
133
34
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
|
|
35
|
+
raw_gateway_sites_greedy = {
|
|
36
|
+
**raw_gateway_common,
|
|
37
|
+
"attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
38
|
+
"attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
39
|
+
"attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
40
|
+
"attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
41
|
+
"attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
42
|
+
}
|
|
140
43
|
|
|
141
|
-
|
|
44
|
+
raw_gateway_sites_conservative = {
|
|
45
|
+
**raw_gateway_common,
|
|
46
|
+
"attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
47
|
+
"attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
48
|
+
"attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
49
|
+
"attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
50
|
+
"attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
51
|
+
}
|
|
142
52
|
|
|
143
|
-
|
|
144
|
-
|
|
53
|
+
gateway_sites_greedy = {
|
|
54
|
+
k: {
|
|
55
|
+
"forward_regex": compute_regex_site(v),
|
|
56
|
+
"reverse_regex": compute_regex_site(reverse_complement(v)),
|
|
57
|
+
"consensus_sequence": v,
|
|
58
|
+
}
|
|
59
|
+
for k, v in raw_gateway_sites_greedy.items()
|
|
60
|
+
}
|
|
145
61
|
|
|
62
|
+
gateway_sites_conservative = {
|
|
63
|
+
k: {
|
|
64
|
+
"forward_regex": compute_regex_site(v),
|
|
65
|
+
"reverse_regex": compute_regex_site(reverse_complement(v)),
|
|
66
|
+
"consensus_sequence": v,
|
|
67
|
+
}
|
|
68
|
+
for k, v in raw_gateway_sites_conservative.items()
|
|
69
|
+
}
|
|
146
70
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
71
|
+
# From snapgene - ask Valerie
|
|
72
|
+
primer_design_attB = {
|
|
73
|
+
"attB1": "ACAAGTTTGTACAAAAAAGCAGGCT",
|
|
74
|
+
"attB2": "ACCACTTTGTACAAGAAAGCTGGGT",
|
|
75
|
+
"attB3": "ACAACTTTGTATAATAAAGTTGTA",
|
|
76
|
+
"attB4": "ACAACTTTGTATAGAAAAGTTGTA",
|
|
77
|
+
"attB5": "ACAACTTTGTATACAAAAGTTGTA",
|
|
78
|
+
}
|
|
150
79
|
|
|
151
|
-
Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
|
|
152
|
-
lox511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
|
|
153
|
-
lox5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
|
|
154
|
-
lox2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
|
|
155
|
-
M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
|
|
156
|
-
M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
|
|
157
|
-
M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
|
|
158
|
-
M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
|
|
159
|
-
lox71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
|
|
160
|
-
lox66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
|
|
161
80
|
|
|
162
|
-
|
|
81
|
+
def gateway_overlap(
|
|
82
|
+
seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy: bool
|
|
83
|
+
) -> list[tuple[int, int, int]]:
|
|
84
|
+
"""
|
|
85
|
+
Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
|
|
86
|
+
which might give false positives
|
|
87
|
+
"""
|
|
88
|
+
if reaction not in ["BP", "LR"]:
|
|
89
|
+
raise ValueError(f"Invalid overlap type: {reaction}")
|
|
90
|
+
|
|
91
|
+
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
|
|
92
|
+
out = list()
|
|
93
|
+
# Iterate over the four possible att sites
|
|
94
|
+
for num in range(1, 5):
|
|
95
|
+
# Iterate over the two possible orientations
|
|
96
|
+
# The sites have to be in the same orientation (fwd + fwd or rev + rev)
|
|
97
|
+
for pattern in ["forward_regex", "reverse_regex"]:
|
|
98
|
+
# The overlap regex is the same for all types
|
|
99
|
+
overlap_regex = gateway_sites[f"overlap_{num}"][pattern]
|
|
100
|
+
|
|
101
|
+
# Iterate over pairs B, P and P, B for BP and L, R and R, L for LR
|
|
102
|
+
for site_x, site_y in zip(reaction, reaction[::-1]):
|
|
103
|
+
site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern]
|
|
104
|
+
matches_x = list(dseqrecord_finditer(site_x_regex, seqx))
|
|
105
|
+
if len(matches_x) == 0:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern]
|
|
109
|
+
matches_y = list(dseqrecord_finditer(site_y_regex, seqy))
|
|
110
|
+
if len(matches_y) == 0:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for match_x, match_y in _itertools.product(matches_x, matches_y):
|
|
114
|
+
# Find the overlap sequence within each match, and use the
|
|
115
|
+
# core 7 pbs that are constant
|
|
116
|
+
overlap_x = re.search(overlap_regex, match_x.group())
|
|
117
|
+
overlap_y = re.search(overlap_regex, match_y.group())
|
|
118
|
+
|
|
119
|
+
# Sanity check
|
|
120
|
+
assert (
|
|
121
|
+
overlap_x is not None and overlap_y is not None
|
|
122
|
+
), "Something went wrong, no overlap found within the matches"
|
|
123
|
+
|
|
124
|
+
out.append(
|
|
125
|
+
(
|
|
126
|
+
match_x.start() + overlap_x.start() + 3,
|
|
127
|
+
match_y.start() + overlap_y.start() + 3,
|
|
128
|
+
7,
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def find_gateway_sites(
|
|
136
|
+
seq: _Dseqrecord, greedy: bool
|
|
137
|
+
) -> dict[str, list[SimpleLocation]]:
|
|
138
|
+
"""Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
|
|
139
|
+
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
|
|
140
|
+
out = dict()
|
|
141
|
+
for site in gateway_sites:
|
|
142
|
+
if not site.startswith("att"):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
for pattern in ["forward_regex", "reverse_regex"]:
|
|
146
|
+
matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq))
|
|
147
|
+
for match in matches:
|
|
148
|
+
if site not in out:
|
|
149
|
+
out[site] = []
|
|
150
|
+
strand = 1 if pattern == "forward_regex" else -1
|
|
151
|
+
loc = SimpleLocation(match.start(), match.end(), strand)
|
|
152
|
+
loc = shift_location(loc, 0, len(seq))
|
|
153
|
+
out[site].append(loc)
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def annotate_gateway_sites(seq: _Dseqrecord, greedy: bool) -> _Dseqrecord:
|
|
158
|
+
sites = find_gateway_sites(seq, greedy)
|
|
159
|
+
for site in sites:
|
|
160
|
+
for loc in sites[site]:
|
|
161
|
+
seq.features.append(
|
|
162
|
+
SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
|
|
163
|
+
)
|
|
164
|
+
return seq
|