pydna 5.5.1__py3-none-any.whl → 5.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +116 -134
- pydna/_pretty.py +2 -14
- pydna/all.py +10 -20
- pydna/amplicon.py +25 -20
- pydna/amplify.py +46 -26
- pydna/assembly.py +50 -27
- pydna/assembly2.py +2627 -0
- pydna/common_sub_strings.py +2 -12
- pydna/contig.py +39 -22
- pydna/cre_lox.py +130 -0
- pydna/crispr.py +8 -13
- pydna/design.py +89 -59
- pydna/download.py +10 -18
- pydna/dseq.py +119 -59
- pydna/dseqrecord.py +88 -45
- pydna/fakeseq.py +0 -11
- pydna/fusionpcr.py +3 -1
- pydna/gateway.py +154 -152
- pydna/gel.py +8 -13
- pydna/genbank.py +33 -32
- pydna/genbankfile.py +8 -13
- pydna/genbankfixer.py +41 -28
- pydna/genbankrecord.py +11 -14
- pydna/goldengate.py +2 -2
- pydna/ladders.py +4 -11
- pydna/ligate.py +8 -14
- pydna/parsers.py +25 -9
- pydna/primer.py +3 -12
- pydna/readers.py +0 -11
- pydna/seq.py +21 -18
- pydna/seqrecord.py +20 -20
- pydna/sequence_picker.py +3 -12
- pydna/sequence_regex.py +44 -0
- pydna/tm.py +13 -15
- pydna/types.py +41 -0
- pydna/utils.py +173 -58
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/METADATA +22 -18
- pydna-5.5.3.dist-info/RECORD +45 -0
- pydna/editor.py +0 -119
- pydna/myenzymes.py +0 -51
- pydna/myprimers.py +0 -219
- pydna-5.5.1.dist-info/RECORD +0 -44
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/LICENSE.txt +0 -0
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/WHEEL +0 -0
pydna/dseqrecord.py
CHANGED
|
@@ -37,9 +37,9 @@ import time as _time
|
|
|
37
37
|
import datetime as _datetime
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
import logging as _logging
|
|
40
|
+
# import logging as _logging
|
|
41
41
|
|
|
42
|
-
_module_logger = _logging.getLogger("pydna." + __name__)
|
|
42
|
+
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
try:
|
|
@@ -127,6 +127,8 @@ class Dseqrecord(_SeqRecord):
|
|
|
127
127
|
|
|
128
128
|
"""
|
|
129
129
|
|
|
130
|
+
seq: _Dseq
|
|
131
|
+
|
|
130
132
|
def __init__(
|
|
131
133
|
self,
|
|
132
134
|
record,
|
|
@@ -135,12 +137,12 @@ class Dseqrecord(_SeqRecord):
|
|
|
135
137
|
n=5e-14, # mol ( = 0.05 pmol)
|
|
136
138
|
**kwargs,
|
|
137
139
|
):
|
|
138
|
-
_module_logger.info("### Dseqrecord initialized ###")
|
|
139
|
-
_module_logger.info("argument circular = %s", circular)
|
|
140
|
-
_module_logger.info("circular = %s", circular)
|
|
140
|
+
# _module_logger.info("### Dseqrecord initialized ###")
|
|
141
|
+
# _module_logger.info("argument circular = %s", circular)
|
|
142
|
+
# _module_logger.info("circular = %s", circular)
|
|
141
143
|
|
|
142
144
|
if isinstance(record, str):
|
|
143
|
-
_module_logger.info("record is a string")
|
|
145
|
+
# _module_logger.info("record is a string")
|
|
144
146
|
super().__init__(
|
|
145
147
|
_Dseq.from_string(
|
|
146
148
|
record,
|
|
@@ -157,12 +159,12 @@ class Dseqrecord(_SeqRecord):
|
|
|
157
159
|
record = record[:]
|
|
158
160
|
elif circular is True:
|
|
159
161
|
record = record.looped()
|
|
160
|
-
_module_logger.info("record is a Dseq object")
|
|
162
|
+
# _module_logger.info("record is a Dseq object")
|
|
161
163
|
super().__init__(record, *args, **kwargs)
|
|
162
164
|
|
|
163
165
|
# record is a Bio.Seq object ?
|
|
164
166
|
elif hasattr(record, "transcribe"):
|
|
165
|
-
_module_logger.info("record is a Seq object")
|
|
167
|
+
# _module_logger.info("record is a Seq object")
|
|
166
168
|
super().__init__(
|
|
167
169
|
_Dseq(
|
|
168
170
|
str(record),
|
|
@@ -175,7 +177,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
175
177
|
|
|
176
178
|
# record is a Bio.SeqRecord or Dseqrecord object ?
|
|
177
179
|
elif hasattr(record, "features"):
|
|
178
|
-
_module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
|
|
180
|
+
# _module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
|
|
179
181
|
for key, value in list(record.__dict__.items()):
|
|
180
182
|
setattr(self, key, value)
|
|
181
183
|
self.letter_annotations = {}
|
|
@@ -256,7 +258,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
256
258
|
obj.n = n
|
|
257
259
|
if circular is None:
|
|
258
260
|
circular = record.annotations.get("topology") == "circular"
|
|
259
|
-
obj.seq = _Dseq.quick(
|
|
261
|
+
obj.seq = _Dseq.quick(
|
|
262
|
+
str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular
|
|
263
|
+
)
|
|
260
264
|
return obj
|
|
261
265
|
|
|
262
266
|
@property
|
|
@@ -295,7 +299,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
295
299
|
"""
|
|
296
300
|
return super().extract_feature(n)
|
|
297
301
|
|
|
298
|
-
def add_feature(
|
|
302
|
+
def add_feature(
|
|
303
|
+
self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs
|
|
304
|
+
):
|
|
299
305
|
"""Add a feature of type misc to the feature list of the sequence.
|
|
300
306
|
|
|
301
307
|
Parameters
|
|
@@ -392,13 +398,19 @@ class Dseqrecord(_SeqRecord):
|
|
|
392
398
|
elif five_prime[0] == "3'":
|
|
393
399
|
fn.location = fn.location + (-self.seq.ovhg)
|
|
394
400
|
if fn.location.start < 0:
|
|
395
|
-
loc1 = _SimpleLocation(
|
|
401
|
+
loc1 = _SimpleLocation(
|
|
402
|
+
len(new) + fn.location.start, len(new), strand=fn.location.strand
|
|
403
|
+
)
|
|
396
404
|
loc2 = _SimpleLocation(0, fn.location.end, strand=fn.location.strand)
|
|
397
405
|
fn.location = _CompoundLocation([loc1, loc2])
|
|
398
406
|
|
|
399
407
|
if fn.location.end > len(new):
|
|
400
|
-
loc1 = _SimpleLocation(
|
|
401
|
-
|
|
408
|
+
loc1 = _SimpleLocation(
|
|
409
|
+
fn.location.start, len(new), strand=fn.location.strand
|
|
410
|
+
)
|
|
411
|
+
loc2 = _SimpleLocation(
|
|
412
|
+
0, fn.location.end - len(new), strand=fn.location.strand
|
|
413
|
+
)
|
|
402
414
|
fn.location = _CompoundLocation([loc1, loc2])
|
|
403
415
|
|
|
404
416
|
fn.qualifiers = fo.qualifiers
|
|
@@ -428,7 +440,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
428
440
|
from pydna import _PydnaDeprecationWarning
|
|
429
441
|
|
|
430
442
|
_warnings.warn(
|
|
431
|
-
"tolinear method is obsolete; "
|
|
443
|
+
"tolinear method is obsolete; "
|
|
444
|
+
"please use obj[:] "
|
|
445
|
+
"instead of obj.tolinear().",
|
|
432
446
|
_PydnaDeprecationWarning,
|
|
433
447
|
)
|
|
434
448
|
new = _copy.copy(self)
|
|
@@ -533,13 +547,17 @@ class Dseqrecord(_SeqRecord):
|
|
|
533
547
|
if self.seq != old_file.seq:
|
|
534
548
|
# If new sequence is different, the old file is
|
|
535
549
|
# renamed with "_OLD_" suffix:
|
|
536
|
-
oldmtime = _datetime.datetime.fromtimestamp(
|
|
550
|
+
oldmtime = _datetime.datetime.fromtimestamp(
|
|
551
|
+
_os.path.getmtime(filename)
|
|
552
|
+
).isoformat()
|
|
537
553
|
tstmp = int(_time.time() * 1_000_000)
|
|
538
554
|
old_filename = f"{name}_OLD_{tstmp}{ext}"
|
|
539
555
|
_os.rename(filename, old_filename)
|
|
540
556
|
with open(filename, "w", encoding="utf8") as fp:
|
|
541
557
|
fp.write(self.format(f))
|
|
542
|
-
newmtime = _datetime.datetime.fromtimestamp(
|
|
558
|
+
newmtime = _datetime.datetime.fromtimestamp(
|
|
559
|
+
_os.path.getmtime(filename)
|
|
560
|
+
).isoformat()
|
|
543
561
|
msg = f"""
|
|
544
562
|
<table style="padding:10px 10px;
|
|
545
563
|
word-break:normal;
|
|
@@ -589,7 +607,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
589
607
|
newdescription = self.description
|
|
590
608
|
if oldstamp and newstamp:
|
|
591
609
|
if oldstamp.group(0)[:35] == newstamp.group(0)[:35]:
|
|
592
|
-
newdescription = newdescription.replace(
|
|
610
|
+
newdescription = newdescription.replace(
|
|
611
|
+
newstamp.group(0), oldstamp.group(0)
|
|
612
|
+
)
|
|
593
613
|
elif oldstamp:
|
|
594
614
|
newdescription += " " + oldstamp.group(0)
|
|
595
615
|
newobj = _copy.copy(self)
|
|
@@ -616,9 +636,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
616
636
|
return s.find(o)
|
|
617
637
|
|
|
618
638
|
def __str__(self):
|
|
619
|
-
return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(
|
|
620
|
-
self
|
|
621
|
-
)
|
|
639
|
+
return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(
|
|
640
|
+
self.circular, len(self)
|
|
641
|
+
) + _SeqRecord.__str__(self)
|
|
622
642
|
|
|
623
643
|
def __contains__(self, other):
|
|
624
644
|
if other.lower() in str(self.seq).lower():
|
|
@@ -757,10 +777,16 @@ class Dseqrecord(_SeqRecord):
|
|
|
757
777
|
return [x.annotations["filename"] for x in matching_reads]
|
|
758
778
|
|
|
759
779
|
def __repr__(self):
|
|
760
|
-
return "Dseqrecord({}{})".format(
|
|
780
|
+
return "Dseqrecord({}{})".format(
|
|
781
|
+
{True: "-", False: "o"}[not self.circular], len(self)
|
|
782
|
+
)
|
|
761
783
|
|
|
762
784
|
def _repr_pretty_(self, p, cycle):
|
|
763
|
-
p.text(
|
|
785
|
+
p.text(
|
|
786
|
+
"Dseqrecord({}{})".format(
|
|
787
|
+
{True: "-", False: "o"}[not self.circular], len(self)
|
|
788
|
+
)
|
|
789
|
+
)
|
|
764
790
|
|
|
765
791
|
def __add__(self, other):
|
|
766
792
|
if hasattr(other, "seq") and hasattr(other.seq, "watson"):
|
|
@@ -784,7 +810,11 @@ class Dseqrecord(_SeqRecord):
|
|
|
784
810
|
|
|
785
811
|
def __mul__(self, number):
|
|
786
812
|
if not isinstance(number, int):
|
|
787
|
-
raise TypeError(
|
|
813
|
+
raise TypeError(
|
|
814
|
+
"TypeError: can't multiply Dseqrecord by non-int of type {}".format(
|
|
815
|
+
type(number)
|
|
816
|
+
)
|
|
817
|
+
)
|
|
788
818
|
if self.circular:
|
|
789
819
|
raise TypeError("TypeError: can't multiply circular Dseqrecord.")
|
|
790
820
|
if number > 0:
|
|
@@ -821,7 +851,8 @@ class Dseqrecord(_SeqRecord):
|
|
|
821
851
|
for f in answer.features
|
|
822
852
|
if (
|
|
823
853
|
_location_boundaries(f.location)[1] <= answer.seq.length
|
|
824
|
-
and _location_boundaries(f.location)[0]
|
|
854
|
+
and _location_boundaries(f.location)[0]
|
|
855
|
+
< _location_boundaries(f.location)[1]
|
|
825
856
|
)
|
|
826
857
|
]
|
|
827
858
|
|
|
@@ -1032,7 +1063,7 @@ class Dseqrecord(_SeqRecord):
|
|
|
1032
1063
|
result = newseq
|
|
1033
1064
|
else:
|
|
1034
1065
|
result = newseq.shifted(start)
|
|
1035
|
-
_module_logger.info("synced")
|
|
1066
|
+
# _module_logger.info("synced")
|
|
1036
1067
|
return result
|
|
1037
1068
|
|
|
1038
1069
|
def upper(self):
|
|
@@ -1118,7 +1149,10 @@ class Dseqrecord(_SeqRecord):
|
|
|
1118
1149
|
type="CDS",
|
|
1119
1150
|
qualifiers={
|
|
1120
1151
|
"note": f"{y - x}bp {(y - x) // 3}aa",
|
|
1121
|
-
"checksum": [
|
|
1152
|
+
"checksum": [
|
|
1153
|
+
orf.seguid() + " (DNA)",
|
|
1154
|
+
prt.seguid() + " (protein)",
|
|
1155
|
+
],
|
|
1122
1156
|
"codon_start": 1,
|
|
1123
1157
|
"transl_table": 11,
|
|
1124
1158
|
"translation": str(prt.seq),
|
|
@@ -1148,7 +1182,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
1148
1182
|
"""docstring."""
|
|
1149
1183
|
if self.features:
|
|
1150
1184
|
f = self.features[feature]
|
|
1151
|
-
locations = sorted(
|
|
1185
|
+
locations = sorted(
|
|
1186
|
+
self.features[feature].location.parts, key=_SimpleLocation.start.fget
|
|
1187
|
+
)
|
|
1152
1188
|
strand = f.location.strand
|
|
1153
1189
|
else:
|
|
1154
1190
|
locations = [_SimpleLocation(0, 0, 1)]
|
|
@@ -1229,7 +1265,10 @@ class Dseqrecord(_SeqRecord):
|
|
|
1229
1265
|
|
|
1230
1266
|
"""
|
|
1231
1267
|
if not self.circular:
|
|
1232
|
-
raise TypeError(
|
|
1268
|
+
raise TypeError(
|
|
1269
|
+
"Sequence is linear, origin can only be "
|
|
1270
|
+
"shifted for circular sequences.\n"
|
|
1271
|
+
)
|
|
1233
1272
|
ln = len(self)
|
|
1234
1273
|
if not shift % ln:
|
|
1235
1274
|
return _copy.deepcopy(self) # shift is a multiple of ln or 0
|
|
@@ -1311,7 +1350,9 @@ class Dseqrecord(_SeqRecord):
|
|
|
1311
1350
|
# 000
|
|
1312
1351
|
# 2222
|
|
1313
1352
|
#
|
|
1314
|
-
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
|
|
1353
|
+
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
|
|
1354
|
+
left_cut, True
|
|
1355
|
+
)
|
|
1315
1356
|
initial_shift = left_watson if left_ovhg < 0 else left_crick
|
|
1316
1357
|
features = self.shifted(initial_shift).features
|
|
1317
1358
|
# for f in features:
|
|
@@ -1327,10 +1368,13 @@ class Dseqrecord(_SeqRecord):
|
|
|
1327
1368
|
# 2222
|
|
1328
1369
|
|
|
1329
1370
|
features_need_transfer = [
|
|
1330
|
-
f
|
|
1371
|
+
f
|
|
1372
|
+
for f in features
|
|
1373
|
+
if (_location_boundaries(f.location)[1] <= abs(left_ovhg))
|
|
1331
1374
|
]
|
|
1332
1375
|
features_need_transfer = [
|
|
1333
|
-
_shift_feature(f, -abs(left_ovhg), len(self))
|
|
1376
|
+
_shift_feature(f, -abs(left_ovhg), len(self))
|
|
1377
|
+
for f in features_need_transfer
|
|
1334
1378
|
]
|
|
1335
1379
|
|
|
1336
1380
|
# ^ ^^^^^^^^^
|
|
@@ -1345,7 +1389,10 @@ class Dseqrecord(_SeqRecord):
|
|
|
1345
1389
|
# The features 0 and 1 would have the right location if the final sequence had the same length
|
|
1346
1390
|
# as the original one. However, the final product is longer because of the overhang.
|
|
1347
1391
|
|
|
1348
|
-
features += [
|
|
1392
|
+
features += [
|
|
1393
|
+
_shift_feature(f, abs(left_ovhg), len(dseq))
|
|
1394
|
+
for f in features_need_transfer
|
|
1395
|
+
]
|
|
1349
1396
|
# ^ ^^^^^^^^^
|
|
1350
1397
|
# So we shift back by the same amount in the opposite direction, but this time we pass the
|
|
1351
1398
|
# length of the final product.
|
|
@@ -1356,24 +1403,20 @@ class Dseqrecord(_SeqRecord):
|
|
|
1356
1403
|
for f in features
|
|
1357
1404
|
if (
|
|
1358
1405
|
_location_boundaries(f.location)[1] <= len(dseq)
|
|
1359
|
-
and _location_boundaries(f.location)[0]
|
|
1406
|
+
and _location_boundaries(f.location)[0]
|
|
1407
|
+
<= _location_boundaries(f.location)[1]
|
|
1360
1408
|
)
|
|
1361
1409
|
]
|
|
1362
1410
|
else:
|
|
1363
|
-
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
|
|
1364
|
-
|
|
1411
|
+
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
|
|
1412
|
+
left_cut, True
|
|
1413
|
+
)
|
|
1414
|
+
right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(
|
|
1415
|
+
right_cut, False
|
|
1416
|
+
)
|
|
1365
1417
|
|
|
1366
1418
|
left_edge = left_crick if left_ovhg > 0 else left_watson
|
|
1367
1419
|
right_edge = right_watson if right_ovhg > 0 else right_crick
|
|
1368
1420
|
features = self[left_edge:right_edge].features
|
|
1369
1421
|
|
|
1370
1422
|
return Dseqrecord(dseq, features=features)
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
if __name__ == "__main__":
|
|
1374
|
-
cache = _os.getenv("pydna_cache")
|
|
1375
|
-
_os.environ["pydna_cache"] = "nocache"
|
|
1376
|
-
import doctest
|
|
1377
|
-
|
|
1378
|
-
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
|
|
1379
|
-
# _os.environ["pydna_cache"] = cache
|
pydna/fakeseq.py
CHANGED
|
@@ -44,14 +44,3 @@ class FakeSeq:
|
|
|
44
44
|
def __str__(self) -> str:
|
|
45
45
|
"""docstring."""
|
|
46
46
|
return self.__repr__()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if __name__ == "__main__":
|
|
50
|
-
import os as _os
|
|
51
|
-
|
|
52
|
-
cached = _os.getenv("pydna_cached_funcs", "")
|
|
53
|
-
_os.environ["pydna_cached_funcs"] = ""
|
|
54
|
-
import doctest
|
|
55
|
-
|
|
56
|
-
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
|
|
57
|
-
_os.environ["pydna_cached_funcs"] = cached
|
pydna/fusionpcr.py
CHANGED
|
@@ -17,7 +17,9 @@ def fuse_by_pcr(fragments, limit=15):
|
|
|
17
17
|
new = None
|
|
18
18
|
for a, b in [(x, y), (x, y.rc()), (x.rc(), y)]:
|
|
19
19
|
try:
|
|
20
|
-
((s1, s2, ln), *r) = terminal_overlap(
|
|
20
|
+
((s1, s2, ln), *r) = terminal_overlap(
|
|
21
|
+
a.seq.watson.lower(), rc(b.seq.crick.lower()), limit=limit
|
|
22
|
+
)
|
|
21
23
|
except ValueError as err:
|
|
22
24
|
if "not enough values to unpack" not in str(err):
|
|
23
25
|
raise err
|
pydna/gateway.py
CHANGED
|
@@ -1,162 +1,164 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
_module_logger = _logging.getLogger("pydna." + __name__)
|
|
34
|
-
|
|
35
|
-
ambiguous_dna_regex = {
|
|
36
|
-
"A": "T",
|
|
37
|
-
"C": "G",
|
|
38
|
-
"G": "C",
|
|
39
|
-
"T": "A",
|
|
40
|
-
"M": "[ACM]",
|
|
41
|
-
"R": "[AGR]",
|
|
42
|
-
"W": "[ATW]",
|
|
43
|
-
"S": "[CGS]",
|
|
44
|
-
"Y": "[CTY]",
|
|
45
|
-
"K": "[GTK]",
|
|
46
|
-
"V": "[ACGVMSR]",
|
|
47
|
-
"H": "[ACTHMYW]",
|
|
48
|
-
"D": "[AGTDRWK]",
|
|
49
|
-
"B": "[CGTBSKY]",
|
|
50
|
-
"X": "X",
|
|
51
|
-
"N": "[ACGTBDHKMNRSVWY]",
|
|
2
|
+
from Bio.Seq import reverse_complement
|
|
3
|
+
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
|
|
4
|
+
import re
|
|
5
|
+
import itertools as _itertools
|
|
6
|
+
from Bio.SeqFeature import SimpleLocation, SeqFeature
|
|
7
|
+
from pydna.utils import shift_location
|
|
8
|
+
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
raw_gateway_common = {
|
|
12
|
+
"attB1": "CHWVTWTGTACAAAAAANNNG",
|
|
13
|
+
"attB2": "CHWVTWTGTACAAGAAANNNG",
|
|
14
|
+
"attB3": "CHWVTWTGTATAATAAANNNG",
|
|
15
|
+
"attB4": "CHWVTWTGTATAGAAAANNNG",
|
|
16
|
+
"attB5": "CHWVTWTGTATACAAAANNNG",
|
|
17
|
+
"attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG",
|
|
18
|
+
"attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG",
|
|
19
|
+
"attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG",
|
|
20
|
+
"attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG",
|
|
21
|
+
"attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG",
|
|
22
|
+
"attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
23
|
+
"attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
24
|
+
"attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
25
|
+
"attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
26
|
+
"attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
27
|
+
"overlap_1": "twtGTACAAAaaa",
|
|
28
|
+
"overlap_2": "twtGTACAAGaaa",
|
|
29
|
+
"overlap_3": "twtGTATAATaaa",
|
|
30
|
+
"overlap_4": "twtGTATAGAaaa",
|
|
31
|
+
"overlap_5": "twtGTATACAaaa",
|
|
52
32
|
}
|
|
53
33
|
|
|
54
|
-
atts = """
|
|
55
|
-
attP1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
56
|
-
attP2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
57
|
-
attP3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
58
|
-
attP4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
59
|
-
attP5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
|
|
60
|
-
|
|
61
|
-
attB1 CMASTWT GTACAAA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
62
|
-
attB2 CMASTWT GTACAAG AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
63
|
-
attB3 CMASTWT GTATAAT AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
64
|
-
attB4 CMASTWT GTATAGA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
65
|
-
attB5 CMASTWT GTATACA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
66
|
-
|
|
67
|
-
attR1 CMASTWT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
68
|
-
attR2 CMASTWT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
69
|
-
attR3 CMASTWT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
70
|
-
attR4 CMASTWT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
71
|
-
attR5 CMASTWT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
|
|
72
|
-
|
|
73
|
-
attL1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
74
|
-
attL2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
75
|
-
attL3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
76
|
-
attL4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
77
|
-
attL5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
retable = str.maketrans(ambiguous_dna_regex)
|
|
82
|
-
|
|
83
|
-
for line in (line for line in atts.splitlines() if line.strip()):
|
|
84
|
-
name, *parts = line.split()
|
|
85
|
-
for part in parts:
|
|
86
|
-
part.translate(retable)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class Gateway(object):
|
|
90
|
-
"""Assembly of linear DNA fragments into linear or circular constructs.
|
|
91
|
-
|
|
92
|
-
The Assembly is meant to replace the Assembly method as it
|
|
93
|
-
is easier to use. Accepts a list of Dseqrecords (source fragments) to
|
|
94
|
-
initiate an Assembly object. Several methods are available for analysis
|
|
95
|
-
of overlapping sequences, graph construction and assembly.
|
|
96
|
-
|
|
97
|
-
Parameters
|
|
98
|
-
----------
|
|
99
|
-
fragments : list
|
|
100
|
-
a list of Dseqrecord objects.
|
|
101
|
-
"""
|
|
102
|
-
|
|
103
|
-
def __init__(self, molecules=None):
|
|
104
|
-
self.molecules = molecules
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
"""
|
|
108
|
-
Created on Sat Aug 21 15:41:42 2021
|
|
109
|
-
|
|
110
|
-
@author: bjorn
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
https://en.wikipedia.org/wiki/Cre-Lox_recombination
|
|
114
|
-
|
|
115
|
-
13bp 8bp 13bp
|
|
116
|
-
ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
Name 13 bp 8 bp 13 bp
|
|
120
|
-
Recognition Spacer Recognition
|
|
121
|
-
Region Region Region
|
|
122
|
-
|
|
123
|
-
Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
|
|
124
|
-
lox 511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
|
|
125
|
-
lox 5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
|
|
126
|
-
lox 2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
|
|
127
|
-
M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
|
|
128
|
-
M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
|
|
129
|
-
M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
|
|
130
|
-
M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
|
|
131
|
-
lox 71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
|
|
132
|
-
lox 66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
|
|
133
34
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
|
|
35
|
+
raw_gateway_sites_greedy = {
|
|
36
|
+
**raw_gateway_common,
|
|
37
|
+
"attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
38
|
+
"attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
39
|
+
"attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
40
|
+
"attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
41
|
+
"attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
|
|
42
|
+
}
|
|
140
43
|
|
|
141
|
-
|
|
44
|
+
raw_gateway_sites_conservative = {
|
|
45
|
+
**raw_gateway_common,
|
|
46
|
+
"attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
47
|
+
"attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
48
|
+
"attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
49
|
+
"attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
50
|
+
"attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
|
|
51
|
+
}
|
|
142
52
|
|
|
143
|
-
|
|
144
|
-
|
|
53
|
+
gateway_sites_greedy = {
|
|
54
|
+
k: {
|
|
55
|
+
"forward_regex": compute_regex_site(v),
|
|
56
|
+
"reverse_regex": compute_regex_site(reverse_complement(v)),
|
|
57
|
+
"consensus_sequence": v,
|
|
58
|
+
}
|
|
59
|
+
for k, v in raw_gateway_sites_greedy.items()
|
|
60
|
+
}
|
|
145
61
|
|
|
62
|
+
gateway_sites_conservative = {
|
|
63
|
+
k: {
|
|
64
|
+
"forward_regex": compute_regex_site(v),
|
|
65
|
+
"reverse_regex": compute_regex_site(reverse_complement(v)),
|
|
66
|
+
"consensus_sequence": v,
|
|
67
|
+
}
|
|
68
|
+
for k, v in raw_gateway_sites_conservative.items()
|
|
69
|
+
}
|
|
146
70
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
71
|
+
# From snapgene - ask Valerie
|
|
72
|
+
primer_design_attB = {
|
|
73
|
+
"attB1": "ACAAGTTTGTACAAAAAAGCAGGCT",
|
|
74
|
+
"attB2": "ACCACTTTGTACAAGAAAGCTGGGT",
|
|
75
|
+
"attB3": "ACAACTTTGTATAATAAAGTTGTA",
|
|
76
|
+
"attB4": "ACAACTTTGTATAGAAAAGTTGTA",
|
|
77
|
+
"attB5": "ACAACTTTGTATACAAAAGTTGTA",
|
|
78
|
+
}
|
|
150
79
|
|
|
151
|
-
Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
|
|
152
|
-
lox511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
|
|
153
|
-
lox5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
|
|
154
|
-
lox2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
|
|
155
|
-
M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
|
|
156
|
-
M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
|
|
157
|
-
M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
|
|
158
|
-
M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
|
|
159
|
-
lox71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
|
|
160
|
-
lox66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
|
|
161
80
|
|
|
162
|
-
|
|
81
|
+
def gateway_overlap(
|
|
82
|
+
seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy: bool
|
|
83
|
+
) -> list[tuple[int, int, int]]:
|
|
84
|
+
"""
|
|
85
|
+
Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
|
|
86
|
+
which might give false positives
|
|
87
|
+
"""
|
|
88
|
+
if reaction not in ["BP", "LR"]:
|
|
89
|
+
raise ValueError(f"Invalid overlap type: {reaction}")
|
|
90
|
+
|
|
91
|
+
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
|
|
92
|
+
out = list()
|
|
93
|
+
# Iterate over the four possible att sites
|
|
94
|
+
for num in range(1, 5):
|
|
95
|
+
# Iterate over the two possible orientations
|
|
96
|
+
# The sites have to be in the same orientation (fwd + fwd or rev + rev)
|
|
97
|
+
for pattern in ["forward_regex", "reverse_regex"]:
|
|
98
|
+
# The overlap regex is the same for all types
|
|
99
|
+
overlap_regex = gateway_sites[f"overlap_{num}"][pattern]
|
|
100
|
+
|
|
101
|
+
# Iterate over pairs B, P and P, B for BP and L, R and R, L for LR
|
|
102
|
+
for site_x, site_y in zip(reaction, reaction[::-1]):
|
|
103
|
+
site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern]
|
|
104
|
+
matches_x = list(dseqrecord_finditer(site_x_regex, seqx))
|
|
105
|
+
if len(matches_x) == 0:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern]
|
|
109
|
+
matches_y = list(dseqrecord_finditer(site_y_regex, seqy))
|
|
110
|
+
if len(matches_y) == 0:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for match_x, match_y in _itertools.product(matches_x, matches_y):
|
|
114
|
+
# Find the overlap sequence within each match, and use the
|
|
115
|
+
# core 7 pbs that are constant
|
|
116
|
+
overlap_x = re.search(overlap_regex, match_x.group())
|
|
117
|
+
overlap_y = re.search(overlap_regex, match_y.group())
|
|
118
|
+
|
|
119
|
+
# Sanity check
|
|
120
|
+
assert (
|
|
121
|
+
overlap_x is not None and overlap_y is not None
|
|
122
|
+
), "Something went wrong, no overlap found within the matches"
|
|
123
|
+
|
|
124
|
+
out.append(
|
|
125
|
+
(
|
|
126
|
+
match_x.start() + overlap_x.start() + 3,
|
|
127
|
+
match_y.start() + overlap_y.start() + 3,
|
|
128
|
+
7,
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def find_gateway_sites(
|
|
136
|
+
seq: _Dseqrecord, greedy: bool
|
|
137
|
+
) -> dict[str, list[SimpleLocation]]:
|
|
138
|
+
"""Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
|
|
139
|
+
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
|
|
140
|
+
out = dict()
|
|
141
|
+
for site in gateway_sites:
|
|
142
|
+
if not site.startswith("att"):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
for pattern in ["forward_regex", "reverse_regex"]:
|
|
146
|
+
matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq))
|
|
147
|
+
for match in matches:
|
|
148
|
+
if site not in out:
|
|
149
|
+
out[site] = []
|
|
150
|
+
strand = 1 if pattern == "forward_regex" else -1
|
|
151
|
+
loc = SimpleLocation(match.start(), match.end(), strand)
|
|
152
|
+
loc = shift_location(loc, 0, len(seq))
|
|
153
|
+
out[site].append(loc)
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def annotate_gateway_sites(seq: _Dseqrecord, greedy: bool) -> _Dseqrecord:
|
|
158
|
+
sites = find_gateway_sites(seq, greedy)
|
|
159
|
+
for site in sites:
|
|
160
|
+
for loc in sites[site]:
|
|
161
|
+
seq.features.append(
|
|
162
|
+
SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
|
|
163
|
+
)
|
|
164
|
+
return seq
|