pydna 5.5.1__py3-none-any.whl → 5.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/dseqrecord.py CHANGED
@@ -37,9 +37,9 @@ import time as _time
37
37
  import datetime as _datetime
38
38
 
39
39
 
40
- import logging as _logging
40
+ # import logging as _logging
41
41
 
42
- _module_logger = _logging.getLogger("pydna." + __name__)
42
+ # _module_logger = _logging.getLogger("pydna." + __name__)
43
43
 
44
44
 
45
45
  try:
@@ -127,6 +127,8 @@ class Dseqrecord(_SeqRecord):
127
127
 
128
128
  """
129
129
 
130
+ seq: _Dseq
131
+
130
132
  def __init__(
131
133
  self,
132
134
  record,
@@ -135,12 +137,12 @@ class Dseqrecord(_SeqRecord):
135
137
  n=5e-14, # mol ( = 0.05 pmol)
136
138
  **kwargs,
137
139
  ):
138
- _module_logger.info("### Dseqrecord initialized ###")
139
- _module_logger.info("argument circular = %s", circular)
140
- _module_logger.info("circular = %s", circular)
140
+ # _module_logger.info("### Dseqrecord initialized ###")
141
+ # _module_logger.info("argument circular = %s", circular)
142
+ # _module_logger.info("circular = %s", circular)
141
143
 
142
144
  if isinstance(record, str):
143
- _module_logger.info("record is a string")
145
+ # _module_logger.info("record is a string")
144
146
  super().__init__(
145
147
  _Dseq.from_string(
146
148
  record,
@@ -157,12 +159,12 @@ class Dseqrecord(_SeqRecord):
157
159
  record = record[:]
158
160
  elif circular is True:
159
161
  record = record.looped()
160
- _module_logger.info("record is a Dseq object")
162
+ # _module_logger.info("record is a Dseq object")
161
163
  super().__init__(record, *args, **kwargs)
162
164
 
163
165
  # record is a Bio.Seq object ?
164
166
  elif hasattr(record, "transcribe"):
165
- _module_logger.info("record is a Seq object")
167
+ # _module_logger.info("record is a Seq object")
166
168
  super().__init__(
167
169
  _Dseq(
168
170
  str(record),
@@ -175,7 +177,7 @@ class Dseqrecord(_SeqRecord):
175
177
 
176
178
  # record is a Bio.SeqRecord or Dseqrecord object ?
177
179
  elif hasattr(record, "features"):
178
- _module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
180
+ # _module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
179
181
  for key, value in list(record.__dict__.items()):
180
182
  setattr(self, key, value)
181
183
  self.letter_annotations = {}
@@ -256,7 +258,9 @@ class Dseqrecord(_SeqRecord):
256
258
  obj.n = n
257
259
  if circular is None:
258
260
  circular = record.annotations.get("topology") == "circular"
259
- obj.seq = _Dseq.quick(str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular)
261
+ obj.seq = _Dseq.quick(
262
+ str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular
263
+ )
260
264
  return obj
261
265
 
262
266
  @property
@@ -295,7 +299,9 @@ class Dseqrecord(_SeqRecord):
295
299
  """
296
300
  return super().extract_feature(n)
297
301
 
298
- def add_feature(self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs):
302
+ def add_feature(
303
+ self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs
304
+ ):
299
305
  """Add a feature of type misc to the feature list of the sequence.
300
306
 
301
307
  Parameters
@@ -392,13 +398,19 @@ class Dseqrecord(_SeqRecord):
392
398
  elif five_prime[0] == "3'":
393
399
  fn.location = fn.location + (-self.seq.ovhg)
394
400
  if fn.location.start < 0:
395
- loc1 = _SimpleLocation(len(new) + fn.location.start, len(new), strand=fn.location.strand)
401
+ loc1 = _SimpleLocation(
402
+ len(new) + fn.location.start, len(new), strand=fn.location.strand
403
+ )
396
404
  loc2 = _SimpleLocation(0, fn.location.end, strand=fn.location.strand)
397
405
  fn.location = _CompoundLocation([loc1, loc2])
398
406
 
399
407
  if fn.location.end > len(new):
400
- loc1 = _SimpleLocation(fn.location.start, len(new), strand=fn.location.strand)
401
- loc2 = _SimpleLocation(0, fn.location.end - len(new), strand=fn.location.strand)
408
+ loc1 = _SimpleLocation(
409
+ fn.location.start, len(new), strand=fn.location.strand
410
+ )
411
+ loc2 = _SimpleLocation(
412
+ 0, fn.location.end - len(new), strand=fn.location.strand
413
+ )
402
414
  fn.location = _CompoundLocation([loc1, loc2])
403
415
 
404
416
  fn.qualifiers = fo.qualifiers
@@ -428,7 +440,9 @@ class Dseqrecord(_SeqRecord):
428
440
  from pydna import _PydnaDeprecationWarning
429
441
 
430
442
  _warnings.warn(
431
- "tolinear method is obsolete; " "please use obj[:] " "instead of obj.tolinear().",
443
+ "tolinear method is obsolete; "
444
+ "please use obj[:] "
445
+ "instead of obj.tolinear().",
432
446
  _PydnaDeprecationWarning,
433
447
  )
434
448
  new = _copy.copy(self)
@@ -533,13 +547,17 @@ class Dseqrecord(_SeqRecord):
533
547
  if self.seq != old_file.seq:
534
548
  # If new sequence is different, the old file is
535
549
  # renamed with "_OLD_" suffix:
536
- oldmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
550
+ oldmtime = _datetime.datetime.fromtimestamp(
551
+ _os.path.getmtime(filename)
552
+ ).isoformat()
537
553
  tstmp = int(_time.time() * 1_000_000)
538
554
  old_filename = f"{name}_OLD_{tstmp}{ext}"
539
555
  _os.rename(filename, old_filename)
540
556
  with open(filename, "w", encoding="utf8") as fp:
541
557
  fp.write(self.format(f))
542
- newmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
558
+ newmtime = _datetime.datetime.fromtimestamp(
559
+ _os.path.getmtime(filename)
560
+ ).isoformat()
543
561
  msg = f"""
544
562
  <table style="padding:10px 10px;
545
563
  word-break:normal;
@@ -589,7 +607,9 @@ class Dseqrecord(_SeqRecord):
589
607
  newdescription = self.description
590
608
  if oldstamp and newstamp:
591
609
  if oldstamp.group(0)[:35] == newstamp.group(0)[:35]:
592
- newdescription = newdescription.replace(newstamp.group(0), oldstamp.group(0))
610
+ newdescription = newdescription.replace(
611
+ newstamp.group(0), oldstamp.group(0)
612
+ )
593
613
  elif oldstamp:
594
614
  newdescription += " " + oldstamp.group(0)
595
615
  newobj = _copy.copy(self)
@@ -616,9 +636,9 @@ class Dseqrecord(_SeqRecord):
616
636
  return s.find(o)
617
637
 
618
638
  def __str__(self):
619
- return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(self.circular, len(self)) + _SeqRecord.__str__(
620
- self
621
- )
639
+ return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(
640
+ self.circular, len(self)
641
+ ) + _SeqRecord.__str__(self)
622
642
 
623
643
  def __contains__(self, other):
624
644
  if other.lower() in str(self.seq).lower():
@@ -757,10 +777,16 @@ class Dseqrecord(_SeqRecord):
757
777
  return [x.annotations["filename"] for x in matching_reads]
758
778
 
759
779
  def __repr__(self):
760
- return "Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self))
780
+ return "Dseqrecord({}{})".format(
781
+ {True: "-", False: "o"}[not self.circular], len(self)
782
+ )
761
783
 
762
784
  def _repr_pretty_(self, p, cycle):
763
- p.text("Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self)))
785
+ p.text(
786
+ "Dseqrecord({}{})".format(
787
+ {True: "-", False: "o"}[not self.circular], len(self)
788
+ )
789
+ )
764
790
 
765
791
  def __add__(self, other):
766
792
  if hasattr(other, "seq") and hasattr(other.seq, "watson"):
@@ -784,7 +810,11 @@ class Dseqrecord(_SeqRecord):
784
810
 
785
811
  def __mul__(self, number):
786
812
  if not isinstance(number, int):
787
- raise TypeError("TypeError: can't multiply Dseqrecord by non-int of type {}".format(type(number)))
813
+ raise TypeError(
814
+ "TypeError: can't multiply Dseqrecord by non-int of type {}".format(
815
+ type(number)
816
+ )
817
+ )
788
818
  if self.circular:
789
819
  raise TypeError("TypeError: can't multiply circular Dseqrecord.")
790
820
  if number > 0:
@@ -821,7 +851,8 @@ class Dseqrecord(_SeqRecord):
821
851
  for f in answer.features
822
852
  if (
823
853
  _location_boundaries(f.location)[1] <= answer.seq.length
824
- and _location_boundaries(f.location)[0] < _location_boundaries(f.location)[1]
854
+ and _location_boundaries(f.location)[0]
855
+ < _location_boundaries(f.location)[1]
825
856
  )
826
857
  ]
827
858
 
@@ -1032,7 +1063,7 @@ class Dseqrecord(_SeqRecord):
1032
1063
  result = newseq
1033
1064
  else:
1034
1065
  result = newseq.shifted(start)
1035
- _module_logger.info("synced")
1066
+ # _module_logger.info("synced")
1036
1067
  return result
1037
1068
 
1038
1069
  def upper(self):
@@ -1118,7 +1149,10 @@ class Dseqrecord(_SeqRecord):
1118
1149
  type="CDS",
1119
1150
  qualifiers={
1120
1151
  "note": f"{y - x}bp {(y - x) // 3}aa",
1121
- "checksum": [orf.seguid() + " (DNA)", prt.seguid() + " (protein)"],
1152
+ "checksum": [
1153
+ orf.seguid() + " (DNA)",
1154
+ prt.seguid() + " (protein)",
1155
+ ],
1122
1156
  "codon_start": 1,
1123
1157
  "transl_table": 11,
1124
1158
  "translation": str(prt.seq),
@@ -1148,7 +1182,9 @@ class Dseqrecord(_SeqRecord):
1148
1182
  """docstring."""
1149
1183
  if self.features:
1150
1184
  f = self.features[feature]
1151
- locations = sorted(self.features[feature].location.parts, key=_SimpleLocation.start.fget)
1185
+ locations = sorted(
1186
+ self.features[feature].location.parts, key=_SimpleLocation.start.fget
1187
+ )
1152
1188
  strand = f.location.strand
1153
1189
  else:
1154
1190
  locations = [_SimpleLocation(0, 0, 1)]
@@ -1229,7 +1265,10 @@ class Dseqrecord(_SeqRecord):
1229
1265
 
1230
1266
  """
1231
1267
  if not self.circular:
1232
- raise TypeError("Sequence is linear, origin can only be " "shifted for circular sequences.\n")
1268
+ raise TypeError(
1269
+ "Sequence is linear, origin can only be "
1270
+ "shifted for circular sequences.\n"
1271
+ )
1233
1272
  ln = len(self)
1234
1273
  if not shift % ln:
1235
1274
  return _copy.deepcopy(self) # shift is a multiple of ln or 0
@@ -1311,7 +1350,9 @@ class Dseqrecord(_SeqRecord):
1311
1350
  # 000
1312
1351
  # 2222
1313
1352
  #
1314
- left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True)
1353
+ left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
1354
+ left_cut, True
1355
+ )
1315
1356
  initial_shift = left_watson if left_ovhg < 0 else left_crick
1316
1357
  features = self.shifted(initial_shift).features
1317
1358
  # for f in features:
@@ -1327,10 +1368,13 @@ class Dseqrecord(_SeqRecord):
1327
1368
  # 2222
1328
1369
 
1329
1370
  features_need_transfer = [
1330
- f for f in features if (_location_boundaries(f.location)[1] <= abs(left_ovhg))
1371
+ f
1372
+ for f in features
1373
+ if (_location_boundaries(f.location)[1] <= abs(left_ovhg))
1331
1374
  ]
1332
1375
  features_need_transfer = [
1333
- _shift_feature(f, -abs(left_ovhg), len(self)) for f in features_need_transfer
1376
+ _shift_feature(f, -abs(left_ovhg), len(self))
1377
+ for f in features_need_transfer
1334
1378
  ]
1335
1379
 
1336
1380
  # ^ ^^^^^^^^^
@@ -1345,7 +1389,10 @@ class Dseqrecord(_SeqRecord):
1345
1389
  # The features 0 and 1 would have the right location if the final sequence had the same length
1346
1390
  # as the original one. However, the final product is longer because of the overhang.
1347
1391
 
1348
- features += [_shift_feature(f, abs(left_ovhg), len(dseq)) for f in features_need_transfer]
1392
+ features += [
1393
+ _shift_feature(f, abs(left_ovhg), len(dseq))
1394
+ for f in features_need_transfer
1395
+ ]
1349
1396
  # ^ ^^^^^^^^^
1350
1397
  # So we shift back by the same amount in the opposite direction, but this time we pass the
1351
1398
  # length of the final product.
@@ -1356,24 +1403,20 @@ class Dseqrecord(_SeqRecord):
1356
1403
  for f in features
1357
1404
  if (
1358
1405
  _location_boundaries(f.location)[1] <= len(dseq)
1359
- and _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1]
1406
+ and _location_boundaries(f.location)[0]
1407
+ <= _location_boundaries(f.location)[1]
1360
1408
  )
1361
1409
  ]
1362
1410
  else:
1363
- left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True)
1364
- right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(right_cut, False)
1411
+ left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(
1412
+ left_cut, True
1413
+ )
1414
+ right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(
1415
+ right_cut, False
1416
+ )
1365
1417
 
1366
1418
  left_edge = left_crick if left_ovhg > 0 else left_watson
1367
1419
  right_edge = right_watson if right_ovhg > 0 else right_crick
1368
1420
  features = self[left_edge:right_edge].features
1369
1421
 
1370
1422
  return Dseqrecord(dseq, features=features)
1371
-
1372
-
1373
- if __name__ == "__main__":
1374
- cache = _os.getenv("pydna_cache")
1375
- _os.environ["pydna_cache"] = "nocache"
1376
- import doctest
1377
-
1378
- doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
1379
- # _os.environ["pydna_cache"] = cache
pydna/fakeseq.py CHANGED
@@ -44,14 +44,3 @@ class FakeSeq:
44
44
  def __str__(self) -> str:
45
45
  """docstring."""
46
46
  return self.__repr__()
47
-
48
-
49
- if __name__ == "__main__":
50
- import os as _os
51
-
52
- cached = _os.getenv("pydna_cached_funcs", "")
53
- _os.environ["pydna_cached_funcs"] = ""
54
- import doctest
55
-
56
- doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
57
- _os.environ["pydna_cached_funcs"] = cached
pydna/fusionpcr.py CHANGED
@@ -17,7 +17,9 @@ def fuse_by_pcr(fragments, limit=15):
17
17
  new = None
18
18
  for a, b in [(x, y), (x, y.rc()), (x.rc(), y)]:
19
19
  try:
20
- ((s1, s2, ln), *r) = terminal_overlap(a.seq.watson.lower(), rc(b.seq.crick.lower()), limit=limit)
20
+ ((s1, s2, ln), *r) = terminal_overlap(
21
+ a.seq.watson.lower(), rc(b.seq.crick.lower()), limit=limit
22
+ )
21
23
  except ValueError as err:
22
24
  if "not enough values to unpack" not in str(err):
23
25
  raise err
pydna/gateway.py CHANGED
@@ -1,162 +1,164 @@
1
- #!/usr/bin/env python3
2
1
  # -*- coding: utf-8 -*-
3
- # Copyright 2013-2023 by Björn Johansson. All rights reserved.
4
- # This code is part of the Python-dna distribution and governed by its
5
- # license. Please see the LICENSE.txt file that should have been included
6
- # as part of this package.
7
-
8
- """Assembly of sequences by Gateway recombination.
9
-
10
- Given a list of sequences (Dseqrecords), all sequences are analyzed for
11
- presence of att(P|B|L|R)N where N is 1,2,3 or 4.
12
-
13
- A graph is constructed where the att sites form a nodes and
14
- sequences separating att sites form edges.
15
-
16
- The NetworkX package is used to trace linear and circular paths through the
17
- graph.
18
- """
19
- # from Bio.SeqFeature import ExactPosition as _ExactPosition
20
- # from Bio.SeqFeature import SimpleLocation as _SimpleLocation
21
- # from Bio.SeqFeature import CompoundLocation as _CompoundLocation
22
- # from pydna.utils import rc as _rc
23
-
24
- # from pydna._pretty import pretty_str as _pretty_str
25
- # from pydna.contig import Contig as _Contig
26
- # from pydna.common_sub_strings import common_sub_strings
27
- # from pydna.dseqrecord import Dseqrecord as _Dseqrecord
28
- # import networkx as _nx
29
- # from copy import deepcopy as _deepcopy
30
- # import itertools as _itertools
31
- import logging as _logging
32
-
33
- _module_logger = _logging.getLogger("pydna." + __name__)
34
-
35
- ambiguous_dna_regex = {
36
- "A": "T",
37
- "C": "G",
38
- "G": "C",
39
- "T": "A",
40
- "M": "[ACM]",
41
- "R": "[AGR]",
42
- "W": "[ATW]",
43
- "S": "[CGS]",
44
- "Y": "[CTY]",
45
- "K": "[GTK]",
46
- "V": "[ACGVMSR]",
47
- "H": "[ACTHMYW]",
48
- "D": "[AGTDRWK]",
49
- "B": "[CGTBSKY]",
50
- "X": "X",
51
- "N": "[ACGTBDHKMNRSVWY]",
2
+ from Bio.Seq import reverse_complement
3
+ from pydna.dseqrecord import Dseqrecord as _Dseqrecord
4
+ import re
5
+ import itertools as _itertools
6
+ from Bio.SeqFeature import SimpleLocation, SeqFeature
7
+ from pydna.utils import shift_location
8
+ from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
9
+
10
+
11
+ raw_gateway_common = {
12
+ "attB1": "CHWVTWTGTACAAAAAANNNG",
13
+ "attB2": "CHWVTWTGTACAAGAAANNNG",
14
+ "attB3": "CHWVTWTGTATAATAAANNNG",
15
+ "attB4": "CHWVTWTGTATAGAAAANNNG",
16
+ "attB5": "CHWVTWTGTATACAAAANNNG",
17
+ "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG",
18
+ "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG",
19
+ "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG",
20
+ "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG",
21
+ "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG",
22
+ "attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
23
+ "attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
24
+ "attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
25
+ "attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
26
+ "attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
27
+ "overlap_1": "twtGTACAAAaaa",
28
+ "overlap_2": "twtGTACAAGaaa",
29
+ "overlap_3": "twtGTATAATaaa",
30
+ "overlap_4": "twtGTATAGAaaa",
31
+ "overlap_5": "twtGTATACAaaa",
52
32
  }
53
33
 
54
- atts = """
55
- attP1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
56
- attP2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
57
- attP3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
58
- attP4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
59
- attP5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG CMASTWT AAAGYWG
60
-
61
- attB1 CMASTWT GTACAAA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
62
- attB2 CMASTWT GTACAAG AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
63
- attB3 CMASTWT GTATAAT AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
64
- attB4 CMASTWT GTATAGA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
65
- attB5 CMASTWT GTATACA AAAGYWG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
66
-
67
- attR1 CMASTWT GTACAAA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
68
- attR2 CMASTWT GTACAAG AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
69
- attR3 CMASTWT GTATAAT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
70
- attR4 CMASTWT GTATAGA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
71
- attR5 CMASTWT GTATACA AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT AAAGYWG
72
-
73
- attL1 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
74
- attL2 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTACAAG AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
75
- attL3 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAAT AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
76
- attL4 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATAGA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
77
- attL5 AAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATKMTTTYTTATAATGCCMASTTT GTATACA AAAGYWG CMASTWT AAAGYWGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATRCTGTAAAACACAACATATSCAGTCAYWWTG
78
- """
79
-
80
-
81
- retable = str.maketrans(ambiguous_dna_regex)
82
-
83
- for line in (line for line in atts.splitlines() if line.strip()):
84
- name, *parts = line.split()
85
- for part in parts:
86
- part.translate(retable)
87
-
88
-
89
- class Gateway(object):
90
- """Assembly of linear DNA fragments into linear or circular constructs.
91
-
92
- The Assembly is meant to replace the Assembly method as it
93
- is easier to use. Accepts a list of Dseqrecords (source fragments) to
94
- initiate an Assembly object. Several methods are available for analysis
95
- of overlapping sequences, graph construction and assembly.
96
-
97
- Parameters
98
- ----------
99
- fragments : list
100
- a list of Dseqrecord objects.
101
- """
102
-
103
- def __init__(self, molecules=None):
104
- self.molecules = molecules
105
-
106
-
107
- """
108
- Created on Sat Aug 21 15:41:42 2021
109
-
110
- @author: bjorn
111
-
112
-
113
- https://en.wikipedia.org/wiki/Cre-Lox_recombination
114
-
115
- 13bp 8bp 13bp
116
- ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
117
-
118
-
119
- Name 13 bp 8 bp 13 bp
120
- Recognition Spacer Recognition
121
- Region Region Region
122
-
123
- Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
124
- lox 511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
125
- lox 5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
126
- lox 2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
127
- M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
128
- M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
129
- M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
130
- M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
131
- lox 71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
132
- lox 66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
133
34
 
134
- """
135
-
136
-
137
- """
138
-
139
- https://blog.addgene.org/plasmids-101-cre-lox
35
+ raw_gateway_sites_greedy = {
36
+ **raw_gateway_common,
37
+ "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
38
+ "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
39
+ "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
40
+ "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
41
+ "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
42
+ }
140
43
 
141
- https://en.wikipedia.org/wiki/Cre-Lox_recombination
44
+ raw_gateway_sites_conservative = {
45
+ **raw_gateway_common,
46
+ "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
47
+ "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
48
+ "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
49
+ "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
50
+ "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
51
+ }
142
52
 
143
- 13bp 8bp 13bp
144
- ATAACTTCGTATA-NNNTANNN-TATACGAAGTTAT
53
+ gateway_sites_greedy = {
54
+ k: {
55
+ "forward_regex": compute_regex_site(v),
56
+ "reverse_regex": compute_regex_site(reverse_complement(v)),
57
+ "consensus_sequence": v,
58
+ }
59
+ for k, v in raw_gateway_sites_greedy.items()
60
+ }
145
61
 
62
+ gateway_sites_conservative = {
63
+ k: {
64
+ "forward_regex": compute_regex_site(v),
65
+ "reverse_regex": compute_regex_site(reverse_complement(v)),
66
+ "consensus_sequence": v,
67
+ }
68
+ for k, v in raw_gateway_sites_conservative.items()
69
+ }
146
70
 
147
- Name 13 bp 8 bp 13 bp
148
- Recognition Spacer Recognition
149
- Region Region Region
71
+ # From snapgene - ask Valerie
72
+ primer_design_attB = {
73
+ "attB1": "ACAAGTTTGTACAAAAAAGCAGGCT",
74
+ "attB2": "ACCACTTTGTACAAGAAAGCTGGGT",
75
+ "attB3": "ACAACTTTGTATAATAAAGTTGTA",
76
+ "attB4": "ACAACTTTGTATAGAAAAGTTGTA",
77
+ "attB5": "ACAACTTTGTATACAAAAGTTGTA",
78
+ }
150
79
 
151
- Wild-Type ATAACTTCGTATA ATGTATGC TATACGAAGTTAT
152
- lox511 ATAACTTCGTATA ATGTATaC TATACGAAGTTAT
153
- lox5171 ATAACTTCGTATA ATGTgTaC TATACGAAGTTAT
154
- lox2272 ATAACTTCGTATA AaGTATcC TATACGAAGTTAT
155
- M2 ATAACTTCGTATA AgaaAcca TATACGAAGTTAT
156
- M3 ATAACTTCGTATA taaTACCA TATACGAAGTTAT
157
- M7 ATAACTTCGTATA AgaTAGAA TATACGAAGTTAT
158
- M11 ATAACTTCGTATA cgaTAcca TATACGAAGTTAT
159
- lox71 TACCGTTCGTATA NNNTANNN TATACGAAGTTAT
160
- lox66 ATAACTTCGTATA NNNTANNN TATACGAACGGTA
161
80
 
162
- """
81
+ def gateway_overlap(
82
+ seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy: bool
83
+ ) -> list[tuple[int, int, int]]:
84
+ """
85
+ Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
86
+ which might give false positives
87
+ """
88
+ if reaction not in ["BP", "LR"]:
89
+ raise ValueError(f"Invalid overlap type: {reaction}")
90
+
91
+ gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
92
+ out = list()
93
+ # Iterate over the four possible att sites
94
+ for num in range(1, 5):
95
+ # Iterate over the two possible orientations
96
+ # The sites have to be in the same orientation (fwd + fwd or rev + rev)
97
+ for pattern in ["forward_regex", "reverse_regex"]:
98
+ # The overlap regex is the same for all types
99
+ overlap_regex = gateway_sites[f"overlap_{num}"][pattern]
100
+
101
+ # Iterate over pairs B, P and P, B for BP and L, R and R, L for LR
102
+ for site_x, site_y in zip(reaction, reaction[::-1]):
103
+ site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern]
104
+ matches_x = list(dseqrecord_finditer(site_x_regex, seqx))
105
+ if len(matches_x) == 0:
106
+ continue
107
+
108
+ site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern]
109
+ matches_y = list(dseqrecord_finditer(site_y_regex, seqy))
110
+ if len(matches_y) == 0:
111
+ continue
112
+
113
+ for match_x, match_y in _itertools.product(matches_x, matches_y):
114
+ # Find the overlap sequence within each match, and use the
115
+ # core 7 pbs that are constant
116
+ overlap_x = re.search(overlap_regex, match_x.group())
117
+ overlap_y = re.search(overlap_regex, match_y.group())
118
+
119
+ # Sanity check
120
+ assert (
121
+ overlap_x is not None and overlap_y is not None
122
+ ), "Something went wrong, no overlap found within the matches"
123
+
124
+ out.append(
125
+ (
126
+ match_x.start() + overlap_x.start() + 3,
127
+ match_y.start() + overlap_y.start() + 3,
128
+ 7,
129
+ )
130
+ )
131
+
132
+ return out
133
+
134
+
135
+ def find_gateway_sites(
136
+ seq: _Dseqrecord, greedy: bool
137
+ ) -> dict[str, list[SimpleLocation]]:
138
+ """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
139
+ gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
140
+ out = dict()
141
+ for site in gateway_sites:
142
+ if not site.startswith("att"):
143
+ continue
144
+
145
+ for pattern in ["forward_regex", "reverse_regex"]:
146
+ matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq))
147
+ for match in matches:
148
+ if site not in out:
149
+ out[site] = []
150
+ strand = 1 if pattern == "forward_regex" else -1
151
+ loc = SimpleLocation(match.start(), match.end(), strand)
152
+ loc = shift_location(loc, 0, len(seq))
153
+ out[site].append(loc)
154
+ return out
155
+
156
+
157
+ def annotate_gateway_sites(seq: _Dseqrecord, greedy: bool) -> _Dseqrecord:
158
+ sites = find_gateway_sites(seq, greedy)
159
+ for site in sites:
160
+ for loc in sites[site]:
161
+ seq.features.append(
162
+ SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
163
+ )
164
+ return seq