pydna 5.5.1__py3-none-any.whl → 5.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +116 -134
- pydna/_pretty.py +2 -14
- pydna/all.py +10 -20
- pydna/amplicon.py +25 -20
- pydna/amplify.py +46 -26
- pydna/assembly.py +50 -27
- pydna/assembly2.py +2627 -0
- pydna/common_sub_strings.py +2 -12
- pydna/contig.py +39 -22
- pydna/cre_lox.py +130 -0
- pydna/crispr.py +8 -13
- pydna/design.py +89 -59
- pydna/download.py +10 -18
- pydna/dseq.py +119 -59
- pydna/dseqrecord.py +88 -45
- pydna/fakeseq.py +0 -11
- pydna/fusionpcr.py +3 -1
- pydna/gateway.py +154 -152
- pydna/gel.py +8 -13
- pydna/genbank.py +33 -32
- pydna/genbankfile.py +8 -13
- pydna/genbankfixer.py +41 -28
- pydna/genbankrecord.py +11 -14
- pydna/goldengate.py +2 -2
- pydna/ladders.py +4 -11
- pydna/ligate.py +8 -14
- pydna/parsers.py +25 -9
- pydna/primer.py +3 -12
- pydna/readers.py +0 -11
- pydna/seq.py +21 -18
- pydna/seqrecord.py +20 -20
- pydna/sequence_picker.py +3 -12
- pydna/sequence_regex.py +44 -0
- pydna/tm.py +13 -15
- pydna/types.py +41 -0
- pydna/utils.py +173 -58
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/METADATA +22 -18
- pydna-5.5.3.dist-info/RECORD +45 -0
- pydna/editor.py +0 -119
- pydna/myenzymes.py +0 -51
- pydna/myprimers.py +0 -219
- pydna-5.5.1.dist-info/RECORD +0 -44
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/LICENSE.txt +0 -0
- {pydna-5.5.1.dist-info → pydna-5.5.3.dist-info}/WHEEL +0 -0
pydna/seq.py
CHANGED
|
@@ -25,9 +25,10 @@ from Bio.Seq import Seq as _Seq
|
|
|
25
25
|
from pydna._pretty import PrettyTable as _PrettyTable
|
|
26
26
|
|
|
27
27
|
from typing import List as _List, Optional as _Optional, Tuple as _Tuple
|
|
28
|
-
import logging as _logging
|
|
29
28
|
|
|
30
|
-
|
|
29
|
+
# import logging as _logging
|
|
30
|
+
|
|
31
|
+
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class Seq(_Seq):
|
|
@@ -43,7 +44,9 @@ class Seq(_Seq):
|
|
|
43
44
|
**kwargs,
|
|
44
45
|
) -> "ProteinSeq":
|
|
45
46
|
"""Translate.."""
|
|
46
|
-
p = super().translate(
|
|
47
|
+
p = super().translate(
|
|
48
|
+
*args, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap, **kwargs
|
|
49
|
+
)
|
|
47
50
|
return ProteinSeq(p._data)
|
|
48
51
|
|
|
49
52
|
def gc(self) -> float:
|
|
@@ -78,10 +81,17 @@ class Seq(_Seq):
|
|
|
78
81
|
|
|
79
82
|
def express(self, organism: str = "sce") -> _PrettyTable:
|
|
80
83
|
"""docstring."""
|
|
81
|
-
x = _PrettyTable(
|
|
84
|
+
x = _PrettyTable(
|
|
85
|
+
["cds", "len", "cai", "gc", "sta", "stp", "n-end"]
|
|
86
|
+
+ _rare_codons[organism]
|
|
87
|
+
+ ["rare"]
|
|
88
|
+
)
|
|
82
89
|
val = []
|
|
83
90
|
|
|
84
|
-
val.append(
|
|
91
|
+
val.append(
|
|
92
|
+
f"{self._data.upper().decode('ASCII')[:3]}..."
|
|
93
|
+
f"{self._data.upper().decode('ASCII')[-3:]}"
|
|
94
|
+
)
|
|
85
95
|
val.append(len(self) / 3)
|
|
86
96
|
val.append(self.cai(organism))
|
|
87
97
|
val.append(self.gc())
|
|
@@ -103,7 +113,9 @@ class Seq(_Seq):
|
|
|
103
113
|
|
|
104
114
|
def orfs2(self, minsize: int = 30) -> _List[str]:
|
|
105
115
|
"""docstring."""
|
|
106
|
-
orf = _re.compile(
|
|
116
|
+
orf = _re.compile(
|
|
117
|
+
f"ATG(?:...){{{minsize},}}?(?:TAG|TAA|TGA)", flags=_re.IGNORECASE
|
|
118
|
+
)
|
|
107
119
|
start = 0
|
|
108
120
|
matches: _List[slice] = []
|
|
109
121
|
s = self._data.decode("ASCII")
|
|
@@ -203,7 +215,9 @@ class ProteinSeq(_Seq):
|
|
|
203
215
|
----------
|
|
204
216
|
.. [#] http://wiki.christophchamp.com/index.php/SEGUID
|
|
205
217
|
"""
|
|
206
|
-
return _lsseguid(
|
|
218
|
+
return _lsseguid(
|
|
219
|
+
self._data.decode("utf8").upper(), alphabet="{protein-extended}"
|
|
220
|
+
)
|
|
207
221
|
|
|
208
222
|
def __getitem__(self, key):
|
|
209
223
|
result = super().__getitem__(key)
|
|
@@ -232,14 +246,3 @@ class ProteinSeq(_Seq):
|
|
|
232
246
|
Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
|
|
233
247
|
"""
|
|
234
248
|
return self._pa().instability_index()
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
if __name__ == "__main__":
|
|
238
|
-
import os as _os
|
|
239
|
-
|
|
240
|
-
cached = _os.getenv("pydna_cached_funcs", "")
|
|
241
|
-
_os.environ["pydna_cached_funcs"] = ""
|
|
242
|
-
import doctest
|
|
243
|
-
|
|
244
|
-
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
|
|
245
|
-
_os.environ["pydna_cached_funcs"] = cached
|
pydna/seqrecord.py
CHANGED
|
@@ -35,10 +35,10 @@ from copy import copy as _copy
|
|
|
35
35
|
from pydna import _PydnaWarning
|
|
36
36
|
from warnings import warn as _warn
|
|
37
37
|
|
|
38
|
-
import logging as _logging
|
|
38
|
+
# import logging as _logging
|
|
39
39
|
import datetime
|
|
40
40
|
|
|
41
|
-
_module_logger = _logging.getLogger("pydna." + __name__)
|
|
41
|
+
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class SeqRecord(_SeqRecord):
|
|
@@ -87,7 +87,9 @@ class SeqRecord(_SeqRecord):
|
|
|
87
87
|
self.seq = _Seq(self.seq)
|
|
88
88
|
|
|
89
89
|
self.seq._data = b"".join(self.seq._data.split()) # remove whitespaces
|
|
90
|
-
self.annotations = {
|
|
90
|
+
self.annotations = {
|
|
91
|
+
_pretty_str(k): _pretty_str(v) for k, v in self.annotations.items()
|
|
92
|
+
}
|
|
91
93
|
|
|
92
94
|
@classmethod
|
|
93
95
|
def from_Bio_SeqRecord(clc, sr: _SeqRecord):
|
|
@@ -109,7 +111,9 @@ class SeqRecord(_SeqRecord):
|
|
|
109
111
|
if len(value) > 16:
|
|
110
112
|
shortvalue = value[:16]
|
|
111
113
|
_warn(
|
|
112
|
-
("locus property {} truncated" "to 16 chars {}").format(
|
|
114
|
+
("locus property {} truncated" "to 16 chars {}").format(
|
|
115
|
+
value, shortvalue
|
|
116
|
+
),
|
|
113
117
|
_PydnaWarning,
|
|
114
118
|
stacklevel=2,
|
|
115
119
|
)
|
|
@@ -193,7 +197,7 @@ class SeqRecord(_SeqRecord):
|
|
|
193
197
|
def translate(self):
|
|
194
198
|
"""docstring."""
|
|
195
199
|
p = super().translate()
|
|
196
|
-
return ProteinSeqRecord(_ProteinSeq(p.seq
|
|
200
|
+
return ProteinSeqRecord(_ProteinSeq(p.seq))
|
|
197
201
|
|
|
198
202
|
def add_colors_to_features_for_ape(self):
|
|
199
203
|
"""Assign colors to features.
|
|
@@ -239,7 +243,9 @@ class SeqRecord(_SeqRecord):
|
|
|
239
243
|
f.qualifiers["ApEinfo_fwdcolor"] = [cols[i % len(cols)]]
|
|
240
244
|
f.qualifiers["ApEinfo_revcolor"] = [cols[::-1][i % len(cols)]]
|
|
241
245
|
|
|
242
|
-
def add_feature(
|
|
246
|
+
def add_feature(
|
|
247
|
+
self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs
|
|
248
|
+
):
|
|
243
249
|
"""Add a feature of type misc to the feature list of the sequence.
|
|
244
250
|
|
|
245
251
|
Parameters
|
|
@@ -327,7 +333,9 @@ class SeqRecord(_SeqRecord):
|
|
|
327
333
|
| 0 | L:ft2 | --> | 2 | 4 | 2 | misc | no |
|
|
328
334
|
+-----+---------------+-----+-----+-----+-----+------+------+
|
|
329
335
|
"""
|
|
330
|
-
x = _PrettyTable(
|
|
336
|
+
x = _PrettyTable(
|
|
337
|
+
["Ft#", "Label or Note", "Dir", "Sta", "End", "Len", "type", "orf?"]
|
|
338
|
+
)
|
|
331
339
|
x.align["Ft#"] = "r" # Left align
|
|
332
340
|
x.align["Label or Note"] = "l" # Left align
|
|
333
341
|
x.align["Len"] = "r"
|
|
@@ -357,7 +365,8 @@ class SeqRecord(_SeqRecord):
|
|
|
357
365
|
len(sf),
|
|
358
366
|
sf.type,
|
|
359
367
|
{True: "yes", False: "no"}[
|
|
360
|
-
self.extract_feature(i).isorf()
|
|
368
|
+
self.extract_feature(i).isorf()
|
|
369
|
+
or self.extract_feature(i).reverse_complement().isorf()
|
|
361
370
|
],
|
|
362
371
|
]
|
|
363
372
|
)
|
|
@@ -480,7 +489,9 @@ class SeqRecord(_SeqRecord):
|
|
|
480
489
|
f"Stamp change.\nNew: {chksum}\nOld: {oldstamp[0]}",
|
|
481
490
|
_PydnaWarning,
|
|
482
491
|
)
|
|
483
|
-
self.annotations["comment"] = (
|
|
492
|
+
self.annotations["comment"] = (
|
|
493
|
+
f"{oldcomment}\n" f"{tool} {chksum} {now()} {comment}"
|
|
494
|
+
).strip()
|
|
484
495
|
return _pretty_str(chksum)
|
|
485
496
|
|
|
486
497
|
def lcs(self, other, *args, limit=25, **kwargs):
|
|
@@ -729,14 +740,3 @@ class ProteinSeqRecord(SeqRecord):
|
|
|
729
740
|
def __format__(self, format):
|
|
730
741
|
"""docstring."""
|
|
731
742
|
return _pretty_str(_SeqRecord.__format__(self, format))
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
if __name__ == "__main__":
|
|
735
|
-
import os as _os
|
|
736
|
-
|
|
737
|
-
cached = _os.getenv("pydna_cached_funcs", "")
|
|
738
|
-
_os.environ["pydna_cached_funcs"] = ""
|
|
739
|
-
import doctest
|
|
740
|
-
|
|
741
|
-
doctest.testmod(verbose=True, optionflags=(doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE))
|
|
742
|
-
_os.environ["pydna_cached_funcs"] = cached
|
pydna/sequence_picker.py
CHANGED
|
@@ -7,11 +7,12 @@
|
|
|
7
7
|
|
|
8
8
|
from pydna.dseqrecord import Dseqrecord
|
|
9
9
|
import os as _os
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
# import logging as _logging
|
|
11
12
|
from Bio.Blast import NCBIWWW
|
|
12
13
|
from Bio.Blast import NCBIXML
|
|
13
14
|
|
|
14
|
-
_module_logger = _logging.getLogger("pydna." + __name__)
|
|
15
|
+
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
email = _os.getenv("pydna_email")
|
|
@@ -51,13 +52,3 @@ def genbank_accession(s: str) -> Dseqrecord:
|
|
|
51
52
|
description=(f"{best_alignment.accession} " f"REGION: {start}..{stop}"),
|
|
52
53
|
)
|
|
53
54
|
return result
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if __name__ == "__main__":
|
|
57
|
-
cached = _os.getenv("pydna_cached_funcs", "")
|
|
58
|
-
_os.environ["pydna_cached_funcs"] = ""
|
|
59
|
-
import doctest
|
|
60
|
-
|
|
61
|
-
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
|
|
62
|
-
_os.environ["pydna_cached_funcs"] = cached
|
|
63
|
-
pass
|
pydna/sequence_regex.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
|
|
3
|
+
import re
|
|
4
|
+
from Bio.Data.IUPACData import ambiguous_dna_values as _ambiguous_dna_values
|
|
5
|
+
|
|
6
|
+
ambiguous_only_dna_values = {**_ambiguous_dna_values}
|
|
7
|
+
for normal_base in "ACGT":
|
|
8
|
+
del ambiguous_only_dna_values[normal_base]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def compute_regex_site(site: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Creates a regex pattern from a string that may contain degenerate bases.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
site: The string to convert to a regex pattern.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
The regex pattern.
|
|
20
|
+
"""
|
|
21
|
+
upper_site = site.upper()
|
|
22
|
+
for k, v in ambiguous_only_dna_values.items():
|
|
23
|
+
if len(v) > 1:
|
|
24
|
+
upper_site = upper_site.replace(k, f"[{''.join(v)}]")
|
|
25
|
+
|
|
26
|
+
# Make case insensitive
|
|
27
|
+
upper_site = f"(?i){upper_site}"
|
|
28
|
+
return upper_site
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def dseqrecord_finditer(pattern: str, seq: _Dseqrecord) -> list[re.Match]:
|
|
32
|
+
"""
|
|
33
|
+
Finds all matches of a regex pattern in a Dseqrecord.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
pattern: The regex pattern to search for.
|
|
37
|
+
seq: The Dseqrecord to search in.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
A list of matches.
|
|
41
|
+
"""
|
|
42
|
+
query = str(seq.seq) if not seq.circular else str(seq.seq) * 2
|
|
43
|
+
matches = re.finditer(pattern, query)
|
|
44
|
+
return (m for m in matches if m.start() <= len(seq))
|
pydna/tm.py
CHANGED
|
@@ -213,7 +213,9 @@ def dbd_program(amplicon, tm=tm_dbd, ta=ta_dbd):
|
|
|
213
213
|
|
|
214
214
|
"""
|
|
215
215
|
PfuSso7d_extension_rate = 15 # seconds/kB PCR product
|
|
216
|
-
extension_time_PfuSso7d = max(
|
|
216
|
+
extension_time_PfuSso7d = max(
|
|
217
|
+
10, int(PfuSso7d_extension_rate * len(amplicon) / 1000)
|
|
218
|
+
) # seconds
|
|
217
219
|
|
|
218
220
|
# The program returned is eaither a two step or three step progrem
|
|
219
221
|
# This depends on the tm and length of the primers in the
|
|
@@ -324,7 +326,10 @@ def tmbresluc(primer: str, *args, primerc=500.0, saltc=50, **kwargs):
|
|
|
324
326
|
dH += _thermodynamic_data.dHBr[n1 - 97][n2 - 97]
|
|
325
327
|
dS += _thermodynamic_data.dSBr[n1 - 97][n2 - 97]
|
|
326
328
|
|
|
327
|
-
tm = (
|
|
329
|
+
tm = (
|
|
330
|
+
dH / (1.9872 * _math.log(pri / 1600) + dS)
|
|
331
|
+
+ (16.6 * _math.log(saltc)) / _math.log(10)
|
|
332
|
+
) - 273.15
|
|
328
333
|
|
|
329
334
|
return tm
|
|
330
335
|
|
|
@@ -365,25 +370,18 @@ def tm_neb(primer, conc=0.5, prodcode="q5-0"):
|
|
|
365
370
|
try:
|
|
366
371
|
res = requests.get(url, params=params, headers=headers)
|
|
367
372
|
except requests.exceptions.ConnectionError as e:
|
|
368
|
-
raise requests.exceptions.ConnectionError(
|
|
373
|
+
raise requests.exceptions.ConnectionError(
|
|
374
|
+
"Could not connect to NEB API."
|
|
375
|
+
) from e
|
|
369
376
|
if res.status_code != 200:
|
|
370
377
|
if "error" in res.json():
|
|
371
378
|
raise requests.exceptions.HTTPError(res.status_code, res.json()["error"])
|
|
372
379
|
else:
|
|
373
|
-
raise requests.exceptions.HTTPError(
|
|
380
|
+
raise requests.exceptions.HTTPError(
|
|
381
|
+
res.status_code, res.text
|
|
382
|
+
) # pragma: no cover
|
|
374
383
|
r = res.json()
|
|
375
384
|
if r["success"]:
|
|
376
385
|
return r["data"]["tm1"]
|
|
377
386
|
else:
|
|
378
387
|
raise requests.exceptions.HTTPError(r["error"])
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
if __name__ == "__main__":
|
|
382
|
-
import os as _os
|
|
383
|
-
|
|
384
|
-
cached = _os.getenv("pydna_cached_funcs", "")
|
|
385
|
-
_os.environ["pydna_cached_funcs"] = ""
|
|
386
|
-
import doctest
|
|
387
|
-
|
|
388
|
-
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
|
|
389
|
-
_os.environ["pydna_cached_funcs"] = cached
|
pydna/types.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Types used in the pydna package.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import (
|
|
7
|
+
TYPE_CHECKING,
|
|
8
|
+
Tuple as _Tuple,
|
|
9
|
+
Union as _Union,
|
|
10
|
+
TypeVar as _TypeVar,
|
|
11
|
+
Iterable as _Iterable,
|
|
12
|
+
Callable as _Callable,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from Bio.Restriction import AbstractCut as _AbstractCut
|
|
17
|
+
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
|
|
18
|
+
from pydna.dseq import Dseq
|
|
19
|
+
from Bio.SeqFeature import Location as _Location
|
|
20
|
+
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# To represent any subclass of Dseq
|
|
24
|
+
DseqType = _TypeVar("DseqType", bound="Dseq")
|
|
25
|
+
EnzymesType = _TypeVar(
|
|
26
|
+
"EnzymesType", "_RestrictionBatch", _Iterable["_AbstractCut"], "_AbstractCut"
|
|
27
|
+
)
|
|
28
|
+
CutSiteType = _Tuple[_Tuple[int, int], _Union["_AbstractCut", None]]
|
|
29
|
+
AssemblyEdgeType = _Tuple[int, int, "_Location | None", "_Location | None"]
|
|
30
|
+
AssemblySubFragmentType = _Tuple[int, "_Location | None", "_Location | None"]
|
|
31
|
+
EdgeRepresentationAssembly = list[AssemblyEdgeType]
|
|
32
|
+
SubFragmentRepresentationAssembly = list[AssemblySubFragmentType]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Type alias that describes overlap between two sequences x and y
|
|
36
|
+
# the two first numbers are the positions where the overlap starts on x and y
|
|
37
|
+
# the third number is the length of the overlap
|
|
38
|
+
SequenceOverlap = _Tuple[int, int, int]
|
|
39
|
+
AssemblyAlgorithmType = _Callable[
|
|
40
|
+
["_Dseqrecord", "_Dseqrecord", int], list[SequenceOverlap]
|
|
41
|
+
]
|
pydna/utils.py
CHANGED
|
@@ -8,13 +8,15 @@
|
|
|
8
8
|
|
|
9
9
|
from Bio.Data.IUPACData import ambiguous_dna_complement as _ambiguous_dna_complement
|
|
10
10
|
from Bio.Seq import _maketrans
|
|
11
|
-
|
|
12
|
-
import
|
|
11
|
+
|
|
12
|
+
# import shelve as _shelve
|
|
13
|
+
# import os as _os
|
|
13
14
|
import re as _re
|
|
14
|
-
|
|
15
|
-
import
|
|
16
|
-
import
|
|
17
|
-
import
|
|
15
|
+
|
|
16
|
+
# import logging as _logging
|
|
17
|
+
# import base64 as _base64
|
|
18
|
+
# import pickle as _pickle
|
|
19
|
+
# import hashlib as _hashlib
|
|
18
20
|
import keyword as _keyword
|
|
19
21
|
import collections as _collections
|
|
20
22
|
import itertools as _itertools
|
|
@@ -31,13 +33,14 @@ from pydna.codon import rare_codons as _rare_codons
|
|
|
31
33
|
|
|
32
34
|
from Bio.SeqFeature import SimpleLocation as _sl
|
|
33
35
|
from Bio.SeqFeature import CompoundLocation as _cl
|
|
36
|
+
from Bio.SeqFeature import Location as _Location
|
|
34
37
|
|
|
35
38
|
from typing import Union as _Union, TypeVar as _TypeVar, List as _List
|
|
36
39
|
|
|
37
40
|
# For functions that take str or bytes as input and return str or bytes as output, matching the input type
|
|
38
41
|
StrOrBytes = _TypeVar("StrOrBytes", str, bytes)
|
|
39
42
|
|
|
40
|
-
_module_logger = _logging.getLogger("pydna." + __name__)
|
|
43
|
+
# _module_logger = _logging.getLogger("pydna." + __name__)
|
|
41
44
|
_ambiguous_dna_complement.update({"U": "A"})
|
|
42
45
|
_complement_table = _maketrans(_ambiguous_dna_complement)
|
|
43
46
|
|
|
@@ -71,7 +74,9 @@ def three_frame_orfs(
|
|
|
71
74
|
pass
|
|
72
75
|
else:
|
|
73
76
|
if stopindex - startindex >= limit:
|
|
74
|
-
orfs.append(
|
|
77
|
+
orfs.append(
|
|
78
|
+
(frame, startindex * 3 + frame, (stopindex + 1) * 3 + frame)
|
|
79
|
+
)
|
|
75
80
|
# print(stopindex, startindex, limit)
|
|
76
81
|
return orfs
|
|
77
82
|
|
|
@@ -82,13 +87,17 @@ def shift_location(original_location, shift, lim):
|
|
|
82
87
|
strand = original_location.strand
|
|
83
88
|
if lim is None:
|
|
84
89
|
if min(original_location) + shift < 0:
|
|
85
|
-
raise ValueError(
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"Shift moves location below zero, use a `lim` to loop around if sequence is circular."
|
|
92
|
+
)
|
|
86
93
|
lim = _sys.maxsize
|
|
87
94
|
|
|
88
95
|
for part in original_location.parts:
|
|
89
96
|
new_start = (part.start + shift) % lim
|
|
90
97
|
new_end = (part.end + shift) % lim or lim
|
|
91
|
-
old_start, old_end = (
|
|
98
|
+
old_start, old_end = (
|
|
99
|
+
(newparts[-1].start, newparts[-1].end) if len(newparts) else (None, None)
|
|
100
|
+
)
|
|
92
101
|
|
|
93
102
|
# The "join with old" cases are for features with multiple parts
|
|
94
103
|
# in which consecutive parts do not have any bases between them.
|
|
@@ -278,49 +287,49 @@ def complement(sequence: str):
|
|
|
278
287
|
return sequence.translate(_complement_table)
|
|
279
288
|
|
|
280
289
|
|
|
281
|
-
def memorize(filename):
|
|
282
|
-
|
|
290
|
+
# def memorize(filename):
|
|
291
|
+
# """Cache functions and classes.
|
|
283
292
|
|
|
284
|
-
|
|
285
|
-
|
|
293
|
+
# see pydna.download
|
|
294
|
+
# """
|
|
286
295
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
296
|
+
# def decorator(f):
|
|
297
|
+
# def wrappee(*args, **kwargs):
|
|
298
|
+
# _module_logger.info("#### memorizer ####")
|
|
299
|
+
# _module_logger.info("cache filename = %s", filename)
|
|
300
|
+
# _module_logger.info(
|
|
301
|
+
# "os.environ['pydna_cached_funcs'] = %s",
|
|
302
|
+
# _os.getenv("pydna_cached_funcs", ""),
|
|
303
|
+
# )
|
|
304
|
+
# if filename not in _os.getenv("pydna_cached_funcs", ""):
|
|
305
|
+
# _module_logger.info("cache filename not among cached functions, made it new!")
|
|
306
|
+
# return f(*args, **kwargs)
|
|
307
|
+
# key = _base64.urlsafe_b64encode(_hashlib.sha1(_pickle.dumps((args, kwargs))).digest()).decode("ascii")
|
|
308
|
+
# _module_logger.info("key = %s", key)
|
|
309
|
+
# cache = _shelve.open(
|
|
310
|
+
# _os.path.join(_os.environ["pydna_data_dir"], identifier_from_string(filename)),
|
|
311
|
+
# writeback=False,
|
|
312
|
+
# )
|
|
313
|
+
# try:
|
|
314
|
+
# result = cache[key]
|
|
315
|
+
# except KeyError:
|
|
316
|
+
# _module_logger.info(
|
|
317
|
+
# "no result for key %s in shelve %s",
|
|
318
|
+
# key,
|
|
319
|
+
# identifier_from_string(filename),
|
|
320
|
+
# )
|
|
321
|
+
# result = f(*args, **kwargs)
|
|
322
|
+
# _module_logger.info("made it new!")
|
|
323
|
+
# cache[key] = result
|
|
324
|
+
# _module_logger.info("saved result under key %s", key)
|
|
325
|
+
# else:
|
|
326
|
+
# _module_logger.info("found %s in cache", key)
|
|
327
|
+
# cache.close()
|
|
328
|
+
# return result
|
|
320
329
|
|
|
321
|
-
|
|
330
|
+
# return wrappee
|
|
322
331
|
|
|
323
|
-
|
|
332
|
+
# return decorator
|
|
324
333
|
|
|
325
334
|
|
|
326
335
|
def identifier_from_string(s: str) -> str:
|
|
@@ -505,7 +514,11 @@ def randomORF(length, maxlength=None):
|
|
|
505
514
|
starts = ("ATG",)
|
|
506
515
|
stops = ("TAA", "TAG", "TGA")
|
|
507
516
|
|
|
508
|
-
return
|
|
517
|
+
return (
|
|
518
|
+
random.choice(starts)
|
|
519
|
+
+ "".join([random.choice(cdns) for x in range(length)])
|
|
520
|
+
+ random.choice(stops)
|
|
521
|
+
)
|
|
509
522
|
|
|
510
523
|
|
|
511
524
|
def randomprot(length, maxlength=None):
|
|
@@ -614,7 +627,9 @@ def eq(*args, **kwargs):
|
|
|
614
627
|
if kwargs["circular"] is False:
|
|
615
628
|
topology = "linear"
|
|
616
629
|
else:
|
|
617
|
-
topology = set(
|
|
630
|
+
topology = set(
|
|
631
|
+
[arg.circular if hasattr(arg, "circular") else None for arg in args]
|
|
632
|
+
)
|
|
618
633
|
|
|
619
634
|
if len(topology) != 1:
|
|
620
635
|
raise ValueError("sequences have different topologies")
|
|
@@ -625,7 +640,10 @@ def eq(*args, **kwargs):
|
|
|
625
640
|
topology = "circular"
|
|
626
641
|
|
|
627
642
|
args = [arg.seq if hasattr(arg, "seq") else arg for arg in args]
|
|
628
|
-
args_string_list = [
|
|
643
|
+
args_string_list = [
|
|
644
|
+
arg.watson.lower() if hasattr(arg, "watson") else str(arg).lower()
|
|
645
|
+
for arg in args
|
|
646
|
+
]
|
|
629
647
|
|
|
630
648
|
length = set((len(s) for s in args_string_list))
|
|
631
649
|
|
|
@@ -735,10 +753,107 @@ def locations_overlap(loc1: _Union[_sl, _cl], loc2: _Union[_sl, _cl], seq_len):
|
|
|
735
753
|
return False
|
|
736
754
|
|
|
737
755
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
756
|
+
def sum_is_sticky(
|
|
757
|
+
three_prime_end: tuple[str, str],
|
|
758
|
+
five_prime_end: tuple[str, str],
|
|
759
|
+
partial: bool = False,
|
|
760
|
+
) -> int:
|
|
761
|
+
"""Return the overlap length if the 3' end of seq1 and 5' end of seq2 ends are sticky and compatible for ligation.
|
|
762
|
+
Return 0 if they are not compatible."""
|
|
763
|
+
type_seq1, sticky_seq1 = three_prime_end
|
|
764
|
+
type_seq2, sticky_seq2 = five_prime_end
|
|
765
|
+
|
|
766
|
+
if (
|
|
767
|
+
"blunt" != type_seq2
|
|
768
|
+
and type_seq2 == type_seq1
|
|
769
|
+
and str(sticky_seq2) == str(rc(sticky_seq1))
|
|
770
|
+
):
|
|
771
|
+
return len(sticky_seq1)
|
|
772
|
+
|
|
773
|
+
if not partial:
|
|
774
|
+
return 0
|
|
775
|
+
|
|
776
|
+
if type_seq1 != type_seq2 or type_seq2 == "blunt":
|
|
777
|
+
return 0
|
|
778
|
+
elif type_seq2 == "5'":
|
|
779
|
+
sticky_seq1 = str(rc(sticky_seq1))
|
|
780
|
+
elif type_seq2 == "3'":
|
|
781
|
+
sticky_seq2 = str(rc(sticky_seq2))
|
|
782
|
+
|
|
783
|
+
ovhg_len = min(len(sticky_seq1), len(sticky_seq2))
|
|
784
|
+
# [::-1] to try the longest overhangs first
|
|
785
|
+
for i in range(1, ovhg_len + 1)[::-1]:
|
|
786
|
+
if sticky_seq1[-i:] == sticky_seq2[:i]:
|
|
787
|
+
return i
|
|
788
|
+
else:
|
|
789
|
+
return 0
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def limit_iterator(iterator, limit):
|
|
793
|
+
"""
|
|
794
|
+
Call the function with an iterator to raise an error if the number of items is greater than the limit.
|
|
795
|
+
"""
|
|
796
|
+
for i, x in enumerate(iterator):
|
|
797
|
+
if i >= limit:
|
|
798
|
+
raise ValueError(f"Too many possible paths (more than {limit})")
|
|
799
|
+
yield x
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def create_location(
|
|
803
|
+
start: int, end: int, lim: int, strand: int | None = None
|
|
804
|
+
) -> _Location:
|
|
805
|
+
"""
|
|
806
|
+
Create a location object from a start and end position.
|
|
807
|
+
If the end position is less than the start position, the location is circular. It handles negative positions.
|
|
808
|
+
|
|
809
|
+
Parameters
|
|
810
|
+
----------
|
|
811
|
+
start : int
|
|
812
|
+
The start position of the location.
|
|
813
|
+
end : int
|
|
814
|
+
The end position of the location.
|
|
815
|
+
lim : int
|
|
816
|
+
The length of the sequence.
|
|
817
|
+
strand : int, optional
|
|
818
|
+
The strand of the location. None, 1 or -1.
|
|
742
819
|
|
|
743
|
-
|
|
744
|
-
|
|
820
|
+
Returns
|
|
821
|
+
-------
|
|
822
|
+
location : Location
|
|
823
|
+
The location object. Can be a SimpleLocation or a CompoundLocation if the feature spans the origin of
|
|
824
|
+
a circular sequence.
|
|
825
|
+
|
|
826
|
+
Examples
|
|
827
|
+
--------
|
|
828
|
+
>>> from pydna.utils import create_location
|
|
829
|
+
>>> str(create_location(0, 5, 10,-1))
|
|
830
|
+
'[0:5](-)'
|
|
831
|
+
>>> str(create_location(0, 5, 10,+1))
|
|
832
|
+
'[0:5](+)'
|
|
833
|
+
>>> str(create_location(0, 5, 10))
|
|
834
|
+
'[0:5]'
|
|
835
|
+
>>> str(create_location(8, 2, 10))
|
|
836
|
+
'join{[8:10], [0:2]}'
|
|
837
|
+
>>> str(create_location(8, 2, 10,-1))
|
|
838
|
+
'join{[0:2](-), [8:10](-)}'
|
|
839
|
+
>>> str(create_location(-2, 2, 10))
|
|
840
|
+
'join{[8:10], [0:2]}'
|
|
841
|
+
|
|
842
|
+
Note this special case, 0 is the same as len(seq)
|
|
843
|
+
>>> str(create_location(5, 0, 10))
|
|
844
|
+
'[5:10]'
|
|
845
|
+
|
|
846
|
+
Note the special case where if start and end are the same,
|
|
847
|
+
the location spans the entire sequence (it's not empty).
|
|
848
|
+
>>> str(create_location(5, 5, 10))
|
|
849
|
+
'join{[5:10], [0:5]}'
|
|
850
|
+
|
|
851
|
+
"""
|
|
852
|
+
while start < 0:
|
|
853
|
+
start += lim
|
|
854
|
+
while end < 0:
|
|
855
|
+
end += lim
|
|
856
|
+
if end > start:
|
|
857
|
+
return _sl(start, end, strand)
|
|
858
|
+
else:
|
|
859
|
+
return shift_location(_sl(start, end + lim, strand), 0, lim)
|