pydna 5.5.3__py3-none-any.whl → 5.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +24 -193
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +650 -405
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/download.py +6 -15
- pydna/dseq.py +1794 -718
- pydna/dseqrecord.py +220 -171
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +680 -0
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +21 -18
- pydna/utils.py +97 -75
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info}/METADATA +14 -46
- pydna-5.5.5.dist-info/RECORD +43 -0
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.3.dist-info/RECORD +0 -45
- {pydna-5.5.3.dist-info → pydna-5.5.5.dist-info/licenses}/LICENSE.txt +0 -0
pydna/alphabet.py
ADDED
|
@@ -0,0 +1,995 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
dscode - The nucleic acid alphabet used in pydna
|
|
6
|
+
|
|
7
|
+
This file serves to define dscode, the DNA alphabet used in pydna.
|
|
8
|
+
Each symbol represents a basepair (two opposing bases in the two antiparalell
|
|
9
|
+
DNA strands).
|
|
10
|
+
|
|
11
|
+
The alphabet is defined in the end of this docstring which serve as the single
|
|
12
|
+
source of thruth. The alphabet is used to construct the codestrings dictionary
|
|
13
|
+
with has the following keys (strings) in the order indicated:
|
|
14
|
+
|
|
15
|
+
1. un_ambiguous_ds_dna
|
|
16
|
+
2. ds_rna
|
|
17
|
+
3. ambiguous_ds_dna
|
|
18
|
+
4. single_stranded_dna_rna
|
|
19
|
+
5. loops_dna_rna
|
|
20
|
+
6. mismatched_dna_rna
|
|
21
|
+
7. gap
|
|
22
|
+
|
|
23
|
+
Each value of the codestrings dictionary is a multiline string. This string
|
|
24
|
+
has five lines following this form:
|
|
25
|
+
|
|
26
|
+
::
|
|
27
|
+
|
|
28
|
+
W 1 Watson symbol
|
|
29
|
+
| 2 Pipe
|
|
30
|
+
C 3 Crick symbol
|
|
31
|
+
<empty line> 4
|
|
32
|
+
S 5 dscode symbol
|
|
33
|
+
|
|
34
|
+
W (line 1) and C (line 2) are complementary bases in a double stranded DNA
|
|
35
|
+
molecule and S (line 5) are the symbols of the alphabet used to
|
|
36
|
+
describe the base pair above the symbol.
|
|
37
|
+
|
|
38
|
+
Line 2 must contain only the pipe character, indicating basepairing and
|
|
39
|
+
line 4 must be empty. The lines must be of equal length and a series ot
|
|
40
|
+
tests are performed to ensure the integrity of the alphabet.
|
|
41
|
+
|
|
42
|
+
The string definition as well as the keys for the codestrings dict follow this
|
|
43
|
+
line and is contained in the last 13 lines of the docstring:
|
|
44
|
+
|
|
45
|
+
un_ambiguous_ds_dna
|
|
46
|
+
| ds_rna
|
|
47
|
+
| | ambiguous_ds_dna
|
|
48
|
+
| | | single_stranded_dna_rna
|
|
49
|
+
| | | | loops_dna_rna
|
|
50
|
+
| | | | | mismatched_dna_rna
|
|
51
|
+
| | | | | | gap
|
|
52
|
+
| | | | | | |
|
|
53
|
+
GATC UA RYMKSWHBVDN GATC••••U• -----AGCTU AAACCCGGGTTTUUUGCT •
|
|
54
|
+
|||| || ||||||||||| |||||||||| |||||||||| |||||||||||||||||| |
|
|
55
|
+
CTAG AU YRKMSWDVBHN ••••CTAG•U AGCTU----- ACGACTAGTCGTGCTUUU •
|
|
56
|
+
|
|
57
|
+
GATC UO RYMKSWHBVDN PEXIQFZJ$% 0123456789 !#{}&*()<>@:?[]=_; •
|
|
58
|
+
|
|
59
|
+
"""
|
|
60
|
+
import re
|
|
61
|
+
from dataclasses import dataclass
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
# Core alphabet dictionaries
|
|
65
|
+
"basepair_dict",
|
|
66
|
+
"annealing_dict",
|
|
67
|
+
"annealing_dict_w_holes",
|
|
68
|
+
"complement_dict_for_dscode",
|
|
69
|
+
# Translation tables (str.translate, bytes.translate)
|
|
70
|
+
"complement_table_for_dscode",
|
|
71
|
+
"dscode_to_watson_table",
|
|
72
|
+
"dscode_to_crick_table",
|
|
73
|
+
"dscode_to_watson_tail_table",
|
|
74
|
+
"dscode_to_crick_tail_table",
|
|
75
|
+
"dscode_to_full_sequence_table",
|
|
76
|
+
# Alphabet subsets
|
|
77
|
+
"ds_letters",
|
|
78
|
+
"ss_letters_watson",
|
|
79
|
+
"ss_letters_crick",
|
|
80
|
+
# Regex helpers and factories
|
|
81
|
+
"iupac_compl_regex",
|
|
82
|
+
"regex_ss_melt_factory",
|
|
83
|
+
"regex_ds_melt_factory",
|
|
84
|
+
# Data structures
|
|
85
|
+
"DseqParts",
|
|
86
|
+
# Public helper functions
|
|
87
|
+
"get_parts",
|
|
88
|
+
"dsbreaks",
|
|
89
|
+
"representation_tuple",
|
|
90
|
+
"anneal_strands",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# An alias for whitespace
|
|
95
|
+
emptyspace = chr(32)
|
|
96
|
+
|
|
97
|
+
# ============================================================================
|
|
98
|
+
# Alphabet definition extracted from module docstring
|
|
99
|
+
# ============================================================================
|
|
100
|
+
|
|
101
|
+
lines = __doc__.rstrip().splitlines()[-13:] # last 13 docstring lines are read
|
|
102
|
+
|
|
103
|
+
assert not lines[-2] # line 4 has to be empty
|
|
104
|
+
assert set(lines[-4]) == {" ", "|"} # line 2 has to have pipes only.
|
|
105
|
+
|
|
106
|
+
uppers = lines[-5]
|
|
107
|
+
pipes = lines[-4]
|
|
108
|
+
lowers = lines[-3]
|
|
109
|
+
dscode = lines[-1]
|
|
110
|
+
|
|
111
|
+
# Make sure all lineas are equal in length
|
|
112
|
+
assert (
|
|
113
|
+
len(uppers.split())
|
|
114
|
+
== len(lowers.split())
|
|
115
|
+
== len(pipes.split())
|
|
116
|
+
== len(dscode.split())
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Extract the keys from the docstring
|
|
120
|
+
names = [x.strip("| ") for x in lines[: len(dscode.split())]]
|
|
121
|
+
|
|
122
|
+
# ============================================================================
|
|
123
|
+
# Construct the codestrings dict
|
|
124
|
+
# ============================================================================
|
|
125
|
+
|
|
126
|
+
codestrings = {}
|
|
127
|
+
|
|
128
|
+
for upper, pipe, lower, code, name in zip(
|
|
129
|
+
uppers.split(), pipes.split(), lowers.split(), dscode.split(), names
|
|
130
|
+
):
|
|
131
|
+
codestrings[name.strip()] = f"{upper}\n{pipe}\n{lower}\n\n{code}\n".replace(
|
|
132
|
+
"•", emptyspace
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ============================================================================
|
|
137
|
+
# Define ascii letters not used in the alphabet
|
|
138
|
+
# ============================================================================
|
|
139
|
+
|
|
140
|
+
letters_not_in_dscode = "lL\"',-./\\^`|+~"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ============================================================================
|
|
144
|
+
# for loop below carries out a series of consistency checks
|
|
145
|
+
# ============================================================================
|
|
146
|
+
|
|
147
|
+
for name, codestring in codestrings.items():
|
|
148
|
+
|
|
149
|
+
lines = codestring.splitlines()
|
|
150
|
+
|
|
151
|
+
assert len(lines) == 5, f'codestring["{name}"] does not have 5 lines'
|
|
152
|
+
|
|
153
|
+
# We want the Watson, Crick and Symbol lines only
|
|
154
|
+
# Second line has to be pipes ("|") and fourth has to be empty
|
|
155
|
+
|
|
156
|
+
watsn, pipes, crick, empty, symbl = lines
|
|
157
|
+
|
|
158
|
+
# Check so that all letters are ascii symbols.
|
|
159
|
+
assert all(
|
|
160
|
+
ln.isascii() for ln in (watsn, crick, symbl)
|
|
161
|
+
), f'codestring["{name}"] has non-ascii letters'
|
|
162
|
+
|
|
163
|
+
# Verify so that all chars that have uppercase are uppercase.
|
|
164
|
+
assert all(
|
|
165
|
+
ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()
|
|
166
|
+
), f'codestring["{name}"] has non-uppercase letters'
|
|
167
|
+
|
|
168
|
+
# check so that pipes contain only "|"
|
|
169
|
+
assert set(pipes) == set(
|
|
170
|
+
"|"
|
|
171
|
+
), f'codestring["{name}"] has non-pipe character(s) in line 2'
|
|
172
|
+
|
|
173
|
+
# check so strings are the same length
|
|
174
|
+
assert all(
|
|
175
|
+
len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)
|
|
176
|
+
), f'codestring["{name}"] has lines of unequal length'
|
|
177
|
+
|
|
178
|
+
# Check that the the letters in the letters_not_in_dscode string
|
|
179
|
+
# are not used.
|
|
180
|
+
assert not any(
|
|
181
|
+
[letter in letters_not_in_dscode for letter in symbl]
|
|
182
|
+
), f'codestring["{name}"] has chars outside alphabet'
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ============================================================================
|
|
186
|
+
# The `codes` dictionary is a dict of dicts containing the information of the
|
|
187
|
+
# code strings in the form if a dict with string names as keys, each containing
|
|
188
|
+
# a {tuple: string} dict with this structure:
|
|
189
|
+
#
|
|
190
|
+
# (Watson letter, Crick letter): dscode symbol
|
|
191
|
+
# ============================================================================
|
|
192
|
+
|
|
193
|
+
codes = dict()
|
|
194
|
+
|
|
195
|
+
for name, codestring in codestrings.items():
|
|
196
|
+
|
|
197
|
+
lines = codestring.splitlines()
|
|
198
|
+
|
|
199
|
+
watsons, _, cricks, _, symbols = lines
|
|
200
|
+
|
|
201
|
+
# d is an alias of codes[name] used in this loop for code clarity.
|
|
202
|
+
codes[name] = d = dict()
|
|
203
|
+
|
|
204
|
+
for watson, crick, symbol in zip(watsons, cricks, symbols):
|
|
205
|
+
d[watson, crick] = symbol
|
|
206
|
+
|
|
207
|
+
del d # delete alias
|
|
208
|
+
|
|
209
|
+
# ============================================================================
|
|
210
|
+
# The `basepair_dict` dictionary is a merge of a subset of the `codes`dict.
|
|
211
|
+
# ============================================================================
|
|
212
|
+
|
|
213
|
+
basepair_dict = (
|
|
214
|
+
codes["un_ambiguous_ds_dna"]
|
|
215
|
+
| codes["ambiguous_ds_dna"]
|
|
216
|
+
| codes["ds_rna"]
|
|
217
|
+
| codes["single_stranded_dna_rna"]
|
|
218
|
+
# | codes["mismatched_dna_rna"]
|
|
219
|
+
# | codes["loops_dna_rna"]
|
|
220
|
+
| codes["gap"]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# ============================================================================
|
|
225
|
+
# The `annealing_dict` dictionary contain letters for single stranded
|
|
226
|
+
# DNA and their dscode after annealing
|
|
227
|
+
# ============================================================================
|
|
228
|
+
|
|
229
|
+
# The annealing_dict_of_str is constructed below. It contains the information
|
|
230
|
+
# needed to tell if two DNA fragments (like a and b below) can anneal.
|
|
231
|
+
|
|
232
|
+
# This of course only concerns single stranded regions.
|
|
233
|
+
|
|
234
|
+
# The dict has the form (x, y): s
|
|
235
|
+
|
|
236
|
+
# Where x and y are bases in a and b and the symbol s is the resulting dscode
|
|
237
|
+
# symbol for the base pair that is formed.
|
|
238
|
+
|
|
239
|
+
# The letters x and y are from the values in the
|
|
240
|
+
# codes["single_stranded_dna_rna"] dictionary.
|
|
241
|
+
|
|
242
|
+
# For, example: One key-value pair is ('P', 'Q'): 'G' which matches the first
|
|
243
|
+
# of the four new base pairings formed between a and b in the example below.
|
|
244
|
+
|
|
245
|
+
# (a)
|
|
246
|
+
# gggPEXI (dscode for a)
|
|
247
|
+
|
|
248
|
+
# gggGATC
|
|
249
|
+
# ccc
|
|
250
|
+
# aaa (b)
|
|
251
|
+
# CTAGttt
|
|
252
|
+
|
|
253
|
+
# QFZJaaa (dscode for b)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# gggGATCaaa (annealing product between a and b)
|
|
257
|
+
# cccCTAGttt
|
|
258
|
+
|
|
259
|
+
# This loops through the base pairs where the upper or lower
|
|
260
|
+
# positions are empty. (w, c), s would be ("G", " "), "P"
|
|
261
|
+
# in the first iteration.
|
|
262
|
+
|
|
263
|
+
annealing_dict = dict()
|
|
264
|
+
|
|
265
|
+
temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"]
|
|
266
|
+
|
|
267
|
+
# Alias to make the code below more readable.
|
|
268
|
+
d = codes["single_stranded_dna_rna"]
|
|
269
|
+
|
|
270
|
+
for (x, y), symbol in d.items():
|
|
271
|
+
if y == emptyspace:
|
|
272
|
+
other = next(b for a, b in temp if a == x)
|
|
273
|
+
symbol_other = d[emptyspace, other]
|
|
274
|
+
annealing_dict[symbol, symbol_other] = temp[x, other]
|
|
275
|
+
annealing_dict[symbol_other, symbol] = temp[x, other]
|
|
276
|
+
elif x == emptyspace:
|
|
277
|
+
other = next(a for a, b in temp if b == y)
|
|
278
|
+
symbol_other = d[other, emptyspace]
|
|
279
|
+
annealing_dict[symbol, symbol_other] = temp[other, y]
|
|
280
|
+
annealing_dict[symbol_other, symbol] = temp[other, y]
|
|
281
|
+
else:
|
|
282
|
+
raise ValueError("This should not happen")
|
|
283
|
+
|
|
284
|
+
del d, temp
|
|
285
|
+
|
|
286
|
+
# ============================================================================
|
|
287
|
+
# The `annealing_dict_w_holes`contains the `annealing_dict`
|
|
288
|
+
# and additional key pairs where one position is empty
|
|
289
|
+
# ============================================================================
|
|
290
|
+
|
|
291
|
+
temp = {}
|
|
292
|
+
|
|
293
|
+
for (x, y), symbol in annealing_dict.items():
|
|
294
|
+
|
|
295
|
+
temp[x, emptyspace] = x
|
|
296
|
+
temp[emptyspace, y] = y
|
|
297
|
+
|
|
298
|
+
annealing_dict_w_holes = annealing_dict | temp
|
|
299
|
+
|
|
300
|
+
del temp
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# ============================================================================
|
|
304
|
+
# translation tables
|
|
305
|
+
# ============================================================================
|
|
306
|
+
|
|
307
|
+
# A collection of translation tables are a practical way to obtain Watson and Crick
|
|
308
|
+
# from dscode or the reverse complement strands when needed.
|
|
309
|
+
|
|
310
|
+
# These are meant to be used by the str.translate or bytes.translate methods.
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# ============================================================================
|
|
314
|
+
# The translation table "complement_table_for_dscode" is used to obtain the
|
|
315
|
+
# complement of a DNA sequence in dscode format.
|
|
316
|
+
# ============================================================================
|
|
317
|
+
|
|
318
|
+
complement_dict_for_dscode = {
|
|
319
|
+
s: basepair_dict[c, w] for (w, c), s in basepair_dict.items()
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
from_letters = "".join(complement_dict_for_dscode.keys())
|
|
323
|
+
to_letters = "".join(complement_dict_for_dscode.values())
|
|
324
|
+
|
|
325
|
+
from_letters += from_letters.lower()
|
|
326
|
+
to_letters += to_letters.lower()
|
|
327
|
+
|
|
328
|
+
complement_table_for_dscode = bytes.maketrans(
|
|
329
|
+
from_letters.encode("ascii"), to_letters.encode("ascii")
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# ============================================================================
|
|
334
|
+
# dscode_to_watson_table and dscode_to_crick_table
|
|
335
|
+
# ============================================================================
|
|
336
|
+
|
|
337
|
+
# dscode_to_watson_table and dscode_to_crick_table are used to obtain the Watson
|
|
338
|
+
# and (reverse) Crick strands from dscode.
|
|
339
|
+
|
|
340
|
+
# Three extra letters (placeholder1, placeholder2, interval) are added to the
|
|
341
|
+
# table and used in the representation_tuple function to
|
|
342
|
+
# add range indicators ("..") in the watson or crick strings for
|
|
343
|
+
# representation of long sequences.
|
|
344
|
+
|
|
345
|
+
dscode_sense = ""
|
|
346
|
+
dscode_compl = ""
|
|
347
|
+
watson = ""
|
|
348
|
+
crick = ""
|
|
349
|
+
dscode_sense_lower = ""
|
|
350
|
+
dscode_compl_lower = ""
|
|
351
|
+
watson_lower = ""
|
|
352
|
+
crick_lower = ""
|
|
353
|
+
|
|
354
|
+
for (w, c), dscode in basepair_dict.items():
|
|
355
|
+
dscode_sense += dscode
|
|
356
|
+
dscode_compl += basepair_dict[c, w]
|
|
357
|
+
watson += w
|
|
358
|
+
crick += c
|
|
359
|
+
dscode_lower = dscode.lower()
|
|
360
|
+
if dscode_lower in dscode_sense:
|
|
361
|
+
continue
|
|
362
|
+
dscode_sense_lower += dscode_lower
|
|
363
|
+
watson_lower += w.lower()
|
|
364
|
+
crick_lower += c.lower()
|
|
365
|
+
dscode_compl_lower += dscode_compl.lower()
|
|
366
|
+
|
|
367
|
+
# dscode_sense += dscode_sense.lower()
|
|
368
|
+
# dscode_compl += dscode_compl.lower()
|
|
369
|
+
# watson += watson.lower()
|
|
370
|
+
# crick += crick.lower()
|
|
371
|
+
|
|
372
|
+
placeholder1 = "~"
|
|
373
|
+
placeholder2 = "+"
|
|
374
|
+
interval = "."
|
|
375
|
+
|
|
376
|
+
assert placeholder1 in letters_not_in_dscode
|
|
377
|
+
assert placeholder2 in letters_not_in_dscode
|
|
378
|
+
assert interval in letters_not_in_dscode
|
|
379
|
+
|
|
380
|
+
dscode_to_watson_table = bytes.maketrans(
|
|
381
|
+
(dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
|
|
382
|
+
(watson + watson_lower + emptyspace + interval).encode("ascii"),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
dscode_to_crick_table = bytes.maketrans(
|
|
386
|
+
(dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
|
|
387
|
+
(crick + crick_lower + interval + emptyspace).encode("ascii"),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
# ============================================================================
|
|
392
|
+
# dscode_to_watson_tail_table
|
|
393
|
+
# ============================================================================
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
watson_tail_letter_dict = {
|
|
397
|
+
w: s for (w, c), s in codes["single_stranded_dna_rna"].items() if c.isspace()
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
from_letters = "".join(watson_tail_letter_dict.keys())
|
|
401
|
+
to_letters = "".join(watson_tail_letter_dict.values())
|
|
402
|
+
|
|
403
|
+
from_letters += from_letters.lower()
|
|
404
|
+
to_letters += to_letters.lower()
|
|
405
|
+
|
|
406
|
+
dscode_to_watson_tail_table = bytes.maketrans(
|
|
407
|
+
from_letters.encode("ascii"), to_letters.encode("ascii")
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
from_letters_full = five_prime_ss_letters = to_letters
|
|
411
|
+
to_letters_full = from_letters
|
|
412
|
+
|
|
413
|
+
# ============================================================================
|
|
414
|
+
# dscode_to_crick_tail_table
|
|
415
|
+
# ============================================================================
|
|
416
|
+
|
|
417
|
+
crick_tail_letter_dict = {
|
|
418
|
+
complement_dict_for_dscode[c]: s
|
|
419
|
+
for (w, c), s in codes["single_stranded_dna_rna"].items()
|
|
420
|
+
if w.isspace()
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
from_letters = "".join(crick_tail_letter_dict.keys())
|
|
424
|
+
to_letters = "".join(crick_tail_letter_dict.values())
|
|
425
|
+
|
|
426
|
+
from_letters += from_letters.lower()
|
|
427
|
+
to_letters += to_letters.lower()
|
|
428
|
+
|
|
429
|
+
dscode_to_crick_tail_table = bytes.maketrans(
|
|
430
|
+
from_letters.encode("ascii"), to_letters.encode("ascii")
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
three_prime_ss_letters = to_letters
|
|
434
|
+
from_letters_full += to_letters
|
|
435
|
+
to_letters_full += from_letters
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ============================================================================
|
|
439
|
+
# dscode_to_full_sequence_table
|
|
440
|
+
# ============================================================================
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
dscode_to_full_sequence_table = bytes.maketrans(
|
|
444
|
+
from_letters_full.encode("ascii"), to_letters_full.encode("ascii")
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# This loop adds upper and lower case symbols
|
|
449
|
+
mixed_case_dict = {}
|
|
450
|
+
|
|
451
|
+
for (x, y), symbol in basepair_dict.items():
|
|
452
|
+
mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
|
|
453
|
+
mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
|
|
454
|
+
mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
|
|
455
|
+
|
|
456
|
+
if x == emptyspace:
|
|
457
|
+
mixed_case_dict[x, y.lower()] = symbol.lower()
|
|
458
|
+
mixed_case_dict[x, y.upper()] = symbol.upper()
|
|
459
|
+
if y == emptyspace:
|
|
460
|
+
mixed_case_dict[x.lower(), y] = symbol.lower()
|
|
461
|
+
mixed_case_dict[x.upper(), y] = symbol.upper()
|
|
462
|
+
|
|
463
|
+
# Add mixed case entries to the dict
|
|
464
|
+
basepair_dict.update(mixed_case_dict)
|
|
465
|
+
|
|
466
|
+
mixed_case_dict = {}
|
|
467
|
+
|
|
468
|
+
# This loop adds upper and lower case symbols
|
|
469
|
+
for (x, y), symbol in annealing_dict.items():
|
|
470
|
+
mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
|
|
471
|
+
mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
|
|
472
|
+
mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
|
|
473
|
+
|
|
474
|
+
# Add mixed case entries to the dict
|
|
475
|
+
annealing_dict.update(mixed_case_dict)
|
|
476
|
+
|
|
477
|
+
ds_letters = (
|
|
478
|
+
"".join(codes["un_ambiguous_ds_dna"].values())
|
|
479
|
+
+ "".join(codes["ds_rna"].values())
|
|
480
|
+
+ "".join(codes["ambiguous_ds_dna"].values())
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
ss_letters_watson = "".join(
|
|
484
|
+
s for (w, c), s in codes["single_stranded_dna_rna"].items() if c == emptyspace
|
|
485
|
+
)
|
|
486
|
+
ss_letters_crick = "".join(
|
|
487
|
+
s for (w, c), s in codes["single_stranded_dna_rna"].items() if w == emptyspace
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
ds_letters += ds_letters.lower()
|
|
491
|
+
ss_letters_watson += ss_letters_watson.lower()
|
|
492
|
+
ss_letters_crick += ss_letters_crick.lower()
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
# ============================================================================
|
|
496
|
+
# iupac_compl_regex dict of regexes below cover IUPAC Ambiguity Code
|
|
497
|
+
# complements and is used in the amplify module.
|
|
498
|
+
# ============================================================================
|
|
499
|
+
|
|
500
|
+
iupac_compl_regex = {
|
|
501
|
+
"A": "(?:T|U)",
|
|
502
|
+
"C": "(?:G)",
|
|
503
|
+
"G": "(?:C)",
|
|
504
|
+
"T": "(?:A)",
|
|
505
|
+
"U": "(?:A)",
|
|
506
|
+
"R": "(?:T|C|Y)",
|
|
507
|
+
"Y": "(?:G|A|R)",
|
|
508
|
+
"S": "(?:G|C|S)",
|
|
509
|
+
"W": "(?:A|T|W)",
|
|
510
|
+
"K": "(?:C|AM)",
|
|
511
|
+
"M": "(?:T|G|K)",
|
|
512
|
+
"B": "(?:C|G|A|V)",
|
|
513
|
+
"D": "(?:A|C|T|H)",
|
|
514
|
+
"H": "(?:A|G|T|D)",
|
|
515
|
+
"V": "(?:T|C|G|B)",
|
|
516
|
+
"N": "(?:A|G|C|T|N)",
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
# This loop adds upper and lower case symbols
|
|
520
|
+
# mixed_case_dict = {}
|
|
521
|
+
|
|
522
|
+
for (x, y), symbol in annealing_dict_w_holes.items():
|
|
523
|
+
mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
|
|
524
|
+
mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
|
|
525
|
+
mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
|
|
526
|
+
# Add mixed case entries to the dict
|
|
527
|
+
annealing_dict_w_holes.update(mixed_case_dict)
|
|
528
|
+
|
|
529
|
+
# ============================================================================
|
|
530
|
+
# DseqParts dataclass
|
|
531
|
+
# ============================================================================
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
@dataclass
|
|
535
|
+
class DseqParts:
|
|
536
|
+
sticky_left5: str
|
|
537
|
+
sticky_left3: str
|
|
538
|
+
middle: str
|
|
539
|
+
sticky_right3: str
|
|
540
|
+
sticky_right5: str
|
|
541
|
+
single_watson: str
|
|
542
|
+
single_crick: str
|
|
543
|
+
|
|
544
|
+
def __iter__(self):
|
|
545
|
+
"""
|
|
546
|
+
Allow unpacking DseqParts instances.
|
|
547
|
+
>>> from pydna.alphabet import get_parts
|
|
548
|
+
>>> sticky_left5, sticky_left3, middle, sticky_right3, sticky_right5, single_watson, single_crick = get_parts("eeATCGuggCCGgg")
|
|
549
|
+
>>> sticky_left5
|
|
550
|
+
'ee'
|
|
551
|
+
>>> middle
|
|
552
|
+
'ATCGuggCCGgg'
|
|
553
|
+
"""
|
|
554
|
+
return iter(
|
|
555
|
+
(
|
|
556
|
+
self.sticky_left5,
|
|
557
|
+
self.sticky_left3,
|
|
558
|
+
self.middle,
|
|
559
|
+
self.sticky_right3,
|
|
560
|
+
self.sticky_right5,
|
|
561
|
+
self.single_watson,
|
|
562
|
+
self.single_crick,
|
|
563
|
+
)
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
def __getitem__(self, index: int) -> str:
|
|
567
|
+
"""
|
|
568
|
+
Allow indexing DseqParts instances.
|
|
569
|
+
>>> from pydna.alphabet import get_parts
|
|
570
|
+
>>> parts = get_parts("eeATCGuggCCGgg")
|
|
571
|
+
>>> parts[0]
|
|
572
|
+
'ee'
|
|
573
|
+
>>> parts[2]
|
|
574
|
+
'ATCGuggCCGgg'
|
|
575
|
+
"""
|
|
576
|
+
return tuple(self)[index]
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def get_parts(datastring: str) -> DseqParts:
|
|
580
|
+
"""
|
|
581
|
+
Returns a DseqParts instance containing the parts of a dsDNA sequence.
|
|
582
|
+
|
|
583
|
+
The datastring argument should contain a string with dscode symbols.
|
|
584
|
+
|
|
585
|
+
A regular expression is used to capture the single stranded regions at
|
|
586
|
+
the ends as well as the ds region in the middle, if any.
|
|
587
|
+
|
|
588
|
+
The figure below numbers the regex capture groups and what they capture
|
|
589
|
+
as well as the DseqParts instance field name for each group.
|
|
590
|
+
|
|
591
|
+
::
|
|
592
|
+
|
|
593
|
+
group 0 "sticky_left5"
|
|
594
|
+
|
|
|
595
|
+
| group 3"sticky_right5"
|
|
596
|
+
| |
|
|
597
|
+
--- ---
|
|
598
|
+
GGGATCC
|
|
599
|
+
TAGGTCA
|
|
600
|
+
----
|
|
601
|
+
|
|
|
602
|
+
group 2 "middle"
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
group 1 "sticky_left3"
|
|
607
|
+
|
|
|
608
|
+
| group 4 "sticky_right3"
|
|
609
|
+
| |
|
|
610
|
+
--- ---
|
|
611
|
+
ATCCAGT
|
|
612
|
+
CCCTAGG
|
|
613
|
+
----
|
|
614
|
+
|
|
|
615
|
+
group 2 "middle"
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
group 5 "single_watson" (only an upper strand)
|
|
620
|
+
|
|
|
621
|
+
-------
|
|
622
|
+
ATCCAGT
|
|
623
|
+
|||||||
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
group 6 "single_crick" (only a lower strand)
|
|
628
|
+
|
|
|
629
|
+
-------
|
|
630
|
+
|
|
631
|
+
|||||||
|
|
632
|
+
CCCTAGG
|
|
633
|
+
|
|
634
|
+
Examples
|
|
635
|
+
--------
|
|
636
|
+
>>>
|
|
637
|
+
|
|
638
|
+
Up to seven groups (0..6) are captured.s ome are mutually exclusive
|
|
639
|
+
which means that one of them is an empty string:
|
|
640
|
+
|
|
641
|
+
0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
|
|
642
|
+
|
|
643
|
+
2 or 5 or 6, a DNA molecule has a ds region or is entirely single stranded.
|
|
644
|
+
|
|
645
|
+
3 or 4, not both, either 5' or 3' sticky end.
|
|
646
|
+
|
|
647
|
+
Note that internal single stranded regions are not identified and will
|
|
648
|
+
be contained in the middle part if they are present.
|
|
649
|
+
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
652
|
+
datastring : str
|
|
653
|
+
A string with dscode.
|
|
654
|
+
|
|
655
|
+
Returns
|
|
656
|
+
-------
|
|
657
|
+
DseqParts
|
|
658
|
+
Seven string fields describing the DNA molecule.
|
|
659
|
+
DseqParts(sticky_left5='', sticky_left3='',
|
|
660
|
+
middle='',
|
|
661
|
+
sticky_right3='', sticky_right5='',
|
|
662
|
+
single_watson='', single_crick='')
|
|
663
|
+
|
|
664
|
+
"""
|
|
665
|
+
|
|
666
|
+
m = re.match(
|
|
667
|
+
f"([{ss_letters_watson}]*)" # capture group 0 ssDNA in watson strand
|
|
668
|
+
f"([{ss_letters_crick}]*)" # " 1 ssDNA in crick strand
|
|
669
|
+
f"(?=[{ds_letters}])" # positive lookahead for dsDNA, no capture
|
|
670
|
+
"(.*)" # capture group 2 everything in the middle
|
|
671
|
+
f"(?<=[{ds_letters}])" # positive look behind for dsDNA, no capture
|
|
672
|
+
f"([{ss_letters_watson}]*)" # capture group 3 ssDNA in watson strand
|
|
673
|
+
f"([{ss_letters_crick}]*)|" # " 4 ssDNA in crick strand
|
|
674
|
+
f"([{ss_letters_watson}]+)|" # " 5 if data contains only upper strand
|
|
675
|
+
f"([{ss_letters_crick}]+)", # " 6 if data contains only lower strand
|
|
676
|
+
datastring,
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
result = m.groups() if m else (None, None, None, None, None, None, None)
|
|
680
|
+
|
|
681
|
+
result = ["" if e is None else e for e in result]
|
|
682
|
+
|
|
683
|
+
return DseqParts(
|
|
684
|
+
sticky_left5=result[0],
|
|
685
|
+
sticky_left3=result[1],
|
|
686
|
+
middle=result[2],
|
|
687
|
+
sticky_right3=result[3],
|
|
688
|
+
sticky_right5=result[4],
|
|
689
|
+
single_watson=result[5],
|
|
690
|
+
single_crick=result[6],
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def dsbreaks(datastring: str) -> list[str]:
|
|
695
|
+
"""
|
|
696
|
+
Find double strand breaks in DNA in dscode format.
|
|
697
|
+
|
|
698
|
+
An empty watson position next to an empty crick position in the dsDNA
|
|
699
|
+
leads to a discontinuous DNA. This function is used to show breaks in
|
|
700
|
+
DNA in Dseq.__init__.
|
|
701
|
+
|
|
702
|
+
>>> from pydna.alphabet import dsbreaks
|
|
703
|
+
>>> x, = dsbreaks("GATPFTAA")
|
|
704
|
+
>>> print(x)
|
|
705
|
+
[0:8]
|
|
706
|
+
GATG TAA
|
|
707
|
+
CTA TATT
|
|
708
|
+
>>> dsbreaks("GATC")
|
|
709
|
+
[]
|
|
710
|
+
|
|
711
|
+
Parameters
|
|
712
|
+
----------
|
|
713
|
+
data : str
|
|
714
|
+
A string representing DNA in dscode format.
|
|
715
|
+
|
|
716
|
+
Returns
|
|
717
|
+
-------
|
|
718
|
+
list[str]
|
|
719
|
+
A list of 3-line
|
|
720
|
+
|
|
721
|
+
"""
|
|
722
|
+
|
|
723
|
+
wl = re.escape(five_prime_ss_letters)
|
|
724
|
+
cl = re.escape(three_prime_ss_letters)
|
|
725
|
+
|
|
726
|
+
breaks = []
|
|
727
|
+
regex = (
|
|
728
|
+
"(.{0,3})" # return context if present.
|
|
729
|
+
f"([{wl}][{cl}]|[{cl}][{wl}])" # find adjacent single strand chars.
|
|
730
|
+
"(.{0,3})" # return context if present.
|
|
731
|
+
)
|
|
732
|
+
for mobj in re.finditer(regex, datastring):
|
|
733
|
+
chunk = mobj.group()
|
|
734
|
+
w, c = representation_tuple(chunk)
|
|
735
|
+
breaks.append(f"[{mobj.start()}:{mobj.end()}]\n{w}\n{c}\n")
|
|
736
|
+
return breaks
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def representation_tuple(
|
|
740
|
+
datastring: str = "", length_limit_for_repr: int = 30, chunk: int = 4
|
|
741
|
+
):
|
|
742
|
+
"""
|
|
743
|
+
Two line string representation of a sequence of dscode symbols.
|
|
744
|
+
|
|
745
|
+
See pydna.alphabet module for the definition of the pydna dscode
|
|
746
|
+
alphabet. The dscode has a symbol (ascii) character for base pairs
|
|
747
|
+
and single stranded DNA.
|
|
748
|
+
|
|
749
|
+
This function is used by the Dseq.__repr__() method.
|
|
750
|
+
|
|
751
|
+
Parameters
|
|
752
|
+
----------
|
|
753
|
+
data : TYPE, optional
|
|
754
|
+
DESCRIPTION. The default is "".
|
|
755
|
+
|
|
756
|
+
Returns
|
|
757
|
+
-------
|
|
758
|
+
str
|
|
759
|
+
A two line string containing The Watson and Crick strands.
|
|
760
|
+
|
|
761
|
+
"""
|
|
762
|
+
|
|
763
|
+
(
|
|
764
|
+
sticky_left5,
|
|
765
|
+
sticky_left3,
|
|
766
|
+
middle,
|
|
767
|
+
sticky_right5,
|
|
768
|
+
sticky_right3,
|
|
769
|
+
single_watson,
|
|
770
|
+
single_crick,
|
|
771
|
+
) = get_parts(datastring)
|
|
772
|
+
|
|
773
|
+
if len(datastring) > length_limit_for_repr:
|
|
774
|
+
"""
|
|
775
|
+
We need to shorten the repr if the sequence is longer than
|
|
776
|
+
limit imposed by length_limit_for_repr.
|
|
777
|
+
|
|
778
|
+
The representation has three parts, so we divide by three for each part.
|
|
779
|
+
|
|
780
|
+
Long DNA strands are interrupted by interval notation, like agc..att
|
|
781
|
+
where the two dots indicate intervening hidden sequence.
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
Dseq(-71)
|
|
785
|
+
GAAA..AATCaaaa..aaaa
|
|
786
|
+
tttt..ttttCTAA..AAAG
|
|
787
|
+
|
|
788
|
+
placeholder1, placeholder2 are two letters that are replaced by
|
|
789
|
+
interval characters in the upper or lower strands by the translation
|
|
790
|
+
"""
|
|
791
|
+
|
|
792
|
+
part_limit = length_limit_for_repr // 3
|
|
793
|
+
|
|
794
|
+
if len(sticky_left5) > part_limit:
|
|
795
|
+
sticky_left5 = (
|
|
796
|
+
sticky_left5[:chunk] + placeholder2 * 2 + sticky_left5[-chunk:]
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
if len(sticky_left3) > part_limit:
|
|
800
|
+
sticky_left3 = (
|
|
801
|
+
sticky_left3[:chunk] + placeholder1 * 2 + sticky_left3[-chunk:]
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
if len(middle) > part_limit:
|
|
805
|
+
middle = middle[:4] + interval * 2 + middle[-4:]
|
|
806
|
+
|
|
807
|
+
if len(sticky_right5) > part_limit:
|
|
808
|
+
sticky_right5 = (
|
|
809
|
+
sticky_right5[:chunk] + placeholder2 * 2 + sticky_right5[-chunk:]
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
if len(sticky_right3) > part_limit:
|
|
813
|
+
sticky_right3 = (
|
|
814
|
+
sticky_right3[:chunk] + placeholder1 * 2 + sticky_right3[-chunk:]
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
# The processed string that will be used to
|
|
818
|
+
# obtain a watson and crick strand
|
|
819
|
+
processed_dscode = (sticky_left5 or sticky_left3) + middle + (
|
|
820
|
+
sticky_right5 or sticky_right3
|
|
821
|
+
) or single_watson + single_crick
|
|
822
|
+
|
|
823
|
+
watson = processed_dscode.translate(dscode_to_watson_table).rstrip()
|
|
824
|
+
crick = processed_dscode.translate(dscode_to_crick_table).rstrip()
|
|
825
|
+
|
|
826
|
+
return watson, crick
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def regex_ss_melt_factory(length: int) -> re.Pattern:
|
|
830
|
+
"""
|
|
831
|
+
A regular expression for finding double-stranded regions flanked by single-stranded DNA
|
|
832
|
+
that can be melted to shed a single-stranded fragment.
|
|
833
|
+
|
|
834
|
+
This function returns a regular expression that finds double-stranded regions
|
|
835
|
+
(of length <= length) that are flanked by single-stranded regions on the same
|
|
836
|
+
side in dscode format. These regions are useful to identify as potential melt
|
|
837
|
+
sites, since melting them leads to the shedding of a single-stranded fragment.
|
|
838
|
+
|
|
839
|
+
The regular expression finds double stranded patches flanked by empty
|
|
840
|
+
positions on the same side (see figure below). Melting of this kind of
|
|
841
|
+
sites leads to the shedding of a single stranded fragment.
|
|
842
|
+
|
|
843
|
+
::
|
|
844
|
+
|
|
845
|
+
GFTTAJA <-- dscode representing the ds DNA below.
|
|
846
|
+
|
|
847
|
+
G TTA A <-- "TTA" is found by the regex for length <= 3
|
|
848
|
+
CTAATGT
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
Examples
|
|
852
|
+
--------
|
|
853
|
+
>>> from pydna.dseq import Dseq
|
|
854
|
+
>>> regex = regex_ss_melt_factory(3)
|
|
855
|
+
>>> s = Dseq("GFTTAJA")
|
|
856
|
+
>>> s
|
|
857
|
+
Dseq(-7)
|
|
858
|
+
G TTA A
|
|
859
|
+
CTAATGT
|
|
860
|
+
>>> mobj = regex.search(s._data)
|
|
861
|
+
>>> mobj.groupdict()
|
|
862
|
+
{'watson': b'TTA', 'crick': None}
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
Parameters
|
|
866
|
+
----------
|
|
867
|
+
length : int
|
|
868
|
+
Max length of double stranded region flanked by single stranded
|
|
869
|
+
regions.
|
|
870
|
+
|
|
871
|
+
Returns
|
|
872
|
+
-------
|
|
873
|
+
TYPE
|
|
874
|
+
regular expression object.
|
|
875
|
+
|
|
876
|
+
"""
|
|
877
|
+
|
|
878
|
+
regex = (
|
|
879
|
+
f"(?P<watson>((?<=[{ss_letters_crick}]))"
|
|
880
|
+
f"([{ds_letters}]{{1,{length}}})"
|
|
881
|
+
f"((?=[^{ss_letters_watson}{ds_letters}])))|"
|
|
882
|
+
f"(?P<crick>((?<=[{ss_letters_watson}]))"
|
|
883
|
+
f"([{ds_letters}]{{1,{length}}})"
|
|
884
|
+
f"((?=[^{ss_letters_crick}{ds_letters}])))"
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
return re.compile(regex.encode("ascii"))
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
def regex_ds_melt_factory(length: int) -> re.Pattern:
|
|
891
|
+
"""
|
|
892
|
+
A regular expression for finding double-stranded regions flanked by single-stranded DNA
|
|
893
|
+
that can be melted to shed multiple double stranded fragments.
|
|
894
|
+
|
|
895
|
+
This function returns a regular expression that finds double-stranded regions
|
|
896
|
+
(of length <= length) that are flanked by single-stranded regions on opposite
|
|
897
|
+
sides in dscode format. These regions are useful to identify as potential melt
|
|
898
|
+
sites, since melting them leads to separation into multiple double stranded fragments.
|
|
899
|
+
|
|
900
|
+
The regular expression finds double stranded patches flanked by empty
|
|
901
|
+
positions on opposite sides(see figure below). Melting of this kind of
|
|
902
|
+
sites leads to separation into multiple double stranded fragments.
|
|
903
|
+
|
|
904
|
+
::
|
|
905
|
+
aaaGFTTAIAttt <-- dscode
|
|
906
|
+
|
|
907
|
+
aaaG TTACAttt <-- "TTA" is found by the regex for length <= 3
|
|
908
|
+
tttCTAAT Taaa
|
|
909
|
+
|
|
910
|
+
Examples
|
|
911
|
+
--------
|
|
912
|
+
|
|
913
|
+
>>> from pydna.dseq import Dseq
|
|
914
|
+
>>> regex = regex_ds_melt_factory(3)
|
|
915
|
+
>>> s = Dseq("aaaGFTTAIAttt")
|
|
916
|
+
>>> s
|
|
917
|
+
Dseq(-13)
|
|
918
|
+
aaaG TTACAttt
|
|
919
|
+
tttCTAAT Taaa
|
|
920
|
+
>>> mobj = regex.search(s._data)
|
|
921
|
+
>>> mobj.groupdict()
|
|
922
|
+
{'watson': None, 'crick': b'TTA'}
|
|
923
|
+
|
|
924
|
+
Parameters
|
|
925
|
+
----------
|
|
926
|
+
length : int
|
|
927
|
+
Max length of double stranded region flanked by single stranded
|
|
928
|
+
regions.
|
|
929
|
+
|
|
930
|
+
Returns
|
|
931
|
+
-------
|
|
932
|
+
TYPE
|
|
933
|
+
regular expression object.
|
|
934
|
+
|
|
935
|
+
"""
|
|
936
|
+
|
|
937
|
+
regex = (
|
|
938
|
+
f"(?P<watson>((?<=[{ss_letters_watson}])|^)"
|
|
939
|
+
f"([{ds_letters}]{{1,{length}}})"
|
|
940
|
+
f"((?=[^{ss_letters_watson}{ds_letters}])|$))|"
|
|
941
|
+
f"(?P<crick>((?<=[{ss_letters_crick}])|^)"
|
|
942
|
+
f"([{ds_letters}]{{1,{length}}})"
|
|
943
|
+
f"((?=[^{ss_letters_crick}{ds_letters}])|$))"
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
return re.compile(regex.encode("ascii"))
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
def anneal_strands(strand_a: str, strand_b: str) -> bool:
|
|
950
|
+
"""
|
|
951
|
+
Test if two DNA strands containing dscode anneal or not.
|
|
952
|
+
|
|
953
|
+
Both strands are assumed to be given in 5' -> 3' direction.
|
|
954
|
+
|
|
955
|
+
Examples
|
|
956
|
+
--------
|
|
957
|
+
|
|
958
|
+
>>> from pydna.alphabet import anneal_strands
|
|
959
|
+
>>> a = "TTA"
|
|
960
|
+
>>> b = "AAT"[::-1]
|
|
961
|
+
>>> anneal_strands(a, b)
|
|
962
|
+
True
|
|
963
|
+
>>> anneal_strands(b, a)
|
|
964
|
+
True
|
|
965
|
+
>>> c = "UUA"
|
|
966
|
+
>>> anneal_strands(c, b)
|
|
967
|
+
True
|
|
968
|
+
>>> anneal_strands(a.lower(), b)
|
|
969
|
+
True
|
|
970
|
+
>>> anneal_strands("TG", "AA")
|
|
971
|
+
False
|
|
972
|
+
|
|
973
|
+
Parameters
|
|
974
|
+
----------
|
|
975
|
+
watson : str
|
|
976
|
+
A single DNA strand.
|
|
977
|
+
crick : str
|
|
978
|
+
A single DNA strand.
|
|
979
|
+
|
|
980
|
+
Returns
|
|
981
|
+
-------
|
|
982
|
+
bool
|
|
983
|
+
True if annealing is perfect.
|
|
984
|
+
|
|
985
|
+
"""
|
|
986
|
+
w = strand_a.translate(dscode_to_watson_table)
|
|
987
|
+
c = strand_b.translate(complement_table_for_dscode).translate(
|
|
988
|
+
dscode_to_crick_table
|
|
989
|
+
)[::-1]
|
|
990
|
+
for x, y in zip(w, c):
|
|
991
|
+
try:
|
|
992
|
+
basepair_dict[(x, y)]
|
|
993
|
+
except KeyError:
|
|
994
|
+
return False
|
|
995
|
+
return True
|