pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydna/__init__.py +30 -195
- pydna/_pretty.py +8 -8
- pydna/_thermodynamic_data.py +3 -3
- pydna/all.py +1 -12
- pydna/alphabet.py +995 -0
- pydna/amplicon.py +19 -24
- pydna/amplify.py +75 -95
- pydna/assembly.py +64 -81
- pydna/assembly2.py +375 -310
- pydna/codon.py +4 -4
- pydna/common_sub_strings.py +6 -8
- pydna/contig.py +203 -10
- pydna/design.py +176 -60
- pydna/dseq.py +1788 -718
- pydna/dseqrecord.py +197 -179
- pydna/gateway.py +6 -6
- pydna/gel.py +5 -5
- pydna/genbank.py +43 -46
- pydna/genbankfixer.py +89 -92
- pydna/ladders.py +11 -12
- pydna/oligonucleotide_hybridization.py +124 -0
- pydna/opencloning_models.py +187 -60
- pydna/parsers.py +45 -32
- pydna/primer.py +4 -4
- pydna/primer_screen.py +833 -0
- pydna/readers.py +14 -9
- pydna/seq.py +137 -47
- pydna/seqrecord.py +54 -62
- pydna/sequence_picker.py +2 -5
- pydna/sequence_regex.py +6 -6
- pydna/tm.py +17 -17
- pydna/types.py +19 -19
- pydna/utils.py +97 -75
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/METADATA +8 -8
- pydna-5.5.6.dist-info/RECORD +42 -0
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/WHEEL +1 -1
- pydna/conftest.py +0 -42
- pydna/download.py +0 -32
- pydna/genbankfile.py +0 -42
- pydna/genbankrecord.py +0 -168
- pydna/goldengate.py +0 -45
- pydna/ligate.py +0 -62
- pydna/user_cloning.py +0 -29
- pydna-5.5.4.dist-info/RECORD +0 -46
- {pydna-5.5.4.dist-info → pydna-5.5.6.dist-info}/licenses/LICENSE.txt +0 -0
pydna/primer_screen.py
ADDED
|
@@ -0,0 +1,833 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Fast primer screening
|
|
5
|
+
---------------------
|
|
6
|
+
|
|
7
|
+
This module provides fast primer screening using the Aho-Corasick string-search
|
|
8
|
+
algorithm. It is useful for PCR diagnostic purposes when given a list of primers
|
|
9
|
+
and a single sequence or list of sequences to analyze.
|
|
10
|
+
|
|
11
|
+
The primer list can consist of `Primer` objects returned by :func:`pydna.parsers.parse_primers`
|
|
12
|
+
or any objects with a ``seq`` attribute, such as :class:`pydna.seqrecord.SeqRecord`
|
|
13
|
+
or :class:`Bio.SeqRecord.SeqRecord`.
|
|
14
|
+
|
|
15
|
+
The Aho-Corasick algorithm efficiently finds all occurrences of a set of sequences
|
|
16
|
+
within a larger text. If the same primer list is used repeatedly, creating an
|
|
17
|
+
automaton greatly speeds up repeated searches. See :func:`make_automaton` for
|
|
18
|
+
information on creating, saving, and loading such automata.
|
|
19
|
+
|
|
20
|
+
Functions
|
|
21
|
+
---------
|
|
22
|
+
|
|
23
|
+
- :func:`forward_primers`
|
|
24
|
+
- :func:`reverse_primers`
|
|
25
|
+
- :func:`primer_pairs`
|
|
26
|
+
- :func:`flanking_primer_pairs`
|
|
27
|
+
- :func:`diff_primer_pairs`
|
|
28
|
+
- :func:`diff_primer_triplets`
|
|
29
|
+
|
|
30
|
+
References
|
|
31
|
+
----------
|
|
32
|
+
|
|
33
|
+
Aho-Corasick algorithm:
|
|
34
|
+
https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
|
|
35
|
+
|
|
36
|
+
This module uses `pyahocorasick`:
|
|
37
|
+
Documentation: https://pyahocorasick.readthedocs.io/en/latest
|
|
38
|
+
GitHub: https://github.com/WojciechMula/pyahocorasick
|
|
39
|
+
PyPI: https://pypi.python.org/pypi/pyahocorasick
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# TODO: circular templates
|
|
44
|
+
|
|
45
|
+
from itertools import product
|
|
46
|
+
from itertools import combinations
|
|
47
|
+
from itertools import pairwise
|
|
48
|
+
from collections import defaultdict
|
|
49
|
+
from collections import Counter
|
|
50
|
+
from collections import namedtuple
|
|
51
|
+
from collections.abc import Callable
|
|
52
|
+
from collections.abc import Sequence
|
|
53
|
+
|
|
54
|
+
from pydna.dseqrecord import Dseqrecord
|
|
55
|
+
from pydna.primer import Primer
|
|
56
|
+
|
|
57
|
+
import ahocorasick
|
|
58
|
+
|
|
59
|
+
import warnings
|
|
60
|
+
|
|
61
|
+
from Bio.Data.IUPACData import ambiguous_dna_values
|
|
62
|
+
|
|
63
|
+
warnings.warn(
|
|
64
|
+
"The primer_screen module is experimental "
|
|
65
|
+
"and not yet extensively tested. "
|
|
66
|
+
"api may change in future versions.",
|
|
67
|
+
category=FutureWarning,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
amplicon_tuple = namedtuple(
|
|
71
|
+
typename="amplicon_tuple", field_names="fp, rp, fposition, rposition, size"
|
|
72
|
+
)
|
|
73
|
+
primer_tuple = namedtuple(typename="primer_tuple", field_names="seq, fp, rp, size")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def closest_diff(nums: list[int]) -> int:
|
|
77
|
+
"""
|
|
78
|
+
Smallest difference between two consecutive integers in a sorted list.
|
|
79
|
+
|
|
80
|
+
Given a list of integers eg. 1, 5, 7, 11, 19, return the smallest
|
|
81
|
+
absolute difference, in this case 7-5 = 2.
|
|
82
|
+
|
|
83
|
+
>>> closest_diff([1, 5, 7, 11, 19])
|
|
84
|
+
2
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
nums : list[int]
|
|
90
|
+
List of integers.
|
|
91
|
+
|
|
92
|
+
Raises
|
|
93
|
+
------
|
|
94
|
+
ValueError
|
|
95
|
+
At least two numbers are required.
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
-------
|
|
99
|
+
int
|
|
100
|
+
Diff, always >= 0.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
if len(nums) < 2:
|
|
104
|
+
raise ValueError("Need at least two numbers")
|
|
105
|
+
|
|
106
|
+
nums = sorted(nums)
|
|
107
|
+
min_diff = float("inf")
|
|
108
|
+
|
|
109
|
+
for a, b in zip(nums, nums[1:]):
|
|
110
|
+
diff = abs(a - b)
|
|
111
|
+
if diff < min_diff:
|
|
112
|
+
min_diff = diff
|
|
113
|
+
x, y = a, b
|
|
114
|
+
|
|
115
|
+
return abs(x - y)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def expand_iupac_to_dna(seq: str) -> list[str]:
|
|
119
|
+
"""
|
|
120
|
+
Expand an extended IUPAC DNA string to unambiguous IUPAC nucleotide alphabet.
|
|
121
|
+
|
|
122
|
+
Expands a string containing extended IUPAC code (ACGTURYSWKMBDHVN) including
|
|
123
|
+
U for uracil into all possible DNA strings using only AGCT.
|
|
124
|
+
|
|
125
|
+
Returns a list of strings.
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
|
|
129
|
+
>>> expand_iupac_to_dna("ATNG")
|
|
130
|
+
['ATGG', 'ATAG', 'ATTG', 'ATCG']
|
|
131
|
+
>>> x = expand_iupac_to_dna("ACGTURYSWKMBDHVN")
|
|
132
|
+
>>> len(x)
|
|
133
|
+
20736
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
seq : str
|
|
139
|
+
String containing extended IUPAC DNA.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
list[str]
|
|
144
|
+
List of strings in unambiguous IUPAC nucleotide alphabet.
|
|
145
|
+
|
|
146
|
+
"""
|
|
147
|
+
custom_dict = {**ambiguous_dna_values}
|
|
148
|
+
# Include RNA
|
|
149
|
+
custom_dict["U"] = "T"
|
|
150
|
+
choices_per_pos = [custom_dict[ch] for ch in seq.upper()]
|
|
151
|
+
# Cartesian product of all position choices
|
|
152
|
+
return ["".join(tup) for tup in product(*choices_per_pos)]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def make_automaton(
|
|
156
|
+
primer_list: Sequence[Primer | None], limit: str = 16
|
|
157
|
+
) -> ahocorasick.Automaton:
|
|
158
|
+
"""
|
|
159
|
+
Aho-Corasick automaton for a list of primers.
|
|
160
|
+
|
|
161
|
+
An automaton `here <https://github.com/WojciechMula/pyahocorasick>`__ can
|
|
162
|
+
be made prior to primer screening for a list of Primer
|
|
163
|
+
objects for faster primer search.
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
This automaton can be reused as an optional argument across calls to :func:`forward_primers`,
|
|
167
|
+
:func:`reverse_primers`, :func:`primer_pairs`, :func:`flanking_primer_pairs`,
|
|
168
|
+
:func:`diff_primer_pairs`, and :func:`diff_primer_triplets`.
|
|
169
|
+
|
|
170
|
+
The primer list can contain None, this can be used to remove primers
|
|
171
|
+
from the primer_list for the automaton, while keeping the original index
|
|
172
|
+
for each primer.
|
|
173
|
+
|
|
174
|
+
The limit is the part of the primer used to find annealing positions.
|
|
175
|
+
The automaton processes the uppercase 3' part of each primer up to `limit`.
|
|
176
|
+
It has to be rebuilt if a different limit is needed.
|
|
177
|
+
|
|
178
|
+
The primers can contain ambiguous bases from the extended IUPAC DNA alphabet.
|
|
179
|
+
|
|
180
|
+
The automaton can be saved and loaded like this (from the pyahocorasick docs):
|
|
181
|
+
|
|
182
|
+
::
|
|
183
|
+
|
|
184
|
+
import pickle
|
|
185
|
+
from pydna import primer_screen
|
|
186
|
+
|
|
187
|
+
# build automaton
|
|
188
|
+
atm = make_automaton(pl, limit = 16)
|
|
189
|
+
|
|
190
|
+
# save automaton
|
|
191
|
+
atm.save("atm.automaton", pickle.dumps)
|
|
192
|
+
|
|
193
|
+
# load automaton
|
|
194
|
+
import ahocorasick
|
|
195
|
+
atm = ahocorasick.load(path, pickle.loads)
|
|
196
|
+
|
|
197
|
+
# use automaton
|
|
198
|
+
fps = forward_primers(template, primer_list, automaton=atm)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
204
|
+
This is a list of pydna.primer.Primer objects or
|
|
205
|
+
any object with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
206
|
+
limit : str, optional
|
|
207
|
+
This is the primer part in the 3'-end that has to
|
|
208
|
+
anneal. The default is 16.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
ahocorasick.Automaton
|
|
213
|
+
pyahocorasick automaton made for the list of Primer objects.
|
|
214
|
+
|
|
215
|
+
"""
|
|
216
|
+
automaton = ahocorasick.Automaton()
|
|
217
|
+
|
|
218
|
+
suffix_dict = defaultdict(list)
|
|
219
|
+
|
|
220
|
+
for i, s in enumerate(primer_list):
|
|
221
|
+
# filter for primers that evaluate to False such as None
|
|
222
|
+
# or primers that are too short.
|
|
223
|
+
if not s or (len(s) < limit):
|
|
224
|
+
continue
|
|
225
|
+
# Primers may share suffix, so primer indices pertaining to a
|
|
226
|
+
# certain suffix are collected together.
|
|
227
|
+
for footprint in expand_iupac_to_dna(str(s.seq)[-limit:].upper()):
|
|
228
|
+
suffix_dict[footprint].append(i)
|
|
229
|
+
|
|
230
|
+
for footprint, indices in suffix_dict.items():
|
|
231
|
+
automaton.add_word(footprint, tuple(indices))
|
|
232
|
+
|
|
233
|
+
automaton.make_automaton()
|
|
234
|
+
|
|
235
|
+
return automaton
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def callback(a: int, b: int) -> bool:
|
|
239
|
+
"""
|
|
240
|
+
PCR product sizes quality control.
|
|
241
|
+
|
|
242
|
+
This function accepts two integers representing PCR product sizes
|
|
243
|
+
and returns True or False indicating the ease with which the size
|
|
244
|
+
differences can be distinguished on a typical agarose gel.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
a : int
|
|
249
|
+
One size.
|
|
250
|
+
b : int
|
|
251
|
+
Another size.
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
bool
|
|
256
|
+
True if successful, False otherwise.
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
# The length difference has to be 20%
|
|
260
|
+
# of the size of the larger fragment
|
|
261
|
+
return abs(a - b) >= 0.2 * max((a, b))
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def forward_primers(
|
|
265
|
+
seq: Dseqrecord,
|
|
266
|
+
primer_list: Sequence[Primer | None],
|
|
267
|
+
limit: int = 16,
|
|
268
|
+
automaton: ahocorasick.Automaton = None,
|
|
269
|
+
) -> dict[int, list[int]]:
|
|
270
|
+
"""
|
|
271
|
+
Forward primers from `primer_list` annealing to `seq` with at least `limit`
|
|
272
|
+
base pairs.
|
|
273
|
+
|
|
274
|
+
The optional automaton can speed up the primer search if the same primer
|
|
275
|
+
list is often used, see :func:`make_automaton` for more information.
|
|
276
|
+
|
|
277
|
+
The resulting dict has the form:
|
|
278
|
+
|
|
279
|
+
::
|
|
280
|
+
|
|
281
|
+
{ primer_A_index : [location1, location2, ...]
|
|
282
|
+
primer_B_index : [location1, location2, ...] }
|
|
283
|
+
|
|
284
|
+
Where a key such as primer_A_index (integer) is the index for a primer
|
|
285
|
+
in `primer_list` and the value is a list of locations (integers) where
|
|
286
|
+
the primer binds.
|
|
287
|
+
|
|
288
|
+
The concept of location is the same as used in :mod:`pydna.primer`.
|
|
289
|
+
The forward primer in the figure below anneals at position 14 on the
|
|
290
|
+
template.
|
|
291
|
+
|
|
292
|
+
::
|
|
293
|
+
|
|
294
|
+
5-gtcatgatctagtcgatgtta-3
|
|
295
|
+
|||||||||||||||||||||
|
|
296
|
+
|
|
297
|
+
5'-tagtcg-3' = forward primer, location = 14
|
|
298
|
+
||||||
|
|
299
|
+
|||||||||||||||||||||
|
|
300
|
+
3-cagtactagatcagctacaat-5
|
|
301
|
+
|
|
|
302
|
+
012345678911111111112 position
|
|
303
|
+
01234567890
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
seq : Dseqrecord
|
|
310
|
+
Target sequence to find primer annealing positions.
|
|
311
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
312
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
313
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
314
|
+
limit : str, optional
|
|
315
|
+
This is the part at the 3'-end of each primer that has to
|
|
316
|
+
anneal. The default is 16.
|
|
317
|
+
automaton : ahocorasick.Automaton, optional
|
|
318
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
dict[int, list[int]]
|
|
323
|
+
Dict of lists where keys are primer indices in primer_list and
|
|
324
|
+
values are lists with primer locations.
|
|
325
|
+
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
# if no automaton is given, we make one.
|
|
329
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
330
|
+
|
|
331
|
+
# The limit is taken from automaton stats.
|
|
332
|
+
limit = automaton.get_stats()["longest_word"]
|
|
333
|
+
|
|
334
|
+
# A defaultdict of lists is used to collect primer locations since
|
|
335
|
+
# different primers can anneal in the same place.
|
|
336
|
+
fps = defaultdict(list)
|
|
337
|
+
|
|
338
|
+
for end_index, ids in automaton.iter(str(seq.seq).upper()):
|
|
339
|
+
for i in ids:
|
|
340
|
+
fps[i].append(end_index + 1)
|
|
341
|
+
|
|
342
|
+
return dict(fps)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def reverse_primers(
|
|
346
|
+
seq: Dseqrecord,
|
|
347
|
+
primer_list: list[Primer] | tuple[Primer],
|
|
348
|
+
limit: int = 16,
|
|
349
|
+
automaton: ahocorasick.Automaton = None,
|
|
350
|
+
) -> dict[int, list[int]]:
|
|
351
|
+
"""
|
|
352
|
+
Primers from `primer_list` annealing in reverse to `seq` with at least
|
|
353
|
+
`limit` base pairs.
|
|
354
|
+
|
|
355
|
+
The optional automaton can speed up the primer search if the same primer
|
|
356
|
+
list is often used, see :func:`make_automaton` for more information.
|
|
357
|
+
|
|
358
|
+
The resulting dict has the form:
|
|
359
|
+
|
|
360
|
+
::
|
|
361
|
+
|
|
362
|
+
{ primer_A_index : [location1, location2, ...]
|
|
363
|
+
primer_B_index : [location1, location2, ...] }
|
|
364
|
+
|
|
365
|
+
Where a key such as primer_A_index (integer) is the index for a primer
|
|
366
|
+
in `primer_list` and the value is a list of locations (integers) where
|
|
367
|
+
the primer binds.
|
|
368
|
+
|
|
369
|
+
The concept of location is the same as used in :mod:`pydna.primer`.
|
|
370
|
+
The reverse primer below anneals at position 9.
|
|
371
|
+
|
|
372
|
+
::
|
|
373
|
+
|
|
374
|
+
5-gtcatgatctagtcgatgtta-3
|
|
375
|
+
|||||||||||||||||||||
|
|
376
|
+
||||||
|
|
377
|
+
3-atcagc-5 = reverse primer, location = 9
|
|
378
|
+
|
|
379
|
+
|||||||||||||||||||||
|
|
380
|
+
3-cagtactagatcagctacaat-5
|
|
381
|
+
|
|
|
382
|
+
012345678911111111112 position
|
|
383
|
+
01234567890
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
Parameters
|
|
387
|
+
----------
|
|
388
|
+
seq : Dseqrecord
|
|
389
|
+
Target sequence to find primer annealing positions.
|
|
390
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
391
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
392
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
393
|
+
limit : str, optional
|
|
394
|
+
This is the part in the 3'-end of each primer that has to
|
|
395
|
+
anneal. The default is 16.
|
|
396
|
+
automaton : ahocorasick.Automaton, optional
|
|
397
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
398
|
+
|
|
399
|
+
Returns
|
|
400
|
+
-------
|
|
401
|
+
dict[int, list[int]]
|
|
402
|
+
Dict of lists where keys are primer indices in primer_list and
|
|
403
|
+
values are lists with primer locations.
|
|
404
|
+
|
|
405
|
+
"""
|
|
406
|
+
# if no automaton is given, we make one.
|
|
407
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
408
|
+
|
|
409
|
+
# The limit is taken from automaton stats.
|
|
410
|
+
# If the automaton is given, the limit argument will be ignored.
|
|
411
|
+
limit = automaton.get_stats()["longest_word"]
|
|
412
|
+
|
|
413
|
+
# A defaultdict of lists is used to collect primer locations since
|
|
414
|
+
# different primers can anneal in the same place.
|
|
415
|
+
rps = defaultdict(list)
|
|
416
|
+
ln = len(seq)
|
|
417
|
+
|
|
418
|
+
# We use the reverse complement of the sequence instead of taking the
|
|
419
|
+
# reverse complement of each primer.
|
|
420
|
+
for end_index, ids in automaton.iter(str(seq.seq.reverse_complement()).upper()):
|
|
421
|
+
for i in ids:
|
|
422
|
+
rps[i].append(ln - (end_index + 1))
|
|
423
|
+
|
|
424
|
+
return dict(rps)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def primer_pairs(
|
|
428
|
+
seq: Dseqrecord,
|
|
429
|
+
primer_list: list[Primer] | tuple[Primer],
|
|
430
|
+
short: int = 500,
|
|
431
|
+
long: int = 2000,
|
|
432
|
+
limit: int = 16,
|
|
433
|
+
automaton: ahocorasick.Automaton = None,
|
|
434
|
+
) -> list[amplicon_tuple[int, int, int, int, int]]:
|
|
435
|
+
"""
|
|
436
|
+
Primer pairs that form PCR products larger than `short` and smaller
|
|
437
|
+
than `long`.
|
|
438
|
+
|
|
439
|
+
The PCR product size includes the PCR primers. Only unique primer pairs
|
|
440
|
+
are returned. This means that the forward and reverse primers can only
|
|
441
|
+
bind in one position on the template each.
|
|
442
|
+
|
|
443
|
+
If you suspect that primers bind on multiple locations, use the
|
|
444
|
+
:func:`forward_primers` and :func:`reverse_primers` functions.
|
|
445
|
+
|
|
446
|
+
The function returns a list of flat 5-namedtuples of integers and
|
|
447
|
+
integers with this form:
|
|
448
|
+
|
|
449
|
+
::
|
|
450
|
+
|
|
451
|
+
[
|
|
452
|
+
((index_fp1, index_rp1, position_fp1, position_rp1, size1),
|
|
453
|
+
((index_fp2, index_rp2, position_fp2, position_rp2, size2),
|
|
454
|
+
]
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
The indices are the `primer_list` indices and positions are the positions of
|
|
458
|
+
the primers as described in :func:`forward_primers` and :func:`reverse_primers`
|
|
459
|
+
functions.
|
|
460
|
+
The size includes the length of each primer, so it is the true total length
|
|
461
|
+
of the PCR product.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
seq : Dseqrecord
|
|
466
|
+
Target sequence to find primer annealing positions.
|
|
467
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
468
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
469
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
470
|
+
limit : str, optional
|
|
471
|
+
This is the part in the 3'-end of each primer that has to
|
|
472
|
+
anneal. The default is 16.
|
|
473
|
+
short : int, optional
|
|
474
|
+
Lower limit for the size of the PCR products. The default is 500.
|
|
475
|
+
long : int, optional
|
|
476
|
+
Upper limit for the size of the PCR products. The default is 1500.
|
|
477
|
+
automaton : ahocorasick.Automaton, optional
|
|
478
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
list[tuple(int, int, int, int, int)]
|
|
483
|
+
List of tuples (index_fp, position_fp, index_rp, position_rp, size)
|
|
484
|
+
|
|
485
|
+
"""
|
|
486
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
487
|
+
limit = automaton.get_stats()["longest_word"]
|
|
488
|
+
|
|
489
|
+
# Unique forward primers are collected
|
|
490
|
+
fps = {
|
|
491
|
+
fp: pos[0]
|
|
492
|
+
for fp, pos in forward_primers(
|
|
493
|
+
seq, primer_list, limit=limit, automaton=automaton
|
|
494
|
+
).items()
|
|
495
|
+
if len(pos) == 1
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
# Unique reverse primers are collected
|
|
499
|
+
rps = {
|
|
500
|
+
rp: pos[0]
|
|
501
|
+
for rp, pos in reverse_primers(
|
|
502
|
+
seq, primer_list, limit=limit, automaton=automaton
|
|
503
|
+
).items()
|
|
504
|
+
if len(pos) == 1
|
|
505
|
+
}
|
|
506
|
+
products = []
|
|
507
|
+
|
|
508
|
+
for fp, fposition in fps.items():
|
|
509
|
+
for rp, rposition in rps.items():
|
|
510
|
+
# We calculate the size of a potential PCR product
|
|
511
|
+
size = len(primer_list[fp]) + rposition - fposition + len(primer_list[rp])
|
|
512
|
+
# If the size falls within long and short, the data is kept.
|
|
513
|
+
if short <= size <= long and fposition <= rposition:
|
|
514
|
+
products.append(amplicon_tuple(fp, rp, fposition, rposition, size))
|
|
515
|
+
return products
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def flanking_primer_pairs(
|
|
519
|
+
seq: Dseqrecord,
|
|
520
|
+
primer_list: list[Primer] | tuple[Primer],
|
|
521
|
+
target: tuple[int, int],
|
|
522
|
+
limit: int = 16,
|
|
523
|
+
automaton: ahocorasick.Automaton = None,
|
|
524
|
+
) -> list[amplicon_tuple[int, int, int, int, int]]:
|
|
525
|
+
"""
|
|
526
|
+
Primer pairs that flank a target position (begin..end). This means that
|
|
527
|
+
forward primers have to bind before or at the begin position and reverse primers
|
|
528
|
+
have to bind at or after the end position.
|
|
529
|
+
|
|
530
|
+
The function returns a list of the same flat 5-namedtuples of integers returned
|
|
531
|
+
from the :func:`primer_pairs` function.
|
|
532
|
+
|
|
533
|
+
::
|
|
534
|
+
|
|
535
|
+
[
|
|
536
|
+
(index_fp1, position_fp1, index_rp1, position_rp1, size1),
|
|
537
|
+
(index_fp2, position_fp2, index_rp2, position_rp2, size2),
|
|
538
|
+
]
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
Parameters
|
|
542
|
+
----------
|
|
543
|
+
seq : Dseqrecord
|
|
544
|
+
Target sequence to find primer annealing positions.
|
|
545
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
546
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
547
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
548
|
+
target : tuple[int, int]
|
|
549
|
+
Start and stop position for target sequence.
|
|
550
|
+
limit : str, optional
|
|
551
|
+
This is the part in the 3'-end of each primer that has to
|
|
552
|
+
anneal. The default is 16.
|
|
553
|
+
automaton : ahocorasick.Automaton, optional
|
|
554
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
Returns
|
|
558
|
+
-------
|
|
559
|
+
list[tuple[int, int, int, int, int]]
|
|
560
|
+
List of tuples (index_fp, position_fp, index_rp, position_rp, size).
|
|
561
|
+
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
565
|
+
limit = automaton.get_stats()["longest_word"]
|
|
566
|
+
|
|
567
|
+
begin, end = target
|
|
568
|
+
|
|
569
|
+
assert begin < end, "begin has to be smaller than end."
|
|
570
|
+
|
|
571
|
+
amplicons = primer_pairs(
|
|
572
|
+
seq,
|
|
573
|
+
primer_list,
|
|
574
|
+
short=end - begin,
|
|
575
|
+
long=len(seq),
|
|
576
|
+
limit=limit,
|
|
577
|
+
automaton=automaton,
|
|
578
|
+
)
|
|
579
|
+
products = []
|
|
580
|
+
|
|
581
|
+
for amplicon in amplicons:
|
|
582
|
+
if amplicon.fposition >= begin and end <= amplicon.rposition:
|
|
583
|
+
products.append(amplicon)
|
|
584
|
+
|
|
585
|
+
return products[::-1]
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def diff_primer_pairs(
|
|
589
|
+
sequences: list[Dseqrecord] | tuple[Dseqrecord],
|
|
590
|
+
primer_list: list[Primer] | tuple[Primer],
|
|
591
|
+
short: int = 500,
|
|
592
|
+
long: int = 1500,
|
|
593
|
+
limit: int = 16,
|
|
594
|
+
automaton: ahocorasick.Automaton = None,
|
|
595
|
+
callback: Callable[[list], bool] = callback,
|
|
596
|
+
) -> tuple[tuple[Dseqrecord, int, int, int]]:
|
|
597
|
+
"""
|
|
598
|
+
Primer pairs for diagnostic PCR.
|
|
599
|
+
|
|
600
|
+
Given an iterable of sequences and a primer list, primers are selected that result in
|
|
601
|
+
unique product sizes from each of the input sequences.
|
|
602
|
+
|
|
603
|
+
Primers 1 and 2 both form PCR products from sequenceA and B below, but of
|
|
604
|
+
different sizes. Primers 1 and 2 could be used to verify genetic modifications such
|
|
605
|
+
as cloning an insert into a plasmid vector.
|
|
606
|
+
|
|
607
|
+
::
|
|
608
|
+
|
|
609
|
+
1> <2
|
|
610
|
+
-------NNNNNNNNN---- sequenceA
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
1> <2
|
|
614
|
+
-------XXXXX-------- sequenceB
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
The callback function is used to return true or false for the PCR products. This score is
|
|
618
|
+
meant to filter for PCR products that are likely to migrate to
|
|
619
|
+
sufficiently distinct locations to be distinguishable on a typical agarose gel.
|
|
620
|
+
|
|
621
|
+
Only products larger than `short` and smaller than `long` are returned.
|
|
622
|
+
|
|
623
|
+
An example of the output for two sequences (Dseqrecord(-3308), Dseqrecord(-3613)).
|
|
624
|
+
Primers 501 and 1806 would yield a 933 bp product with the 3308 bp sequence and the same
|
|
625
|
+
primer pair would give 1212 bp with the 3613 bp sequence.
|
|
626
|
+
|
|
627
|
+
A list of named 4-tuples is returned (Sequence, forward_primer, reverse_primer, size_bp),
|
|
628
|
+
where each tuple has one entry for each sequence in the input argument.
|
|
629
|
+
|
|
630
|
+
::
|
|
631
|
+
|
|
632
|
+
[
|
|
633
|
+
((Dseqrecord(-3308), 501, 1806, 933), (Dseqrecord(-3613), 501, 1806, 1212)),
|
|
634
|
+
]
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
Parameters
|
|
638
|
+
----------
|
|
639
|
+
sequences : list[Dseqrecord] | tuple[Dseqrecord]
|
|
640
|
+
Target sequence to find primer annealing positions.
|
|
641
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
642
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
643
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
644
|
+
limit : str, optional
|
|
645
|
+
This is the part in the 3'-end of each primer that has to
|
|
646
|
+
anneal. The default is 16.
|
|
647
|
+
short : int, optional
|
|
648
|
+
Lower limit for the size of the PCR products. The default is 500.
|
|
649
|
+
long : int, optional
|
|
650
|
+
Upper limit for the size of the PCR products. The default is 1500.
|
|
651
|
+
automaton : ahocorasick.Automaton, optional
|
|
652
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
653
|
+
callback : callable[[list], bool], optional
|
|
654
|
+
A function accepting a list of integers and returning True or False.
|
|
655
|
+
The default is callback.
|
|
656
|
+
|
|
657
|
+
Returns
|
|
658
|
+
-------
|
|
659
|
+
list[tuple[Dseqrecord, int, int, int]]
|
|
660
|
+
(Sequence, forward_primer, reverse_primer, size_bp)
|
|
661
|
+
|
|
662
|
+
"""
|
|
663
|
+
|
|
664
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
665
|
+
limit = automaton.get_stats()["longest_word"]
|
|
666
|
+
primer_pair_dict = defaultdict(dict)
|
|
667
|
+
number_of_sequences = len(sequences)
|
|
668
|
+
|
|
669
|
+
for seq in sequences:
|
|
670
|
+
|
|
671
|
+
for fp, rp, *_, size in primer_pairs(
|
|
672
|
+
seq, primer_list, short=short, long=long, limit=limit, automaton=automaton
|
|
673
|
+
):
|
|
674
|
+
|
|
675
|
+
primer_pair_dict[frozenset((fp, rp))][size] = fp, rp, seq
|
|
676
|
+
|
|
677
|
+
primer_pair_dict = {
|
|
678
|
+
k: v for k, v in primer_pair_dict.items() if len(v) == number_of_sequences
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
primer_pair_dict = {
|
|
682
|
+
k: v
|
|
683
|
+
for k, v in primer_pair_dict.items()
|
|
684
|
+
if all(callback(a, b) for a, b in pairwise(v.keys()))
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
result = []
|
|
688
|
+
|
|
689
|
+
for primer_pair, seqd in primer_pair_dict.items():
|
|
690
|
+
result.append(
|
|
691
|
+
(
|
|
692
|
+
closest_diff(seqd.keys()),
|
|
693
|
+
tuple(
|
|
694
|
+
primer_tuple(s, fp, rp, size) for size, (fp, rp, s) in seqd.items()
|
|
695
|
+
),
|
|
696
|
+
)
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
result.sort(reverse=True)
|
|
700
|
+
|
|
701
|
+
return [b for a, b in result]
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def diff_primer_triplets(
|
|
705
|
+
sequences: list[Dseqrecord] | tuple[Dseqrecord],
|
|
706
|
+
primer_list: list[Primer] | tuple[Primer],
|
|
707
|
+
limit: int = 16,
|
|
708
|
+
short: int = 500,
|
|
709
|
+
long: int = 1500,
|
|
710
|
+
automaton: ahocorasick.Automaton = None,
|
|
711
|
+
callback: Callable[[list], bool] = callback,
|
|
712
|
+
) -> tuple[tuple[tuple[Dseqrecord, int, int, int]]]:
|
|
713
|
+
"""
|
|
714
|
+
Primer triplets for diagnostic PCR.
|
|
715
|
+
|
|
716
|
+
Given a list of sequences and a primer list, primer triplets are selected that result in
|
|
717
|
+
PCR products of different sizes from each of the input sequences.
|
|
718
|
+
|
|
719
|
+
Primers 1, 2 and 3 form PCR products from sequenceA and B below, but of
|
|
720
|
+
different sizes. Primer 1 binds both sequences while primers 2 and 3 bind one
|
|
721
|
+
sequence each. This primer triplet could be used to verify genetic
|
|
722
|
+
modifications.
|
|
723
|
+
|
|
724
|
+
::
|
|
725
|
+
|
|
726
|
+
1> <2
|
|
727
|
+
-------NNNNNNNNN---- sequenceA
|
|
728
|
+
|
|
729
|
+
1> <3
|
|
730
|
+
-------XXXXX-------- sequenceB
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
The callback function is used to give a score for the PCR products. This score can
|
|
735
|
+
be used to decide if a collection of PCR products are likely to migrate to distinct
|
|
736
|
+
locations on a typical agarose gel.
|
|
737
|
+
|
|
738
|
+
Only products larger than `short` and smaller than `long` are returned.
|
|
739
|
+
|
|
740
|
+
An example of the output for two sequences = [Dseqrecord(-7664), Dseqrecord(-3613)].
|
|
741
|
+
Primer pair 701, 700 would produce a 724 bp product with the 7664 bp sequence while
|
|
742
|
+
the primer pair 701, 1564 would give a 1450 bp product with the 3613 bp sequence.
|
|
743
|
+
|
|
744
|
+
::
|
|
745
|
+
|
|
746
|
+
[
|
|
747
|
+
((Dseqrecord(-7664), 701, 700, 724), (Dseqrecord(-3613), 701, 1564, 1450)),
|
|
748
|
+
]
|
|
749
|
+
|
|
750
|
+
Parameters
|
|
751
|
+
----------
|
|
752
|
+
sequences : list[Dseqrecord] | tuple[Dseqrecord]
|
|
753
|
+
Target sequence to find primer annealing positions.
|
|
754
|
+
primer_list : list[Primer] | tuple[Primer]
|
|
755
|
+
This is a list of pydna.primer.Primer objects or any object
|
|
756
|
+
with a seq property such as Bio.SeqRecord.SeqRecord.
|
|
757
|
+
limit : str, optional
|
|
758
|
+
This is the part in the 3'-end of each primer that has to
|
|
759
|
+
anneal. The default is 16.
|
|
760
|
+
short : int, optional
|
|
761
|
+
Lower limit for the size of the PCR products. The default is 500.
|
|
762
|
+
long : int, optional
|
|
763
|
+
Upper limit for the size of the PCR products. The default is 2000.
|
|
764
|
+
automaton : ahocorasick.Automaton, optional
|
|
765
|
+
Automaton made with the :func:`make_automaton`. The default is None.
|
|
766
|
+
callback : callable[[list], bool], optional
|
|
767
|
+
A function accepting a list of integers and returning True or False.
|
|
768
|
+
The default is callback.
|
|
769
|
+
|
|
770
|
+
Returns
|
|
771
|
+
-------
|
|
772
|
+
list[tuple[Dseqrecord, int, int, int]]
|
|
773
|
+
(Sequence, forward_primer, reverse_primer, size_bp)
|
|
774
|
+
|
|
775
|
+
"""
|
|
776
|
+
|
|
777
|
+
automaton = automaton or make_automaton(primer_list, limit=limit)
|
|
778
|
+
limit = automaton.get_stats()["longest_word"]
|
|
779
|
+
number_of_sequences = len(sequences)
|
|
780
|
+
pp = {}
|
|
781
|
+
# pp = { seq1: [(a,b,c,d,e), ...], seq2: [(i,j,k,l,m), ... ]}
|
|
782
|
+
|
|
783
|
+
# All primer pairs for each sequence are collected.
|
|
784
|
+
for seq in sequences:
|
|
785
|
+
pp[seq] = primer_pairs(
|
|
786
|
+
seq, primer_list, short=short, long=long, limit=limit, automaton=automaton
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
# We count all the times a specific pair occurs
|
|
790
|
+
pair_counter = Counter()
|
|
791
|
+
|
|
792
|
+
for seq, tuples in pp.items():
|
|
793
|
+
for t in tuples:
|
|
794
|
+
pair = frozenset(t[:2]) # first two integers, unordered
|
|
795
|
+
pair_counter[pair] += 1
|
|
796
|
+
|
|
797
|
+
# Pick pairs that appear more than once.
|
|
798
|
+
pairs_to_remove = {pair for pair, count in pair_counter.items() if count > 1}
|
|
799
|
+
|
|
800
|
+
# Remove pairs that appear more than once.
|
|
801
|
+
for seq in pp:
|
|
802
|
+
pp[seq] = [t for t in pp[seq] if frozenset(t[:2]) not in pairs_to_remove]
|
|
803
|
+
|
|
804
|
+
primertrios = defaultdict(dict)
|
|
805
|
+
|
|
806
|
+
for seq1, seq2 in combinations(sequences, 2):
|
|
807
|
+
for fp1, rp1, *_, size1 in pp[seq1]:
|
|
808
|
+
for fp2, rp2, *_, size2 in pp[seq2]:
|
|
809
|
+
primertrio = frozenset((fp1, rp1, fp2, rp2))
|
|
810
|
+
if len(primertrio) == 3 and callback(size1, size2):
|
|
811
|
+
if primertrios[primertrio]:
|
|
812
|
+
del primertrios[primertrio]
|
|
813
|
+
else:
|
|
814
|
+
primertrios[primertrio][size1] = (fp1, rp1, seq1)
|
|
815
|
+
primertrios[primertrio][size2] = (fp2, rp2, seq2)
|
|
816
|
+
|
|
817
|
+
result = []
|
|
818
|
+
for primertrio, seqd in primertrios.items():
|
|
819
|
+
if len(seqd) == number_of_sequences and set(sequences) == set(
|
|
820
|
+
s for *_, s in seqd.values()
|
|
821
|
+
):
|
|
822
|
+
result.append(
|
|
823
|
+
(
|
|
824
|
+
closest_diff(seqd.keys()),
|
|
825
|
+
tuple(
|
|
826
|
+
primer_tuple(s, fp, rp, size)
|
|
827
|
+
for size, (fp, rp, s) in seqd.items()
|
|
828
|
+
),
|
|
829
|
+
)
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
result.sort(key=lambda item: item[0], reverse=True)
|
|
833
|
+
return [b for a, b in result]
|