pydna 5.5.4__py3-none-any.whl → 5.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydna/primer_screen.py ADDED
@@ -0,0 +1,833 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Fast primer screening
5
+ ---------------------
6
+
7
+ This module provides fast primer screening using the Aho-Corasick string-search
8
+ algorithm. It is useful for PCR diagnostic purposes when given a list of primers
9
+ and a single sequence or list of sequences to analyze.
10
+
11
+ The primer list can consist of `Primer` objects returned by :func:`pydna.parsers.parse_primers`
12
+ or any objects with a ``seq`` attribute, such as :class:`pydna.seqrecord.SeqRecord`
13
+ or :class:`Bio.SeqRecord.SeqRecord`.
14
+
15
+ The Aho-Corasick algorithm efficiently finds all occurrences of a set of sequences
16
+ within a larger text. If the same primer list is used repeatedly, creating an
17
+ automaton greatly speeds up repeated searches. See :func:`make_automaton` for
18
+ information on creating, saving, and loading such automata.
19
+
20
+ Functions
21
+ ---------
22
+
23
+ - :func:`forward_primers`
24
+ - :func:`reverse_primers`
25
+ - :func:`primer_pairs`
26
+ - :func:`flanking_primer_pairs`
27
+ - :func:`diff_primer_pairs`
28
+ - :func:`diff_primer_triplets`
29
+
30
+ References
31
+ ----------
32
+
33
+ Aho-Corasick algorithm:
34
+ https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
35
+
36
+ This module uses `pyahocorasick`:
37
+ Documentation: https://pyahocorasick.readthedocs.io/en/latest
38
+ GitHub: https://github.com/WojciechMula/pyahocorasick
39
+ PyPI: https://pypi.python.org/pypi/pyahocorasick
40
+ """
41
+
42
+
43
+ # TODO: circular templates
44
+
45
+ from itertools import product
46
+ from itertools import combinations
47
+ from itertools import pairwise
48
+ from collections import defaultdict
49
+ from collections import Counter
50
+ from collections import namedtuple
51
+ from collections.abc import Callable
52
+ from collections.abc import Sequence
53
+
54
+ from pydna.dseqrecord import Dseqrecord
55
+ from pydna.primer import Primer
56
+
57
+ import ahocorasick
58
+
59
+ import warnings
60
+
61
+ from Bio.Data.IUPACData import ambiguous_dna_values
62
+
63
+ warnings.warn(
64
+ "The primer_screen module is experimental "
65
+ "and not yet extensively tested. "
66
+ "api may change in future versions.",
67
+ category=FutureWarning,
68
+ )
69
+
70
+ amplicon_tuple = namedtuple(
71
+ typename="amplicon_tuple", field_names="fp, rp, fposition, rposition, size"
72
+ )
73
+ primer_tuple = namedtuple(typename="primer_tuple", field_names="seq, fp, rp, size")
74
+
75
+
76
+ def closest_diff(nums: list[int]) -> int:
77
+ """
78
+ Smallest difference between two consecutive integers in a sorted list.
79
+
80
+ Given a list of integers eg. 1, 5, 7, 11, 19, return the smallest
81
+ absolute difference, in this case 7-5 = 2.
82
+
83
+ >>> closest_diff([1, 5, 7, 11, 19])
84
+ 2
85
+
86
+
87
+ Parameters
88
+ ----------
89
+ nums : list[int]
90
+ List of integers.
91
+
92
+ Raises
93
+ ------
94
+ ValueError
95
+ At least two numbers are required.
96
+
97
+ Returns
98
+ -------
99
+ int
100
+ Diff, always >= 0.
101
+
102
+ """
103
+ if len(nums) < 2:
104
+ raise ValueError("Need at least two numbers")
105
+
106
+ nums = sorted(nums)
107
+ min_diff = float("inf")
108
+
109
+ for a, b in zip(nums, nums[1:]):
110
+ diff = abs(a - b)
111
+ if diff < min_diff:
112
+ min_diff = diff
113
+ x, y = a, b
114
+
115
+ return abs(x - y)
116
+
117
+
118
+ def expand_iupac_to_dna(seq: str) -> list[str]:
119
+ """
120
+ Expand an extended IUPAC DNA string to unambiguous IUPAC nucleotide alphabet.
121
+
122
+ Expands a string containing extended IUPAC code (ACGTURYSWKMBDHVN) including
123
+ U for uracil into all possible DNA strings using only AGCT.
124
+
125
+ Returns a list of strings.
126
+
127
+ Example:
128
+
129
+ >>> expand_iupac_to_dna("ATNG")
130
+ ['ATGG', 'ATAG', 'ATTG', 'ATCG']
131
+ >>> x = expand_iupac_to_dna("ACGTURYSWKMBDHVN")
132
+ >>> len(x)
133
+ 20736
134
+
135
+
136
+ Parameters
137
+ ----------
138
+ seq : str
139
+ String containing extended IUPAC DNA.
140
+
141
+ Returns
142
+ -------
143
+ list[str]
144
+ List of strings in unambiguous IUPAC nucleotide alphabet.
145
+
146
+ """
147
+ custom_dict = {**ambiguous_dna_values}
148
+ # Include RNA
149
+ custom_dict["U"] = "T"
150
+ choices_per_pos = [custom_dict[ch] for ch in seq.upper()]
151
+ # Cartesian product of all position choices
152
+ return ["".join(tup) for tup in product(*choices_per_pos)]
153
+
154
+
155
+ def make_automaton(
156
+ primer_list: Sequence[Primer | None], limit: str = 16
157
+ ) -> ahocorasick.Automaton:
158
+ """
159
+ Aho-Corasick automaton for a list of primers.
160
+
161
+ An automaton `here <https://github.com/WojciechMula/pyahocorasick>`__ can
162
+ be made prior to primer screening for a list of Primer
163
+ objects for faster primer search.
164
+
165
+
166
+ This automaton can be reused as an optional argument across calls to :func:`forward_primers`,
167
+ :func:`reverse_primers`, :func:`primer_pairs`, :func:`flanking_primer_pairs`,
168
+ :func:`diff_primer_pairs`, and :func:`diff_primer_triplets`.
169
+
170
+ The primer list can contain None, this can be used to remove primers
171
+ from the primer_list for the automaton, while keeping the original index
172
+ for each primer.
173
+
174
+ The limit is the part of the primer used to find annealing positions.
175
+ The automaton processes the uppercase 3' part of each primer up to `limit`.
176
+ It has to be rebuilt if a different limit is needed.
177
+
178
+ The primers can contain ambiguous bases from the extended IUPAC DNA alphabet.
179
+
180
+ The automaton can be saved and loaded like this (from the pyahocorasick docs):
181
+
182
+ ::
183
+
184
+ import pickle
185
+ from pydna import primer_screen
186
+
187
+ # build automaton
188
+ atm = make_automaton(pl, limit = 16)
189
+
190
+ # save automaton
191
+ atm.save("atm.automaton", pickle.dumps)
192
+
193
+ # load automaton
194
+ import ahocorasick
195
+ atm = ahocorasick.load(path, pickle.loads)
196
+
197
+ # use automaton
198
+ fps = forward_primers(template, primer_list, automaton=atm)
199
+
200
+
201
+ Parameters
202
+ ----------
203
+ primer_list : list[Primer] | tuple[Primer]
204
+ This is a list of pydna.primer.Primer objects or
205
+ any object with a seq property such as Bio.SeqRecord.SeqRecord.
206
+ limit : str, optional
207
+ This is the primer part in the 3'-end that has to
208
+ anneal. The default is 16.
209
+
210
+ Returns
211
+ -------
212
+ ahocorasick.Automaton
213
+ pyahocorasick automaton made for the list of Primer objects.
214
+
215
+ """
216
+ automaton = ahocorasick.Automaton()
217
+
218
+ suffix_dict = defaultdict(list)
219
+
220
+ for i, s in enumerate(primer_list):
221
+ # filter for primers that evaluate to False such as None
222
+ # or primers that are too short.
223
+ if not s or (len(s) < limit):
224
+ continue
225
+ # Primers may share suffix, so primer indices pertaining to a
226
+ # certain suffix are collected together.
227
+ for footprint in expand_iupac_to_dna(str(s.seq)[-limit:].upper()):
228
+ suffix_dict[footprint].append(i)
229
+
230
+ for footprint, indices in suffix_dict.items():
231
+ automaton.add_word(footprint, tuple(indices))
232
+
233
+ automaton.make_automaton()
234
+
235
+ return automaton
236
+
237
+
238
+ def callback(a: int, b: int) -> bool:
239
+ """
240
+ PCR product sizes quality control.
241
+
242
+ This function accepts two integers representing PCR product sizes
243
+ and returns True or False indicating the ease with which the size
244
+ differences can be distinguished on a typical agarose gel.
245
+
246
+ Parameters
247
+ ----------
248
+ a : int
249
+ One size.
250
+ b : int
251
+ Another size.
252
+
253
+ Returns
254
+ -------
255
+ bool
256
+ True if successful, False otherwise.
257
+
258
+ """
259
+ # The length difference has to be 20%
260
+ # of the size of the larger fragment
261
+ return abs(a - b) >= 0.2 * max((a, b))
262
+
263
+
264
+ def forward_primers(
265
+ seq: Dseqrecord,
266
+ primer_list: Sequence[Primer | None],
267
+ limit: int = 16,
268
+ automaton: ahocorasick.Automaton = None,
269
+ ) -> dict[int, list[int]]:
270
+ """
271
+ Forward primers from `primer_list` annealing to `seq` with at least `limit`
272
+ base pairs.
273
+
274
+ The optional automaton can speed up the primer search if the same primer
275
+ list is often used, see :func:`make_automaton` for more information.
276
+
277
+ The resulting dict has the form:
278
+
279
+ ::
280
+
281
+ { primer_A_index : [location1, location2, ...]
282
+ primer_B_index : [location1, location2, ...] }
283
+
284
+ Where a key such as primer_A_index (integer) is the index for a primer
285
+ in `primer_list` and the value is a list of locations (integers) where
286
+ the primer binds.
287
+
288
+ The concept of location is the same as used in :mod:`pydna.primer`.
289
+ The forward primer in the figure below anneals at position 14 on the
290
+ template.
291
+
292
+ ::
293
+
294
+ 5-gtcatgatctagtcgatgtta-3
295
+ |||||||||||||||||||||
296
+
297
+ 5'-tagtcg-3' = forward primer, location = 14
298
+ ||||||
299
+ |||||||||||||||||||||
300
+ 3-cagtactagatcagctacaat-5
301
+ |
302
+ 012345678911111111112 position
303
+ 01234567890
304
+
305
+
306
+
307
+ Parameters
308
+ ----------
309
+ seq : Dseqrecord
310
+ Target sequence to find primer annealing positions.
311
+ primer_list : list[Primer] | tuple[Primer]
312
+ This is a list of pydna.primer.Primer objects or any object
313
+ with a seq property such as Bio.SeqRecord.SeqRecord.
314
+ limit : str, optional
315
+ This is the part at the 3'-end of each primer that has to
316
+ anneal. The default is 16.
317
+ automaton : ahocorasick.Automaton, optional
318
+ Automaton made with the :func:`make_automaton`. The default is None.
319
+
320
+ Returns
321
+ -------
322
+ dict[int, list[int]]
323
+ Dict of lists where keys are primer indices in primer_list and
324
+ values are lists with primer locations.
325
+
326
+ """
327
+
328
+ # if no automaton is given, we make one.
329
+ automaton = automaton or make_automaton(primer_list, limit=limit)
330
+
331
+ # The limit is taken from automaton stats.
332
+ limit = automaton.get_stats()["longest_word"]
333
+
334
+ # A defaultdict of lists is used to collect primer locations since
335
+ # different primers can anneal in the same place.
336
+ fps = defaultdict(list)
337
+
338
+ for end_index, ids in automaton.iter(str(seq.seq).upper()):
339
+ for i in ids:
340
+ fps[i].append(end_index + 1)
341
+
342
+ return dict(fps)
343
+
344
+
345
+ def reverse_primers(
346
+ seq: Dseqrecord,
347
+ primer_list: list[Primer] | tuple[Primer],
348
+ limit: int = 16,
349
+ automaton: ahocorasick.Automaton = None,
350
+ ) -> dict[int, list[int]]:
351
+ """
352
+ Primers from `primer_list` annealing in reverse to `seq` with at least
353
+ `limit` base pairs.
354
+
355
+ The optional automaton can speed up the primer search if the same primer
356
+ list is often used, see :func:`make_automaton` for more information.
357
+
358
+ The resulting dict has the form:
359
+
360
+ ::
361
+
362
+ { primer_A_index : [location1, location2, ...]
363
+ primer_B_index : [location1, location2, ...] }
364
+
365
+ Where a key such as primer_A_index (integer) is the index for a primer
366
+ in `primer_list` and the value is a list of locations (integers) where
367
+ the primer binds.
368
+
369
+ The concept of location is the same as used in :mod:`pydna.primer`.
370
+ The reverse primer below anneals at position 9.
371
+
372
+ ::
373
+
374
+ 5-gtcatgatctagtcgatgtta-3
375
+ |||||||||||||||||||||
376
+ ||||||
377
+ 3-atcagc-5 = reverse primer, location = 9
378
+
379
+ |||||||||||||||||||||
380
+ 3-cagtactagatcagctacaat-5
381
+ |
382
+ 012345678911111111112 position
383
+ 01234567890
384
+
385
+
386
+ Parameters
387
+ ----------
388
+ seq : Dseqrecord
389
+ Target sequence to find primer annealing positions.
390
+ primer_list : list[Primer] | tuple[Primer]
391
+ This is a list of pydna.primer.Primer objects or any object
392
+ with a seq property such as Bio.SeqRecord.SeqRecord.
393
+ limit : str, optional
394
+ This is the part in the 3'-end of each primer that has to
395
+ anneal. The default is 16.
396
+ automaton : ahocorasick.Automaton, optional
397
+ Automaton made with the :func:`make_automaton`. The default is None.
398
+
399
+ Returns
400
+ -------
401
+ dict[int, list[int]]
402
+ Dict of lists where keys are primer indices in primer_list and
403
+ values are lists with primer locations.
404
+
405
+ """
406
+ # if no automaton is given, we make one.
407
+ automaton = automaton or make_automaton(primer_list, limit=limit)
408
+
409
+ # The limit is taken from automaton stats.
410
+ # If the automaton is given, the limit argument will be ignored.
411
+ limit = automaton.get_stats()["longest_word"]
412
+
413
+ # A defaultdict of lists is used to collect primer locations since
414
+ # different primers can anneal in the same place.
415
+ rps = defaultdict(list)
416
+ ln = len(seq)
417
+
418
+ # We use the reverse complement of the sequence instead of taking the
419
+ # reverse complement of each primer.
420
+ for end_index, ids in automaton.iter(str(seq.seq.reverse_complement()).upper()):
421
+ for i in ids:
422
+ rps[i].append(ln - (end_index + 1))
423
+
424
+ return dict(rps)
425
+
426
+
427
+ def primer_pairs(
428
+ seq: Dseqrecord,
429
+ primer_list: list[Primer] | tuple[Primer],
430
+ short: int = 500,
431
+ long: int = 2000,
432
+ limit: int = 16,
433
+ automaton: ahocorasick.Automaton = None,
434
+ ) -> list[amplicon_tuple[int, int, int, int, int]]:
435
+ """
436
+ Primer pairs that form PCR products larger than `short` and smaller
437
+ than `long`.
438
+
439
+ The PCR product size includes the PCR primers. Only unique primer pairs
440
+ are returned. This means that the forward and reverse primers can only
441
+ bind in one position on the template each.
442
+
443
+ If you suspect that primers bind on multiple locations, use the
444
+ :func:`forward_primers` and :func:`reverse_primers` functions.
445
+
446
+ The function returns a list of flat 5-namedtuples of integers and
447
+ integers with this form:
448
+
449
+ ::
450
+
451
+ [
452
+ ((index_fp1, index_rp1, position_fp1, position_rp1, size1),
453
+ ((index_fp2, index_rp2, position_fp2, position_rp2, size2),
454
+ ]
455
+
456
+
457
+ The indices are the `primer_list` indices and positions are the positions of
458
+ the primers as described in :func:`forward_primers` and :func:`reverse_primers`
459
+ functions.
460
+ The size includes the length of each primer, so it is the true total length
461
+ of the PCR product.
462
+
463
+ Parameters
464
+ ----------
465
+ seq : Dseqrecord
466
+ Target sequence to find primer annealing positions.
467
+ primer_list : list[Primer] | tuple[Primer]
468
+ This is a list of pydna.primer.Primer objects or any object
469
+ with a seq property such as Bio.SeqRecord.SeqRecord.
470
+ limit : str, optional
471
+ This is the part in the 3'-end of each primer that has to
472
+ anneal. The default is 16.
473
+ short : int, optional
474
+ Lower limit for the size of the PCR products. The default is 500.
475
+ long : int, optional
476
+ Upper limit for the size of the PCR products. The default is 1500.
477
+ automaton : ahocorasick.Automaton, optional
478
+ Automaton made with the :func:`make_automaton`. The default is None.
479
+
480
+ Returns
481
+ -------
482
+ list[tuple(int, int, int, int, int)]
483
+ List of tuples (index_fp, position_fp, index_rp, position_rp, size)
484
+
485
+ """
486
+ automaton = automaton or make_automaton(primer_list, limit=limit)
487
+ limit = automaton.get_stats()["longest_word"]
488
+
489
+ # Unique forward primers are collected
490
+ fps = {
491
+ fp: pos[0]
492
+ for fp, pos in forward_primers(
493
+ seq, primer_list, limit=limit, automaton=automaton
494
+ ).items()
495
+ if len(pos) == 1
496
+ }
497
+
498
+ # Unique reverse primers are collected
499
+ rps = {
500
+ rp: pos[0]
501
+ for rp, pos in reverse_primers(
502
+ seq, primer_list, limit=limit, automaton=automaton
503
+ ).items()
504
+ if len(pos) == 1
505
+ }
506
+ products = []
507
+
508
+ for fp, fposition in fps.items():
509
+ for rp, rposition in rps.items():
510
+ # We calculate the size of a potential PCR product
511
+ size = len(primer_list[fp]) + rposition - fposition + len(primer_list[rp])
512
+ # If the size falls within long and short, the data is kept.
513
+ if short <= size <= long and fposition <= rposition:
514
+ products.append(amplicon_tuple(fp, rp, fposition, rposition, size))
515
+ return products
516
+
517
+
518
+ def flanking_primer_pairs(
519
+ seq: Dseqrecord,
520
+ primer_list: list[Primer] | tuple[Primer],
521
+ target: tuple[int, int],
522
+ limit: int = 16,
523
+ automaton: ahocorasick.Automaton = None,
524
+ ) -> list[amplicon_tuple[int, int, int, int, int]]:
525
+ """
526
+ Primer pairs that flank a target position (begin..end). This means that
527
+ forward primers have to bind before or at the begin position and reverse primers
528
+ have to bind at or after the end position.
529
+
530
+ The function returns a list of the same flat 5-namedtuples of integers returned
531
+ from the :func:`primer_pairs` function.
532
+
533
+ ::
534
+
535
+ [
536
+ (index_fp1, position_fp1, index_rp1, position_rp1, size1),
537
+ (index_fp2, position_fp2, index_rp2, position_rp2, size2),
538
+ ]
539
+
540
+
541
+ Parameters
542
+ ----------
543
+ seq : Dseqrecord
544
+ Target sequence to find primer annealing positions.
545
+ primer_list : list[Primer] | tuple[Primer]
546
+ This is a list of pydna.primer.Primer objects or any object
547
+ with a seq property such as Bio.SeqRecord.SeqRecord.
548
+ target : tuple[int, int]
549
+ Start and stop position for target sequence.
550
+ limit : str, optional
551
+ This is the part in the 3'-end of each primer that has to
552
+ anneal. The default is 16.
553
+ automaton : ahocorasick.Automaton, optional
554
+ Automaton made with the :func:`make_automaton`. The default is None.
555
+
556
+
557
+ Returns
558
+ -------
559
+ list[tuple[int, int, int, int, int]]
560
+ List of tuples (index_fp, position_fp, index_rp, position_rp, size).
561
+
562
+ """
563
+
564
+ automaton = automaton or make_automaton(primer_list, limit=limit)
565
+ limit = automaton.get_stats()["longest_word"]
566
+
567
+ begin, end = target
568
+
569
+ assert begin < end, "begin has to be smaller than end."
570
+
571
+ amplicons = primer_pairs(
572
+ seq,
573
+ primer_list,
574
+ short=end - begin,
575
+ long=len(seq),
576
+ limit=limit,
577
+ automaton=automaton,
578
+ )
579
+ products = []
580
+
581
+ for amplicon in amplicons:
582
+ if amplicon.fposition >= begin and end <= amplicon.rposition:
583
+ products.append(amplicon)
584
+
585
+ return products[::-1]
586
+
587
+
588
+ def diff_primer_pairs(
589
+ sequences: list[Dseqrecord] | tuple[Dseqrecord],
590
+ primer_list: list[Primer] | tuple[Primer],
591
+ short: int = 500,
592
+ long: int = 1500,
593
+ limit: int = 16,
594
+ automaton: ahocorasick.Automaton = None,
595
+ callback: Callable[[list], bool] = callback,
596
+ ) -> tuple[tuple[Dseqrecord, int, int, int]]:
597
+ """
598
+ Primer pairs for diagnostic PCR.
599
+
600
+ Given an iterable of sequences and a primer list, primers are selected that result in
601
+ unique product sizes from each of the input sequences.
602
+
603
+ Primers 1 and 2 both form PCR products from sequenceA and B below, but of
604
+ different sizes. Primers 1 and 2 could be used to verify genetic modifications such
605
+ as cloning an insert into a plasmid vector.
606
+
607
+ ::
608
+
609
+ 1> <2
610
+ -------NNNNNNNNN---- sequenceA
611
+
612
+
613
+ 1> <2
614
+ -------XXXXX-------- sequenceB
615
+
616
+
617
+ The callback function is used to return true or false for the PCR products. This score is
618
+ meant to filter for PCR products that are likely to migrate to
619
+ sufficiently distinct locations to be distinguishable on a typical agarose gel.
620
+
621
+ Only products larger than `short` and smaller than `long` are returned.
622
+
623
+ An example of the output for two sequences (Dseqrecord(-3308), Dseqrecord(-3613)).
624
+ Primers 501 and 1806 would yield a 933 bp product with the 3308 bp sequence and the same
625
+ primer pair would give 1212 bp with the 3613 bp sequence.
626
+
627
+ A list of named 4-tuples is returned (Sequence, forward_primer, reverse_primer, size_bp),
628
+ where each tuple has one entry for each sequence in the input argument.
629
+
630
+ ::
631
+
632
+ [
633
+ ((Dseqrecord(-3308), 501, 1806, 933), (Dseqrecord(-3613), 501, 1806, 1212)),
634
+ ]
635
+
636
+
637
+ Parameters
638
+ ----------
639
+ sequences : list[Dseqrecord] | tuple[Dseqrecord]
640
+ Target sequence to find primer annealing positions.
641
+ primer_list : list[Primer] | tuple[Primer]
642
+ This is a list of pydna.primer.Primer objects or any object
643
+ with a seq property such as Bio.SeqRecord.SeqRecord.
644
+ limit : str, optional
645
+ This is the part in the 3'-end of each primer that has to
646
+ anneal. The default is 16.
647
+ short : int, optional
648
+ Lower limit for the size of the PCR products. The default is 500.
649
+ long : int, optional
650
+ Upper limit for the size of the PCR products. The default is 1500.
651
+ automaton : ahocorasick.Automaton, optional
652
+ Automaton made with the :func:`make_automaton`. The default is None.
653
+ callback : callable[[list], bool], optional
654
+ A function accepting a list of integers and returning True or False.
655
+ The default is callback.
656
+
657
+ Returns
658
+ -------
659
+ list[tuple[Dseqrecord, int, int, int]]
660
+ (Sequence, forward_primer, reverse_primer, size_bp)
661
+
662
+ """
663
+
664
+ automaton = automaton or make_automaton(primer_list, limit=limit)
665
+ limit = automaton.get_stats()["longest_word"]
666
+ primer_pair_dict = defaultdict(dict)
667
+ number_of_sequences = len(sequences)
668
+
669
+ for seq in sequences:
670
+
671
+ for fp, rp, *_, size in primer_pairs(
672
+ seq, primer_list, short=short, long=long, limit=limit, automaton=automaton
673
+ ):
674
+
675
+ primer_pair_dict[frozenset((fp, rp))][size] = fp, rp, seq
676
+
677
+ primer_pair_dict = {
678
+ k: v for k, v in primer_pair_dict.items() if len(v) == number_of_sequences
679
+ }
680
+
681
+ primer_pair_dict = {
682
+ k: v
683
+ for k, v in primer_pair_dict.items()
684
+ if all(callback(a, b) for a, b in pairwise(v.keys()))
685
+ }
686
+
687
+ result = []
688
+
689
+ for primer_pair, seqd in primer_pair_dict.items():
690
+ result.append(
691
+ (
692
+ closest_diff(seqd.keys()),
693
+ tuple(
694
+ primer_tuple(s, fp, rp, size) for size, (fp, rp, s) in seqd.items()
695
+ ),
696
+ )
697
+ )
698
+
699
+ result.sort(reverse=True)
700
+
701
+ return [b for a, b in result]
702
+
703
+
704
+ def diff_primer_triplets(
705
+ sequences: list[Dseqrecord] | tuple[Dseqrecord],
706
+ primer_list: list[Primer] | tuple[Primer],
707
+ limit: int = 16,
708
+ short: int = 500,
709
+ long: int = 1500,
710
+ automaton: ahocorasick.Automaton = None,
711
+ callback: Callable[[list], bool] = callback,
712
+ ) -> tuple[tuple[tuple[Dseqrecord, int, int, int]]]:
713
+ """
714
+ Primer triplets for diagnostic PCR.
715
+
716
+ Given a list of sequences and a primer list, primer triplets are selected that result in
717
+ PCR products of different sizes from each of the input sequences.
718
+
719
+ Primers 1, 2 and 3 form PCR products from sequenceA and B below, but of
720
+ different sizes. Primer 1 binds both sequences while primers 2 and 3 bind one
721
+ sequence each. This primer triplet could be used to verify genetic
722
+ modifications.
723
+
724
+ ::
725
+
726
+ 1> <2
727
+ -------NNNNNNNNN---- sequenceA
728
+
729
+ 1> <3
730
+ -------XXXXX-------- sequenceB
731
+
732
+
733
+
734
+ The callback function is used to give a score for the PCR products. This score can
735
+ be used to decide if a collection of PCR products are likely to migrate to distinct
736
+ locations on a typical agarose gel.
737
+
738
+ Only products larger than `short` and smaller than `long` are returned.
739
+
740
+ An example of the output for two sequences = [Dseqrecord(-7664), Dseqrecord(-3613)].
741
+ Primer pair 701, 700 would produce a 724 bp product with the 7664 bp sequence while
742
+ the primer pair 701, 1564 would give a 1450 bp product with the 3613 bp sequence.
743
+
744
+ ::
745
+
746
+ [
747
+ ((Dseqrecord(-7664), 701, 700, 724), (Dseqrecord(-3613), 701, 1564, 1450)),
748
+ ]
749
+
750
+ Parameters
751
+ ----------
752
+ sequences : list[Dseqrecord] | tuple[Dseqrecord]
753
+ Target sequence to find primer annealing positions.
754
+ primer_list : list[Primer] | tuple[Primer]
755
+ This is a list of pydna.primer.Primer objects or any object
756
+ with a seq property such as Bio.SeqRecord.SeqRecord.
757
+ limit : str, optional
758
+ This is the part in the 3'-end of each primer that has to
759
+ anneal. The default is 16.
760
+ short : int, optional
761
+ Lower limit for the size of the PCR products. The default is 500.
762
+ long : int, optional
763
+ Upper limit for the size of the PCR products. The default is 2000.
764
+ automaton : ahocorasick.Automaton, optional
765
+ Automaton made with the :func:`make_automaton`. The default is None.
766
+ callback : callable[[list], bool], optional
767
+ A function accepting a list of integers and returning True or False.
768
+ The default is callback.
769
+
770
+ Returns
771
+ -------
772
+ list[tuple[Dseqrecord, int, int, int]]
773
+ (Sequence, forward_primer, reverse_primer, size_bp)
774
+
775
+ """
776
+
777
+ automaton = automaton or make_automaton(primer_list, limit=limit)
778
+ limit = automaton.get_stats()["longest_word"]
779
+ number_of_sequences = len(sequences)
780
+ pp = {}
781
+ # pp = { seq1: [(a,b,c,d,e), ...], seq2: [(i,j,k,l,m), ... ]}
782
+
783
+ # All primer pairs for each sequence are collected.
784
+ for seq in sequences:
785
+ pp[seq] = primer_pairs(
786
+ seq, primer_list, short=short, long=long, limit=limit, automaton=automaton
787
+ )
788
+
789
+ # We count all the times a specific pair occurs
790
+ pair_counter = Counter()
791
+
792
+ for seq, tuples in pp.items():
793
+ for t in tuples:
794
+ pair = frozenset(t[:2]) # first two integers, unordered
795
+ pair_counter[pair] += 1
796
+
797
+ # Pick pairs that appear more than once.
798
+ pairs_to_remove = {pair for pair, count in pair_counter.items() if count > 1}
799
+
800
+ # Remove pairs that appear more than once.
801
+ for seq in pp:
802
+ pp[seq] = [t for t in pp[seq] if frozenset(t[:2]) not in pairs_to_remove]
803
+
804
+ primertrios = defaultdict(dict)
805
+
806
+ for seq1, seq2 in combinations(sequences, 2):
807
+ for fp1, rp1, *_, size1 in pp[seq1]:
808
+ for fp2, rp2, *_, size2 in pp[seq2]:
809
+ primertrio = frozenset((fp1, rp1, fp2, rp2))
810
+ if len(primertrio) == 3 and callback(size1, size2):
811
+ if primertrios[primertrio]:
812
+ del primertrios[primertrio]
813
+ else:
814
+ primertrios[primertrio][size1] = (fp1, rp1, seq1)
815
+ primertrios[primertrio][size2] = (fp2, rp2, seq2)
816
+
817
+ result = []
818
+ for primertrio, seqd in primertrios.items():
819
+ if len(seqd) == number_of_sequences and set(sequences) == set(
820
+ s for *_, s in seqd.values()
821
+ ):
822
+ result.append(
823
+ (
824
+ closest_diff(seqd.keys()),
825
+ tuple(
826
+ primer_tuple(s, fp, rp, size)
827
+ for size, (fp, rp, s) in seqd.items()
828
+ ),
829
+ )
830
+ )
831
+
832
+ result.sort(key=lambda item: item[0], reverse=True)
833
+ return [b for a, b in result]