varvamp 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +6 -3
- varvamp/command.py +131 -57
- varvamp/scripts/alignment.py +54 -164
- varvamp/scripts/default_config.py +5 -3
- varvamp/scripts/logging.py +66 -20
- varvamp/scripts/param_estimation.py +84 -62
- varvamp/scripts/primers.py +190 -46
- varvamp/scripts/qpcr.py +141 -117
- varvamp/scripts/reporting.py +41 -34
- varvamp/scripts/scheme.py +101 -52
- varvamp-1.3.dist-info/METADATA +760 -0
- varvamp-1.3.dist-info/RECORD +22 -0
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/WHEEL +1 -1
- varvamp-1.3.dist-info/licenses/LICENSE +674 -0
- varvamp-1.2.2.dist-info/METADATA +0 -87
- varvamp-1.2.2.dist-info/RECORD +0 -21
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/entry_points.txt +0 -0
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/top_level.txt +0 -0
varvamp/scripts/primers.py
CHANGED
|
@@ -2,10 +2,16 @@
|
|
|
2
2
|
primer creation and evaluation
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
# BUILTIN
|
|
6
|
+
import itertools
|
|
7
|
+
import re
|
|
8
|
+
import multiprocessing
|
|
9
|
+
import functools
|
|
10
|
+
|
|
5
11
|
# LIBS
|
|
6
|
-
from Bio.Seq import
|
|
12
|
+
from Bio.Seq import MutableSeq
|
|
13
|
+
from Bio import SeqIO
|
|
7
14
|
import primer3 as p3
|
|
8
|
-
import math
|
|
9
15
|
|
|
10
16
|
# varVAMP
|
|
11
17
|
from varvamp.scripts import config
|
|
@@ -60,6 +66,50 @@ def calc_dimer(seq1, seq2, structure=False):
|
|
|
60
66
|
)
|
|
61
67
|
|
|
62
68
|
|
|
69
|
+
def has_end_overlap(dimer_result):
|
|
70
|
+
"""
|
|
71
|
+
checks if two oligos overlap at their ends
|
|
72
|
+
Example:
|
|
73
|
+
xxxxxxxxtagc-------
|
|
74
|
+
--------atcgxxxxxxx
|
|
75
|
+
"""
|
|
76
|
+
if dimer_result.structure_found:
|
|
77
|
+
# clean structure
|
|
78
|
+
structure = [x[4:] for x in dimer_result.ascii_structure_lines]
|
|
79
|
+
# check if we have an overlap that is large enough
|
|
80
|
+
overlap = len(structure[1].replace(" ", ""))
|
|
81
|
+
if overlap <= config.END_OVERLAP:
|
|
82
|
+
return False
|
|
83
|
+
# not more than one conseq. internal mismatch
|
|
84
|
+
if ' ' in structure[1].lstrip(" "):
|
|
85
|
+
return False
|
|
86
|
+
# The alignment length of the ACII structure is equal to the first part of the structure
|
|
87
|
+
# and the maximum possible alignment length is the cumulative length of both primers (-> no overlap at all)
|
|
88
|
+
alignment_length = len(structure[0])
|
|
89
|
+
maximum_alignment_length = len(re.findall("[ATCG]", "".join(structure)))
|
|
90
|
+
# this means that for a perfect end overlap the alignment length is equal to:
|
|
91
|
+
# len(primer1) + len(primer2) - overlap.
|
|
92
|
+
if alignment_length == maximum_alignment_length - overlap:
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def is_dimer(seq1, seq2):
|
|
99
|
+
"""
|
|
100
|
+
check if two sequences dimerize above threshold or are overlapping at their ends
|
|
101
|
+
"""
|
|
102
|
+
dimer_result = calc_dimer(seq1, seq2, structure=True)
|
|
103
|
+
# check both the temperature and the deltaG
|
|
104
|
+
if dimer_result.tm > config.PRIMER_MAX_DIMER_TMP or dimer_result.dg < config.PRIMER_MAX_DIMER_DELTAG:
|
|
105
|
+
return True
|
|
106
|
+
# check for perfect end overlaps (this can result in primer extensions even though the tm/dg are okay)
|
|
107
|
+
if has_end_overlap(dimer_result):
|
|
108
|
+
return True
|
|
109
|
+
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
|
|
63
113
|
def calc_max_polyx(seq):
|
|
64
114
|
"""
|
|
65
115
|
calculate maximum polyx of a seq
|
|
@@ -126,7 +176,7 @@ def rev_complement(seq):
|
|
|
126
176
|
"""
|
|
127
177
|
reverse complement a sequence
|
|
128
178
|
"""
|
|
129
|
-
return str(
|
|
179
|
+
return str(MutableSeq(seq).reverse_complement(inplace=True))
|
|
130
180
|
|
|
131
181
|
|
|
132
182
|
def calc_permutation_penalty(amb_seq):
|
|
@@ -262,13 +312,14 @@ def filter_kmer_direction_independent(seq, primer_temps=config.PRIMER_TMP, gc_ra
|
|
|
262
312
|
filter kmer for temperature, gc content,
|
|
263
313
|
poly x, dinucleotide repeats and homodimerization
|
|
264
314
|
"""
|
|
315
|
+
|
|
265
316
|
return(
|
|
266
317
|
(primer_temps[0] <= calc_temp(seq) <= primer_temps[1])
|
|
267
318
|
and (gc_range[0] <= calc_gc(seq) <= gc_range[1])
|
|
268
319
|
and (calc_max_polyx(seq) <= config.PRIMER_MAX_POLYX)
|
|
269
320
|
and (calc_max_dinuc_repeats(seq) <= config.PRIMER_MAX_DINUC_REPEATS)
|
|
270
321
|
and (calc_base_penalty(seq, primer_temps, gc_range, primer_sizes) <= config.PRIMER_MAX_BASE_PENALTY)
|
|
271
|
-
and (
|
|
322
|
+
and not is_dimer(seq, seq)
|
|
272
323
|
)
|
|
273
324
|
|
|
274
325
|
|
|
@@ -292,51 +343,66 @@ def filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
|
|
|
292
343
|
)
|
|
293
344
|
|
|
294
345
|
|
|
295
|
-
def
|
|
346
|
+
def _process_kmer_batch(ambiguous_consensus, alignment, kmers):
|
|
296
347
|
"""
|
|
297
|
-
|
|
298
|
-
|
|
348
|
+
Helper function for multiprocessing: process a batch of kmers.
|
|
349
|
+
Returns (left_primers, right_primers) tuples.
|
|
299
350
|
"""
|
|
300
|
-
|
|
301
|
-
|
|
351
|
+
left_primers = []
|
|
352
|
+
right_primers = []
|
|
302
353
|
|
|
303
354
|
for kmer in kmers:
|
|
304
|
-
# filter kmers based on their direction independend stats
|
|
305
355
|
if not filter_kmer_direction_independent(kmer[0]):
|
|
306
356
|
continue
|
|
307
|
-
# calc
|
|
308
|
-
base_penalty = calc_base_penalty(kmer[0],config.PRIMER_TMP, config.PRIMER_GC_RANGE, config.PRIMER_SIZES)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
alignment,
|
|
313
|
-
ambiguous_consensus
|
|
314
|
-
)
|
|
315
|
-
# calculate permutation penealty
|
|
316
|
-
permutation_penalty = calc_permutation_penalty(
|
|
317
|
-
ambiguous_consensus[kmer[1]:kmer[2]]
|
|
318
|
-
)
|
|
319
|
-
# now check direction specific
|
|
357
|
+
# calc penalties
|
|
358
|
+
base_penalty = calc_base_penalty(kmer[0], config.PRIMER_TMP, config.PRIMER_GC_RANGE, config.PRIMER_SIZES)
|
|
359
|
+
per_base_mismatches = calc_per_base_mismatches(kmer, alignment, ambiguous_consensus)
|
|
360
|
+
permutation_penalty = calc_permutation_penalty(ambiguous_consensus[kmer[1]:kmer[2]])
|
|
361
|
+
# some filters depend on the direction of each primer
|
|
320
362
|
for direction in ["+", "-"]:
|
|
321
|
-
# check if kmer passes direction filter
|
|
322
363
|
if not filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
|
|
323
364
|
continue
|
|
324
|
-
#
|
|
325
|
-
three_prime_penalty = calc_3_prime_penalty(
|
|
326
|
-
direction,
|
|
327
|
-
per_base_mismatches
|
|
328
|
-
)
|
|
329
|
-
# add all penalties
|
|
365
|
+
# calc penalties
|
|
366
|
+
three_prime_penalty = calc_3_prime_penalty(direction, per_base_mismatches)
|
|
330
367
|
primer_penalty = base_penalty + permutation_penalty + three_prime_penalty
|
|
331
|
-
#
|
|
368
|
+
# add to lists depending on their direction
|
|
332
369
|
if direction == "+":
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
370
|
+
left_primers.append([kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches])
|
|
371
|
+
else:
|
|
372
|
+
right_primers.append([rev_complement(kmer[0]), kmer[1], kmer[2], primer_penalty, per_base_mismatches])
|
|
373
|
+
|
|
374
|
+
return left_primers, right_primers
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def find_primers(kmers, ambiguous_consensus, alignment, num_processes):
|
|
378
|
+
"""
|
|
379
|
+
Filter kmers direction specific and append penalties --> potential primers.
|
|
380
|
+
Uses multiprocessing to process kmers in parallel.
|
|
381
|
+
"""
|
|
382
|
+
if not kmers:
|
|
383
|
+
return [], []
|
|
384
|
+
|
|
385
|
+
# Convert kmers set to list for slicing
|
|
386
|
+
kmers = list(kmers)
|
|
387
|
+
batch_size = max(1, int(len(kmers)/num_processes))
|
|
388
|
+
|
|
389
|
+
# Split kmers into batches
|
|
390
|
+
batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
|
|
391
|
+
callable_f = functools.partial(
|
|
392
|
+
_process_kmer_batch,
|
|
393
|
+
ambiguous_consensus, alignment
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Solve dimers in parallel
|
|
397
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
398
|
+
results = pool.map(callable_f, batches)
|
|
399
|
+
|
|
400
|
+
# Aggregate results
|
|
401
|
+
left_primer_candidates = []
|
|
402
|
+
right_primer_candidates = []
|
|
403
|
+
for left_primers, right_primers in results:
|
|
404
|
+
left_primer_candidates.extend(left_primers)
|
|
405
|
+
right_primer_candidates.extend(right_primers)
|
|
340
406
|
|
|
341
407
|
return left_primer_candidates, right_primer_candidates
|
|
342
408
|
|
|
@@ -351,7 +417,7 @@ def create_primer_dictionary(primer_candidates, direction):
|
|
|
351
417
|
for primer in primer_candidates:
|
|
352
418
|
if direction == "+":
|
|
353
419
|
direction_name = "LEFT"
|
|
354
|
-
|
|
420
|
+
else:
|
|
355
421
|
direction_name = "RIGHT"
|
|
356
422
|
primer_name = f"{direction_name}_{primer_idx}"
|
|
357
423
|
primer_dict[primer_name] = primer
|
|
@@ -360,7 +426,7 @@ def create_primer_dictionary(primer_candidates, direction):
|
|
|
360
426
|
return primer_dict
|
|
361
427
|
|
|
362
428
|
|
|
363
|
-
def find_best_primers(left_primer_candidates, right_primer_candidates):
|
|
429
|
+
def find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation:bool=False):
|
|
364
430
|
"""
|
|
365
431
|
Primer candidates are likely overlapping. Here, the list of primers
|
|
366
432
|
is sorted for the lowest to highest penalty. Then, the next lowest
|
|
@@ -386,16 +452,20 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
|
|
|
386
452
|
primer_candidates.sort(key=lambda x: (x[3], x[1]))
|
|
387
453
|
# ini everything with the primer with the lowest penalty
|
|
388
454
|
to_retain = [primer_candidates[0]]
|
|
389
|
-
|
|
390
|
-
primer_set = set(primer_ranges)
|
|
455
|
+
primer_set = set(range(primer_candidates[0][1], primer_candidates[0][2]))
|
|
391
456
|
|
|
392
|
-
for primer in primer_candidates:
|
|
457
|
+
for primer in primer_candidates[1:]:
|
|
458
|
+
# for highly conserved alignments exclude everything that overlaps with the best primer
|
|
459
|
+
# this reduces graph complexity by quite a large margin
|
|
460
|
+
if high_conservation:
|
|
461
|
+
primer_positions =set(range(primer[1], primer[2]))
|
|
393
462
|
# get the thirds of the primer, only consider the middle
|
|
394
|
-
|
|
395
|
-
|
|
463
|
+
else:
|
|
464
|
+
thirds_len = int((primer[2] - primer[1])/3)
|
|
465
|
+
primer_positions = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
|
|
396
466
|
# check if none of the nucleotides of the next primer
|
|
397
467
|
# are already covered by a better primer
|
|
398
|
-
if
|
|
468
|
+
if primer_set.isdisjoint(primer_positions):
|
|
399
469
|
# update the primer set
|
|
400
470
|
primer_set.update(primer_positions)
|
|
401
471
|
# append this primer as it has a low penalty and is not overlapping
|
|
@@ -409,3 +479,77 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
|
|
|
409
479
|
|
|
410
480
|
# and create a dict
|
|
411
481
|
return all_primers
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def get_permutations(seq):
|
|
485
|
+
"""
|
|
486
|
+
get all permutations of an ambiguous sequence.
|
|
487
|
+
"""
|
|
488
|
+
splits = [config.AMBIG_NUCS.get(nuc, [nuc]) for nuc in seq]
|
|
489
|
+
|
|
490
|
+
return[''.join(p) for p in itertools.product(*splits)]
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def parse_primer_fasta(fasta_path):
|
|
494
|
+
"""
|
|
495
|
+
Parse a primer FASTA file and return a list of sequences using BioPython.
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
sequences = []
|
|
499
|
+
|
|
500
|
+
for record in SeqIO.parse(fasta_path, "fasta"):
|
|
501
|
+
seq = str(record.seq).lower()
|
|
502
|
+
# Only include primers up to 40 nucleotides
|
|
503
|
+
if len(seq) <= 40:
|
|
504
|
+
sequences += get_permutations(seq)
|
|
505
|
+
|
|
506
|
+
return list(set(sequences)) # deduplication
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def check_primer_against_externals(external_sequences, primer):
|
|
510
|
+
"""
|
|
511
|
+
Worker function to check a single primer against all external sequences.
|
|
512
|
+
Returns the primer if it passes, None otherwise.
|
|
513
|
+
Handles both list format and dict format (name, data) tuples.
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
# Extract sequence based on input format
|
|
517
|
+
if isinstance(primer, tuple):
|
|
518
|
+
name, data = primer
|
|
519
|
+
seq = data[0]
|
|
520
|
+
else:
|
|
521
|
+
seq = primer[0]
|
|
522
|
+
|
|
523
|
+
for ext_seq in external_sequences:
|
|
524
|
+
if is_dimer(seq, ext_seq):
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
return primer
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def filter_non_dimer_candidates(primer_candidates, external_sequences, n_processes):
|
|
531
|
+
"""
|
|
532
|
+
Filter out primer candidates that form dimers with external sequences.
|
|
533
|
+
Uses multiprocessing to speed up checks.
|
|
534
|
+
"""
|
|
535
|
+
is_dict = isinstance(primer_candidates, dict)
|
|
536
|
+
|
|
537
|
+
callable_f = functools.partial(
|
|
538
|
+
check_primer_against_externals,
|
|
539
|
+
external_sequences
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
with multiprocessing.Pool(processes=n_processes) as pool:
|
|
543
|
+
# Prepare arguments based on input type
|
|
544
|
+
# qpcr probes are stored in dictionaries --> result in tuples when unpacked
|
|
545
|
+
if is_dict:
|
|
546
|
+
results = pool.map(callable_f, primer_candidates.items())
|
|
547
|
+
else:
|
|
548
|
+
results = pool.map(callable_f, primer_candidates)
|
|
549
|
+
|
|
550
|
+
# Filter and restore original format
|
|
551
|
+
if is_dict:
|
|
552
|
+
filtered_results = [result for result in results if result is not None]
|
|
553
|
+
return {name: data for name, data in filtered_results}
|
|
554
|
+
else:
|
|
555
|
+
return [primer for primer in results if primer is not None]
|
varvamp/scripts/qpcr.py
CHANGED
|
@@ -7,11 +7,11 @@ import re
|
|
|
7
7
|
import seqfold
|
|
8
8
|
import itertools
|
|
9
9
|
import multiprocessing
|
|
10
|
+
import functools
|
|
10
11
|
|
|
11
12
|
# varVAMP
|
|
12
13
|
from varvamp.scripts import config
|
|
13
14
|
from varvamp.scripts import primers
|
|
14
|
-
from varvamp.scripts import reporting
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def choose_probe_direction(seq):
|
|
@@ -51,35 +51,25 @@ def filter_probe_direction_dependent(seq):
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
def
|
|
54
|
+
def _process_kmer_batch_probes(ambiguous_consensus, alignment_cleaned, kmers):
|
|
55
55
|
"""
|
|
56
|
-
|
|
56
|
+
Helper function for multiprocessing: process a batch of kmers for probes.
|
|
57
|
+
Returns probe_candidates dictionary.
|
|
57
58
|
"""
|
|
58
59
|
probe_candidates = {}
|
|
59
60
|
probe_idx = 0
|
|
60
61
|
|
|
61
62
|
for kmer in kmers:
|
|
62
|
-
|
|
63
|
-
if not primers.filter_kmer_direction_independent(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE,
|
|
64
|
-
config.QPROBE_SIZES):
|
|
63
|
+
if not primers.filter_kmer_direction_independent(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE, config.QPROBE_SIZES):
|
|
65
64
|
continue
|
|
66
|
-
# do not allow ambiguous chars at both ends
|
|
67
65
|
if ambiguous_ends(ambiguous_consensus[kmer[1]:kmer[2]]):
|
|
68
66
|
continue
|
|
69
|
-
|
|
70
|
-
base_penalty = primers.calc_base_penalty(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE,
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
kmer,
|
|
74
|
-
alignment_cleaned,
|
|
75
|
-
ambiguous_consensus
|
|
76
|
-
)
|
|
77
|
-
permutation_penalty = primers.calc_permutation_penalty(
|
|
78
|
-
ambiguous_consensus[kmer[1]:kmer[2]]
|
|
79
|
-
)
|
|
80
|
-
# determine the direction with more cytosine or set both if 50 %
|
|
67
|
+
|
|
68
|
+
base_penalty = primers.calc_base_penalty(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE, config.QPROBE_SIZES)
|
|
69
|
+
per_base_mismatches = primers.calc_per_base_mismatches(kmer, alignment_cleaned, ambiguous_consensus)
|
|
70
|
+
permutation_penalty = primers.calc_permutation_penalty(ambiguous_consensus[kmer[1]:kmer[2]])
|
|
81
71
|
direction = choose_probe_direction(kmer[0])
|
|
82
|
-
|
|
72
|
+
|
|
83
73
|
if "+" in direction:
|
|
84
74
|
if filter_probe_direction_dependent(kmer[0]):
|
|
85
75
|
probe_name = f"PROBE_{probe_idx}_LEFT"
|
|
@@ -96,7 +86,44 @@ def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned):
|
|
|
96
86
|
base_penalty + permutation_penalty + three_prime_penalty,
|
|
97
87
|
per_base_mismatches, direction]
|
|
98
88
|
probe_idx += 1
|
|
99
|
-
|
|
89
|
+
|
|
90
|
+
return probe_candidates
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes):
|
|
94
|
+
"""
|
|
95
|
+
Find potential qPCR probes using multiprocessing.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# Convert kmers set to list for batching
|
|
99
|
+
kmers = list(kmers)
|
|
100
|
+
|
|
101
|
+
# Split kmers into batches
|
|
102
|
+
batch_size = max(1, int(len(kmers) / num_processes))
|
|
103
|
+
batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
|
|
104
|
+
|
|
105
|
+
# Prepare arguments for each dimer
|
|
106
|
+
callable_f = functools.partial(
|
|
107
|
+
_process_kmer_batch_probes,
|
|
108
|
+
ambiguous_consensus, alignment_cleaned
|
|
109
|
+
)
|
|
110
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
111
|
+
results = pool.map(callable_f, batches)
|
|
112
|
+
|
|
113
|
+
# Aggregate results and re-index probe names
|
|
114
|
+
probe_candidates = {}
|
|
115
|
+
probe_idx = 0
|
|
116
|
+
for batch_probes in results:
|
|
117
|
+
if batch_probes is None:
|
|
118
|
+
continue
|
|
119
|
+
for probe_name, probe_data in batch_probes.items():
|
|
120
|
+
# Extract direction from original probe name
|
|
121
|
+
direction = "LEFT" if "LEFT" in probe_name else "RIGHT"
|
|
122
|
+
new_probe_name = f"PROBE_{probe_idx}_{direction}"
|
|
123
|
+
probe_candidates[new_probe_name] = probe_data
|
|
124
|
+
probe_idx += 1
|
|
125
|
+
|
|
126
|
+
# Sort by penalty
|
|
100
127
|
probe_candidates = dict(sorted(probe_candidates.items(), key=lambda x: x[1][3]))
|
|
101
128
|
|
|
102
129
|
return probe_candidates
|
|
@@ -139,54 +166,30 @@ def hardfilter_amplicon(majority_consensus, left_primer, right_primer):
|
|
|
139
166
|
)
|
|
140
167
|
|
|
141
168
|
|
|
142
|
-
def
|
|
143
|
-
"""
|
|
144
|
-
checks if two oligos overlap at their ends (pretty rare)
|
|
145
|
-
Example:
|
|
146
|
-
xxxxxxxxtagc-------
|
|
147
|
-
--------atcgxxxxxxx
|
|
148
|
-
"""
|
|
149
|
-
if dimer_result.structure_found:
|
|
150
|
-
# clean structure
|
|
151
|
-
structure = [x[4:] for x in dimer_result.ascii_structure_lines]
|
|
152
|
-
# calc overlap and the cumulative len of the oligos
|
|
153
|
-
overlap = len(structure[1].replace(" ", ""))
|
|
154
|
-
nt_count = len(re.findall("[ATCG]", "".join(structure)))
|
|
155
|
-
# check for overlaps at the ends and the min overlap (allows for some amount of mismatches)
|
|
156
|
-
if overlap > config.END_OVERLAP and nt_count <= len(structure[0]) + overlap + 1 and " " not in structure[1].lstrip(" "):
|
|
157
|
-
return True
|
|
158
|
-
|
|
159
|
-
return False
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def forms_dimer_or_overhangs(right_primer, left_primer, probe, ambiguous_consensus):
|
|
169
|
+
def dimer_in_combinations(right_primer, left_primer, probe, ambiguous_consensus):
|
|
163
170
|
"""
|
|
164
|
-
checks if combinations of primers/probe form dimers
|
|
171
|
+
checks if primers cause dimers and if combinations of primers/probe including all permutations form dimers
|
|
165
172
|
"""
|
|
166
173
|
|
|
167
174
|
forms_structure = False
|
|
168
175
|
|
|
169
176
|
# first check if there are dimers between the two flanking primers
|
|
170
|
-
if primers.
|
|
177
|
+
if primers.is_dimer(left_primer[0], right_primer[0]):
|
|
171
178
|
return True
|
|
172
179
|
# for the probe check all permutations and possible overhangs to ensure
|
|
173
180
|
# that none of the primers could cause unspecific probe binding.
|
|
174
181
|
# first get all permutations
|
|
175
|
-
probe_per =
|
|
176
|
-
left_per =
|
|
177
|
-
right_per =
|
|
182
|
+
probe_per = primers.get_permutations(ambiguous_consensus[probe[1]:probe[2]])
|
|
183
|
+
left_per = primers.get_permutations(ambiguous_consensus[left_primer[1]:left_primer[2]])
|
|
184
|
+
right_per = primers.get_permutations(ambiguous_consensus[right_primer[1]:right_primer[2]])
|
|
178
185
|
# then check all permutations
|
|
179
186
|
for combination in [(probe_per, left_per), (probe_per, right_per)]:
|
|
180
|
-
for oligo1 in combination
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
if dimer_result.tm >= config.PRIMER_MAX_DIMER_TMP or check_end_overlap(dimer_result):
|
|
184
|
-
forms_structure = True
|
|
185
|
-
break
|
|
186
|
-
# break all loops because we found an unwanted structure in one of the permutations
|
|
187
|
-
# (either dimer formation or a too long overlap at the ends of the primer)
|
|
188
|
-
if forms_structure:
|
|
187
|
+
for oligo1, oligo2 in itertools.product(*combination):
|
|
188
|
+
if primers.is_dimer(oligo1, oligo2):
|
|
189
|
+
forms_structure = True
|
|
189
190
|
break
|
|
191
|
+
# break also outer loop because we found an unwanted structure in one of the permutations
|
|
192
|
+
# (either dimer formation or a too long overlap at the ends of the primer)
|
|
190
193
|
if forms_structure:
|
|
191
194
|
break
|
|
192
195
|
|
|
@@ -231,7 +234,7 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
|
|
|
231
234
|
[config.QPROBE_TEMP_DIFF[0] <= probe_temp - x <= config.QPROBE_TEMP_DIFF[1] for x in primer_temps]):
|
|
232
235
|
continue
|
|
233
236
|
# .... all combination of oligos do not form dimers or overhangs.
|
|
234
|
-
if
|
|
237
|
+
if dimer_in_combinations(right_primer, left_primer, qpcr_probes[probe], ambiguous_consensus):
|
|
235
238
|
continue
|
|
236
239
|
# append to list and break as this is the primer combi
|
|
237
240
|
# with the lowest penalty (primers are sorted by penalty)
|
|
@@ -245,54 +248,74 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
|
|
|
245
248
|
return primer_combinations
|
|
246
249
|
|
|
247
250
|
|
|
248
|
-
def
|
|
249
|
-
|
|
251
|
+
def find_single_qpcr_scheme(left_primer_candidates, right_primer_candidates, qpcr_probes,
|
|
252
|
+
majority_consensus, ambiguous_consensus, probe):
|
|
250
253
|
"""
|
|
251
|
-
|
|
252
|
-
test all left/right combinations whether they are potential amplicons. as primers
|
|
253
|
-
are sorted by penalty, only the very first match is considered as this has the
|
|
254
|
-
lowest penalty. however, probes are overlapping and there is a high chance that
|
|
255
|
-
left and right primers are found multiple times. to consider only one primer-probe
|
|
256
|
-
combination the probes are also sorted by penalty. therefore, if a primer
|
|
257
|
-
combination has been found already the optimal probe was already selected and
|
|
258
|
-
there is no need to consider this primer probe combination.
|
|
254
|
+
Find a qPCR scheme for a single probe.
|
|
259
255
|
"""
|
|
260
256
|
|
|
257
|
+
probe_name, probe_data = probe
|
|
258
|
+
|
|
259
|
+
# Generate flanking subsets within the worker process
|
|
260
|
+
left_subset = flanking_primer_subset(left_primer_candidates, "+", probe_data)
|
|
261
|
+
right_subset = flanking_primer_subset(right_primer_candidates, "-", probe_data)
|
|
262
|
+
|
|
263
|
+
if not left_subset or not right_subset:
|
|
264
|
+
return probe_name, None
|
|
265
|
+
|
|
266
|
+
primer_combination = assess_amplicons(
|
|
267
|
+
left_subset, right_subset, qpcr_probes, probe_name,
|
|
268
|
+
majority_consensus, ambiguous_consensus
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return probe_name, primer_combination
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates,
|
|
275
|
+
majority_consensus, ambiguous_consensus, num_processes):
|
|
276
|
+
"""
|
|
277
|
+
Find final qPCR schemes using multiprocessing to evaluate probes in parallel.
|
|
278
|
+
Probes are sorted by penalty, ensuring optimal probe selection.
|
|
279
|
+
"""
|
|
261
280
|
qpcr_scheme_candidates = []
|
|
262
281
|
found_amplicons = []
|
|
263
282
|
amplicon_nr = -1
|
|
264
283
|
|
|
265
|
-
for
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
284
|
+
# Prepare arguments for parallel processing - pass full primer lists
|
|
285
|
+
batch_size = max(1, int(len(qpcr_probes) / num_processes))
|
|
286
|
+
callable_f = functools.partial(
|
|
287
|
+
find_single_qpcr_scheme,
|
|
288
|
+
left_primer_candidates, right_primer_candidates, qpcr_probes, majority_consensus, ambiguous_consensus
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Process probes in parallel
|
|
292
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
293
|
+
results = pool.map(callable_f, qpcr_probes.items(), chunksize=batch_size)
|
|
294
|
+
|
|
295
|
+
# Aggregate results in original probe order (sorted by penalty)
|
|
296
|
+
for probe_name, primer_combination in results:
|
|
274
297
|
if not primer_combination:
|
|
275
298
|
continue
|
|
276
|
-
# ...and this combi is not already present for a probe with a better penalty.
|
|
277
299
|
if primer_combination in found_amplicons:
|
|
278
300
|
continue
|
|
279
|
-
|
|
301
|
+
|
|
280
302
|
amplicon_nr += 1
|
|
281
303
|
found_amplicons.append(primer_combination)
|
|
282
|
-
qpcr_scheme_candidates.append(
|
|
283
|
-
{
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
304
|
+
qpcr_scheme_candidates.append({
|
|
305
|
+
"id": f"AMPLICON_{amplicon_nr}",
|
|
306
|
+
"penalty": qpcr_probes[probe_name][3] + primer_combination[0][3] + primer_combination[1][3],
|
|
307
|
+
"PROBE": qpcr_probes[probe_name],
|
|
308
|
+
"LEFT": primer_combination[0],
|
|
309
|
+
"RIGHT": primer_combination[1]
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
# Sort by total penalty
|
|
313
|
+
qpcr_scheme_candidates.sort(key=lambda x: x["penalty"])
|
|
314
|
+
|
|
292
315
|
return qpcr_scheme_candidates
|
|
293
316
|
|
|
294
317
|
|
|
295
|
-
def process_single_amplicon_deltaG(
|
|
318
|
+
def process_single_amplicon_deltaG(majority_consensus, amplicon):
|
|
296
319
|
"""
|
|
297
320
|
Process a single amplicon to test its deltaG and apply filtering.
|
|
298
321
|
This function will be called concurrently by multiple threads.
|
|
@@ -310,7 +333,7 @@ def process_single_amplicon_deltaG(amplicon, majority_consensus):
|
|
|
310
333
|
return amplicon
|
|
311
334
|
|
|
312
335
|
|
|
313
|
-
def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff,
|
|
336
|
+
def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_processes):
|
|
314
337
|
"""
|
|
315
338
|
Test all amplicon deltaGs for the top n hits at the lowest primer temperature
|
|
316
339
|
and filters if they fall below the cutoff. Multiple processes are used
|
|
@@ -318,32 +341,33 @@ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n
|
|
|
318
341
|
"""
|
|
319
342
|
final_amplicons = []
|
|
320
343
|
|
|
321
|
-
# Create a
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
344
|
+
# Create a list of the first n amplicon tuples for processing
|
|
345
|
+
# The list is sorted first on whether offset targets were predicted for the amplicon,
|
|
346
|
+
# then by penalty. This ensures that amplicons with offset targets are always considered last
|
|
347
|
+
amplicons = list(sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])))[:n_to_test]
|
|
348
|
+
# process amplicons concurrently
|
|
349
|
+
batch_size = max(1, int(n_to_test / n_processes))
|
|
350
|
+
callable_f = functools.partial(
|
|
351
|
+
process_single_amplicon_deltaG,
|
|
352
|
+
majority_consensus
|
|
353
|
+
)
|
|
354
|
+
with multiprocessing.Pool(processes=n_processes) as pool:
|
|
355
|
+
results = pool.map(callable_f, amplicons, chunksize=batch_size)
|
|
356
|
+
# Process the results
|
|
357
|
+
retained_ranges = []
|
|
358
|
+
for amp in results:
|
|
359
|
+
# check if the amplicon overlaps with an amplicon that was previously
|
|
360
|
+
# found and had a high enough deltaG
|
|
361
|
+
if amp["deltaG"] <= deltaG_cutoff:
|
|
362
|
+
continue
|
|
363
|
+
amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
|
|
364
|
+
overlaps_retained = False
|
|
365
|
+
for r in retained_ranges:
|
|
366
|
+
if amp_range.start < r.stop and r.start < amp_range.stop:
|
|
367
|
+
overlaps_retained = True
|
|
368
|
+
break
|
|
369
|
+
if not overlaps_retained:
|
|
370
|
+
final_amplicons.append(amp)
|
|
371
|
+
retained_ranges.append(amp_range)
|
|
348
372
|
|
|
349
373
|
return final_amplicons
|