varvamp 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,16 @@
2
2
  primer creation and evaluation
3
3
  """
4
4
 
5
+ # BUILTIN
6
+ import itertools
7
+ import re
8
+ import multiprocessing
9
+ import functools
10
+
5
11
  # LIBS
6
- from Bio.Seq import Seq
12
+ from Bio.Seq import MutableSeq
13
+ from Bio import SeqIO
7
14
  import primer3 as p3
8
- import math
9
15
 
10
16
  # varVAMP
11
17
  from varvamp.scripts import config
@@ -60,6 +66,50 @@ def calc_dimer(seq1, seq2, structure=False):
60
66
  )
61
67
 
62
68
 
69
+ def has_end_overlap(dimer_result):
70
+ """
71
+ checks if two oligos overlap at their ends
72
+ Example:
73
+ xxxxxxxxtagc-------
74
+ --------atcgxxxxxxx
75
+ """
76
+ if dimer_result.structure_found:
77
+ # clean structure
78
+ structure = [x[4:] for x in dimer_result.ascii_structure_lines]
79
+ # check if we have an overlap that is large enough
80
+ overlap = len(structure[1].replace(" ", ""))
81
+ if overlap <= config.END_OVERLAP:
82
+ return False
83
+ # not more than one conseq. internal mismatch
84
+ if ' ' in structure[1].lstrip(" "):
85
+ return False
86
+ # The alignment length of the ACII structure is equal to the first part of the structure
87
+ # and the maximum possible alignment length is the cumulative length of both primers (-> no overlap at all)
88
+ alignment_length = len(structure[0])
89
+ maximum_alignment_length = len(re.findall("[ATCG]", "".join(structure)))
90
+ # this means that for a perfect end overlap the alignment length is equal to:
91
+ # len(primer1) + len(primer2) - overlap.
92
+ if alignment_length == maximum_alignment_length - overlap:
93
+ return True
94
+
95
+ return False
96
+
97
+
98
+ def is_dimer(seq1, seq2):
99
+ """
100
+ check if two sequences dimerize above threshold or are overlapping at their ends
101
+ """
102
+ dimer_result = calc_dimer(seq1, seq2, structure=True)
103
+ # check both the temperature and the deltaG
104
+ if dimer_result.tm > config.PRIMER_MAX_DIMER_TMP or dimer_result.dg < config.PRIMER_MAX_DIMER_DELTAG:
105
+ return True
106
+ # check for perfect end overlaps (this can result in primer extensions even though the tm/dg are okay)
107
+ if has_end_overlap(dimer_result):
108
+ return True
109
+
110
+ return False
111
+
112
+
63
113
  def calc_max_polyx(seq):
64
114
  """
65
115
  calculate maximum polyx of a seq
@@ -126,7 +176,7 @@ def rev_complement(seq):
126
176
  """
127
177
  reverse complement a sequence
128
178
  """
129
- return str(Seq(seq).reverse_complement())
179
+ return str(MutableSeq(seq).reverse_complement(inplace=True))
130
180
 
131
181
 
132
182
  def calc_permutation_penalty(amb_seq):
@@ -262,13 +312,14 @@ def filter_kmer_direction_independent(seq, primer_temps=config.PRIMER_TMP, gc_ra
262
312
  filter kmer for temperature, gc content,
263
313
  poly x, dinucleotide repeats and homodimerization
264
314
  """
315
+
265
316
  return(
266
317
  (primer_temps[0] <= calc_temp(seq) <= primer_temps[1])
267
318
  and (gc_range[0] <= calc_gc(seq) <= gc_range[1])
268
319
  and (calc_max_polyx(seq) <= config.PRIMER_MAX_POLYX)
269
320
  and (calc_max_dinuc_repeats(seq) <= config.PRIMER_MAX_DINUC_REPEATS)
270
321
  and (calc_base_penalty(seq, primer_temps, gc_range, primer_sizes) <= config.PRIMER_MAX_BASE_PENALTY)
271
- and (calc_dimer(seq, seq).tm <= config.PRIMER_MAX_DIMER_TMP)
322
+ and not is_dimer(seq, seq)
272
323
  )
273
324
 
274
325
 
@@ -292,51 +343,66 @@ def filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
292
343
  )
293
344
 
294
345
 
295
- def find_primers(kmers, ambiguous_consensus, alignment):
346
+ def _process_kmer_batch(ambiguous_consensus, alignment, kmers):
296
347
  """
297
- filter kmers direction specific and append penalties
298
- --> potential primers
348
+ Helper function for multiprocessing: process a batch of kmers.
349
+ Returns (left_primers, right_primers) tuples.
299
350
  """
300
- left_primer_candidates = []
301
- right_primer_candidates = []
351
+ left_primers = []
352
+ right_primers = []
302
353
 
303
354
  for kmer in kmers:
304
- # filter kmers based on their direction independend stats
305
355
  if not filter_kmer_direction_independent(kmer[0]):
306
356
  continue
307
- # calc base penalty
308
- base_penalty = calc_base_penalty(kmer[0],config.PRIMER_TMP, config.PRIMER_GC_RANGE, config.PRIMER_SIZES)
309
- # calculate per base mismatches
310
- per_base_mismatches = calc_per_base_mismatches(
311
- kmer,
312
- alignment,
313
- ambiguous_consensus
314
- )
315
- # calculate permutation penealty
316
- permutation_penalty = calc_permutation_penalty(
317
- ambiguous_consensus[kmer[1]:kmer[2]]
318
- )
319
- # now check direction specific
357
+ # calc penalties
358
+ base_penalty = calc_base_penalty(kmer[0], config.PRIMER_TMP, config.PRIMER_GC_RANGE, config.PRIMER_SIZES)
359
+ per_base_mismatches = calc_per_base_mismatches(kmer, alignment, ambiguous_consensus)
360
+ permutation_penalty = calc_permutation_penalty(ambiguous_consensus[kmer[1]:kmer[2]])
361
+ # some filters depend on the direction of each primer
320
362
  for direction in ["+", "-"]:
321
- # check if kmer passes direction filter
322
363
  if not filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
323
364
  continue
324
- # calculate the 3' penalty
325
- three_prime_penalty = calc_3_prime_penalty(
326
- direction,
327
- per_base_mismatches
328
- )
329
- # add all penalties
365
+ # calc penalties
366
+ three_prime_penalty = calc_3_prime_penalty(direction, per_base_mismatches)
330
367
  primer_penalty = base_penalty + permutation_penalty + three_prime_penalty
331
- # sort into lists
368
+ # add to lists depending on their direction
332
369
  if direction == "+":
333
- left_primer_candidates.append(
334
- [kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches]
335
- )
336
- if direction == "-":
337
- right_primer_candidates.append(
338
- [rev_complement(kmer[0]), kmer[1], kmer[2], primer_penalty, per_base_mismatches]
339
- )
370
+ left_primers.append([kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches])
371
+ else:
372
+ right_primers.append([rev_complement(kmer[0]), kmer[1], kmer[2], primer_penalty, per_base_mismatches])
373
+
374
+ return left_primers, right_primers
375
+
376
+
377
+ def find_primers(kmers, ambiguous_consensus, alignment, num_processes):
378
+ """
379
+ Filter kmers direction specific and append penalties --> potential primers.
380
+ Uses multiprocessing to process kmers in parallel.
381
+ """
382
+ if not kmers:
383
+ return [], []
384
+
385
+ # Convert kmers set to list for slicing
386
+ kmers = list(kmers)
387
+ batch_size = max(1, int(len(kmers)/num_processes))
388
+
389
+ # Split kmers into batches
390
+ batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
391
+ callable_f = functools.partial(
392
+ _process_kmer_batch,
393
+ ambiguous_consensus, alignment
394
+ )
395
+
396
+ # Solve dimers in parallel
397
+ with multiprocessing.Pool(processes=num_processes) as pool:
398
+ results = pool.map(callable_f, batches)
399
+
400
+ # Aggregate results
401
+ left_primer_candidates = []
402
+ right_primer_candidates = []
403
+ for left_primers, right_primers in results:
404
+ left_primer_candidates.extend(left_primers)
405
+ right_primer_candidates.extend(right_primers)
340
406
 
341
407
  return left_primer_candidates, right_primer_candidates
342
408
 
@@ -351,7 +417,7 @@ def create_primer_dictionary(primer_candidates, direction):
351
417
  for primer in primer_candidates:
352
418
  if direction == "+":
353
419
  direction_name = "LEFT"
354
- elif direction == "-":
420
+ else:
355
421
  direction_name = "RIGHT"
356
422
  primer_name = f"{direction_name}_{primer_idx}"
357
423
  primer_dict[primer_name] = primer
@@ -360,7 +426,7 @@ def create_primer_dictionary(primer_candidates, direction):
360
426
  return primer_dict
361
427
 
362
428
 
363
- def find_best_primers(left_primer_candidates, right_primer_candidates):
429
+ def find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation:bool=False):
364
430
  """
365
431
  Primer candidates are likely overlapping. Here, the list of primers
366
432
  is sorted for the lowest to highest penalty. Then, the next lowest
@@ -386,16 +452,20 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
386
452
  primer_candidates.sort(key=lambda x: (x[3], x[1]))
387
453
  # ini everything with the primer with the lowest penalty
388
454
  to_retain = [primer_candidates[0]]
389
- primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]))
390
- primer_set = set(primer_ranges)
455
+ primer_set = set(range(primer_candidates[0][1], primer_candidates[0][2]))
391
456
 
392
- for primer in primer_candidates:
457
+ for primer in primer_candidates[1:]:
458
+ # for highly conserved alignments exclude everything that overlaps with the best primer
459
+ # this reduces graph complexity by quite a large margin
460
+ if high_conservation:
461
+ primer_positions =set(range(primer[1], primer[2]))
393
462
  # get the thirds of the primer, only consider the middle
394
- thirds_len = int((primer[2] - primer[1])/3)
395
- primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len))
463
+ else:
464
+ thirds_len = int((primer[2] - primer[1])/3)
465
+ primer_positions = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
396
466
  # check if none of the nucleotides of the next primer
397
467
  # are already covered by a better primer
398
- if not any(x in primer_positions for x in primer_set):
468
+ if primer_set.isdisjoint(primer_positions):
399
469
  # update the primer set
400
470
  primer_set.update(primer_positions)
401
471
  # append this primer as it has a low penalty and is not overlapping
@@ -409,3 +479,77 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
409
479
 
410
480
  # and create a dict
411
481
  return all_primers
482
+
483
+
484
+ def get_permutations(seq):
485
+ """
486
+ get all permutations of an ambiguous sequence.
487
+ """
488
+ splits = [config.AMBIG_NUCS.get(nuc, [nuc]) for nuc in seq]
489
+
490
+ return[''.join(p) for p in itertools.product(*splits)]
491
+
492
+
493
+ def parse_primer_fasta(fasta_path):
494
+ """
495
+ Parse a primer FASTA file and return a list of sequences using BioPython.
496
+ """
497
+
498
+ sequences = []
499
+
500
+ for record in SeqIO.parse(fasta_path, "fasta"):
501
+ seq = str(record.seq).lower()
502
+ # Only include primers up to 40 nucleotides
503
+ if len(seq) <= 40:
504
+ sequences += get_permutations(seq)
505
+
506
+ return list(set(sequences)) # deduplication
507
+
508
+
509
+ def check_primer_against_externals(external_sequences, primer):
510
+ """
511
+ Worker function to check a single primer against all external sequences.
512
+ Returns the primer if it passes, None otherwise.
513
+ Handles both list format and dict format (name, data) tuples.
514
+ """
515
+
516
+ # Extract sequence based on input format
517
+ if isinstance(primer, tuple):
518
+ name, data = primer
519
+ seq = data[0]
520
+ else:
521
+ seq = primer[0]
522
+
523
+ for ext_seq in external_sequences:
524
+ if is_dimer(seq, ext_seq):
525
+ return None
526
+
527
+ return primer
528
+
529
+
530
+ def filter_non_dimer_candidates(primer_candidates, external_sequences, n_processes):
531
+ """
532
+ Filter out primer candidates that form dimers with external sequences.
533
+ Uses multiprocessing to speed up checks.
534
+ """
535
+ is_dict = isinstance(primer_candidates, dict)
536
+
537
+ callable_f = functools.partial(
538
+ check_primer_against_externals,
539
+ external_sequences
540
+ )
541
+
542
+ with multiprocessing.Pool(processes=n_processes) as pool:
543
+ # Prepare arguments based on input type
544
+ # qpcr probes are stored in dictionaries --> result in tuples when unpacked
545
+ if is_dict:
546
+ results = pool.map(callable_f, primer_candidates.items())
547
+ else:
548
+ results = pool.map(callable_f, primer_candidates)
549
+
550
+ # Filter and restore original format
551
+ if is_dict:
552
+ filtered_results = [result for result in results if result is not None]
553
+ return {name: data for name, data in filtered_results}
554
+ else:
555
+ return [primer for primer in results if primer is not None]
varvamp/scripts/qpcr.py CHANGED
@@ -7,11 +7,11 @@ import re
7
7
  import seqfold
8
8
  import itertools
9
9
  import multiprocessing
10
+ import functools
10
11
 
11
12
  # varVAMP
12
13
  from varvamp.scripts import config
13
14
  from varvamp.scripts import primers
14
- from varvamp.scripts import reporting
15
15
 
16
16
 
17
17
  def choose_probe_direction(seq):
@@ -51,35 +51,25 @@ def filter_probe_direction_dependent(seq):
51
51
  )
52
52
 
53
53
 
54
- def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned):
54
+ def _process_kmer_batch_probes(ambiguous_consensus, alignment_cleaned, kmers):
55
55
  """
56
- find potential qPCR probes
56
+ Helper function for multiprocessing: process a batch of kmers for probes.
57
+ Returns probe_candidates dictionary.
57
58
  """
58
59
  probe_candidates = {}
59
60
  probe_idx = 0
60
61
 
61
62
  for kmer in kmers:
62
- # filter probe for base params
63
- if not primers.filter_kmer_direction_independent(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE,
64
- config.QPROBE_SIZES):
63
+ if not primers.filter_kmer_direction_independent(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE, config.QPROBE_SIZES):
65
64
  continue
66
- # do not allow ambiguous chars at both ends
67
65
  if ambiguous_ends(ambiguous_consensus[kmer[1]:kmer[2]]):
68
66
  continue
69
- # calc penalties analogous to primer search
70
- base_penalty = primers.calc_base_penalty(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE,
71
- config.QPROBE_SIZES)
72
- per_base_mismatches = primers.calc_per_base_mismatches(
73
- kmer,
74
- alignment_cleaned,
75
- ambiguous_consensus
76
- )
77
- permutation_penalty = primers.calc_permutation_penalty(
78
- ambiguous_consensus[kmer[1]:kmer[2]]
79
- )
80
- # determine the direction with more cytosine or set both if 50 %
67
+
68
+ base_penalty = primers.calc_base_penalty(kmer[0], config.QPROBE_TMP, config.QPROBE_GC_RANGE, config.QPROBE_SIZES)
69
+ per_base_mismatches = primers.calc_per_base_mismatches(kmer, alignment_cleaned, ambiguous_consensus)
70
+ permutation_penalty = primers.calc_permutation_penalty(ambiguous_consensus[kmer[1]:kmer[2]])
81
71
  direction = choose_probe_direction(kmer[0])
82
- # create probe dictionary
72
+
83
73
  if "+" in direction:
84
74
  if filter_probe_direction_dependent(kmer[0]):
85
75
  probe_name = f"PROBE_{probe_idx}_LEFT"
@@ -96,7 +86,44 @@ def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned):
96
86
  base_penalty + permutation_penalty + three_prime_penalty,
97
87
  per_base_mismatches, direction]
98
88
  probe_idx += 1
99
- # sort by penalty
89
+
90
+ return probe_candidates
91
+
92
+
93
+ def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes):
94
+ """
95
+ Find potential qPCR probes using multiprocessing.
96
+ """
97
+
98
+ # Convert kmers set to list for batching
99
+ kmers = list(kmers)
100
+
101
+ # Split kmers into batches
102
+ batch_size = max(1, int(len(kmers) / num_processes))
103
+ batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
104
+
105
+ # Prepare arguments for each dimer
106
+ callable_f = functools.partial(
107
+ _process_kmer_batch_probes,
108
+ ambiguous_consensus, alignment_cleaned
109
+ )
110
+ with multiprocessing.Pool(processes=num_processes) as pool:
111
+ results = pool.map(callable_f, batches)
112
+
113
+ # Aggregate results and re-index probe names
114
+ probe_candidates = {}
115
+ probe_idx = 0
116
+ for batch_probes in results:
117
+ if batch_probes is None:
118
+ continue
119
+ for probe_name, probe_data in batch_probes.items():
120
+ # Extract direction from original probe name
121
+ direction = "LEFT" if "LEFT" in probe_name else "RIGHT"
122
+ new_probe_name = f"PROBE_{probe_idx}_{direction}"
123
+ probe_candidates[new_probe_name] = probe_data
124
+ probe_idx += 1
125
+
126
+ # Sort by penalty
100
127
  probe_candidates = dict(sorted(probe_candidates.items(), key=lambda x: x[1][3]))
101
128
 
102
129
  return probe_candidates
@@ -139,54 +166,30 @@ def hardfilter_amplicon(majority_consensus, left_primer, right_primer):
139
166
  )
140
167
 
141
168
 
142
- def check_end_overlap(dimer_result):
143
- """
144
- checks if two oligos overlap at their ends (pretty rare)
145
- Example:
146
- xxxxxxxxtagc-------
147
- --------atcgxxxxxxx
148
- """
149
- if dimer_result.structure_found:
150
- # clean structure
151
- structure = [x[4:] for x in dimer_result.ascii_structure_lines]
152
- # calc overlap and the cumulative len of the oligos
153
- overlap = len(structure[1].replace(" ", ""))
154
- nt_count = len(re.findall("[ATCG]", "".join(structure)))
155
- # check for overlaps at the ends and the min overlap (allows for some amount of mismatches)
156
- if overlap > config.END_OVERLAP and nt_count <= len(structure[0]) + overlap + 1 and " " not in structure[1].lstrip(" "):
157
- return True
158
-
159
- return False
160
-
161
-
162
- def forms_dimer_or_overhangs(right_primer, left_primer, probe, ambiguous_consensus):
169
+ def dimer_in_combinations(right_primer, left_primer, probe, ambiguous_consensus):
163
170
  """
164
- checks if combinations of primers/probe form dimers or overhangs
171
+ checks if primers cause dimers and if combinations of primers/probe including all permutations form dimers
165
172
  """
166
173
 
167
174
  forms_structure = False
168
175
 
169
176
  # first check if there are dimers between the two flanking primers
170
- if primers.calc_dimer(left_primer[0], right_primer[0]).tm > config.PRIMER_MAX_DIMER_TMP:
177
+ if primers.is_dimer(left_primer[0], right_primer[0]):
171
178
  return True
172
179
  # for the probe check all permutations and possible overhangs to ensure
173
180
  # that none of the primers could cause unspecific probe binding.
174
181
  # first get all permutations
175
- probe_per = reporting.get_permutations(ambiguous_consensus[probe[1]:probe[2]])
176
- left_per = reporting.get_permutations(ambiguous_consensus[left_primer[1]:left_primer[2]])
177
- right_per = reporting.get_permutations(ambiguous_consensus[right_primer[1]:right_primer[2]])
182
+ probe_per = primers.get_permutations(ambiguous_consensus[probe[1]:probe[2]])
183
+ left_per = primers.get_permutations(ambiguous_consensus[left_primer[1]:left_primer[2]])
184
+ right_per = primers.get_permutations(ambiguous_consensus[right_primer[1]:right_primer[2]])
178
185
  # then check all permutations
179
186
  for combination in [(probe_per, left_per), (probe_per, right_per)]:
180
- for oligo1 in combination[0]:
181
- for oligo2 in combination[1]:
182
- dimer_result = primers.calc_dimer(oligo1, oligo2, structure=True)
183
- if dimer_result.tm >= config.PRIMER_MAX_DIMER_TMP or check_end_overlap(dimer_result):
184
- forms_structure = True
185
- break
186
- # break all loops because we found an unwanted structure in one of the permutations
187
- # (either dimer formation or a too long overlap at the ends of the primer)
188
- if forms_structure:
187
+ for oligo1, oligo2 in itertools.product(*combination):
188
+ if primers.is_dimer(oligo1, oligo2):
189
+ forms_structure = True
189
190
  break
191
+ # break also outer loop because we found an unwanted structure in one of the permutations
192
+ # (either dimer formation or a too long overlap at the ends of the primer)
190
193
  if forms_structure:
191
194
  break
192
195
 
@@ -231,7 +234,7 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
231
234
  [config.QPROBE_TEMP_DIFF[0] <= probe_temp - x <= config.QPROBE_TEMP_DIFF[1] for x in primer_temps]):
232
235
  continue
233
236
  # .... all combination of oligos do not form dimers or overhangs.
234
- if forms_dimer_or_overhangs(right_primer, left_primer, qpcr_probes[probe], ambiguous_consensus):
237
+ if dimer_in_combinations(right_primer, left_primer, qpcr_probes[probe], ambiguous_consensus):
235
238
  continue
236
239
  # append to list and break as this is the primer combi
237
240
  # with the lowest penalty (primers are sorted by penalty)
@@ -245,54 +248,74 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
245
248
  return primer_combinations
246
249
 
247
250
 
248
- def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus,
249
- ambiguous_consensus):
251
+ def find_single_qpcr_scheme(left_primer_candidates, right_primer_candidates, qpcr_probes,
252
+ majority_consensus, ambiguous_consensus, probe):
250
253
  """
251
- this finds the final qPCR schemes. it slices for primers flanking a probe and
252
- test all left/right combinations whether they are potential amplicons. as primers
253
- are sorted by penalty, only the very first match is considered as this has the
254
- lowest penalty. however, probes are overlapping and there is a high chance that
255
- left and right primers are found multiple times. to consider only one primer-probe
256
- combination the probes are also sorted by penalty. therefore, if a primer
257
- combination has been found already the optimal probe was already selected and
258
- there is no need to consider this primer probe combination.
254
+ Find a qPCR scheme for a single probe.
259
255
  """
260
256
 
257
+ probe_name, probe_data = probe
258
+
259
+ # Generate flanking subsets within the worker process
260
+ left_subset = flanking_primer_subset(left_primer_candidates, "+", probe_data)
261
+ right_subset = flanking_primer_subset(right_primer_candidates, "-", probe_data)
262
+
263
+ if not left_subset or not right_subset:
264
+ return probe_name, None
265
+
266
+ primer_combination = assess_amplicons(
267
+ left_subset, right_subset, qpcr_probes, probe_name,
268
+ majority_consensus, ambiguous_consensus
269
+ )
270
+
271
+ return probe_name, primer_combination
272
+
273
+
274
+ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates,
275
+ majority_consensus, ambiguous_consensus, num_processes):
276
+ """
277
+ Find final qPCR schemes using multiprocessing to evaluate probes in parallel.
278
+ Probes are sorted by penalty, ensuring optimal probe selection.
279
+ """
261
280
  qpcr_scheme_candidates = []
262
281
  found_amplicons = []
263
282
  amplicon_nr = -1
264
283
 
265
- for probe in qpcr_probes:
266
- left_subset = flanking_primer_subset(left_primer_candidates, "+", qpcr_probes[probe])
267
- right_subset = flanking_primer_subset(right_primer_candidates, "-", qpcr_probes[probe])
268
- # consider if there are primers flanking the probe ...
269
- if not left_subset or not right_subset:
270
- continue
271
- primer_combination = assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_consensus,
272
- ambiguous_consensus)
273
- # ... a combi has been found, ...
284
+ # Prepare arguments for parallel processing - pass full primer lists
285
+ batch_size = max(1, int(len(qpcr_probes) / num_processes))
286
+ callable_f = functools.partial(
287
+ find_single_qpcr_scheme,
288
+ left_primer_candidates, right_primer_candidates, qpcr_probes, majority_consensus, ambiguous_consensus
289
+ )
290
+
291
+ # Process probes in parallel
292
+ with multiprocessing.Pool(processes=num_processes) as pool:
293
+ results = pool.map(callable_f, qpcr_probes.items(), chunksize=batch_size)
294
+
295
+ # Aggregate results in original probe order (sorted by penalty)
296
+ for probe_name, primer_combination in results:
274
297
  if not primer_combination:
275
298
  continue
276
- # ...and this combi is not already present for a probe with a better penalty.
277
299
  if primer_combination in found_amplicons:
278
300
  continue
279
- # populate the primer dictionary:
301
+
280
302
  amplicon_nr += 1
281
303
  found_amplicons.append(primer_combination)
282
- qpcr_scheme_candidates.append(
283
- {
284
- "id": f"AMPLICON_{amplicon_nr}",
285
- "penalty": qpcr_probes[probe][3] + primer_combination[0][3] + primer_combination[1][3],
286
- "PROBE": qpcr_probes[probe],
287
- "LEFT": primer_combination[0],
288
- "RIGHT": primer_combination[1]
289
- }
290
- )
291
- # and again sort by total penalty (left + right + probe)
304
+ qpcr_scheme_candidates.append({
305
+ "id": f"AMPLICON_{amplicon_nr}",
306
+ "penalty": qpcr_probes[probe_name][3] + primer_combination[0][3] + primer_combination[1][3],
307
+ "PROBE": qpcr_probes[probe_name],
308
+ "LEFT": primer_combination[0],
309
+ "RIGHT": primer_combination[1]
310
+ })
311
+
312
+ # Sort by total penalty
313
+ qpcr_scheme_candidates.sort(key=lambda x: x["penalty"])
314
+
292
315
  return qpcr_scheme_candidates
293
316
 
294
317
 
295
- def process_single_amplicon_deltaG(amplicon, majority_consensus):
318
+ def process_single_amplicon_deltaG(majority_consensus, amplicon):
296
319
  """
297
320
  Process a single amplicon to test its deltaG and apply filtering.
298
321
  This function will be called concurrently by multiple threads.
@@ -310,7 +333,7 @@ def process_single_amplicon_deltaG(amplicon, majority_consensus):
310
333
  return amplicon
311
334
 
312
335
 
313
- def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_threads):
336
+ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_processes):
314
337
  """
315
338
  Test all amplicon deltaGs for the top n hits at the lowest primer temperature
316
339
  and filters if they fall below the cutoff. Multiple processes are used
@@ -318,32 +341,33 @@ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n
318
341
  """
319
342
  final_amplicons = []
320
343
 
321
- # Create a pool of processes to handle the concurrent processing
322
- with multiprocessing.Pool(processes=n_threads) as pool:
323
- # Create a list of the first n amplicon tuples for processing
324
- # The list is sorted first on whether offset targets were predicted for the amplicon,
325
- # then by penalty. This ensures that amplicons with offset targets are always considered last
326
- amplicons = itertools.islice(
327
- sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])),
328
- n_to_test
329
- )
330
- # process amplicons concurrently
331
- results = pool.starmap(process_single_amplicon_deltaG, [(amp, majority_consensus) for amp in amplicons])
332
- # Process the results
333
- retained_ranges = []
334
- for amp in results:
335
- # check if the amplicon overlaps with an amplicon that was previously
336
- # found and had a high enough deltaG
337
- if amp["deltaG"] <= deltaG_cutoff:
338
- continue
339
- amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
340
- overlaps_retained = False
341
- for r in retained_ranges:
342
- if amp_range.start < r.stop and r.start < amp_range.stop:
343
- overlaps_retained = True
344
- break
345
- if not overlaps_retained:
346
- final_amplicons.append(amp)
347
- retained_ranges.append(amp_range)
344
+ # Create a list of the first n amplicon tuples for processing
345
+ # The list is sorted first on whether offset targets were predicted for the amplicon,
346
+ # then by penalty. This ensures that amplicons with offset targets are always considered last
347
+ amplicons = list(sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])))[:n_to_test]
348
+ # process amplicons concurrently
349
+ batch_size = max(1, int(n_to_test / n_processes))
350
+ callable_f = functools.partial(
351
+ process_single_amplicon_deltaG,
352
+ majority_consensus
353
+ )
354
+ with multiprocessing.Pool(processes=n_processes) as pool:
355
+ results = pool.map(callable_f, amplicons, chunksize=batch_size)
356
+ # Process the results
357
+ retained_ranges = []
358
+ for amp in results:
359
+ # check if the amplicon overlaps with an amplicon that was previously
360
+ # found and had a high enough deltaG
361
+ if amp["deltaG"] <= deltaG_cutoff:
362
+ continue
363
+ amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
364
+ overlaps_retained = False
365
+ for r in retained_ranges:
366
+ if amp_range.start < r.stop and r.start < amp_range.stop:
367
+ overlaps_retained = True
368
+ break
369
+ if not overlaps_retained:
370
+ final_amplicons.append(amp)
371
+ retained_ranges.append(amp_range)
348
372
 
349
373
  return final_amplicons