varvamp 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@ data writing and visualization.
4
4
  # BUILT-INS
5
5
  import os
6
6
  import math
7
- import itertools
8
7
 
9
8
  # LIBS
10
9
  import pandas as pd
@@ -94,20 +93,6 @@ def write_all_primers(path, scheme_name, all_primers):
94
93
  write_primers_to_bed(outfile, scheme_name, primer, all_primers[direction][primer], round(all_primers[direction][primer][3], 2), direction)
95
94
 
96
95
 
97
- def get_permutations(seq):
98
- """
99
- get all permutations of an ambiguous sequence. needed to
100
- correctly report the gc and the temperature.
101
- """
102
- groups = itertools.groupby(seq, lambda char: char not in config.AMBIG_NUCS)
103
- splits = []
104
- for b, group in groups:
105
- if b:
106
- splits.extend([[g] for g in group])
107
- else:
108
- for nuc in group:
109
- splits.append(config.AMBIG_NUCS[nuc])
110
- return[''.join(p) for p in itertools.product(*splits)]
111
96
 
112
97
 
113
98
  def calc_mean_stats(permutations):
@@ -190,7 +175,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
190
175
  else:
191
176
  direction = "+"
192
177
 
193
- permutations = get_permutations(seq)
178
+ permutations = primers.get_permutations(seq)
194
179
  gc, temp = calc_mean_stats(permutations)
195
180
  primer_name = f"{amp_name}_{oligo_type}"
196
181
 
@@ -224,7 +209,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
224
209
  print(f">{primer_name}\n{seq.upper()}", file=fasta)
225
210
 
226
211
 
227
- def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file):
212
+ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file, primer_dimers=None):
228
213
  """
229
214
  write all relevant bed files and a tsv file with all primer stats
230
215
  """
@@ -233,6 +218,9 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
233
218
  amplicon_bed_file = os.path.join(path, "amplicons.bed")
234
219
  tabular_file = os.path.join(path, "primer_to_amplicon_assignment.tabular")
235
220
 
221
+ # Map old primer names to new amplicon-based names
222
+ name_mapping = {}
223
+
236
224
  # open files to write
237
225
  with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
238
226
  # write header for primer tsv
@@ -248,11 +236,11 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
248
236
  if mode == "single":
249
237
  primer_fasta_file = os.path.join(path, "primers.fasta")
250
238
  else:
251
- primer_fasta_file = os.path.join(path, f"primers_pool_{pool+1}.fasta")
239
+ primer_fasta_file = os.path.join(path, f"primers_pool_{pool + 1}.fasta")
252
240
  with open(primer_fasta_file, "w") as primer_fasta:
253
241
  for counter, amp in enumerate(amplicon_scheme[pool::len(pools)]):
254
242
  # give a new amplicon name
255
- amplicon_index = counter*len(pools) + pool
243
+ amplicon_index = counter * len(pools) + pool
256
244
  amp_name = f"{scheme_name}_{amplicon_index}"
257
245
  # get left and right primers and their names
258
246
  amp_length = amp["RIGHT"][2] - amp["LEFT"][1]
@@ -266,7 +254,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
266
254
  amplicon_has_off_target = "n.d."
267
255
  # write amplicon bed
268
256
  if mode == "tiled":
269
- bed_score = pool+1
257
+ bed_score = pool + 1
270
258
  elif mode == "single":
271
259
  bed_score = round(amp["LEFT"][3] + amp["RIGHT"][3], 1)
272
260
  amplicon_bed_records.append(
@@ -284,6 +272,10 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
284
272
  (f"{amp_name}_LEFT", f"{amp_name}_RIGHT")
285
273
  )
286
274
  )
275
+ # Build name mapping for dimers
276
+ name_mapping[amp["LEFT"][-1]] = f"{amp_name}_LEFT"
277
+ name_mapping[amp["RIGHT"][-1]] = f"{amp_name}_RIGHT"
278
+
287
279
  # write primer tsv and primer bed
288
280
  for direction, primer in [("+", amp["LEFT"]), ("-", amp["RIGHT"])]:
289
281
  seq = ambiguous_consensus[primer[1]:primer[2]]
@@ -295,7 +287,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
295
287
  # write primers to fasta pool file
296
288
  print(f">{primer_name}\n{seq.upper()}", file=primer_fasta)
297
289
  # calc primer parameters for all permutations
298
- permutations = get_permutations(seq)
290
+ permutations = primers.get_permutations(seq)
299
291
  gc, temp = calc_mean_stats(permutations)
300
292
  # write tsv file
301
293
  print(
@@ -303,7 +295,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
303
295
  amp_length,
304
296
  primer_name,
305
297
  primer[-1],
306
- pool+1,
298
+ pool + 1,
307
299
  primer[1] + 1,
308
300
  primer[2],
309
301
  seq.upper(),
@@ -321,7 +313,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
321
313
  (
322
314
  # will need amplicon_index for sorting
323
315
  amplicon_index,
324
- (primer_name, primer, pool+1, direction, seq.upper())
316
+ (primer_name, primer, pool + 1, direction, seq.upper())
325
317
  )
326
318
  )
327
319
  # write amplicon bed with amplicons sorted by start position
@@ -348,26 +340,41 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
348
340
  *record[1]
349
341
  )
350
342
 
343
+ # Write dimers with renamed primers
344
+ if primer_dimers:
345
+ write_dimers(path, primer_dimers, name_mapping)
351
346
 
352
- def write_dimers(path, primer_dimers):
347
+
348
+ def write_dimers(path, primer_dimers, name_mapping):
353
349
  """
354
350
  write dimers for which no replacement was found to file
355
351
  """
356
- tsv_file = os.path.join(path, "unsolvable_primer_dimers.tsv")
357
- with open(tsv_file, "w") as tsv:
358
- print(
359
- "pool\tprimer_name_1\tprimer_name_2\tdimer melting temp",
360
- file=tsv
361
- )
352
+ file = os.path.join(path, "unsolvable_primer_dimers.txt")
353
+ with open(file, "w") as f:
362
354
  for pool, primer1, primer2 in primer_dimers:
355
+ dimer_result = primers.calc_dimer(primer1[2][0], primer2[2][0], structure=True)
356
+ print(
357
+ "pool\tprimer 1\tprimer 2\tdimer melting temp\tdeltaG",
358
+ file=f
359
+ )
363
360
  print(
364
361
  pool+1,
365
- primer1[1],
366
- primer2[1],
367
- round(primers.calc_dimer(primer1[2][0], primer2[2][0]).tm, 1),
362
+ name_mapping[primer1[1]],
363
+ name_mapping[primer2[1]],
364
+ round(dimer_result.tm, 1),
365
+ dimer_result.dg,
368
366
  sep="\t",
369
- file=tsv
367
+ file=f
370
368
  )
369
+ structure = [x[4:] for x in dimer_result.ascii_structure_lines]
370
+ print("\nDimer structure:", file=f)
371
+ for line in structure:
372
+ print(
373
+ line,
374
+ file=f
375
+ )
376
+ print(file=f)
377
+
371
378
 
372
379
  def entropy(chars, states):
373
380
  """
varvamp/scripts/scheme.py CHANGED
@@ -5,6 +5,8 @@ amplicon search
5
5
  # BUILT-INS
6
6
  import heapq
7
7
  import math
8
+ import multiprocessing
9
+ import functools
8
10
 
9
11
  # varVAMP
10
12
  from varvamp.scripts import config, primers
@@ -73,7 +75,7 @@ def find_amplicons(all_primers, opt_len, max_len):
73
75
  amplicon_length = right_primer[2] - left_primer[1]
74
76
  if not opt_len <= amplicon_length <= max_len:
75
77
  continue
76
- if primers.calc_dimer(left_primer[0], right_primer[0]).tm > config.PRIMER_MAX_DIMER_TMP:
78
+ if primers.is_dimer(left_primer[0], right_primer[0]):
77
79
  continue
78
80
  # calculate length dependend amplicon costs as the cumulative primer
79
81
  # penalty multiplied by the e^(fold length of the optimal length).
@@ -92,6 +94,26 @@ def find_amplicons(all_primers, opt_len, max_len):
92
94
  return amplicons
93
95
 
94
96
 
97
+ def has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
98
+ """
99
+ check if two amplicons overlap sufficiently to connect them in the graph
100
+ """
101
+ # connect amplicons if they sufficiently overlap because:
102
+ # ... the start of next amplicon lies in the second half of the prior amplicon
103
+ if next_amplicon["LEFT"][1] < current_amplicon["LEFT"][1] + current_amplicon["length"] / 2:
104
+ return False
105
+ # ... the stop of the left primer of the next amplicon does not lie in the minimum amplicon insert
106
+ if next_amplicon["LEFT"][2] > current_amplicon["RIGHT"][1] - min_overlap:
107
+ return False
108
+ # ... half of the next amplicon does not overlap with the previous amplicon --> enough space for a
109
+ # further amplicon that lies in the second half next amplicon and cannot overlap with a primer of the
110
+ # current amplicon
111
+ if next_amplicon["RIGHT"][2] <= current_amplicon["RIGHT"][2] + next_amplicon["length"] / 2:
112
+ return False
113
+
114
+ return True
115
+
116
+
95
117
  def create_amplicon_graph(amplicons, min_overlap):
96
118
  """
97
119
  creates the amplicon graph.
@@ -100,34 +122,26 @@ def create_amplicon_graph(amplicons, min_overlap):
100
122
  amplicon_graph = {}
101
123
  nodes = []
102
124
 
103
- # add the maximum len of a primer to ensure that possible amplicon starts
104
- # before the min overlap
105
- min_overlap = min_overlap + config.PRIMER_SIZES[1]
106
-
107
125
  for current_amplicon in amplicons:
108
126
  # remember all vertices
109
127
  amplicon_id = current_amplicon["id"]
110
128
  nodes.append(amplicon_id)
111
- start = current_amplicon["LEFT"][1] + current_amplicon["length"]/2
112
- stop = current_amplicon["RIGHT"][1] - min_overlap
113
129
  for next_amplicon in amplicons:
114
- # check if the next amplicon lies within the start/stop range of
115
- # the current amplicon and if its non-overlapping part is large
116
- # enough to ensure space for a primer and the min overlap of the
117
- # following amplicon.
118
- if start <= next_amplicon["LEFT"][1] <= stop and next_amplicon["RIGHT"][2] > current_amplicon["RIGHT"][2] + next_amplicon["length"]/2:
119
- if amplicon_id not in amplicon_graph:
120
- amplicon_graph[amplicon_id] = {
121
- next_amplicon["id"]: (
122
- next_amplicon.get("off_targets", False),
123
- next_amplicon["penalty"]
124
- )
125
- }
126
- else:
127
- amplicon_graph[amplicon_id][next_amplicon["id"]] = (
130
+ if not has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
131
+ continue
132
+ # --> write to graph
133
+ if amplicon_id not in amplicon_graph:
134
+ amplicon_graph[amplicon_id] = {
135
+ next_amplicon["id"]: (
128
136
  next_amplicon.get("off_targets", False),
129
137
  next_amplicon["penalty"]
130
138
  )
139
+ }
140
+ else:
141
+ amplicon_graph[amplicon_id][next_amplicon["id"]] = (
142
+ next_amplicon.get("off_targets", False),
143
+ next_amplicon["penalty"]
144
+ )
131
145
 
132
146
  # return a graph object
133
147
  return Graph(nodes, amplicon_graph)
@@ -274,6 +288,7 @@ def find_best_covering_scheme(amplicons, amplicon_graph):
274
288
  # if no previous nodes are found but the single amplicon results in the largest
275
289
  # coverage - return as the best scheme
276
290
  amplicon_path = [best_start_node]
291
+
277
292
  return best_coverage, create_scheme(amplicon_path, amps_by_id)
278
293
 
279
294
 
@@ -283,8 +298,15 @@ def test_scheme_for_dimers(amplicon_scheme):
283
298
  """
284
299
 
285
300
  primer_dimers = []
286
- pools = {amp["pool"] for amp in amplicon_scheme}
287
- for pool in pools:
301
+ non_dimers = {amp["pool"]:set() for amp in amplicon_scheme}
302
+ # write all primer sequences in the respective pools -->
303
+ # these primers should not be violated by primer switching
304
+ # and primers are only switched later if no primer dimers
305
+ # with the existing 'good' scheme are created
306
+ for amp in amplicon_scheme:
307
+ non_dimers[amp["pool"]].add(amp["LEFT"][0])
308
+ non_dimers[amp["pool"]].add(amp["RIGHT"][0])
309
+ for pool in non_dimers:
288
310
  # test the primer dimers only within the respective pools
289
311
  tested_primers = []
290
312
  for amp_index, amp in enumerate(amplicon_scheme):
@@ -297,13 +319,16 @@ def test_scheme_for_dimers(amplicon_scheme):
297
319
  current_seq = current_primer[2][0]
298
320
  for tested in tested_primers:
299
321
  tested_seq = tested[2][0]
300
- if primers.calc_dimer(current_seq, tested_seq).tm <= config.PRIMER_MAX_DIMER_TMP:
322
+ if not primers.is_dimer(current_seq, tested_seq):
301
323
  continue
302
324
  primer_dimers.append((current_primer, tested))
325
+ non_dimers[pool].discard(current_seq)
326
+ non_dimers[pool].discard(tested_seq)
303
327
  # and remember all tested primers
304
328
  tested_primers.append(current_primer)
305
329
 
306
- return primer_dimers
330
+ # report both dimers and non-dimers
331
+ return primer_dimers, non_dimers
307
332
 
308
333
 
309
334
  def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates):
@@ -317,13 +342,16 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
317
342
  # test each primer in dimer
318
343
  for amp_index, primer_name, primer in dimer:
319
344
  overlapping_primers_temp = []
320
- thirds_len = int((primer[2] - primer[1]) / 3)
321
- # get the middle third of the primer (here are the previously excluded primers)
322
- overlap_set = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
323
- # check in which list to look for them
345
+ # as switching could violate overlap criteria,
346
+ # only consider primers that overlap in the left half (LEFT primers)
347
+ # or right half (RIGHT primers) respectively, however this can result in slightly
348
+ # longer amplicons than allowed.
349
+ half_length = int((primer[2] - primer[1]) / 2)
324
350
  if "RIGHT" in primer_name:
351
+ overlap_set = set(range(primer[1] + half_length, primer[2]))
325
352
  primers_to_test = right_primer_candidates
326
353
  else:
354
+ overlap_set = set(range(primer[1], primer[1] + half_length))
327
355
  primers_to_test = left_primer_candidates
328
356
  # and check this list for all primers that overlap
329
357
  for potential_new in primers_to_test:
@@ -337,40 +365,60 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
337
365
  return overlapping_primers
338
366
 
339
367
 
340
- def test_overlaps_for_dimers(overlapping_primers):
368
+ def test_overlaps_for_dimers(overlapping_primers, non_dimers):
341
369
  """
342
- test the overlapping primers for dimers. return new primers.
370
+ test all possible overlapping primers against each other for dimers
371
+ and return the first pair that doesn't form a dimer with each other
372
+ and with all non-dimer forming primers in the pool.
343
373
  """
344
374
  for first_overlap in overlapping_primers[0]:
375
+ if any(primers.is_dimer(seq, first_overlap[2][0]) for seq in non_dimers):
376
+ continue
345
377
  for second_overlap in overlapping_primers[1]:
346
- # return the first match. primers are sorted by penalty.
347
- # first pair that makes it has the lowest penalty
348
- if primers.calc_dimer(first_overlap[2][0], second_overlap[2][0]).tm <= config.PRIMER_MAX_DIMER_TMP:
378
+ if any(primers.is_dimer(seq, second_overlap[2][0]) for seq in non_dimers):
379
+ continue
380
+ if not primers.is_dimer(first_overlap[2][0], second_overlap[2][0]):
349
381
  return [first_overlap, second_overlap]
350
382
 
351
383
 
352
- def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers):
384
+ def _solve_single_dimer(amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools, dimer):
385
+ """
386
+ Helper function for multiprocessing: solve a single dimer independently.
387
+ Returns (amp_index, primer_name, new_primer) tuples or empty list if no solution.
388
+ """
389
+ pool = amplicon_scheme[dimer[0][0]]["pool"]
390
+ non_dimers = non_dimers_all_pools[pool]
391
+
392
+ overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
393
+ new_primers = test_overlaps_for_dimers(overlapping_primers, non_dimers)
394
+
395
+ return new_primers if new_primers else []
396
+
397
+
398
+ def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers, num_processes):
353
399
  """
354
400
  check scheme for heterodimers, try to find
355
401
  new primers that overlap and replace the existing ones.
356
- this can lead to new primer dimers. therefore the scheme
357
- is checked a second time. if there are still primer dimers
358
- present the non-solvable dimers are returned
402
+ Uses multiprocessing to solve dimers in parallel.
359
403
  """
404
+ primer_dimers, non_dimers_all_pools = test_scheme_for_dimers(amplicon_scheme)
405
+ n_initial_dimers = len(primer_dimers)
360
406
 
361
- primer_dimers = test_scheme_for_dimers(amplicon_scheme)
407
+ if not primer_dimers:
408
+ return [], 0
362
409
 
363
- if primer_dimers:
364
- print(f"varVAMP found {len(primer_dimers)} dimer pairs in scheme ... trying to find replacements")
365
- else:
366
- return []
367
-
368
- for dimer in primer_dimers:
369
- # get overlapping primers that have not been considered
370
- overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
371
- # test all possible primers against each other for dimers
372
- new_primers = test_overlaps_for_dimers(overlapping_primers)
373
- # now change these primers in the scheme
410
+ # Prepare arguments for each dimer
411
+ callable_f = functools.partial(
412
+ _solve_single_dimer,
413
+ amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools
414
+ )
415
+
416
+ # Solve dimers in parallel
417
+ with multiprocessing.Pool(processes=num_processes) as pool:
418
+ results = pool.map(callable_f, primer_dimers)
419
+
420
+ # Apply all solutions to the scheme
421
+ for new_primers in results:
374
422
  if new_primers:
375
423
  for amp_index, primer_name, primer in new_primers:
376
424
  # overwrite in final scheme
@@ -386,12 +434,13 @@ def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_
386
434
  # and in all primers
387
435
  all_primers[strand][primer_name] = primer
388
436
  # get remaining dimers in the revised scheme and add pool identifier for reporting
437
+ remaining_primer_dimers, _ = test_scheme_for_dimers(amplicon_scheme)
389
438
  primer_dimers = [
390
439
  (amplicon_scheme[primer1[0]]["pool"], primer1, primer2)
391
- for primer1, primer2 in test_scheme_for_dimers(amplicon_scheme)
440
+ for primer1, primer2 in remaining_primer_dimers
392
441
  ]
393
442
 
394
- return primer_dimers
443
+ return primer_dimers, n_initial_dimers
395
444
 
396
445
 
397
446
  def find_single_amplicons(amplicons, n):