varvamp 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@ data writing and visualization.
4
4
  # BUILT-INS
5
5
  import os
6
6
  import math
7
- import itertools
8
7
 
9
8
  # LIBS
10
9
  import pandas as pd
@@ -53,7 +52,7 @@ def write_regions_to_bed(primer_regions, scheme_name, path, mode=None):
53
52
  with open(outfile, 'w') as o:
54
53
  for counter, region in enumerate(primer_regions):
55
54
  print(
56
- f"{scheme_name}_consensus",
55
+ f"{scheme_name}_ambiguous_consensus",
57
56
  region[0],
58
57
  region[1],
59
58
  "REGION_"+str(counter),
@@ -68,9 +67,7 @@ def write_primers_to_bed(outfile, scheme_name, primer_name, primer_properties, n
68
67
  """
69
68
  with open(outfile, 'a') as o:
70
69
  # write header for primer bed
71
- if os.path.getsize(outfile) == 0 and sequence is not None:
72
- print("#chrom\tchromStart\tchromEnd\tprimer-name\tpool\tstrand\tprimer-sequence", file=o)
73
- data = [f"{scheme_name}_consensus",
70
+ data = [f"{scheme_name}_ambiguous_consensus",
74
71
  primer_properties[1], # start
75
72
  primer_properties[2], # stop
76
73
  primer_name,
@@ -96,20 +93,6 @@ def write_all_primers(path, scheme_name, all_primers):
96
93
  write_primers_to_bed(outfile, scheme_name, primer, all_primers[direction][primer], round(all_primers[direction][primer][3], 2), direction)
97
94
 
98
95
 
99
- def get_permutations(seq):
100
- """
101
- get all permutations of an ambiguous sequence. needed to
102
- correctly report the gc and the temperature.
103
- """
104
- groups = itertools.groupby(seq, lambda char: char not in config.AMBIG_NUCS)
105
- splits = []
106
- for b, group in groups:
107
- if b:
108
- splits.extend([[g] for g in group])
109
- else:
110
- for nuc in group:
111
- splits.append(config.AMBIG_NUCS[nuc])
112
- return[''.join(p) for p in itertools.product(*splits)]
113
96
 
114
97
 
115
98
  def calc_mean_stats(permutations):
@@ -150,7 +133,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
150
133
  amp_name = f"{scheme_name}_{n}"
151
134
  # write bed amplicon file
152
135
  print(
153
- f"{scheme_name}_consensus",
136
+ f"{scheme_name}_ambiguous_consensus",
154
137
  amp["LEFT"][1],
155
138
  amp["RIGHT"][2],
156
139
  amp_name,
@@ -192,7 +175,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
192
175
  else:
193
176
  direction = "+"
194
177
 
195
- permutations = get_permutations(seq)
178
+ permutations = primers.get_permutations(seq)
196
179
  gc, temp = calc_mean_stats(permutations)
197
180
  primer_name = f"{amp_name}_{oligo_type}"
198
181
 
@@ -226,7 +209,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
226
209
  print(f">{primer_name}\n{seq.upper()}", file=fasta)
227
210
 
228
211
 
229
- def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file):
212
+ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file, primer_dimers=None):
230
213
  """
231
214
  write all relevant bed files and a tsv file with all primer stats
232
215
  """
@@ -235,6 +218,9 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
235
218
  amplicon_bed_file = os.path.join(path, "amplicons.bed")
236
219
  tabular_file = os.path.join(path, "primer_to_amplicon_assignment.tabular")
237
220
 
221
+ # Map old primer names to new amplicon-based names
222
+ name_mapping = {}
223
+
238
224
  # open files to write
239
225
  with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
240
226
  # write header for primer tsv
@@ -250,11 +236,11 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
250
236
  if mode == "single":
251
237
  primer_fasta_file = os.path.join(path, "primers.fasta")
252
238
  else:
253
- primer_fasta_file = os.path.join(path, f"primers_pool_{pool+1}.fasta")
239
+ primer_fasta_file = os.path.join(path, f"primers_pool_{pool + 1}.fasta")
254
240
  with open(primer_fasta_file, "w") as primer_fasta:
255
241
  for counter, amp in enumerate(amplicon_scheme[pool::len(pools)]):
256
242
  # give a new amplicon name
257
- amplicon_index = counter*len(pools) + pool
243
+ amplicon_index = counter * len(pools) + pool
258
244
  amp_name = f"{scheme_name}_{amplicon_index}"
259
245
  # get left and right primers and their names
260
246
  amp_length = amp["RIGHT"][2] - amp["LEFT"][1]
@@ -268,7 +254,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
268
254
  amplicon_has_off_target = "n.d."
269
255
  # write amplicon bed
270
256
  if mode == "tiled":
271
- bed_score = pool+1
257
+ bed_score = pool + 1
272
258
  elif mode == "single":
273
259
  bed_score = round(amp["LEFT"][3] + amp["RIGHT"][3], 1)
274
260
  amplicon_bed_records.append(
@@ -286,6 +272,10 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
286
272
  (f"{amp_name}_LEFT", f"{amp_name}_RIGHT")
287
273
  )
288
274
  )
275
+ # Build name mapping for dimers
276
+ name_mapping[amp["LEFT"][-1]] = f"{amp_name}_LEFT"
277
+ name_mapping[amp["RIGHT"][-1]] = f"{amp_name}_RIGHT"
278
+
289
279
  # write primer tsv and primer bed
290
280
  for direction, primer in [("+", amp["LEFT"]), ("-", amp["RIGHT"])]:
291
281
  seq = ambiguous_consensus[primer[1]:primer[2]]
@@ -297,7 +287,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
297
287
  # write primers to fasta pool file
298
288
  print(f">{primer_name}\n{seq.upper()}", file=primer_fasta)
299
289
  # calc primer parameters for all permutations
300
- permutations = get_permutations(seq)
290
+ permutations = primers.get_permutations(seq)
301
291
  gc, temp = calc_mean_stats(permutations)
302
292
  # write tsv file
303
293
  print(
@@ -305,7 +295,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
305
295
  amp_length,
306
296
  primer_name,
307
297
  primer[-1],
308
- pool+1,
298
+ pool + 1,
309
299
  primer[1] + 1,
310
300
  primer[2],
311
301
  seq.upper(),
@@ -323,13 +313,13 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
323
313
  (
324
314
  # will need amplicon_index for sorting
325
315
  amplicon_index,
326
- (primer_name, primer, pool+1, direction, seq.upper())
316
+ (primer_name, primer, pool + 1, direction, seq.upper())
327
317
  )
328
318
  )
329
319
  # write amplicon bed with amplicons sorted by start position
330
320
  for record in sorted(amplicon_bed_records, key=lambda x: x[0]):
331
321
  print(
332
- f"{scheme_name}_consensus",
322
+ f"{scheme_name}_ambiguous_consensus",
333
323
  *record,
334
324
  ".",
335
325
  sep="\t",
@@ -350,26 +340,41 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
350
340
  *record[1]
351
341
  )
352
342
 
343
+ # Write dimers with renamed primers
344
+ if primer_dimers:
345
+ write_dimers(path, primer_dimers, name_mapping)
353
346
 
354
- def write_dimers(path, primer_dimers):
347
+
348
+ def write_dimers(path, primer_dimers, name_mapping):
355
349
  """
356
350
  write dimers for which no replacement was found to file
357
351
  """
358
- tsv_file = os.path.join(path, "unsolvable_primer_dimers.tsv")
359
- with open(tsv_file, "w") as tsv:
360
- print(
361
- "pool\tprimer_name_1\tprimer_name_2\tdimer melting temp",
362
- file=tsv
363
- )
352
+ file = os.path.join(path, "unsolvable_primer_dimers.txt")
353
+ with open(file, "w") as f:
364
354
  for pool, primer1, primer2 in primer_dimers:
355
+ dimer_result = primers.calc_dimer(primer1[2][0], primer2[2][0], structure=True)
356
+ print(
357
+ "pool\tprimer 1\tprimer 2\tdimer melting temp\tdeltaG",
358
+ file=f
359
+ )
365
360
  print(
366
361
  pool+1,
367
- primer1[1],
368
- primer2[1],
369
- round(primers.calc_dimer(primer1[2][0], primer2[2][0]).tm, 1),
362
+ name_mapping[primer1[1]],
363
+ name_mapping[primer2[1]],
364
+ round(dimer_result.tm, 1),
365
+ dimer_result.dg,
370
366
  sep="\t",
371
- file=tsv
367
+ file=f
372
368
  )
369
+ structure = [x[4:] for x in dimer_result.ascii_structure_lines]
370
+ print("\nDimer structure:", file=f)
371
+ for line in structure:
372
+ print(
373
+ line,
374
+ file=f
375
+ )
376
+ print(file=f)
377
+
373
378
 
374
379
  def entropy(chars, states):
375
380
  """
varvamp/scripts/scheme.py CHANGED
@@ -5,6 +5,8 @@ amplicon search
5
5
  # BUILT-INS
6
6
  import heapq
7
7
  import math
8
+ import multiprocessing
9
+ import functools
8
10
 
9
11
  # varVAMP
10
12
  from varvamp.scripts import config, primers
@@ -73,7 +75,7 @@ def find_amplicons(all_primers, opt_len, max_len):
73
75
  amplicon_length = right_primer[2] - left_primer[1]
74
76
  if not opt_len <= amplicon_length <= max_len:
75
77
  continue
76
- if primers.calc_dimer(left_primer[0], right_primer[0]).tm > config.PRIMER_MAX_DIMER_TMP:
78
+ if primers.is_dimer(left_primer[0], right_primer[0]):
77
79
  continue
78
80
  # calculate length dependend amplicon costs as the cumulative primer
79
81
  # penalty multiplied by the e^(fold length of the optimal length).
@@ -92,6 +94,26 @@ def find_amplicons(all_primers, opt_len, max_len):
92
94
  return amplicons
93
95
 
94
96
 
97
+ def has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
98
+ """
99
+ check if two amplicons overlap sufficiently to connect them in the graph
100
+ """
101
+ # connect amplicons if they sufficiently overlap because:
102
+ # ... the start of next amplicon lies in the second half of the prior amplicon
103
+ if next_amplicon["LEFT"][1] < current_amplicon["LEFT"][1] + current_amplicon["length"] / 2:
104
+ return False
105
+ # ... the stop of the left primer of the next amplicon does not lie in the minimum amplicon insert
106
+ if next_amplicon["LEFT"][2] > current_amplicon["RIGHT"][1] - min_overlap:
107
+ return False
108
+ # ... half of the next amplicon does not overlap with the previous amplicon --> enough space for a
109
+ # further amplicon that lies in the second half next amplicon and cannot overlap with a primer of the
110
+ # current amplicon
111
+ if next_amplicon["RIGHT"][2] <= current_amplicon["RIGHT"][2] + next_amplicon["length"] / 2:
112
+ return False
113
+
114
+ return True
115
+
116
+
95
117
  def create_amplicon_graph(amplicons, min_overlap):
96
118
  """
97
119
  creates the amplicon graph.
@@ -100,34 +122,26 @@ def create_amplicon_graph(amplicons, min_overlap):
100
122
  amplicon_graph = {}
101
123
  nodes = []
102
124
 
103
- # add the maximum len of a primer to ensure that possible amplicon starts
104
- # before the min overlap
105
- min_overlap = min_overlap + config.PRIMER_SIZES[2]
106
-
107
125
  for current_amplicon in amplicons:
108
126
  # remember all vertices
109
127
  amplicon_id = current_amplicon["id"]
110
128
  nodes.append(amplicon_id)
111
- start = current_amplicon["LEFT"][1] + current_amplicon["length"]/2
112
- stop = current_amplicon["RIGHT"][2] - min_overlap
113
129
  for next_amplicon in amplicons:
114
- # check if the next amplicon lies within the start/stop range of
115
- # the current amplicon and if its non-overlapping part is large
116
- # enough to ensure space for a primer and the min overlap of the
117
- # following amplicon.
118
- if start <= next_amplicon["LEFT"][1] <= stop and next_amplicon["RIGHT"][2] > current_amplicon["RIGHT"][2] + next_amplicon["length"]/2:
119
- if amplicon_id not in amplicon_graph:
120
- amplicon_graph[amplicon_id] = {
121
- next_amplicon["id"]: (
122
- next_amplicon.get("off_targets", False),
123
- next_amplicon["penalty"]
124
- )
125
- }
126
- else:
127
- amplicon_graph[amplicon_id][next_amplicon["id"]] = (
130
+ if not has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
131
+ continue
132
+ # --> write to graph
133
+ if amplicon_id not in amplicon_graph:
134
+ amplicon_graph[amplicon_id] = {
135
+ next_amplicon["id"]: (
128
136
  next_amplicon.get("off_targets", False),
129
137
  next_amplicon["penalty"]
130
138
  )
139
+ }
140
+ else:
141
+ amplicon_graph[amplicon_id][next_amplicon["id"]] = (
142
+ next_amplicon.get("off_targets", False),
143
+ next_amplicon["penalty"]
144
+ )
131
145
 
132
146
  # return a graph object
133
147
  return Graph(nodes, amplicon_graph)
@@ -274,6 +288,7 @@ def find_best_covering_scheme(amplicons, amplicon_graph):
274
288
  # if no previous nodes are found but the single amplicon results in the largest
275
289
  # coverage - return as the best scheme
276
290
  amplicon_path = [best_start_node]
291
+
277
292
  return best_coverage, create_scheme(amplicon_path, amps_by_id)
278
293
 
279
294
 
@@ -283,8 +298,15 @@ def test_scheme_for_dimers(amplicon_scheme):
283
298
  """
284
299
 
285
300
  primer_dimers = []
286
- pools = {amp["pool"] for amp in amplicon_scheme}
287
- for pool in pools:
301
+ non_dimers = {amp["pool"]:set() for amp in amplicon_scheme}
302
+ # write all primer sequences in the respective pools -->
303
+ # these primers should not be violated by primer switching
304
+ # and primers are only switched later if no primer dimers
305
+ # with the existing 'good' scheme are created
306
+ for amp in amplicon_scheme:
307
+ non_dimers[amp["pool"]].add(amp["LEFT"][0])
308
+ non_dimers[amp["pool"]].add(amp["RIGHT"][0])
309
+ for pool in non_dimers:
288
310
  # test the primer dimers only within the respective pools
289
311
  tested_primers = []
290
312
  for amp_index, amp in enumerate(amplicon_scheme):
@@ -297,13 +319,16 @@ def test_scheme_for_dimers(amplicon_scheme):
297
319
  current_seq = current_primer[2][0]
298
320
  for tested in tested_primers:
299
321
  tested_seq = tested[2][0]
300
- if primers.calc_dimer(current_seq, tested_seq).tm <= config.PRIMER_MAX_DIMER_TMP:
322
+ if not primers.is_dimer(current_seq, tested_seq):
301
323
  continue
302
324
  primer_dimers.append((current_primer, tested))
325
+ non_dimers[pool].discard(current_seq)
326
+ non_dimers[pool].discard(tested_seq)
303
327
  # and remember all tested primers
304
328
  tested_primers.append(current_primer)
305
329
 
306
- return primer_dimers
330
+ # report both dimers and non-dimers
331
+ return primer_dimers, non_dimers
307
332
 
308
333
 
309
334
  def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates):
@@ -317,13 +342,16 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
317
342
  # test each primer in dimer
318
343
  for amp_index, primer_name, primer in dimer:
319
344
  overlapping_primers_temp = []
320
- thirds_len = int((primer[2] - primer[1]) / 3)
321
- # get the middle third of the primer (here are the previously excluded primers)
322
- overlap_set = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
323
- # check in which list to look for them
345
+ # as switching could violate overlap criteria,
346
+ # only consider primers that overlap in the left half (LEFT primers)
347
+ # or right half (RIGHT primers) respectively, however this can result in slightly
348
+ # longer amplicons than allowed.
349
+ half_length = int((primer[2] - primer[1]) / 2)
324
350
  if "RIGHT" in primer_name:
351
+ overlap_set = set(range(primer[1] + half_length, primer[2]))
325
352
  primers_to_test = right_primer_candidates
326
353
  else:
354
+ overlap_set = set(range(primer[1], primer[1] + half_length))
327
355
  primers_to_test = left_primer_candidates
328
356
  # and check this list for all primers that overlap
329
357
  for potential_new in primers_to_test:
@@ -337,40 +365,60 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
337
365
  return overlapping_primers
338
366
 
339
367
 
340
- def test_overlaps_for_dimers(overlapping_primers):
368
+ def test_overlaps_for_dimers(overlapping_primers, non_dimers):
341
369
  """
342
- test the overlapping primers for dimers. return new primers.
370
+ test all possible overlapping primers against each other for dimers
371
+ and return the first pair that doesn't form a dimer with each other
372
+ and with all non-dimer forming primers in the pool.
343
373
  """
344
374
  for first_overlap in overlapping_primers[0]:
375
+ if any(primers.is_dimer(seq, first_overlap[2][0]) for seq in non_dimers):
376
+ continue
345
377
  for second_overlap in overlapping_primers[1]:
346
- # return the first match. primers are sorted by penalty.
347
- # first pair that makes it has the lowest penalty
348
- if primers.calc_dimer(first_overlap[2][0], second_overlap[2][0]).tm <= config.PRIMER_MAX_DIMER_TMP:
378
+ if any(primers.is_dimer(seq, second_overlap[2][0]) for seq in non_dimers):
379
+ continue
380
+ if not primers.is_dimer(first_overlap[2][0], second_overlap[2][0]):
349
381
  return [first_overlap, second_overlap]
350
382
 
351
383
 
352
- def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers):
384
+ def _solve_single_dimer(amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools, dimer):
385
+ """
386
+ Helper function for multiprocessing: solve a single dimer independently.
387
+ Returns (amp_index, primer_name, new_primer) tuples or empty list if no solution.
388
+ """
389
+ pool = amplicon_scheme[dimer[0][0]]["pool"]
390
+ non_dimers = non_dimers_all_pools[pool]
391
+
392
+ overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
393
+ new_primers = test_overlaps_for_dimers(overlapping_primers, non_dimers)
394
+
395
+ return new_primers if new_primers else []
396
+
397
+
398
+ def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers, num_processes):
353
399
  """
354
400
  check scheme for heterodimers, try to find
355
401
  new primers that overlap and replace the existing ones.
356
- this can lead to new primer dimers. therefore the scheme
357
- is checked a second time. if there are still primer dimers
358
- present the non-solvable dimers are returned
402
+ Uses multiprocessing to solve dimers in parallel.
359
403
  """
404
+ primer_dimers, non_dimers_all_pools = test_scheme_for_dimers(amplicon_scheme)
405
+ n_initial_dimers = len(primer_dimers)
360
406
 
361
- primer_dimers = test_scheme_for_dimers(amplicon_scheme)
407
+ if not primer_dimers:
408
+ return [], 0
362
409
 
363
- if primer_dimers:
364
- print(f"varVAMP found {len(primer_dimers)} dimer pairs in scheme ... trying to find replacements")
365
- else:
366
- return []
367
-
368
- for dimer in primer_dimers:
369
- # get overlapping primers that have not been considered
370
- overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
371
- # test all possible primers against each other for dimers
372
- new_primers = test_overlaps_for_dimers(overlapping_primers)
373
- # now change these primers in the scheme
410
+ # Prepare arguments for each dimer
411
+ callable_f = functools.partial(
412
+ _solve_single_dimer,
413
+ amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools
414
+ )
415
+
416
+ # Solve dimers in parallel
417
+ with multiprocessing.Pool(processes=num_processes) as pool:
418
+ results = pool.map(callable_f, primer_dimers)
419
+
420
+ # Apply all solutions to the scheme
421
+ for new_primers in results:
374
422
  if new_primers:
375
423
  for amp_index, primer_name, primer in new_primers:
376
424
  # overwrite in final scheme
@@ -386,12 +434,13 @@ def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_
386
434
  # and in all primers
387
435
  all_primers[strand][primer_name] = primer
388
436
  # get remaining dimers in the revised scheme and add pool identifier for reporting
437
+ remaining_primer_dimers, _ = test_scheme_for_dimers(amplicon_scheme)
389
438
  primer_dimers = [
390
439
  (amplicon_scheme[primer1[0]]["pool"], primer1, primer2)
391
- for primer1, primer2 in test_scheme_for_dimers(amplicon_scheme)
440
+ for primer1, primer2 in remaining_primer_dimers
392
441
  ]
393
442
 
394
- return primer_dimers
443
+ return primer_dimers, n_initial_dimers
395
444
 
396
445
 
397
446
  def find_single_amplicons(amplicons, n):