varvamp 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
varvamp/__init__.py CHANGED
@@ -1,3 +1,6 @@
1
- """Tool to design amplicons for highly variable virusgenomes"""
2
- _program = "varvamp"
3
- __version__ = "1.2.1"
1
+ from importlib.metadata import version, PackageNotFoundError
2
+
3
+ try:
4
+ __version__ = version("varvamp")
5
+ except PackageNotFoundError:
6
+ __version__ = "unknown"
varvamp/command.py CHANGED
@@ -7,7 +7,6 @@ import sys
7
7
  import os
8
8
  import datetime
9
9
  import argparse
10
- import multiprocessing
11
10
 
12
11
  # varVAMP
13
12
  from varvamp.scripts import alignment
@@ -22,7 +21,6 @@ from varvamp.scripts import reporting
22
21
  from varvamp.scripts import scheme
23
22
  from varvamp.scripts import blast
24
23
  from varvamp import __version__
25
- from . import _program
26
24
 
27
25
 
28
26
  def get_args(sysargs):
@@ -30,7 +28,6 @@ def get_args(sysargs):
30
28
  arg parsing for varvamp
31
29
  """
32
30
  parser = argparse.ArgumentParser(
33
- prog=_program,
34
31
  usage='''\tvarvamp <mode> --help\n\tvarvamp <mode> [mode optional arguments] <alignment> <output dir>''')
35
32
  mode_parser = parser.add_subparsers(
36
33
  title="varvamp mode",
@@ -49,7 +46,7 @@ def get_args(sysargs):
49
46
  QPCR_parser = mode_parser.add_parser(
50
47
  "qpcr",
51
48
  help="design qPCR primers",
52
- usage="varvamp qpcr [optional arguments] <alignment> <output dir>"
49
+ usage="varvamp qpcr -t <threshold> [optional arguments] <alignment> <output dir>"
53
50
  )
54
51
  parser.add_argument(
55
52
  "input",
@@ -57,20 +54,12 @@ def get_args(sysargs):
57
54
  help="alignment file and dir to write results"
58
55
  )
59
56
  for par in (SINGLE_parser, TILED_parser, QPCR_parser):
60
- par.add_argument(
61
- "-t",
62
- "--threshold",
63
- metavar="",
64
- type=float,
65
- default=None,
66
- help="threshold for consensus nucleotides"
67
- )
68
57
  par.add_argument(
69
58
  "-a",
70
59
  "--n-ambig",
71
- metavar="",
60
+ metavar="2",
72
61
  type=int,
73
- default=None,
62
+ default=2,
74
63
  help="max number of ambiguous characters in a primer"
75
64
  )
76
65
  par.add_argument(
@@ -96,7 +85,23 @@ def get_args(sysargs):
96
85
  type=str,
97
86
  default="varVAMP"
98
87
  )
88
+ par.add_argument(
89
+ "--compatible-primers",
90
+ metavar="None",
91
+ type=str,
92
+ default=None,
93
+ help="FASTA primer file with which new primers should not form dimers. Sequences >40 nt are ignored. Can significantly increase runtime."
94
+ )
95
+
99
96
  for par in (SINGLE_parser, TILED_parser):
97
+ par.add_argument(
98
+ "-t",
99
+ "--threshold",
100
+ metavar="",
101
+ type=float,
102
+ default=None,
103
+ help="consensus threshold (0-1) - if not set it will be estimated (higher values result in higher specificity at the expense of found primers)"
104
+ )
100
105
  par.add_argument(
101
106
  "-ol",
102
107
  "--opt-length",
@@ -117,9 +122,9 @@ def get_args(sysargs):
117
122
  "-o",
118
123
  "--overlap",
119
124
  type=int,
120
- metavar="100",
121
- default=100,
122
- help="min overlap of the amplicons"
125
+ metavar="25",
126
+ default=25,
127
+ help="min overlap of the amplicon inserts"
123
128
  )
124
129
  SINGLE_parser.add_argument(
125
130
  "-n",
@@ -137,6 +142,13 @@ def get_args(sysargs):
137
142
  default=None,
138
143
  help="max number of ambiguous characters in a probe"
139
144
  )
145
+ QPCR_parser.add_argument(
146
+ "-t",
147
+ "--threshold",
148
+ required=True,
149
+ type=float,
150
+ help="consensus threshold (0-1) - higher values result in higher specificity at the expense of found primers"
151
+ )
140
152
  QPCR_parser.add_argument(
141
153
  "-n",
142
154
  "--test-n",
@@ -151,7 +163,7 @@ def get_args(sysargs):
151
163
  type=int,
152
164
  metavar="-3",
153
165
  default=-3,
154
- help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting temp"
166
+ help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting temperature"
155
167
  )
156
168
  parser.add_argument(
157
169
  "--verbose",
@@ -178,19 +190,30 @@ def shared_workflow(args, log_file):
178
190
  """
179
191
  # start varvamp
180
192
  logging.varvamp_progress(log_file, mode=args.mode)
181
-
182
193
  # read in alignment and preprocess
183
194
  preprocessed_alignment = alignment.preprocess(args.input[0])
184
- # check alignment length distribution
195
+ # read in external primer sequences with which new primers should not form dimers
196
+ if args.compatible_primers is not None:
197
+ compatible_primers = primers.parse_primer_fasta(args.compatible_primers)
198
+ if not compatible_primers:
199
+ logging.raise_error(
200
+ "no valid primers found in the provided primer file.\n",
201
+ log_file,
202
+ )
203
+ else:
204
+ compatible_primers = None
205
+ # check alignment length and number of gaps and report if its significantly more/less than expected
185
206
  logging.check_alignment_length(preprocessed_alignment, log_file)
207
+ logging.check_gaped_sequences(preprocessed_alignment, log_file)
186
208
 
187
209
  # estimate threshold or number of ambiguous bases if args were not supplied
188
- if args.threshold is None or args.n_ambig is None:
189
- args.threshold, args.n_ambig = param_estimation.get_parameters(preprocessed_alignment, args, log_file)
210
+ if args.threshold is None and not args.mode == 'qpcr':
211
+ args.threshold = param_estimation.get_parameters(preprocessed_alignment, args, log_file)
212
+ # set the number of ambiguous chars for qPCR probes to one less than for primers if not given
190
213
  if args.mode == "qpcr" and args.pn_ambig is None:
191
214
  if args.n_ambig == 0:
192
215
  args.pn_ambig = 0
193
- if args.n_ambig > 0:
216
+ else:
194
217
  args.pn_ambig = args.n_ambig - 1
195
218
  with open(log_file, "a") as f:
196
219
  print(f"Automatic parameter selection set -pa {args.pn_ambig}.", file=f)
@@ -211,7 +234,6 @@ def shared_workflow(args, log_file):
211
234
  alignment_cleaned, gaps_to_mask = alignment.process_alignment(
212
235
  preprocessed_alignment,
213
236
  args.threshold,
214
- args.threads
215
237
  )
216
238
  logging.varvamp_progress(
217
239
  log_file,
@@ -237,6 +259,9 @@ def shared_workflow(args, log_file):
237
259
  ambiguous_consensus,
238
260
  args.n_ambig
239
261
  )
262
+
263
+ potential_primer_regions = regions.mean(primer_regions, majority_consensus)
264
+
240
265
  if not primer_regions:
241
266
  logging.raise_error(
242
267
  "no primer regions found. Lower the threshold!",
@@ -247,7 +272,7 @@ def shared_workflow(args, log_file):
247
272
  log_file,
248
273
  progress=0.4,
249
274
  job="Finding primer regions.",
250
- progress_text=f"{regions.mean(primer_regions, majority_consensus)} % of the consensus sequence will be evaluated for primers"
275
+ progress_text=f"{potential_primer_regions} % of the consensus sequence will be evaluated for primers"
251
276
  )
252
277
 
253
278
  # produce kmers for all primer regions
@@ -266,7 +291,8 @@ def shared_workflow(args, log_file):
266
291
  left_primer_candidates, right_primer_candidates = primers.find_primers(
267
292
  kmers,
268
293
  ambiguous_consensus,
269
- alignment_cleaned
294
+ alignment_cleaned,
295
+ args.threads
270
296
  )
271
297
  for primer_type, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
272
298
  if not primer_candidates:
@@ -282,20 +308,41 @@ def shared_workflow(args, log_file):
282
308
  progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv potential primers"
283
309
  )
284
310
 
285
- return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates
311
+ # filter primers against user-provided list of compatible primers, can use multi-processing
312
+ if compatible_primers:
313
+ left_primer_candidates = primers.filter_non_dimer_candidates(
314
+ left_primer_candidates, compatible_primers, args.threads
315
+ )
316
+ right_primer_candidates = primers.filter_non_dimer_candidates(
317
+ right_primer_candidates, compatible_primers, args.threads
318
+ )
319
+ logging.varvamp_progress(
320
+ log_file,
321
+ progress=0.65,
322
+ job="Filtering primers against provided primers.",
323
+ progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv primers after filtering"
324
+ )
325
+
326
+ return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers
286
327
 
287
328
 
288
- def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, data_dir, log_file):
329
+ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, potential_primer_regions, data_dir, log_file):
289
330
  """
290
331
  part of the workflow shared by the single and tiled mode
291
332
  """
292
333
 
293
334
  # find best primers and create primer dict
294
- all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
335
+ # depending on the percentage of potential primer regions use high conservation mode
336
+ if potential_primer_regions >= 90:
337
+ all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=True)
338
+ job_text = "Excluding overlapping primers (stringent)."
339
+ else:
340
+ all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=False)
341
+ job_text = "Excluding overlapping primers."
295
342
  logging.varvamp_progress(
296
343
  log_file,
297
344
  progress=0.7,
298
- job="Considering primers with low penalties.",
345
+ job=f"{job_text}",
299
346
  progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rv primers"
300
347
  )
301
348
 
@@ -307,8 +354,7 @@ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
307
354
  )
308
355
  if not amplicons:
309
356
  logging.raise_error(
310
- "no amplicons found. Increase the max amplicon length or \
311
- number of ambiguous bases or lower threshold!\n",
357
+ "no amplicons found. Increase the max amplicon length or number of ambiguous bases or lower threshold!\n",
312
358
  log_file,
313
359
  exit=True
314
360
  )
@@ -353,7 +399,7 @@ def single_workflow(args, amplicons, log_file):
353
399
  return amplicon_scheme
354
400
 
355
401
 
356
- def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file, results_dir):
402
+ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
357
403
  """
358
404
  part of the workflow specific for the tiled mode
359
405
  """
@@ -367,21 +413,9 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
367
413
  amplicon_graph
368
414
  )
369
415
 
370
- # check for dimers
371
- dimers_not_solved = scheme.check_and_solve_heterodimers(
372
- amplicon_scheme,
373
- left_primer_candidates,
374
- right_primer_candidates,
375
- all_primers)
376
- if dimers_not_solved:
377
- logging.raise_error(
378
- f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
379
- log_file
380
- )
381
- reporting.write_dimers(results_dir, dimers_not_solved)
382
-
383
416
  # evaluate coverage
384
- # ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers, but this potential, minor inaccuracy is currently accepted.
417
+ # ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers,
418
+ # but this potential, minor inaccuracy is currently accepted.
385
419
  percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
386
420
  logging.varvamp_progress(
387
421
  log_file,
@@ -398,10 +432,36 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
398
432
  "\t - relax primer settings (not recommended)\n",
399
433
  log_file
400
434
  )
401
- return amplicon_scheme
402
435
 
436
+ # check for dimers
437
+ dimers_not_solved, n_initial_dimers = scheme.check_and_solve_heterodimers(
438
+ amplicon_scheme,
439
+ left_primer_candidates,
440
+ right_primer_candidates,
441
+ all_primers,
442
+ args.threads
443
+ )
444
+
445
+ # report dimers solve
446
+ if n_initial_dimers > 0 and not dimers_not_solved:
447
+ logging.varvamp_progress(
448
+ log_file,
449
+ progress=0.95,
450
+ job="Trying to solve primer dimers.",
451
+ progress_text=f"all dimers (n={n_initial_dimers}) could be resolved"
452
+ )
453
+ elif dimers_not_solved:
454
+ logging.varvamp_progress(
455
+ log_file,
456
+ progress=0.95,
457
+ job="Trying to solve primer dimers.",
458
+ progress_text=f"{len(dimers_not_solved)}/{n_initial_dimers} dimers could not be resolved"
459
+ )
460
+
461
+ return amplicon_scheme, dimers_not_solved
403
462
 
404
- def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, log_file):
463
+
464
+ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, compatible_primers, log_file):
405
465
  """
406
466
  part of the workflow specific for the tiled mode
407
467
  """
@@ -412,7 +472,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
412
472
  )
413
473
  if not probe_regions:
414
474
  logging.raise_error(
415
- "no regions that fullfill probe criteria! lower threshold or increase number of ambiguous chars in probe\n",
475
+ "no regions that fulfill probe criteria! lower threshold or increase number of ambiguous chars in probe\n",
416
476
  log_file,
417
477
  exit=True
418
478
  )
@@ -424,7 +484,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
424
484
  config.QPROBE_SIZES
425
485
  )
426
486
  # find potential probes
427
- qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned)
487
+ qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned, args.threads)
428
488
  if not qpcr_probes:
429
489
  logging.raise_error(
430
490
  "no qpcr probes found\n",
@@ -438,8 +498,21 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
438
498
  progress_text=f"{len(qpcr_probes)} potential qPCR probes"
439
499
  )
440
500
 
501
+ # filter primers against non-dimer sequences if provided
502
+ if compatible_primers:
503
+ qpcr_probes = primers.filter_non_dimer_candidates(
504
+ qpcr_probes, compatible_primers, args.threads)
505
+ logging.varvamp_progress(
506
+ log_file,
507
+ progress=0.75,
508
+ job="Filtering probes against provided primers.",
509
+ progress_text=f"{len(qpcr_probes)} potential qPCR probes after filtering"
510
+ )
511
+
441
512
  # find unique amplicons with a low penalty and an internal probe
442
- qpcr_scheme_candidates = qpcr.find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus, ambiguous_consensus)
513
+ qpcr_scheme_candidates = qpcr.find_qcr_schemes(
514
+ qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus, ambiguous_consensus, args.threads
515
+ )
443
516
  if not qpcr_scheme_candidates:
444
517
  logging.raise_error(
445
518
  "no qPCR scheme candidates found. lower threshold or increase number of ambiguous chars in primer and/or probe\n",
@@ -500,13 +573,13 @@ def main():
500
573
  blast.check_BLAST_installation(log_file)
501
574
 
502
575
  # mode unspecific part of the workflow
503
- alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates = shared_workflow(args, log_file)
576
+ alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers = shared_workflow(args, log_file)
504
577
 
505
578
  # write files that are shared in all modes
506
579
  reporting.write_regions_to_bed(primer_regions, args.name, data_dir)
507
580
  reporting.write_alignment(data_dir, alignment_cleaned)
508
- reporting.write_fasta(data_dir, f"majority_consensus", f"{args.name}_consensus",majority_consensus)
509
- reporting.write_fasta(results_dir, f"ambiguous_consensus", f"{args.name}_consensus", ambiguous_consensus)
581
+ reporting.write_fasta(data_dir, f"majority_consensus", f"{args.name}_majority_consensus",majority_consensus)
582
+ reporting.write_fasta(results_dir, f"ambiguous_consensus", f"{args.name}_ambiguous_consensus", ambiguous_consensus)
510
583
 
511
584
  # Functions called from here on return lists of amplicons that are refined step-wise into final schemes.
512
585
  # These lists that are passed between functions and later used for reporting consist of dictionary elemnts,
@@ -522,10 +595,12 @@ def main():
522
595
 
523
596
  # SINGLE/TILED mode
524
597
  if args.mode == "tiled" or args.mode == "single":
598
+ dimers_not_solved = None
525
599
  all_primers, amplicons = single_and_tiled_shared_workflow(
526
600
  args,
527
601
  left_primer_candidates,
528
602
  right_primer_candidates,
603
+ potential_primer_regions,
529
604
  data_dir,
530
605
  log_file
531
606
  )
@@ -536,7 +611,7 @@ def main():
536
611
  log_file
537
612
  )
538
613
  elif args.mode == "tiled":
539
- amplicon_scheme = tiled_workflow(
614
+ amplicon_scheme, dimers_not_solved = tiled_workflow(
540
615
  args,
541
616
  amplicons,
542
617
  left_primer_candidates,
@@ -544,11 +619,9 @@ def main():
544
619
  all_primers,
545
620
  ambiguous_consensus,
546
621
  log_file,
547
- results_dir
548
622
  )
549
623
 
550
624
  # write files
551
-
552
625
  if args.mode == "tiled":
553
626
  # assign amplicon numbers from 5' to 3' along the genome
554
627
  amplicon_scheme.sort(key=lambda x: x["LEFT"][1])
@@ -562,7 +635,8 @@ def main():
562
635
  ambiguous_consensus,
563
636
  args.name,
564
637
  args.mode,
565
- log_file
638
+ log_file,
639
+ dimers_not_solved
566
640
  )
567
641
  reporting.varvamp_plot(
568
642
  results_dir,
@@ -584,11 +658,11 @@ def main():
584
658
  majority_consensus,
585
659
  left_primer_candidates,
586
660
  right_primer_candidates,
661
+ compatible_primers,
587
662
  log_file
588
663
  )
589
664
 
590
665
  # write files
591
-
592
666
  # make sure amplicons with no off-target products and with low penalties get the lowest numbers
593
667
  final_schemes.sort(key=lambda x: (x.get("off_targets", False), x["penalty"]))
594
668
  reporting.write_regions_to_bed(probe_regions, args.name, data_dir, "probe")
@@ -2,14 +2,11 @@
2
2
  alignment preprocessing
3
3
  """
4
4
 
5
- # BUILT-INS
6
- import re
7
- import multiprocessing
8
-
9
5
  # varVAMP
10
6
  from varvamp.scripts import config
11
7
 
12
8
  # LIBS
9
+ import numpy as np
13
10
  from Bio import AlignIO
14
11
  from Bio.Seq import Seq
15
12
 
@@ -47,181 +44,74 @@ def preprocess(alignment_path):
47
44
  return preprocessed_alignment
48
45
 
49
46
 
50
- def find_internal_gaps(unique_gaps, gap):
51
- """
52
- find all unique gaps that
53
- lie within the current gap
54
- """
55
- overlapping_gaps = []
56
-
57
- if gap[1] - gap[0] == 0:
58
- # if the gap length = 1 there are
59
- # no overlapping gaps
60
- overlapping_gaps = [gap]
61
- else:
62
- # for each unique gap check if the intersection with the
63
- # gap is the same as the unique gap -> internal gap of
64
- # the current gap
65
- for unique_gap in unique_gaps:
66
- unique_set = set(range(unique_gap[0], unique_gap[1]))
67
- current_range = range(gap[0], gap[1])
68
- intersection = unique_set.intersection(current_range)
69
- if not intersection:
70
- continue
71
- if min(intersection) == unique_gap[0] and max(intersection) + 1 == unique_gap[1]:
72
- overlapping_gaps.append(unique_gap)
73
-
74
- return overlapping_gaps
75
-
76
-
77
- def find_overlapping_gaps_worker(gap_list, unique_gaps):
78
- """
79
- Worker function to find overlapping gaps and count their occurrences.
80
- """
81
- gap_dict_part = {}
82
- for gap in gap_list:
83
- overlapping_gaps = find_internal_gaps(unique_gaps, gap)
84
- for overlapping_gap in overlapping_gaps:
85
- if overlapping_gap in gap_dict_part:
86
- gap_dict_part[overlapping_gap] += 1
87
- else:
88
- gap_dict_part[overlapping_gap] = 1
89
- return gap_dict_part
90
-
91
-
92
- def create_gap_dictionary(unique_gaps, all_gaps, n_threads):
93
- """
94
- Creates a dictionary with all gap counts.
95
- Counts also all overlapping gaps per gap.
96
- Uses multiprocessing for parallelization.
97
- """
98
-
99
- with multiprocessing.Pool(processes=n_threads) as pool:
100
- results = pool.starmap(find_overlapping_gaps_worker, [(gap_list, unique_gaps) for gap_list in all_gaps])
101
-
102
- gap_dict = {}
103
- for gap_dict_part in results:
104
- for gap, count in gap_dict_part.items():
105
- if gap in gap_dict:
106
- gap_dict[gap] += count
107
- else:
108
- gap_dict[gap] = count
109
-
110
- return gap_dict
111
-
112
-
113
- def find_gaps_to_mask(gap_dict, cutoff):
114
- """
115
- filters gaps for their freq cutoff.
116
- condenses final gaps if there is
117
- an overlap.
118
- """
119
- gaps_to_mask = []
120
- potential_gaps = []
121
- opened_region = []
122
-
123
- # check for each region if it is covered
124
- # by enough sequences
125
- for gap in gap_dict:
126
- if gap_dict[gap] > cutoff:
127
- potential_gaps.append(gap)
128
-
129
- # sort by start and stop
130
- potential_gaps = sorted(potential_gaps)
131
-
132
- # get the min and max of overlapping gaps
133
- for i, region in enumerate(potential_gaps):
134
- region = list(region)
135
- if opened_region:
136
- # write the opened region if the start of the current region
137
- # > opened_region[stop] and the last still opened region
138
- if region[0] > opened_region[1] or i == len(potential_gaps) - 1:
139
- gaps_to_mask.append(opened_region)
140
- opened_region = region
141
- else:
142
- # 1 case: same start and further stop -> new stop
143
- if region[0] == opened_region[0]:
144
- opened_region[1] = region[1]
145
- # 2 case: further start and further stop -> new stop
146
- if region[0] > opened_region[0] and region[1] > opened_region[1]:
147
- opened_region[1] = region[1]
148
- else:
149
- opened_region = region
150
-
151
- return gaps_to_mask
152
-
153
-
154
47
  def clean_gaps(alignment, gaps_to_mask):
155
48
  """
156
- clean an alignment of large common deletions.
49
+ Clean an alignment of large common deletions based on gaps_to_mask.
50
+ gaps_to_mask: list of [start, end] (inclusive), sorted by start.
157
51
  """
158
52
  cleaned_alignment = []
53
+ gaps_to_mask = sorted(gaps_to_mask, key=lambda x: x[0])
159
54
 
160
- for sequence in alignment:
55
+ for seq_id, seq in alignment:
161
56
  start = 0
162
- masked_seq = str()
163
- for region in gaps_to_mask:
164
- # check if it is three bases or more and mask with 2 Ns
165
- if region[1] - region[0] >= config.QAMPLICON_DEL_CUTOFF:
57
+ pieces = []
58
+ # for each seq in the alignment, mask the regions
59
+ for region_start, region_end in gaps_to_mask:
60
+ # mask length for this region
61
+ if (region_end - region_start + 1) >= config.QAMPLICON_DEL_CUTOFF:
166
62
  mask = "NN"
167
- # or mask with one N (small deletion)
168
63
  else:
169
64
  mask = "N"
170
- stop = region[0]
171
- masked_seq_temp = sequence[1][start:stop]
172
- # check if the deletion is at the start
173
- if start == 0 and len(masked_seq_temp) == 0:
174
- masked_seq = mask
175
- # check if deletion is not at start
176
- elif start == 0 and len(masked_seq_temp) != 0:
177
- masked_seq = masked_seq_temp
178
- # else we are in the middle of the alignment
179
- else:
180
- masked_seq = masked_seq + mask + masked_seq_temp
181
- start = region[1] + 1
182
- if max(gaps_to_mask)[1] < len(sequence[1]) - 1:
183
- # append the last seq if no gap is at
184
- # the end of the sequence
185
- start = max(gaps_to_mask)[1]
186
- stop = len(sequence[1]) - 1
187
- masked_seq_temp = sequence[1][start:stop]
188
- masked_seq = masked_seq + mask + masked_seq_temp
189
- else:
190
- # append the mask to the end of the seq
191
- masked_seq = masked_seq + mask
192
-
193
- cleaned_alignment.append([sequence[0], masked_seq])
65
+ # part before region
66
+ pieces.append(seq[start:region_start])
67
+ # mask for region
68
+ pieces.append(mask)
69
+ # next start is after region
70
+ start = region_end + 1
71
+
72
+ # tail after last masked region
73
+ if start < len(seq):
74
+ pieces.append(seq[start:])
75
+
76
+ cleaned_alignment.append([seq_id, "".join(pieces)])
194
77
 
195
78
  return cleaned_alignment
196
79
 
197
80
 
198
- def process_alignment(preprocessed_alignment, threshold, n_threads):
81
+ def process_alignment(preprocessed_alignment, threshold):
199
82
  """
200
- proprocesses alignment and cleans gaps
83
+ - build an array of shape (n_seq, seq_len)
84
+ - for each column, count how many sequences are '-'
85
+ - mark columns to mask if count > cutoff
86
+ - turn those columns into contiguous regions
201
87
  """
202
- all_gaps = []
203
-
204
- gap_cutoff = len(preprocessed_alignment) * (1 - threshold)
205
- for seq in preprocessed_alignment:
206
- # find all gaps for all sequences with regular expression -{min}
207
- all_gaps.append(
208
- [(gap.start(0), gap.end(0) - 1) for gap in re.finditer(
209
- "-{1,}", seq[1])]
210
- )
211
- unique_gaps = list(set(gaps for gap_list in all_gaps for gaps in gap_list))
212
-
213
- if unique_gaps:
214
- gap_dic = create_gap_dictionary(unique_gaps, all_gaps, n_threads)
215
- gaps_to_mask = find_gaps_to_mask(gap_dic, gap_cutoff)
216
- if gaps_to_mask:
217
- alignment_cleaned = clean_gaps(
218
- preprocessed_alignment, gaps_to_mask
219
- )
220
- else:
221
- alignment_cleaned = preprocessed_alignment
222
- else:
223
- gaps_to_mask = []
224
- alignment_cleaned = preprocessed_alignment
88
+
89
+ # build char array
90
+ seqs = [seq for seq_id, seq in preprocessed_alignment]
91
+ arr = np.array([list(s) for s in seqs], dtype="U1")
92
+ n_seq, len_seq = arr.shape
93
+
94
+ # per-column gap counts
95
+ cols_to_mask = (arr == "-").sum(axis=0) > n_seq * (1 - threshold)
96
+
97
+ # convert bool mask into list of (start, end) regions (end inclusive)
98
+ gaps_to_mask = []
99
+ in_gap = False
100
+ start = None
101
+ for i, is_gap in enumerate(cols_to_mask):
102
+ if is_gap and not in_gap:
103
+ in_gap = True
104
+ start = i
105
+ elif not is_gap and in_gap:
106
+ in_gap = False
107
+ gaps_to_mask.append([start, i - 1])
108
+ if in_gap:
109
+ gaps_to_mask.append([start, len_seq - 1])
110
+
111
+ if not gaps_to_mask:
112
+ return preprocessed_alignment, []
113
+
114
+ alignment_cleaned = clean_gaps(preprocessed_alignment, gaps_to_mask)
225
115
 
226
116
  return alignment_cleaned, gaps_to_mask
227
117