tb-consensus-aligner 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,491 @@
1
+ import argparse
2
+ import re
3
+ import os
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from Bio import SeqIO
7
+ import gzip
8
+ '''
9
+ This script creates consensus fastas from a collection of VCF files and the MTBC ancestor reference genome.
10
+ SNPs with a frequency >90% are encoded with the alternative base from the VCF.
11
+ SNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
12
+ SNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
13
+
14
+ Large deletions (coverage 0) are encoded by dashes (-).
15
+ Small deletions with frequency >90% are encoded by dashes (-).
16
+ Small deletions with frequency between 10% and 90% are encoded with a 'N'.
17
+ Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
18
+
19
+ Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
20
+
21
+ Sites to exclude specified in the bed files are encoded with a 'N'.
22
+ Variants that do not have the 'PASS' quality filter are encoded with a 'N'.
23
+ Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
24
+
25
+ '''
26
+
27
+ # Extract the basename to use in output file naming and consensus file filling
28
+ def get_basename(vcf_path):
29
+ # Get the filename from the path (e.g., 'input_vcfs/Sample_01.recal.vcf.gz')
30
+ filename = os.path.basename(vcf_path)
31
+
32
+ # Use regex to "look ahead" for .vcf and take everything before it.
33
+ match = re.split(r'\.vcf', filename, flags=re.IGNORECASE)
34
+
35
+ return match[0]
36
+
37
+ def flatten(t):
38
+ return [item for sublist in t for item in sublist]
39
+
40
+
41
+ # Split a string of characters into a list of the individual characters (list comprehension) useful to examine each base of f.e. ACTG individually
42
+ def split(word):
43
+ return [char for char in word]
44
+
45
+
46
+ # Function to open either gzipped or unzipped vcf files
47
+ def open_vcf(path):
48
+
49
+ path = Path(path)
50
+
51
+ if path.suffix == ".gz":
52
+ return gzip.open(path, "rt")
53
+ else:
54
+ return open(path, "r")
55
+
56
+
57
+ # Function to get the length of the reference sequence in case of testing with a smaller reference genome
58
+ def get_reference_length(args):
59
+ record = SeqIO.read(str(args.reference), "fasta") # Create object with key info of the sequence
60
+ return len(record.seq)
61
+
62
+ # Parse the BED files to get a dictionary of positions to exclude
63
+ def get_pos_to_exclude(bed_files):
64
+
65
+ pos_to_exclude = {}
66
+ if bed_files:
67
+
68
+ for bed_path in bed_files:
69
+ bed_path = Path(bed_path)
70
+
71
+ with bed_path.open() as file:
72
+ table = [position.strip().split('\t') for position in file] # Create a list of lists, each representing a line with the entries as items
73
+
74
+ for i in range(1,len(table)): # Iterate over every element of [table] -> line in BED
75
+ StartPosition = int(table[i][1])
76
+ EndPosition = int(table[i][2])
77
+
78
+ ranges_of_coordinates = [i for i in range(StartPosition, EndPosition)] # Create a list of every position of the interval of a BED line
79
+
80
+ for pos in ranges_of_coordinates: # If the position of the range is not yet in the dict, add it
81
+ if pos not in pos_to_exclude:
82
+ pos_to_exclude[pos] = ''
83
+ else:
84
+ continue
85
+
86
+ return dict(sorted(pos_to_exclude.items()))
87
+
88
+
89
+ # Create a dictionary where the keys are the genomic positions and values are the bases of the reference genome
90
+ def fasta2dict(args):
91
+
92
+ fasta_dict = {}
93
+
94
+ record = SeqIO.read(str(args.reference), "fasta") # Create object with key info of the sequence
95
+ number_of_refbases = len(record.seq)
96
+
97
+ for i in range(number_of_refbases): # Add the each refbase as value to the key (genomic position)
98
+ fasta_dict[i+1]=record.seq[i]
99
+
100
+ return(fasta_dict)
101
+
102
+ # Create a dictionary from the VCFs where POS is the key and REF, ALT, QUAL and AF(as list) are values
103
+ def vcf2dict(vcf_files):
104
+
105
+ vcf_dict = {}
106
+
107
+ for vcf_path in vcf_files:
108
+ vcf_path = Path(vcf_path)
109
+
110
+ with open_vcf(vcf_path) as vcf_file:
111
+
112
+ for row in vcf_file:
113
+
114
+ if row.startswith("#"):
115
+ continue
116
+
117
+ row = row.strip().split('\t')
118
+
119
+ position = row[1]
120
+ ref = row[3]
121
+ alt = row[4]
122
+ qual = float(row[5])
123
+
124
+ # Compute the AF from AO and RO for one or multiple ALT alleles with AF = AO / AO + RO
125
+ value_fields = row[-1].split(":") # Split column with the values at the :
126
+ RO = int(value_fields[2]) # As int to do calculations with it
127
+ AO_list = [int(ao) for ao in value_fields[4].split(",")] # A list in case calculation of AF has do be done for multiple alleles
128
+
129
+ AF = [ao / (sum(AO_list) + RO) for ao in AO_list] # sum(AO_list) to accomodate possibility of many alleles
130
+
131
+ vcf_dict[int(position)] = [ref,alt,qual,AF]
132
+
133
+ return(vcf_dict)
134
+
135
+ # Parse the depth file to get a nested dictionary with with the depths of each genomic position for each BAM file
136
+ # {{ BAM0: {pos1: depth1, pos2: depth2, ...}, BAM1: {pos1: depth1, pos2: depth2, ...}}
137
+ def get_sample_depth(depth_file_path, column_index):
138
+ """
139
+ column_index: the integer index of the sample (0, 1, 2...)
140
+ This corresponds to the order Galaxy passed the VCFs.
141
+ """
142
+ sample_depths = {}
143
+ with open(depth_file_path, 'rt') as f:
144
+ # Skip the header
145
+ header = f.readline()
146
+
147
+ # We add 2 to column_index because:
148
+ # Col 0 = CHROM, Col 1 = POS, Col 2 = First Sample (Index 0)
149
+ actual_col = column_index + 2
150
+
151
+ for line in f:
152
+ if line.startswith("#"):
153
+ continue
154
+ parts = line.strip().split("\t")
155
+ # Pull the POS and the specific DEPTH column
156
+ sample_depths[int(parts[1])] = int(parts[actual_col])
157
+
158
+ return sample_depths
159
+
160
+ # Define the ambiguity bases to return ambiguity base if SNP between AF 10% and 90%
161
+ def ambiguity_code(ref, alt):
162
+
163
+ ambiguity_base = ''
164
+
165
+ if ref == 'C' and alt == 'T':
166
+ ambiguity_base = 'Y'
167
+
168
+ elif ref == 'T' and alt == 'C':
169
+ ambiguity_base = 'Y'
170
+
171
+ elif ref == 'A' and alt == 'G':
172
+ ambiguity_base = 'R'
173
+
174
+ elif ref == 'G' and alt == 'A':
175
+ ambiguity_base = 'R'
176
+
177
+ elif ref == 'A' and alt == 'T':
178
+ ambiguity_base = 'W'
179
+
180
+ elif ref == 'T' and alt == 'A':
181
+ ambiguity_base = 'W'
182
+
183
+ elif ref == 'G' and alt == 'C':
184
+ ambiguity_base = 'S'
185
+
186
+ elif ref == 'C' and alt == 'G':
187
+ ambiguity_base = 'S'
188
+
189
+ elif ref == 'T' and alt == 'G':
190
+ ambiguity_base = 'K'
191
+
192
+ elif ref == 'G' and alt == 'T':
193
+ ambiguity_base = 'K'
194
+
195
+ elif ref == 'C' and alt == 'A':
196
+ ambiguity_base = 'M'
197
+
198
+ elif ref == 'A' and alt == 'C':
199
+ ambiguity_base = 'M'
200
+
201
+ elif ref == alt:
202
+ ambiguity_base = ref
203
+
204
+ return(ambiguity_base)
205
+
206
+ # Main function to loop over VCFs in input directory and create the respective consensus genomes
207
+
208
+ def main(args):
209
+
210
+ REF = fasta2dict(args)
211
+ EXCLUDED_POS = get_pos_to_exclude(args.bed_files)
212
+ print(EXCLUDED_POS)
213
+
214
+ # Define the vcf_dir from the argument -v/vcf, sort the vcf files alphanumerically, extract all keys (columns of depth file) alphanumerically sorted
215
+
216
+ vcf_files = args.vcf_files
217
+
218
+
219
+ # Loop over each vcf file with the matching column from the depth file
220
+ for sample_idx, vcf_file in enumerate(vcf_files):
221
+
222
+ VCF = vcf2dict([vcf_file])
223
+ VCF_DEPTH = get_sample_depth(args.depth, sample_idx)
224
+
225
+
226
+ fasta_sequence = []
227
+
228
+ i = 1
229
+
230
+ # Go over each position in the reference sequence and modify the base if it is in the VCF
231
+ while(i <= len(REF)):
232
+
233
+ if i in EXCLUDED_POS: # Check if the position falls in the BED file
234
+
235
+ fasta_sequence += ['N']
236
+ i += 1
237
+
238
+
239
+ else:
240
+
241
+ if i not in VCF: # Position not in VCF -> must be ancestral or deletion
242
+
243
+ if i in VCF_DEPTH:
244
+
245
+ if 1 <= VCF_DEPTH[i] <=5: # Covered by less than 5 reads -> N
246
+ fasta_sequence += ['N']
247
+ i += 1
248
+
249
+ elif VCF_DEPTH[i] == 0: # Covered by 0 reads -> deletion
250
+ fasta_sequence += ['-']
251
+ i += 1
252
+
253
+ elif VCF_DEPTH[i] > 5: # Not in VCF but covered >5 reads -> ancestral base
254
+ fasta_sequence += [REF[i]]
255
+ i += 1
256
+
257
+ elif i in VCF: # Position is in VCF -> Variant
258
+
259
+ # for i define:
260
+ reference_base = VCF[i][0]
261
+ alternative_base = VCF[i][1]
262
+ quality = VCF[i][2]
263
+ allele_freq = VCF[i][3]
264
+
265
+ if quality >= 20: # Variants with a 99% confidence (phred-score >20)
266
+
267
+ if len(allele_freq) == 1: # Only one ALT allele
268
+
269
+ allele_freq = allele_freq[0]
270
+
271
+ if len(alternative_base) == 1 and len(reference_base) == 1: # REF and ALT = 1 -> SNP
272
+
273
+ if allele_freq >= 0.90: # Take alt base from VCF
274
+ fasta_sequence +=[alternative_base]
275
+ i += 1
276
+
277
+ elif 0.10 <= allele_freq < 0.90: # Take ambiguity base
278
+ fasta_sequence += [ambiguity_code(reference_base,alternative_base)]
279
+ i += 1
280
+
281
+ elif allele_freq < 0.10: # Take the ancestral base
282
+ fasta_sequence += [REF[i]]
283
+ i += 1
284
+
285
+
286
+ elif len(alternative_base) < len(reference_base): # Small deletions
287
+
288
+ if len(alternative_base) == 1 and len(reference_base) > 1:
289
+ small_deletion_length = len(reference_base) # Get length of REF to skip correct number of bases
290
+
291
+ if allele_freq >=0.90: # Encode with "-"
292
+ fasta_sequence += [alternative_base[0]]
293
+ fasta_sequence += split('-'*(small_deletion_length-1))
294
+ i += small_deletion_length
295
+
296
+ elif 0.10 <= allele_freq < 0.90: # Take ambiguity base
297
+ fasta_sequence += split('N'*small_deletion_length)
298
+ i += small_deletion_length
299
+
300
+ elif allele_freq < 0.10: # take reference base
301
+ fasta_sequence += [REF[i]]
302
+ i += 1
303
+
304
+ if len(alternative_base) > 1 and len(reference_base) > 1: # If both REF & ALT are more than one base but still REF>ALT
305
+ fasta_sequence += [REF[i]]
306
+ i += 1
307
+
308
+ elif len(alternative_base) > len(reference_base): # Small insertion
309
+ fasta_sequence += [REF[i]]
310
+ i += 1
311
+
312
+ elif (len(alternative_base) > 1 ) and (len(reference_base) > 1) and (len(alternative_base) == len(reference_base)): # MNP
313
+
314
+ length_mnp = len(alternative_base)
315
+
316
+ if allele_freq >= 0.90: # Take alt bases from VCF
317
+ fasta_sequence += split(alternative_base)
318
+ i += length_mnp
319
+
320
+ elif 0.10 <= allele_freq < 0.90: # Take ambiguity bases
321
+
322
+ for f, b in zip(reference_base, alternative_base):
323
+ fasta_sequence += [ambiguity_code(f, b)]
324
+ i += length_mnp
325
+
326
+ elif allele_freq < 0.10: # Take the ancestral bases
327
+ fasta_sequence += split(reference_base)
328
+ i += len(reference_base)
329
+
330
+ elif len(allele_freq) > 1: # More than one ALT allele, we consider the ALT allele with the highest allele frequency
331
+
332
+ highest_AF = max(allele_freq) # Find highest AF
333
+
334
+ alternative_base = alternative_base.split(',') # Get different alleles split by ','
335
+
336
+ index_of_nucleotide_with_highest_AF = allele_freq.index(highest_AF) # Index of the allele with highest AF
337
+
338
+ ALT_allele_with_highest_frequency = alternative_base[index_of_nucleotide_with_highest_AF] # ALT allele for highest AF
339
+
340
+ print(i,VCF[i],highest_AF,split(ALT_allele_with_highest_frequency),len(ALT_allele_with_highest_frequency),reference_base,REF[i])
341
+
342
+ # Now again check for SNPs, deletions, insertions and MNPs
343
+ if len(ALT_allele_with_highest_frequency) == 1 and len(reference_base) == 1: # SNP
344
+
345
+ if highest_AF >= 0.90: # Take ALT base
346
+ fasta_sequence += [ALT_allele_with_highest_frequency]
347
+ i += 1
348
+
349
+ elif 0.10 <= highest_AF < 0.90: # take ambiguity_base
350
+ fasta_sequence += [ambiguity_code(reference_base,ALT_allele_with_highest_frequency)]
351
+ i += 1
352
+
353
+ elif highest_AF < 0.10: # Take reference base
354
+ fasta_sequence += [REF[i]]
355
+ i +=1
356
+
357
+ elif len(ALT_allele_with_highest_frequency) < len(reference_base): # Small deletion
358
+
359
+ if len(ALT_allele_with_highest_frequency) == 1 and len(reference_base) > 1: # Small deletion with ALT allele = 1bp
360
+ small_deletion_length = len(reference_base) # To move correct amount of bases
361
+
362
+ if highest_AF >= 0.90: # Encode with a '-'
363
+ fasta_sequence += [ALT_allele_with_highest_frequency[0]]
364
+ fasta_sequence += split('-'*(small_deletion_length-1))
365
+ i += small_deletion_length
366
+
367
+ elif 0.10 <= highest_AF < 0.90: # Encode with N
368
+ fasta_sequence += split('N'*small_deletion_length)
369
+ i += small_deletion_length
370
+
371
+ elif highest_AF < 0.10: # Take reference base
372
+ fasta_sequence += [REF[i]]
373
+ i += 1
374
+
375
+ if len(ALT_allele_with_highest_frequency) > 1 and len(reference_base) > 1: # Although ALT < REF, ALT > 1
376
+ fasta_sequence += [REF[i]]
377
+ i += 1
378
+
379
+ elif len(ALT_allele_with_highest_frequency) > len(reference_base): # Small insertion, no matter AF -> take anc base
380
+ fasta_sequence += [REF[i]]
381
+ i += 1
382
+
383
+ elif (len(ALT_allele_with_highest_frequency) > 1) and (len(reference_base) > 1) and (len(ALT_allele_with_highest_frequency) == len(reference_base)): # MNP
384
+
385
+ length_mnp = len(ALT_allele_with_highest_frequency)
386
+
387
+ if highest_AF >= 0.90: # Take alt base
388
+ fasta_sequence += split(ALT_allele_with_highest_frequency)
389
+ i += length_mnp
390
+
391
+ elif 0.10 <= highest_AF < 0.90: # Take ambiguity base
392
+ for f, b in zip(reference_base, ALT_allele_with_highest_frequency):
393
+ fasta_sequence += [ambiguity_code(f, b)]
394
+ i += length_mnp
395
+
396
+ elif highest_AF < 0.10: # Take reference base
397
+ fasta_sequence += split(reference_base)
398
+ i += len(reference_base)
399
+
400
+
401
+ else: # QUAL < 20
402
+ fasta_sequence += ['N']
403
+ i += 1
404
+
405
+ else: # Pos i not in VCF
406
+ print("i not in VCF",i,VCF[i])
407
+ break
408
+
409
+
410
+ print("Lenght of sequence:",len(flatten(fasta_sequence)))
411
+
412
+ # Find and print empty strings in the list fasta_sequence
413
+ for b in range(len(fasta_sequence)):
414
+ if len(fasta_sequence[b]) == 0:
415
+ print(b, fasta_sequence[b])
416
+
417
+ # Check if the script is running in test mode (shorter genomes)
418
+ if args.test_mode:
419
+
420
+ print("Running in test mode - smaller genomes accepted")
421
+ reference_length = get_reference_length(args)
422
+
423
+ # Created consensus sequence too short
424
+ if len(fasta_sequence) < reference_length:
425
+ print("Fasta file not created")
426
+ print("Fasta has less than {reference_length} bp:", len(fasta_sequence))
427
+
428
+ # Consensus sequence has correct length
429
+ elif len(fasta_sequence) == reference_length:
430
+
431
+ sample_name = get_basename(vcf_file)
432
+ output_filename = f"{sample_name}.consensus.fasta"
433
+ output_path = os.path.join(args.output_dir, output_filename)
434
+ with open(output_path, 'w') as output_file:
435
+ output_file.write(f">{sample_name}\n")
436
+ output_file.write("".join(fasta_sequence))
437
+
438
+ # Consensus sequence too long
439
+ else:
440
+ print("Fasta file not created")
441
+ print("Fasta more than {reference_length} bp:", len(fasta_sequence))
442
+
443
+ else:
444
+ # Default reference length is set to ancestral MTBC ref length
445
+ # Get calculated ref len from object args if not available set to default 4411532
446
+ target_len = getattr(args, 'calculated_ref_len', 4411532)
447
+
448
+ # Created consensus sequence too short
449
+ if len(fasta_sequence) < target_len:
450
+ print("Fasta file not created")
451
+ print(f"Fasta has less than {target_len} bp: {len(fasta_sequence)}")
452
+
453
+ # Consensus sequence has correct length
454
+ elif len(fasta_sequence) == target_len:
455
+ sample_name = get_basename(vcf_file)
456
+ output_filename = f"{sample_name}.consensus.fasta"
457
+ output_path = os.path.join(args.output_dir, output_filename)
458
+ with open(output_path, 'w') as output_file:
459
+ output_file.write(f">{sample_name}\n")
460
+ output_file.write("".join(fasta_sequence))
461
+
462
+ # Consensus sequence too long
463
+ else:
464
+ print("Fasta file not created")
465
+ print(f"Fasta more than {target_len} bp: {len(fasta_sequence)}")
466
+
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
@@ -0,0 +1,161 @@
1
+ import os
2
+ import glob
3
+ import sys
4
+ import argparse
5
+ from collections import defaultdict
6
+ from pathlib import Path
7
+ from Bio import SeqIO
8
+ import gzip
9
+ from . import consensus_galaxy
10
+ from . import snp_aligner_galaxy
11
+
12
+ '''
13
+
14
+ This script is the main script for the "insert name here".
15
+ It is built on top of consensus.py and snp_aligner.py.
16
+
17
+ The user can choose the following with the -s option.
18
+ -s all: Input = VCFs, output = SNP alignment -> runs consensus.py and snp_aligner.py subsequently.
19
+ -s consensus: Input = VCFs, output = consensus fasta files -> runs just consensus.py
20
+ When -s all is chosen, -m option gives choice whether to just output the SNP alignment
21
+ with -m alignment_only or both the SNP alignment and the consensus fasta files with -m everything
22
+
23
+ '''
24
+
25
+ # Define arguments used in the script
26
+ def get_args():
27
+
28
+ parser = argparse.ArgumentParser(description='Main script for consensus.py and snp_aligner.py')
29
+
30
+ # Use dest to set the name of the argument for further handling and clarity
31
+ parser.add_argument('-s', choices=['consensus', 'all'],dest='step', help='Run either consensus, or both consensus and snp_aligner', required=True)
32
+ parser.add_argument('-m', choices=['alignment_only', 'everything'],dest='mode', help='If -s all then output either just SNP alignment or both the alignment and the consensus fasta files', required=False)
33
+ parser.add_argument('-v', action='append', dest='vcf_files',help='path to the input directory with all the vcf files', required=True)
34
+ parser.add_argument('-r', dest='reference',help='path to reference genome file', required=True)
35
+ parser.add_argument('-d',dest='depth',help='Depth file per position, output of samtools depth', required= True)
36
+ parser.add_argument('-c', dest='outgroup_vcf', help='Outgroup VCF required for variable alignment', required=False)
37
+ parser.add_argument('-n', dest='outgroup_name', type=str, help='Clean name for outgroup header in Galaxy')
38
+ parser.add_argument('-b', dest='bed_files', action='append', default=[], help='Optional BED files to mask certain genomic regions', required=False)
39
+ parser.add_argument('-o', dest='output_dir', help='Output directory for consensus files and the variable alignment' ,required=False)
40
+ parser.add_argument('-g', dest='undefined_states', help='Percentage of undefined states allowed per polymorphic position in the alignment', type=float, default=0.9, required=False)
41
+ parser.add_argument('-t', dest='test_mode',help='allows to use smaller files with less genomic positions', action='store_true')
42
+
43
+
44
+ # Run the parser and place data in the parser object for later use
45
+ args = parser.parse_args()
46
+
47
+ return args
48
+
49
+
50
+ # Check if all arguments for the consensus script are there
51
+ def check_consensus_args(args):
52
+
53
+ required_consensus_args = ['vcf_files', 'reference', 'depth']
54
+
55
+ # List comprehension to iterate over field in required_consensus_args and if the field is missing, adds it to the list
56
+ missing_consensus_args = [field for field in required_consensus_args if getattr(args, field, None) is None]
57
+
58
+ # If Missing_consensus_args has elements, it raises and error and outputs what arguments are missing, separated by a ','
59
+ if missing_consensus_args:
60
+ raise ValueError(f"Missing required arguments: {', '.join(missing_consensus_args)}")
61
+
62
+ # Check if all arguments for the snp_aligner script are there
63
+ def check_snp_aligner_args(args):
64
+
65
+ required_snp_aligner_args = ['outgroup_vcf', 'undefined_states']
66
+
67
+ # List comprehension to iterate over field in required_snp_aligner_args and if the field is missing, adds it to the list
68
+ missing_snp_aligner_args = [field for field in required_snp_aligner_args if getattr(args, field, None) is None]
69
+
70
+ # If missing_snp_aligner_args has elements, it raises and error and outputs what arguments are missing, separated by a ','
71
+ if missing_snp_aligner_args:
72
+ raise ValueError(f"Missing required arguments: {', '.join(missing_snp_aligner_args)}")
73
+
74
+ # Function to prevent crashing by checking for 1) reference genome length and 2) amounts of variants in VCF
75
+ def safety_check(args, max_total_variants=200000, max_ref_length=16100000):
76
+
77
+ # If a VCF has more than 200'000 entries, it is most likely not bacterial or filled with unfiltered sequencing errors
78
+ # The largest bacterial genome is Minicystis rosea with 16.04 Mbp
79
+
80
+ print("Executing safety check")
81
+
82
+ # Check length of reference genome and throw error if file not parsed or too long
83
+ try:
84
+ record = SeqIO.read(str(args.reference), "fasta")
85
+ genome_len = len(record.seq)
86
+ except Exception as e:
87
+ sys.exit(f"ERROR: Failed to parse the reference genome: {e}")
88
+
89
+ if genome_len > max_ref_length:
90
+ sys.exit(f"ERROR: Reference genome length ({genome_len} bp) exceeds the maximum allowed bacterial ceiling of {max_ref_length} bp.")
91
+
92
+ # Check number of variants in each VCF file
93
+ for vcf_path in args.vcf_files:
94
+ try:
95
+ with consensus_galaxy.open_vcf(vcf_path) as file:
96
+ variant_count = sum(1 for line in file if not line.startswith('#'))
97
+
98
+ # If one VCF file is to big, throw error
99
+ if variant_count > max_total_variants:
100
+ sys.exit(
101
+ f"ERROR: Variant overload. File '{os.path.basename(vcf_path)}' "
102
+ f"contains {variant_count} variant lines.\n"
103
+ f"This exceeds the safety limit of {max_total_variants} variants per sample. "
104
+ f"Filter or remove this VCF running the pipeline."
105
+ )
106
+
107
+ except Exception as e:
108
+ sys.exit(f"ERROR: Failed reading VCF file {vcf_path}: {e}")
109
+
110
+ print(f"Safety check passed, Reference size: {genome_len} bp. Variants of all VCFs within safety margin.")
111
+ return genome_len
112
+
113
+
114
+ # Main logic of the program
115
+ def main():
116
+
117
+ args = get_args()
118
+
119
+ if args.output_dir is None:
120
+ args.output_dir = "output"
121
+
122
+ os.makedirs(args.output_dir, exist_ok=True)
123
+
124
+ # Find input files and attribute them to arguments
125
+ #args.reference, args.depth, args.outgroup_vcf, args.vcf_files, args.bed_files = find_input_files(args.input_dir)
126
+ #print("REFERENCE:", args.reference)
127
+ #print("DEPTH:", args.depth)
128
+ #print("OUTGROUP:", args.outgroup_vcf)
129
+ #print("VCFS:", args.vcf_files)
130
+ #print("BEDS:", args.bed_files)
131
+
132
+ # Execute safety check and get the length of the reference genome
133
+ ref_genome_length = safety_check(args)
134
+
135
+ # Relay reference genome length to consensus_galaxy
136
+ args.calculated_ref_len = ref_genome_length
137
+
138
+ # -s consensus: Input = VCFs, output = consensus fasta files -> runs just consensus.py
139
+ if args.step == 'consensus':
140
+
141
+ check_consensus_args(args)
142
+
143
+ consensus_galaxy.main(args)
144
+
145
+ # -s all: Input = VCFs, output = SNP alignment -> runs consensus.py and snp_aligner.py subsequently
146
+ elif args.step == 'all':
147
+
148
+ check_consensus_args(args)
149
+ check_snp_aligner_args(args)
150
+
151
+ consensus_galaxy.main(args)
152
+ snp_aligner_galaxy.main(args)
153
+
154
+ # After consensus is run and -m alignment_only is chosen, delete the fasta files again
155
+ if args.mode == 'alignment_only':
156
+
157
+ for file in glob.glob(os.path.join(args.output_dir, "*.fasta")):
158
+ os.remove(file)
159
+
160
+ if __name__ == '__main__':
161
+ main()
@@ -0,0 +1,264 @@
1
+ import argparse
2
+ import os
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+ from Bio import SeqIO
6
+ from pprint import pprint
7
+ import gzip
8
+ '''
9
+
10
+ This script creates a SNP alignment in multi fasta format from a collection of consensus genomes
11
+ one of which should be the outgroup if intended to be used later for phylogenetic analyses.
12
+
13
+ The user can choose, how many undefined states ('-' or 'N') for a polymorphic position are accepted with
14
+ the -g argument. By default, if a position has more than 90% undefined states, it will be excluded (-g 0.9).
15
+ For a polymorphic site with Ns or - or both to be included in the final SNP alignment, the site has to be poly-
16
+ morphic in terms of at least 2 different bases, plus Ns or -s or both.
17
+ A site with just one base and Ns and or -s is not considered polymorphic.
18
+
19
+ '''
20
+
21
+
22
+ def flatten(t):
23
+ return [item for sublist in t for item in sublist]
24
+
25
+
26
+ # Function to open both zipped and unzipped vcf files
27
+ def open_vcf(path):
28
+ path = Path(path)
29
+ # Check the first two bytes for the gzip magic number
30
+ with open(path, 'rb') as f:
31
+ is_gzip = f.read(2) == b'\x1f\x8b'
32
+
33
+ if is_gzip:
34
+ return gzip.open(path, "rt")
35
+ else:
36
+ return open(path, "r")
37
+
38
+ # Create a dictionary of fasta sequences where the sample name is the key and the value the string of bases
39
+ def fastas2dict(args):
40
+ sequences_dict = {}
41
+ consensus_dir = Path(args.output_dir)
42
+ consensus_files = sorted(consensus_dir.glob("*.fasta"))
43
+
44
+ for fasta in consensus_files:
45
+ with fasta.open() as fh:
46
+ header = None
47
+ seq_parts = []
48
+ for line in fh:
49
+ line = line.strip()
50
+ if not line:
51
+ continue
52
+ if line.startswith(">"):
53
+ header = line[1:]
54
+ else:
55
+ seq_parts.append(line)
56
+
57
+ if header:
58
+ # Join only once at the very end for this sample
59
+ sequences_dict[header] = "".join(seq_parts)
60
+ print(f"Loaded {header}: {len(sequences_dict[header])} bases")
61
+
62
+ return sequences_dict
63
+
64
+
65
+
66
+
67
+ # Check if all sequences (values, string of characters) in a dictionary are the same length, return True if so
68
+ def check_length(sequences_dict):
69
+
70
+ sequence_lengths = [len(seq) for seq in sequences_dict.values()]
71
+
72
+ return all(x == sequence_lengths[0] for x in sequence_lengths)
73
+
74
+
75
+ # Create a 2D array from dictionary of sequences with sequence names at pos 0 and first genomic position at python pos 1
76
+ def dict2array(sequences_dict):
77
+
78
+
79
+ print("DEBUG dict2array:")
80
+ print(" Number of sequences:", len(sequences_dict))
81
+ print(" Unique sequence lengths:", set(len(seq) for seq in sequences_dict.values()))
82
+
83
+ if check_length(sequences_dict) == True:
84
+
85
+ array = [[name] + list(seq) for name, seq in sequences_dict.items()]
86
+
87
+
88
+
89
+ return array
90
+
91
+ else:
92
+ print("Not all sequences are of equal length")
93
+ return None
94
+
95
+ # Get positions from outgroup VCF corresponding to genomic positions of polymorphic sites found in alignment
96
+ def get_outgroup_vcf_var_pos(args, var_positions):
97
+ outgroup_vcf_dict = {}
98
+
99
+ # 1. Clean the outgroup name for the FASTA header
100
+ outgroup_name = args.outgroup_name if args.outgroup_name else Path(args.outgroup_vcf).stem
101
+ outgroup_name = outgroup_name.replace(".vcf", "").replace(".gz", "")
102
+
103
+ # 2. Populate the dictionary FIRST so debug prints work
104
+ with open_vcf(args.outgroup_vcf) as outgroup_vcf:
105
+ for var in outgroup_vcf:
106
+ if var.startswith('#') or not var.strip():
107
+ continue
108
+
109
+ # Strict tab splitting and stripping to avoid hidden characters in Galaxy
110
+ fields = var.strip().split('\t')
111
+
112
+ if len(fields) < 5:
113
+ continue
114
+
115
+ # Force POS to be a clean string
116
+ pos = fields[1].strip()
117
+ ref = fields[3].strip()
118
+ alt = fields[4].strip()
119
+
120
+ outgroup_vcf_dict[pos] = [ref, alt]
121
+
122
+ # 3. Handle the polymorphic coordinates
123
+ # Ensure var_positions is a list of clean strings to match the dict keys
124
+ unique_var_positions = [str(p) for p in dict.fromkeys(var_positions)]
125
+
126
+ # DEBUG SECTION
127
+ print(f"DEBUG: Input positions count: {len(unique_var_positions)}")
128
+ print(f"DEBUG: Outgroup VCF dict entries: {len(outgroup_vcf_dict)}")
129
+ if unique_var_positions:
130
+ print(f"DEBUG: First search key: '{unique_var_positions[0]}'")
131
+
132
+ outgroup_sequence = list()
133
+ for pos in unique_var_positions:
134
+ # Check dictionary using the clean string key
135
+ key = str(pos)
136
+
137
+ if key in outgroup_vcf_dict:
138
+ ref_val = outgroup_vcf_dict[pos][0]
139
+ alt_val = outgroup_vcf_dict[pos][1]
140
+
141
+ if alt_val == '.': # No ALT allele -> use REF
142
+ outgroup_sequence.append(ref_val)
143
+
144
+ elif alt_val != '.': # ALT allele present
145
+ if len(alt_val) > 1 or len(ref_val) > 1: # Indel
146
+ outgroup_sequence.append('N')
147
+ else: # SNP
148
+ outgroup_sequence.append(alt_val)
149
+ print(f"MATCH FOUND: Position {key} is in Outgroup VCF. Base: {outgroup_vcf_dict[key]}")
150
+ else:
151
+ # Position not in VCF means it's the Reference base (or missing)
152
+ print(f"MATCH FAILED: Position {key} not found in Outgroup VCF keys.")
153
+ outgroup_sequence.append("-")
154
+
155
+ # Add the header at the start
156
+ result = [outgroup_name] + outgroup_sequence
157
+
158
+ print(f"DEBUG: Finalll outgroup row length: {len(result)}")
159
+ return result
160
+
161
+
162
+ # Main script to extract polymorphic positions from the alignment
163
+ def main(args):
164
+ UNDEF_STAT = args.undefined_states # Use the arg from your parser
165
+ SEQ_DICT = fastas2dict(args)
166
+
167
+ seq_names = list(SEQ_DICT.keys())
168
+ num_seqs = len(seq_names)
169
+ genome_length = len(SEQ_DICT[seq_names[0]])
170
+
171
+ # Initialize pol_pos with sequence names as the first element of each row
172
+ pol_pos = [[name] for name in seq_names]
173
+ polymorphic_indices = []
174
+
175
+ print(f"Analyzing {num_seqs} sequences across {genome_length} bp...")
176
+
177
+ # Iterate through each genomic position one by one (Memory Efficient)
178
+ for col_index in range(genome_length):
179
+ # Build the column on the fly
180
+ column = [SEQ_DICT[name][col_index].replace('X', 'N') for name in seq_names]
181
+
182
+ # Check for polymorphism (more than 1 unique state)
183
+ if len(set(column)) > 1:
184
+
185
+ # Logic for adding column to alignment
186
+ should_add = False
187
+
188
+ if 'N' not in column and '-' not in column:
189
+ should_add = True
190
+ else:
191
+ count_undef = column.count('N') + column.count('-')
192
+ if (count_undef / num_seqs) <= UNDEF_STAT:
193
+ unique_states = set(column)
194
+ # Your specific conditions for missing data sites
195
+ if ('N' in column and '-' not in column) and len(unique_states) > 2:
196
+ should_add = True
197
+ elif ('-' in column and 'N' not in column) and len(unique_states) > 2:
198
+ should_add = True
199
+ elif ('N' in column and '-' in column) and len(unique_states) > 3:
200
+ should_add = True
201
+
202
+ if should_add:
203
+ for i, base in enumerate(column):
204
+ pol_pos[i].append(base)
205
+ # Store the 1-based genomic index
206
+ polymorphic_indices.append(col_index + 1)
207
+
208
+ print("\npol_pos preview (first 5 sequences, first 10 entries):")
209
+ for row in pol_pos[:5]:
210
+ print(row[:10])
211
+
212
+ # Append the outgroup sequence if provided via VCF
213
+ if args.outgroup_vcf:
214
+ # Use a clean, unique, and sorted list of indices
215
+ unique_indices = sorted(list(set(polymorphic_indices)))
216
+
217
+ # Get the outgroup sequence row
218
+ outgroup_line = get_outgroup_vcf_var_pos(args, var_positions=unique_indices)
219
+
220
+ # Safety check: Row length must match (Name + Bases)
221
+ if len(outgroup_line) != len(pol_pos[0]):
222
+ print(f"CRITICAL ERROR: Outgroup row length ({len(outgroup_line)}) "
223
+ f"mismatch with Sample row length ({len(pol_pos[0])})")
224
+
225
+ pol_pos.append(outgroup_line)
226
+
227
+ print("Number of polymorphic sites:", len(pol_pos[0]) - 1)
228
+ print("Number of polymorphic_indices:", len(polymorphic_indices))
229
+
230
+ # Write the output file
231
+ output_alignment_path = os.path.join(args.output_dir, 'snp_alignment.fasta')
232
+ with open(output_alignment_path, 'w') as output_file:
233
+ for i, row in enumerate(pol_pos):
234
+ sequence_name = row[0]
235
+ # Convert everything to string just in case a None or list slipped in
236
+ sequence = "".join(str(base) for base in row[1:])
237
+
238
+ # Standard FASTA format: >Header\nSequence\n
239
+ output_file.write(f">{sequence_name}\n")
240
+ output_file.write(f"{sequence}\n")
241
+
242
+ print(f"Successfully wrote {len(pol_pos)} sequences to {output_alignment_path}")
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: tb_consensus_aligner
3
+ Version: 1.0.0
4
+ Summary: Building consensus fasta files and variable multi-sequence alignments for mycobacterial genomes.
5
+ Author-email: scg40 <gian.schuepbach@swisstph.ch>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: biopython>=1.80
13
+
14
+ # TBConsensusAligner
15
+ TBConsensusAligner is a tool designed to create a multi-sequence alignment via consensus
16
+ genomes created from VCF files.
17
+ It can currently be used in different modes as follows
18
+ - VCF to SNP alignment `-s all`
19
+ - input = VCFs, output = SNP alignment
20
+ - runs `consensus_galaxy.py` and `snp_aligner_galaxy.py`
21
+
22
+ - VCF to consensus FASTA files `-s consensus`
23
+ - input = VCFs, output = consensus FASTA files
24
+ - runs only `consensus_galaxy.py`
25
+
26
+ If the script produces a SNP alignment from VCFs, the user can choose whether to ouptut just the SNP alignment
27
+ or the used consensus FASTA files as well with the `-m` option.
28
+
29
+ When running the `consensus_galaxy.py` the user has to provide the VCF files, the reference genome used in their creation and the multisample
30
+ depth file from bamtools depth. Optionally, the user can provide one or several BED files to mask certain regions of the genome.
31
+
32
+ ## Usage
33
+ The script is run via the command line.
34
+
35
+ ***usage***:
36
+
37
+ ```
38
+ Usage: TBConsensusAligner [options]
39
+
40
+ Options: -s STR ['consensus' or 'all'] Create either consensus files or consensus files and a variable alignment
41
+
42
+ -m STR ['everything' or 'alignment_only'] If -s all is chosen, output either consensus files and the alignment or just the alignment
43
+
44
+ -v list[STR] Path to the input VCF files
45
+
46
+ -r STR Path to the reference genome
47
+
48
+ -d STR Path to the depth file created by samtools depth
49
+
50
+ -c STR Path to the outgroup VCF for the alignment
51
+
52
+ -b list[STR] Path to the BED files to mask certain genomic regions
53
+
54
+ -o STR Path to the output directory
55
+
56
+ -g FLOAT Percentage of undefined states allowed per polymorphic position in the alignment default at 90 percent value=0.9
57
+
58
+ -t BOOLEAN Test Mode to run the script with smaller genomes
59
+
60
+ ```
61
+
62
+ ## Example usages command line
63
+
64
+ ### From 3 VCFs to alignment, output consensus FASTAs and SNP alignment, masked by two bedfiles
65
+
66
+ ```
67
+ python yourfolder/galaxy_main.py \
68
+ -s all \
69
+ -m everything \
70
+ -v path/to/vcf1/vcf1.vcf -v path/to/vcf2/vcf2.vcf -v path/to/vcf2/vcf2.vcf \
71
+ -r path/to/reference/reference_genome.reference.fasta \
72
+ -d path/to/depthfile/depthfile.tabular \
73
+ -c path/to/outgroupvcf/outgroup.vcf \
74
+ -b path/to/bedfile1/bed1.bed -b path/to/bedfile2/bed2.bed \
75
+ -o output
76
+ -g 0.9
77
+
78
+ ```
79
+
80
+ ## Logic consensus_galaxy.py
81
+
82
+ This script produces consensus FASTA files from VCFs. The user provides the VCFs, the reference genome,
83
+ the multisample depth file obtained from `bamtools depth` and optionally BED files to mask certain genomic regions.
84
+
85
+ ***algorithm***
86
+
87
+ We loop through each position in the reference genome and build the consensus sequence base per base with the following rules:
88
+
89
+ ```
90
+ - SNPs and MNPs with a frequency >90% are encoded with the alternative base from the VCF.
91
+ - SNPs and MNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
92
+ - SNPs and MNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
93
+
94
+ - Large deletions (coverage 0) are encoded by dashes (-).
95
+ - Small deletions with frequency >90% are encoded by dashes (-).
96
+ - Small deletions with frequency between 10% and 90% are encoded with a 'N'.
97
+ - Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
98
+
99
+ - Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
100
+
101
+ - Sites to exclude specified in the bed files are encoded with a 'N'.
102
+ - Variants that have a phred score < 20 are encoded with a 'N'.
103
+ - Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
104
+
105
+ - If more than one ALT allele is present, we consider the one with the highest allele frequency.
106
+ ```
107
+ Since the VCF for which the script is tailored to does not have the allele frequency `AF` calculated, we calculate it using
108
+ `RO = REF allele occurance` and `AO = ALT allele occurance` with
109
+ `AF = AO / ( sum(all AO's) + RO )`.
110
+
111
+ ## Logic snp_aligner_galaxy.py
112
+
113
+ This script produces a SNP alignment i.e. multi-sequence alignment of polymorphic positions in FASTA format from consensus FASTA files.
114
+ The consensus FASTAs are generated in the previous step by `consensus_galaxy.py` wrapped in `TBConsensusAligner`.
115
+
116
+ ***algorithm***
117
+
118
+ The script creates a multi-sequence alignment from the consensus FASTAs in form of an 2D array where each row represents a sequence
119
+ and each column a genomic position. Each column is checked for the abundance of a polymorphism. If such a polymorphism is detected,
120
+ the column is appended to the alignment of polymorphic positions. For each polymorphic position, the corresponding nucleotide from
121
+ outgroup VCF is retrieved to get the outgroup's sequence.
122
+
123
+ The algorithm to populate the multi-sequence alignment of polymorphic positions is asd follows.
124
+ The user can control the proportion of gaps or undefined states (`-`or`N`) for a polymorphic position
125
+ to be kept in the alignment (argument `-g`). By default, this is set to `0.9`, meaning that if a polymorphic position has
126
+ more than 90% gaps or undefined states, it will not be in the final alignment.
127
+
128
+ ## Directory structure used for development
129
+
130
+ * TBConsensusAligner
131
+ * test_data
132
+ - snp_alignment.fasta
133
+ - test_G77777.consensus.fasta
134
+ - test_G77777.vcf.gz
135
+ - test_G88888_k1.consensus.fasta
136
+ - test_G88888_k1.vcf.gz
137
+ - test_G99999.k2.consensus.fasta
138
+ - test_G99999.k2.vcf.gz
139
+ - test_Galaxy_multiple_depths_header.tabular
140
+ - test_reference_200bp.reference.fasta
141
+ - test_regions_blindspots_modlin_farhat_and_PE_PPE_PGRS.bed
142
+ - test.outgroup.all.pos.vcf.gz
143
+ - consensus_galaxy.py
144
+ - main_galaxy.py
145
+ - README.md
146
+ - snp_aligner_galaxy.py
147
+ - TBConsensusAligner.xml
148
+
149
+
150
+
151
+
@@ -0,0 +1,9 @@
1
+ tb_consensus_aligner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ tb_consensus_aligner/consensus_galaxy.py,sha256=NnwO3_rT43NquvkOFQR0si_ivbAP0JRfOn4o1ezkdJw,20165
3
+ tb_consensus_aligner/main_galaxy.py,sha256=8m_73RdxwDM3W_sjtNtKyvxhEmMYWEXFp2BJoCd1kR4,7376
4
+ tb_consensus_aligner/snp_aligner_galaxy.py,sha256=tv8J2LmBsHzr9K-T5fLhd82B7q7CYDJEuE3UDK38UkY,9218
5
+ tb_consensus_aligner-1.0.0.dist-info/METADATA,sha256=2WWLuwLTnqcbzQXzKW9EhSREDX8CAVGMliPVb2z_ALk,6415
6
+ tb_consensus_aligner-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ tb_consensus_aligner-1.0.0.dist-info/entry_points.txt,sha256=cBK81ccXI6DIOvhvxfZNr4v2j21Rdq_7HEJT_1RGKUw,79
8
+ tb_consensus_aligner-1.0.0.dist-info/top_level.txt,sha256=Q8H2spBV19grmt3Mw6TTiZ_D-1CvaK0LeqkqTT84gbA,21
9
+ tb_consensus_aligner-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tb-consensus-aligner = tb_consensus_aligner.main_galaxy:main
@@ -0,0 +1 @@
1
+ tb_consensus_aligner