tb-consensus-aligner 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tb_consensus_aligner/__init__.py +0 -0
- tb_consensus_aligner/consensus_galaxy.py +491 -0
- tb_consensus_aligner/main_galaxy.py +161 -0
- tb_consensus_aligner/snp_aligner_galaxy.py +264 -0
- tb_consensus_aligner-1.0.0.dist-info/METADATA +151 -0
- tb_consensus_aligner-1.0.0.dist-info/RECORD +9 -0
- tb_consensus_aligner-1.0.0.dist-info/WHEEL +5 -0
- tb_consensus_aligner-1.0.0.dist-info/entry_points.txt +2 -0
- tb_consensus_aligner-1.0.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import re
|
|
3
|
+
import os
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from Bio import SeqIO
|
|
7
|
+
import gzip
|
|
8
|
+
'''
|
|
9
|
+
This script creates consensus fastas from a collection of VCF files and the MTBC ancestor reference genome.
|
|
10
|
+
SNPs with a frequency >90% are encoded with the alternative base from the VCF.
|
|
11
|
+
SNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
|
|
12
|
+
SNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
|
|
13
|
+
|
|
14
|
+
Large deletions (coverage 0) are encoded by dashes (-).
|
|
15
|
+
Small deletions with frequency >90% are encoded by dashes (-).
|
|
16
|
+
Small deletions with frequency between 10% and 90% are encoded with a 'N'.
|
|
17
|
+
Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
|
|
18
|
+
|
|
19
|
+
Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
|
|
20
|
+
|
|
21
|
+
Sites to exclude specified in the bed files are encoded with a 'N'.
|
|
22
|
+
Variants that do not have the 'PASS' quality filter are encoded with a 'N'.
|
|
23
|
+
Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
|
|
24
|
+
|
|
25
|
+
'''
|
|
26
|
+
|
|
27
|
+
# Extract the basename to use in output file naming and consensus file filling
|
|
28
|
+
def get_basename(vcf_path):
|
|
29
|
+
# Get the filename from the path (e.g., 'input_vcfs/Sample_01.recal.vcf.gz')
|
|
30
|
+
filename = os.path.basename(vcf_path)
|
|
31
|
+
|
|
32
|
+
# Use regex to "look ahead" for .vcf and take everything before it.
|
|
33
|
+
match = re.split(r'\.vcf', filename, flags=re.IGNORECASE)
|
|
34
|
+
|
|
35
|
+
return match[0]
|
|
36
|
+
|
|
37
|
+
def flatten(t):
|
|
38
|
+
return [item for sublist in t for item in sublist]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Split a string of characters into a list of the individual characters (list comprehension) useful to examine each base of f.e. ACTG individually
|
|
42
|
+
def split(word):
|
|
43
|
+
return [char for char in word]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Function to open either gzipped or unzipped vcf files
|
|
47
|
+
def open_vcf(path):
|
|
48
|
+
|
|
49
|
+
path = Path(path)
|
|
50
|
+
|
|
51
|
+
if path.suffix == ".gz":
|
|
52
|
+
return gzip.open(path, "rt")
|
|
53
|
+
else:
|
|
54
|
+
return open(path, "r")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Function to get the length of the reference sequence in case of testing with a smaller reference genome
|
|
58
|
+
def get_reference_length(args):
|
|
59
|
+
record = SeqIO.read(str(args.reference), "fasta") # Create object with key info of the sequence
|
|
60
|
+
return len(record.seq)
|
|
61
|
+
|
|
62
|
+
# Parse the BED files to get a dictionary of positions to exclude
|
|
63
|
+
def get_pos_to_exclude(bed_files):
|
|
64
|
+
|
|
65
|
+
pos_to_exclude = {}
|
|
66
|
+
if bed_files:
|
|
67
|
+
|
|
68
|
+
for bed_path in bed_files:
|
|
69
|
+
bed_path = Path(bed_path)
|
|
70
|
+
|
|
71
|
+
with bed_path.open() as file:
|
|
72
|
+
table = [position.strip().split('\t') for position in file] # Create a list of lists, each representing a line with the entries as items
|
|
73
|
+
|
|
74
|
+
for i in range(1,len(table)): # Iterate over every element of [table] -> line in BED
|
|
75
|
+
StartPosition = int(table[i][1])
|
|
76
|
+
EndPosition = int(table[i][2])
|
|
77
|
+
|
|
78
|
+
ranges_of_coordinates = [i for i in range(StartPosition, EndPosition)] # Create a list of every position of the interval of a BED line
|
|
79
|
+
|
|
80
|
+
for pos in ranges_of_coordinates: # If the position of the range is not yet in the dict, add it
|
|
81
|
+
if pos not in pos_to_exclude:
|
|
82
|
+
pos_to_exclude[pos] = ''
|
|
83
|
+
else:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
return dict(sorted(pos_to_exclude.items()))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# Create a dictionary where the keys are the genomic positions and values are the bases of the reference genome
|
|
90
|
+
def fasta2dict(args):
|
|
91
|
+
|
|
92
|
+
fasta_dict = {}
|
|
93
|
+
|
|
94
|
+
record = SeqIO.read(str(args.reference), "fasta") # Create object with key info of the sequence
|
|
95
|
+
number_of_refbases = len(record.seq)
|
|
96
|
+
|
|
97
|
+
for i in range(number_of_refbases): # Add the each refbase as value to the key (genomic position)
|
|
98
|
+
fasta_dict[i+1]=record.seq[i]
|
|
99
|
+
|
|
100
|
+
return(fasta_dict)
|
|
101
|
+
|
|
102
|
+
# Create a dictionary from the VCFs where POS is the key and REF, ALT, QUAL and AF(as list) are values
|
|
103
|
+
def vcf2dict(vcf_files):
|
|
104
|
+
|
|
105
|
+
vcf_dict = {}
|
|
106
|
+
|
|
107
|
+
for vcf_path in vcf_files:
|
|
108
|
+
vcf_path = Path(vcf_path)
|
|
109
|
+
|
|
110
|
+
with open_vcf(vcf_path) as vcf_file:
|
|
111
|
+
|
|
112
|
+
for row in vcf_file:
|
|
113
|
+
|
|
114
|
+
if row.startswith("#"):
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
row = row.strip().split('\t')
|
|
118
|
+
|
|
119
|
+
position = row[1]
|
|
120
|
+
ref = row[3]
|
|
121
|
+
alt = row[4]
|
|
122
|
+
qual = float(row[5])
|
|
123
|
+
|
|
124
|
+
# Compute the AF from AO and RO for one or multiple ALT alleles with AF = AO / AO + RO
|
|
125
|
+
value_fields = row[-1].split(":") # Split column with the values at the :
|
|
126
|
+
RO = int(value_fields[2]) # As int to do calculations with it
|
|
127
|
+
AO_list = [int(ao) for ao in value_fields[4].split(",")] # A list in case calculation of AF has do be done for multiple alleles
|
|
128
|
+
|
|
129
|
+
AF = [ao / (sum(AO_list) + RO) for ao in AO_list] # sum(AO_list) to accomodate possibility of many alleles
|
|
130
|
+
|
|
131
|
+
vcf_dict[int(position)] = [ref,alt,qual,AF]
|
|
132
|
+
|
|
133
|
+
return(vcf_dict)
|
|
134
|
+
|
|
135
|
+
# Parse the depth file to get a nested dictionary with with the depths of each genomic position for each BAM file
|
|
136
|
+
# {{ BAM0: {pos1: depth1, pos2: depth2, ...}, BAM1: {pos1: depth1, pos2: depth2, ...}}
|
|
137
|
+
def get_sample_depth(depth_file_path, column_index):
|
|
138
|
+
"""
|
|
139
|
+
column_index: the integer index of the sample (0, 1, 2...)
|
|
140
|
+
This corresponds to the order Galaxy passed the VCFs.
|
|
141
|
+
"""
|
|
142
|
+
sample_depths = {}
|
|
143
|
+
with open(depth_file_path, 'rt') as f:
|
|
144
|
+
# Skip the header
|
|
145
|
+
header = f.readline()
|
|
146
|
+
|
|
147
|
+
# We add 2 to column_index because:
|
|
148
|
+
# Col 0 = CHROM, Col 1 = POS, Col 2 = First Sample (Index 0)
|
|
149
|
+
actual_col = column_index + 2
|
|
150
|
+
|
|
151
|
+
for line in f:
|
|
152
|
+
if line.startswith("#"):
|
|
153
|
+
continue
|
|
154
|
+
parts = line.strip().split("\t")
|
|
155
|
+
# Pull the POS and the specific DEPTH column
|
|
156
|
+
sample_depths[int(parts[1])] = int(parts[actual_col])
|
|
157
|
+
|
|
158
|
+
return sample_depths
|
|
159
|
+
|
|
160
|
+
# Define the ambiguity bases to return ambiguity base if SNP between AF 10% and 90%
|
|
161
|
+
def ambiguity_code(ref, alt):
|
|
162
|
+
|
|
163
|
+
ambiguity_base = ''
|
|
164
|
+
|
|
165
|
+
if ref == 'C' and alt == 'T':
|
|
166
|
+
ambiguity_base = 'Y'
|
|
167
|
+
|
|
168
|
+
elif ref == 'T' and alt == 'C':
|
|
169
|
+
ambiguity_base = 'Y'
|
|
170
|
+
|
|
171
|
+
elif ref == 'A' and alt == 'G':
|
|
172
|
+
ambiguity_base = 'R'
|
|
173
|
+
|
|
174
|
+
elif ref == 'G' and alt == 'A':
|
|
175
|
+
ambiguity_base = 'R'
|
|
176
|
+
|
|
177
|
+
elif ref == 'A' and alt == 'T':
|
|
178
|
+
ambiguity_base = 'W'
|
|
179
|
+
|
|
180
|
+
elif ref == 'T' and alt == 'A':
|
|
181
|
+
ambiguity_base = 'W'
|
|
182
|
+
|
|
183
|
+
elif ref == 'G' and alt == 'C':
|
|
184
|
+
ambiguity_base = 'S'
|
|
185
|
+
|
|
186
|
+
elif ref == 'C' and alt == 'G':
|
|
187
|
+
ambiguity_base = 'S'
|
|
188
|
+
|
|
189
|
+
elif ref == 'T' and alt == 'G':
|
|
190
|
+
ambiguity_base = 'K'
|
|
191
|
+
|
|
192
|
+
elif ref == 'G' and alt == 'T':
|
|
193
|
+
ambiguity_base = 'K'
|
|
194
|
+
|
|
195
|
+
elif ref == 'C' and alt == 'A':
|
|
196
|
+
ambiguity_base = 'M'
|
|
197
|
+
|
|
198
|
+
elif ref == 'A' and alt == 'C':
|
|
199
|
+
ambiguity_base = 'M'
|
|
200
|
+
|
|
201
|
+
elif ref == alt:
|
|
202
|
+
ambiguity_base = ref
|
|
203
|
+
|
|
204
|
+
return(ambiguity_base)
|
|
205
|
+
|
|
206
|
+
# Main function to loop over VCFs in input directory and create the respective consensus genomes
|
|
207
|
+
|
|
208
|
+
def main(args):
|
|
209
|
+
|
|
210
|
+
REF = fasta2dict(args)
|
|
211
|
+
EXCLUDED_POS = get_pos_to_exclude(args.bed_files)
|
|
212
|
+
print(EXCLUDED_POS)
|
|
213
|
+
|
|
214
|
+
# Define the vcf_dir from the argument -v/vcf, sort the vcf files alphanumerically, extract all keys (columns of depth file) alphanumerically sorted
|
|
215
|
+
|
|
216
|
+
vcf_files = args.vcf_files
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# Loop over each vcf file with the matching column from the depth file
|
|
220
|
+
for sample_idx, vcf_file in enumerate(vcf_files):
|
|
221
|
+
|
|
222
|
+
VCF = vcf2dict([vcf_file])
|
|
223
|
+
VCF_DEPTH = get_sample_depth(args.depth, sample_idx)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
fasta_sequence = []
|
|
227
|
+
|
|
228
|
+
i = 1
|
|
229
|
+
|
|
230
|
+
# Go over each position in the reference sequence and modify the base if it is in the VCF
|
|
231
|
+
while(i <= len(REF)):
|
|
232
|
+
|
|
233
|
+
if i in EXCLUDED_POS: # Check if the position falls in the BED file
|
|
234
|
+
|
|
235
|
+
fasta_sequence += ['N']
|
|
236
|
+
i += 1
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
else:
|
|
240
|
+
|
|
241
|
+
if i not in VCF: # Position not in VCF -> must be ancestral or deletion
|
|
242
|
+
|
|
243
|
+
if i in VCF_DEPTH:
|
|
244
|
+
|
|
245
|
+
if 1 <= VCF_DEPTH[i] <=5: # Covered by less than 5 reads -> N
|
|
246
|
+
fasta_sequence += ['N']
|
|
247
|
+
i += 1
|
|
248
|
+
|
|
249
|
+
elif VCF_DEPTH[i] == 0: # Covered by 0 reads -> deletion
|
|
250
|
+
fasta_sequence += ['-']
|
|
251
|
+
i += 1
|
|
252
|
+
|
|
253
|
+
elif VCF_DEPTH[i] > 5: # Not in VCF but covered >5 reads -> ancestral base
|
|
254
|
+
fasta_sequence += [REF[i]]
|
|
255
|
+
i += 1
|
|
256
|
+
|
|
257
|
+
elif i in VCF: # Position is in VCF -> Variant
|
|
258
|
+
|
|
259
|
+
# for i define:
|
|
260
|
+
reference_base = VCF[i][0]
|
|
261
|
+
alternative_base = VCF[i][1]
|
|
262
|
+
quality = VCF[i][2]
|
|
263
|
+
allele_freq = VCF[i][3]
|
|
264
|
+
|
|
265
|
+
if quality >= 20: # Variants with a 99% confidence (phred-score >20)
|
|
266
|
+
|
|
267
|
+
if len(allele_freq) == 1: # Only one ALT allele
|
|
268
|
+
|
|
269
|
+
allele_freq = allele_freq[0]
|
|
270
|
+
|
|
271
|
+
if len(alternative_base) == 1 and len(reference_base) == 1: # REF and ALT = 1 -> SNP
|
|
272
|
+
|
|
273
|
+
if allele_freq >= 0.90: # Take alt base from VCF
|
|
274
|
+
fasta_sequence +=[alternative_base]
|
|
275
|
+
i += 1
|
|
276
|
+
|
|
277
|
+
elif 0.10 <= allele_freq < 0.90: # Take ambiguity base
|
|
278
|
+
fasta_sequence += [ambiguity_code(reference_base,alternative_base)]
|
|
279
|
+
i += 1
|
|
280
|
+
|
|
281
|
+
elif allele_freq < 0.10: # Take the ancestral base
|
|
282
|
+
fasta_sequence += [REF[i]]
|
|
283
|
+
i += 1
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
elif len(alternative_base) < len(reference_base): # Small deletions
|
|
287
|
+
|
|
288
|
+
if len(alternative_base) == 1 and len(reference_base) > 1:
|
|
289
|
+
small_deletion_length = len(reference_base) # Get length of REF to skip correct number of bases
|
|
290
|
+
|
|
291
|
+
if allele_freq >=0.90: # Encode with "-"
|
|
292
|
+
fasta_sequence += [alternative_base[0]]
|
|
293
|
+
fasta_sequence += split('-'*(small_deletion_length-1))
|
|
294
|
+
i += small_deletion_length
|
|
295
|
+
|
|
296
|
+
elif 0.10 <= allele_freq < 0.90: # Take ambiguity base
|
|
297
|
+
fasta_sequence += split('N'*small_deletion_length)
|
|
298
|
+
i += small_deletion_length
|
|
299
|
+
|
|
300
|
+
elif allele_freq < 0.10: # take reference base
|
|
301
|
+
fasta_sequence += [REF[i]]
|
|
302
|
+
i += 1
|
|
303
|
+
|
|
304
|
+
if len(alternative_base) > 1 and len(reference_base) > 1: # If both REF & ALT are more than one base but still REF>ALT
|
|
305
|
+
fasta_sequence += [REF[i]]
|
|
306
|
+
i += 1
|
|
307
|
+
|
|
308
|
+
elif len(alternative_base) > len(reference_base): # Small insertion
|
|
309
|
+
fasta_sequence += [REF[i]]
|
|
310
|
+
i += 1
|
|
311
|
+
|
|
312
|
+
elif (len(alternative_base) > 1 ) and (len(reference_base) > 1) and (len(alternative_base) == len(reference_base)): # MNP
|
|
313
|
+
|
|
314
|
+
length_mnp = len(alternative_base)
|
|
315
|
+
|
|
316
|
+
if allele_freq >= 0.90: # Take alt bases from VCF
|
|
317
|
+
fasta_sequence += split(alternative_base)
|
|
318
|
+
i += length_mnp
|
|
319
|
+
|
|
320
|
+
elif 0.10 <= allele_freq < 0.90: # Take ambiguity bases
|
|
321
|
+
|
|
322
|
+
for f, b in zip(reference_base, alternative_base):
|
|
323
|
+
fasta_sequence += [ambiguity_code(f, b)]
|
|
324
|
+
i += length_mnp
|
|
325
|
+
|
|
326
|
+
elif allele_freq < 0.10: # Take the ancestral bases
|
|
327
|
+
fasta_sequence += split(reference_base)
|
|
328
|
+
i += len(reference_base)
|
|
329
|
+
|
|
330
|
+
elif len(allele_freq) > 1: # More than one ALT allele, we consider the ALT allele with the highest allele frequency
|
|
331
|
+
|
|
332
|
+
highest_AF = max(allele_freq) # Find highest AF
|
|
333
|
+
|
|
334
|
+
alternative_base = alternative_base.split(',') # Get different alleles split by ','
|
|
335
|
+
|
|
336
|
+
index_of_nucleotide_with_highest_AF = allele_freq.index(highest_AF) # Index of the allele with highest AF
|
|
337
|
+
|
|
338
|
+
ALT_allele_with_highest_frequency = alternative_base[index_of_nucleotide_with_highest_AF] # ALT allele for highest AF
|
|
339
|
+
|
|
340
|
+
print(i,VCF[i],highest_AF,split(ALT_allele_with_highest_frequency),len(ALT_allele_with_highest_frequency),reference_base,REF[i])
|
|
341
|
+
|
|
342
|
+
# Now again check for SNPs, deletions, insertions and MNPs
|
|
343
|
+
if len(ALT_allele_with_highest_frequency) == 1 and len(reference_base) == 1: # SNP
|
|
344
|
+
|
|
345
|
+
if highest_AF >= 0.90: # Take ALT base
|
|
346
|
+
fasta_sequence += [ALT_allele_with_highest_frequency]
|
|
347
|
+
i += 1
|
|
348
|
+
|
|
349
|
+
elif 0.10 <= highest_AF < 0.90: # take ambiguity_base
|
|
350
|
+
fasta_sequence += [ambiguity_code(reference_base,ALT_allele_with_highest_frequency)]
|
|
351
|
+
i += 1
|
|
352
|
+
|
|
353
|
+
elif highest_AF < 0.10: # Take reference base
|
|
354
|
+
fasta_sequence += [REF[i]]
|
|
355
|
+
i +=1
|
|
356
|
+
|
|
357
|
+
elif len(ALT_allele_with_highest_frequency) < len(reference_base): # Small deletion
|
|
358
|
+
|
|
359
|
+
if len(ALT_allele_with_highest_frequency) == 1 and len(reference_base) > 1: # Small deletion with ALT allele = 1bp
|
|
360
|
+
small_deletion_length = len(reference_base) # To move correct amount of bases
|
|
361
|
+
|
|
362
|
+
if highest_AF >= 0.90: # Encode with a '-'
|
|
363
|
+
fasta_sequence += [ALT_allele_with_highest_frequency[0]]
|
|
364
|
+
fasta_sequence += split('-'*(small_deletion_length-1))
|
|
365
|
+
i += small_deletion_length
|
|
366
|
+
|
|
367
|
+
elif 0.10 <= highest_AF < 0.90: # Encode with N
|
|
368
|
+
fasta_sequence += split('N'*small_deletion_length)
|
|
369
|
+
i += small_deletion_length
|
|
370
|
+
|
|
371
|
+
elif highest_AF < 0.10: # Take reference base
|
|
372
|
+
fasta_sequence += [REF[i]]
|
|
373
|
+
i += 1
|
|
374
|
+
|
|
375
|
+
if len(ALT_allele_with_highest_frequency) > 1 and len(reference_base) > 1: # Although ALT < REF, ALT > 1
|
|
376
|
+
fasta_sequence += [REF[i]]
|
|
377
|
+
i += 1
|
|
378
|
+
|
|
379
|
+
elif len(ALT_allele_with_highest_frequency) > len(reference_base): # Small insertion, no matter AF -> take anc base
|
|
380
|
+
fasta_sequence += [REF[i]]
|
|
381
|
+
i += 1
|
|
382
|
+
|
|
383
|
+
elif (len(ALT_allele_with_highest_frequency) > 1) and (len(reference_base) > 1) and (len(ALT_allele_with_highest_frequency) == len(reference_base)): # MNP
|
|
384
|
+
|
|
385
|
+
length_mnp = len(ALT_allele_with_highest_frequency)
|
|
386
|
+
|
|
387
|
+
if highest_AF >= 0.90: # Take alt base
|
|
388
|
+
fasta_sequence += split(ALT_allele_with_highest_frequency)
|
|
389
|
+
i += length_mnp
|
|
390
|
+
|
|
391
|
+
elif 0.10 <= highest_AF < 0.90: # Take ambiguity base
|
|
392
|
+
for f, b in zip(reference_base, ALT_allele_with_highest_frequency):
|
|
393
|
+
fasta_sequence += [ambiguity_code(f, b)]
|
|
394
|
+
i += length_mnp
|
|
395
|
+
|
|
396
|
+
elif highest_AF < 0.10: # Take reference base
|
|
397
|
+
fasta_sequence += split(reference_base)
|
|
398
|
+
i += len(reference_base)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
else: # QUAL < 20
|
|
402
|
+
fasta_sequence += ['N']
|
|
403
|
+
i += 1
|
|
404
|
+
|
|
405
|
+
else: # Pos i not in VCF
|
|
406
|
+
print("i not in VCF",i,VCF[i])
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
print("Lenght of sequence:",len(flatten(fasta_sequence)))
|
|
411
|
+
|
|
412
|
+
# Find and print empty strings in the list fasta_sequence
|
|
413
|
+
for b in range(len(fasta_sequence)):
|
|
414
|
+
if len(fasta_sequence[b]) == 0:
|
|
415
|
+
print(b, fasta_sequence[b])
|
|
416
|
+
|
|
417
|
+
# Check if the script is running in test mode (shorter genomes)
|
|
418
|
+
if args.test_mode:
|
|
419
|
+
|
|
420
|
+
print("Running in test mode - smaller genomes accepted")
|
|
421
|
+
reference_length = get_reference_length(args)
|
|
422
|
+
|
|
423
|
+
# Created consensus sequence too short
|
|
424
|
+
if len(fasta_sequence) < reference_length:
|
|
425
|
+
print("Fasta file not created")
|
|
426
|
+
print("Fasta has less than {reference_length} bp:", len(fasta_sequence))
|
|
427
|
+
|
|
428
|
+
# Consensus sequence has correct length
|
|
429
|
+
elif len(fasta_sequence) == reference_length:
|
|
430
|
+
|
|
431
|
+
sample_name = get_basename(vcf_file)
|
|
432
|
+
output_filename = f"{sample_name}.consensus.fasta"
|
|
433
|
+
output_path = os.path.join(args.output_dir, output_filename)
|
|
434
|
+
with open(output_path, 'w') as output_file:
|
|
435
|
+
output_file.write(f">{sample_name}\n")
|
|
436
|
+
output_file.write("".join(fasta_sequence))
|
|
437
|
+
|
|
438
|
+
# Consensus sequence too long
|
|
439
|
+
else:
|
|
440
|
+
print("Fasta file not created")
|
|
441
|
+
print("Fasta more than {reference_length} bp:", len(fasta_sequence))
|
|
442
|
+
|
|
443
|
+
else:
|
|
444
|
+
# Default reference length is set to ancestral MTBC ref length
|
|
445
|
+
# Get calculated ref len from object args if not available set to default 4411532
|
|
446
|
+
target_len = getattr(args, 'calculated_ref_len', 4411532)
|
|
447
|
+
|
|
448
|
+
# Created consensus sequence too short
|
|
449
|
+
if len(fasta_sequence) < target_len:
|
|
450
|
+
print("Fasta file not created")
|
|
451
|
+
print(f"Fasta has less than {target_len} bp: {len(fasta_sequence)}")
|
|
452
|
+
|
|
453
|
+
# Consensus sequence has correct length
|
|
454
|
+
elif len(fasta_sequence) == target_len:
|
|
455
|
+
sample_name = get_basename(vcf_file)
|
|
456
|
+
output_filename = f"{sample_name}.consensus.fasta"
|
|
457
|
+
output_path = os.path.join(args.output_dir, output_filename)
|
|
458
|
+
with open(output_path, 'w') as output_file:
|
|
459
|
+
output_file.write(f">{sample_name}\n")
|
|
460
|
+
output_file.write("".join(fasta_sequence))
|
|
461
|
+
|
|
462
|
+
# Consensus sequence too long
|
|
463
|
+
else:
|
|
464
|
+
print("Fasta file not created")
|
|
465
|
+
print(f"Fasta more than {target_len} bp: {len(fasta_sequence)}")
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import sys
|
|
4
|
+
import argparse
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from Bio import SeqIO
|
|
8
|
+
import gzip
|
|
9
|
+
from . import consensus_galaxy
|
|
10
|
+
from . import snp_aligner_galaxy
|
|
11
|
+
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
This script is the main script for the "insert name here".
|
|
15
|
+
It is built on top of consensus.py and snp_aligner.py.
|
|
16
|
+
|
|
17
|
+
The user can choose the following with the -s option.
|
|
18
|
+
-s all: Input = VCFs, output = SNP alignment -> runs consensus.py and snp_aligner.py subsequently.
|
|
19
|
+
-s consensus: Input = VCFs, output = consensus fasta files -> runs just consensus.py
|
|
20
|
+
When -s all is chosen, -m option gives choice whether to just output the SNP alignment
|
|
21
|
+
with -m alignment_only or both the SNP alignment and the consensus fasta files with -m everything
|
|
22
|
+
|
|
23
|
+
'''
|
|
24
|
+
|
|
25
|
+
# Define arguments used in the script
|
|
26
|
+
def get_args():
|
|
27
|
+
|
|
28
|
+
parser = argparse.ArgumentParser(description='Main script for consensus.py and snp_aligner.py')
|
|
29
|
+
|
|
30
|
+
# Use dest to set the name of the argument for further handling and clarity
|
|
31
|
+
parser.add_argument('-s', choices=['consensus', 'all'],dest='step', help='Run either consensus, or both consensus and snp_aligner', required=True)
|
|
32
|
+
parser.add_argument('-m', choices=['alignment_only', 'everything'],dest='mode', help='If -s all then output either just SNP alignment or both the alignment and the consensus fasta files', required=False)
|
|
33
|
+
parser.add_argument('-v', action='append', dest='vcf_files',help='path to the input directory with all the vcf files', required=True)
|
|
34
|
+
parser.add_argument('-r', dest='reference',help='path to reference genome file', required=True)
|
|
35
|
+
parser.add_argument('-d',dest='depth',help='Depth file per position, output of samtools depth', required= True)
|
|
36
|
+
parser.add_argument('-c', dest='outgroup_vcf', help='Outgroup VCF required for variable alignment', required=False)
|
|
37
|
+
parser.add_argument('-n', dest='outgroup_name', type=str, help='Clean name for outgroup header in Galaxy')
|
|
38
|
+
parser.add_argument('-b', dest='bed_files', action='append', default=[], help='Optional BED files to mask certain genomic regions', required=False)
|
|
39
|
+
parser.add_argument('-o', dest='output_dir', help='Output directory for consensus files and the variable alignment' ,required=False)
|
|
40
|
+
parser.add_argument('-g', dest='undefined_states', help='Percentage of undefined states allowed per polymorphic position in the alignment', type=float, default=0.9, required=False)
|
|
41
|
+
parser.add_argument('-t', dest='test_mode',help='allows to use smaller files with less genomic positions', action='store_true')
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Run the parser and place data in the parser object for later use
|
|
45
|
+
args = parser.parse_args()
|
|
46
|
+
|
|
47
|
+
return args
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Check if all arguments for the consensus script are there
|
|
51
|
+
def check_consensus_args(args):
|
|
52
|
+
|
|
53
|
+
required_consensus_args = ['vcf_files', 'reference', 'depth']
|
|
54
|
+
|
|
55
|
+
# List comprehension to iterate over field in required_consensus_args and if the field is missing, adds it to the list
|
|
56
|
+
missing_consensus_args = [field for field in required_consensus_args if getattr(args, field, None) is None]
|
|
57
|
+
|
|
58
|
+
# If Missing_consensus_args has elements, it raises and error and outputs what arguments are missing, separated by a ','
|
|
59
|
+
if missing_consensus_args:
|
|
60
|
+
raise ValueError(f"Missing required arguments: {', '.join(missing_consensus_args)}")
|
|
61
|
+
|
|
62
|
+
# Check if all arguments for the snp_aligner script are there
|
|
63
|
+
def check_snp_aligner_args(args):
|
|
64
|
+
|
|
65
|
+
required_snp_aligner_args = ['outgroup_vcf', 'undefined_states']
|
|
66
|
+
|
|
67
|
+
# List comprehension to iterate over field in required_snp_aligner_args and if the field is missing, adds it to the list
|
|
68
|
+
missing_snp_aligner_args = [field for field in required_snp_aligner_args if getattr(args, field, None) is None]
|
|
69
|
+
|
|
70
|
+
# If missing_snp_aligner_args has elements, it raises and error and outputs what arguments are missing, separated by a ','
|
|
71
|
+
if missing_snp_aligner_args:
|
|
72
|
+
raise ValueError(f"Missing required arguments: {', '.join(missing_snp_aligner_args)}")
|
|
73
|
+
|
|
74
|
+
# Function to prevent crashing by checking for 1) reference genome length and 2) amounts of variants in VCF
|
|
75
|
+
def safety_check(args, max_total_variants=200000, max_ref_length=16100000):
|
|
76
|
+
|
|
77
|
+
# If a VCF has more than 200'000 entries, it is most likely not bacterial or filled with unfiltered sequencing errors
|
|
78
|
+
# The largest bacterial genome is Minicystis rosea with 16.04 Mbp
|
|
79
|
+
|
|
80
|
+
print("Executing safety check")
|
|
81
|
+
|
|
82
|
+
# Check length of reference genome and throw error if file not parsed or too long
|
|
83
|
+
try:
|
|
84
|
+
record = SeqIO.read(str(args.reference), "fasta")
|
|
85
|
+
genome_len = len(record.seq)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
sys.exit(f"ERROR: Failed to parse the reference genome: {e}")
|
|
88
|
+
|
|
89
|
+
if genome_len > max_ref_length:
|
|
90
|
+
sys.exit(f"ERROR: Reference genome length ({genome_len} bp) exceeds the maximum allowed bacterial ceiling of {max_ref_length} bp.")
|
|
91
|
+
|
|
92
|
+
# Check number of variants in each VCF file
|
|
93
|
+
for vcf_path in args.vcf_files:
|
|
94
|
+
try:
|
|
95
|
+
with consensus_galaxy.open_vcf(vcf_path) as file:
|
|
96
|
+
variant_count = sum(1 for line in file if not line.startswith('#'))
|
|
97
|
+
|
|
98
|
+
# If one VCF file is to big, throw error
|
|
99
|
+
if variant_count > max_total_variants:
|
|
100
|
+
sys.exit(
|
|
101
|
+
f"ERROR: Variant overload. File '{os.path.basename(vcf_path)}' "
|
|
102
|
+
f"contains {variant_count} variant lines.\n"
|
|
103
|
+
f"This exceeds the safety limit of {max_total_variants} variants per sample. "
|
|
104
|
+
f"Filter or remove this VCF running the pipeline."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
sys.exit(f"ERROR: Failed reading VCF file {vcf_path}: {e}")
|
|
109
|
+
|
|
110
|
+
print(f"Safety check passed, Reference size: {genome_len} bp. Variants of all VCFs within safety margin.")
|
|
111
|
+
return genome_len
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Main logic of the program
|
|
115
|
+
def main():
|
|
116
|
+
|
|
117
|
+
args = get_args()
|
|
118
|
+
|
|
119
|
+
if args.output_dir is None:
|
|
120
|
+
args.output_dir = "output"
|
|
121
|
+
|
|
122
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
123
|
+
|
|
124
|
+
# Find input files and attribute them to arguments
|
|
125
|
+
#args.reference, args.depth, args.outgroup_vcf, args.vcf_files, args.bed_files = find_input_files(args.input_dir)
|
|
126
|
+
#print("REFERENCE:", args.reference)
|
|
127
|
+
#print("DEPTH:", args.depth)
|
|
128
|
+
#print("OUTGROUP:", args.outgroup_vcf)
|
|
129
|
+
#print("VCFS:", args.vcf_files)
|
|
130
|
+
#print("BEDS:", args.bed_files)
|
|
131
|
+
|
|
132
|
+
# Execute safety check and get the length of the reference genome
|
|
133
|
+
ref_genome_length = safety_check(args)
|
|
134
|
+
|
|
135
|
+
# Relay reference genome length to consensus_galaxy
|
|
136
|
+
args.calculated_ref_len = ref_genome_length
|
|
137
|
+
|
|
138
|
+
# -s consensus: Input = VCFs, output = consensus fasta files -> runs just consensus.py
|
|
139
|
+
if args.step == 'consensus':
|
|
140
|
+
|
|
141
|
+
check_consensus_args(args)
|
|
142
|
+
|
|
143
|
+
consensus_galaxy.main(args)
|
|
144
|
+
|
|
145
|
+
# -s all: Input = VCFs, output = SNP alignment -> runs consensus.py and snp_aligner.py subsequently
|
|
146
|
+
elif args.step == 'all':
|
|
147
|
+
|
|
148
|
+
check_consensus_args(args)
|
|
149
|
+
check_snp_aligner_args(args)
|
|
150
|
+
|
|
151
|
+
consensus_galaxy.main(args)
|
|
152
|
+
snp_aligner_galaxy.main(args)
|
|
153
|
+
|
|
154
|
+
# After consensus is run and -m alignment_only is chosen, delete the fasta files again
|
|
155
|
+
if args.mode == 'alignment_only':
|
|
156
|
+
|
|
157
|
+
for file in glob.glob(os.path.join(args.output_dir, "*.fasta")):
|
|
158
|
+
os.remove(file)
|
|
159
|
+
|
|
160
|
+
if __name__ == '__main__':
|
|
161
|
+
main()
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
from pprint import pprint
|
|
7
|
+
import gzip
|
|
8
|
+
'''
|
|
9
|
+
|
|
10
|
+
This script creates a SNP alignment in multi fasta format from a collection of consensus genomes
|
|
11
|
+
one of which should be the outgroup if intended to be used later for phylogenetic analyses.
|
|
12
|
+
|
|
13
|
+
The user can choose, how many undefined states ('-' or 'N') for a polymorphic position are accepted with
|
|
14
|
+
the -g argument. By default, if a position has more than 90% undefined states, it will be excluded (-g 0.9).
|
|
15
|
+
For a polymorphic site with Ns or - or both to be included in the final SNP alignment, the site has to be poly-
|
|
16
|
+
morphic in terms of at least 2 different bases, plus Ns or -s or both.
|
|
17
|
+
A site with just one base and Ns and or -s is not considered polymorphic.
|
|
18
|
+
|
|
19
|
+
'''
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def flatten(t):
|
|
23
|
+
return [item for sublist in t for item in sublist]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Function to open both zipped and unzipped vcf files
|
|
27
|
+
def open_vcf(path):
|
|
28
|
+
path = Path(path)
|
|
29
|
+
# Check the first two bytes for the gzip magic number
|
|
30
|
+
with open(path, 'rb') as f:
|
|
31
|
+
is_gzip = f.read(2) == b'\x1f\x8b'
|
|
32
|
+
|
|
33
|
+
if is_gzip:
|
|
34
|
+
return gzip.open(path, "rt")
|
|
35
|
+
else:
|
|
36
|
+
return open(path, "r")
|
|
37
|
+
|
|
38
|
+
# Create a dictionary of fasta sequences where the sample name is the key and the value the string of bases
|
|
39
|
+
def fastas2dict(args):
|
|
40
|
+
sequences_dict = {}
|
|
41
|
+
consensus_dir = Path(args.output_dir)
|
|
42
|
+
consensus_files = sorted(consensus_dir.glob("*.fasta"))
|
|
43
|
+
|
|
44
|
+
for fasta in consensus_files:
|
|
45
|
+
with fasta.open() as fh:
|
|
46
|
+
header = None
|
|
47
|
+
seq_parts = []
|
|
48
|
+
for line in fh:
|
|
49
|
+
line = line.strip()
|
|
50
|
+
if not line:
|
|
51
|
+
continue
|
|
52
|
+
if line.startswith(">"):
|
|
53
|
+
header = line[1:]
|
|
54
|
+
else:
|
|
55
|
+
seq_parts.append(line)
|
|
56
|
+
|
|
57
|
+
if header:
|
|
58
|
+
# Join only once at the very end for this sample
|
|
59
|
+
sequences_dict[header] = "".join(seq_parts)
|
|
60
|
+
print(f"Loaded {header}: {len(sequences_dict[header])} bases")
|
|
61
|
+
|
|
62
|
+
return sequences_dict
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Check if all sequences (values, string of characters) in a dictionary are the same length, return True if so
|
|
68
|
+
def check_length(sequences_dict):
|
|
69
|
+
|
|
70
|
+
sequence_lengths = [len(seq) for seq in sequences_dict.values()]
|
|
71
|
+
|
|
72
|
+
return all(x == sequence_lengths[0] for x in sequence_lengths)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Create a 2D array from dictionary of sequences with sequence names at pos 0 and first genomic position at python pos 1
|
|
76
|
+
def dict2array(sequences_dict):
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
print("DEBUG dict2array:")
|
|
80
|
+
print(" Number of sequences:", len(sequences_dict))
|
|
81
|
+
print(" Unique sequence lengths:", set(len(seq) for seq in sequences_dict.values()))
|
|
82
|
+
|
|
83
|
+
if check_length(sequences_dict) == True:
|
|
84
|
+
|
|
85
|
+
array = [[name] + list(seq) for name, seq in sequences_dict.items()]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
return array
|
|
90
|
+
|
|
91
|
+
else:
|
|
92
|
+
print("Not all sequences are of equal length")
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# Get positions from outgroup VCF corresponding to genomic positions of polymorphic sites found in alignment
|
|
96
|
+
def get_outgroup_vcf_var_pos(args, var_positions):
|
|
97
|
+
outgroup_vcf_dict = {}
|
|
98
|
+
|
|
99
|
+
# 1. Clean the outgroup name for the FASTA header
|
|
100
|
+
outgroup_name = args.outgroup_name if args.outgroup_name else Path(args.outgroup_vcf).stem
|
|
101
|
+
outgroup_name = outgroup_name.replace(".vcf", "").replace(".gz", "")
|
|
102
|
+
|
|
103
|
+
# 2. Populate the dictionary FIRST so debug prints work
|
|
104
|
+
with open_vcf(args.outgroup_vcf) as outgroup_vcf:
|
|
105
|
+
for var in outgroup_vcf:
|
|
106
|
+
if var.startswith('#') or not var.strip():
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Strict tab splitting and stripping to avoid hidden characters in Galaxy
|
|
110
|
+
fields = var.strip().split('\t')
|
|
111
|
+
|
|
112
|
+
if len(fields) < 5:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Force POS to be a clean string
|
|
116
|
+
pos = fields[1].strip()
|
|
117
|
+
ref = fields[3].strip()
|
|
118
|
+
alt = fields[4].strip()
|
|
119
|
+
|
|
120
|
+
outgroup_vcf_dict[pos] = [ref, alt]
|
|
121
|
+
|
|
122
|
+
# 3. Handle the polymorphic coordinates
|
|
123
|
+
# Ensure var_positions is a list of clean strings to match the dict keys
|
|
124
|
+
unique_var_positions = [str(p) for p in dict.fromkeys(var_positions)]
|
|
125
|
+
|
|
126
|
+
# DEBUG SECTION
|
|
127
|
+
print(f"DEBUG: Input positions count: {len(unique_var_positions)}")
|
|
128
|
+
print(f"DEBUG: Outgroup VCF dict entries: {len(outgroup_vcf_dict)}")
|
|
129
|
+
if unique_var_positions:
|
|
130
|
+
print(f"DEBUG: First search key: '{unique_var_positions[0]}'")
|
|
131
|
+
|
|
132
|
+
outgroup_sequence = list()
|
|
133
|
+
for pos in unique_var_positions:
|
|
134
|
+
# Check dictionary using the clean string key
|
|
135
|
+
key = str(pos)
|
|
136
|
+
|
|
137
|
+
if key in outgroup_vcf_dict:
|
|
138
|
+
ref_val = outgroup_vcf_dict[pos][0]
|
|
139
|
+
alt_val = outgroup_vcf_dict[pos][1]
|
|
140
|
+
|
|
141
|
+
if alt_val == '.': # No ALT allele -> use REF
|
|
142
|
+
outgroup_sequence.append(ref_val)
|
|
143
|
+
|
|
144
|
+
elif alt_val != '.': # ALT allele present
|
|
145
|
+
if len(alt_val) > 1 or len(ref_val) > 1: # Indel
|
|
146
|
+
outgroup_sequence.append('N')
|
|
147
|
+
else: # SNP
|
|
148
|
+
outgroup_sequence.append(alt_val)
|
|
149
|
+
print(f"MATCH FOUND: Position {key} is in Outgroup VCF. Base: {outgroup_vcf_dict[key]}")
|
|
150
|
+
else:
|
|
151
|
+
# Position not in VCF means it's the Reference base (or missing)
|
|
152
|
+
print(f"MATCH FAILED: Position {key} not found in Outgroup VCF keys.")
|
|
153
|
+
outgroup_sequence.append("-")
|
|
154
|
+
|
|
155
|
+
# Add the header at the start
|
|
156
|
+
result = [outgroup_name] + outgroup_sequence
|
|
157
|
+
|
|
158
|
+
print(f"DEBUG: Finalll outgroup row length: {len(result)}")
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# Main script to extract polymorphic positions from the alignment
|
|
163
|
+
def main(args):
|
|
164
|
+
UNDEF_STAT = args.undefined_states # Use the arg from your parser
|
|
165
|
+
SEQ_DICT = fastas2dict(args)
|
|
166
|
+
|
|
167
|
+
seq_names = list(SEQ_DICT.keys())
|
|
168
|
+
num_seqs = len(seq_names)
|
|
169
|
+
genome_length = len(SEQ_DICT[seq_names[0]])
|
|
170
|
+
|
|
171
|
+
# Initialize pol_pos with sequence names as the first element of each row
|
|
172
|
+
pol_pos = [[name] for name in seq_names]
|
|
173
|
+
polymorphic_indices = []
|
|
174
|
+
|
|
175
|
+
print(f"Analyzing {num_seqs} sequences across {genome_length} bp...")
|
|
176
|
+
|
|
177
|
+
# Iterate through each genomic position one by one (Memory Efficient)
|
|
178
|
+
for col_index in range(genome_length):
|
|
179
|
+
# Build the column on the fly
|
|
180
|
+
column = [SEQ_DICT[name][col_index].replace('X', 'N') for name in seq_names]
|
|
181
|
+
|
|
182
|
+
# Check for polymorphism (more than 1 unique state)
|
|
183
|
+
if len(set(column)) > 1:
|
|
184
|
+
|
|
185
|
+
# Logic for adding column to alignment
|
|
186
|
+
should_add = False
|
|
187
|
+
|
|
188
|
+
if 'N' not in column and '-' not in column:
|
|
189
|
+
should_add = True
|
|
190
|
+
else:
|
|
191
|
+
count_undef = column.count('N') + column.count('-')
|
|
192
|
+
if (count_undef / num_seqs) <= UNDEF_STAT:
|
|
193
|
+
unique_states = set(column)
|
|
194
|
+
# Your specific conditions for missing data sites
|
|
195
|
+
if ('N' in column and '-' not in column) and len(unique_states) > 2:
|
|
196
|
+
should_add = True
|
|
197
|
+
elif ('-' in column and 'N' not in column) and len(unique_states) > 2:
|
|
198
|
+
should_add = True
|
|
199
|
+
elif ('N' in column and '-' in column) and len(unique_states) > 3:
|
|
200
|
+
should_add = True
|
|
201
|
+
|
|
202
|
+
if should_add:
|
|
203
|
+
for i, base in enumerate(column):
|
|
204
|
+
pol_pos[i].append(base)
|
|
205
|
+
# Store the 1-based genomic index
|
|
206
|
+
polymorphic_indices.append(col_index + 1)
|
|
207
|
+
|
|
208
|
+
print("\npol_pos preview (first 5 sequences, first 10 entries):")
|
|
209
|
+
for row in pol_pos[:5]:
|
|
210
|
+
print(row[:10])
|
|
211
|
+
|
|
212
|
+
# Append the outgroup sequence if provided via VCF
|
|
213
|
+
if args.outgroup_vcf:
|
|
214
|
+
# Use a clean, unique, and sorted list of indices
|
|
215
|
+
unique_indices = sorted(list(set(polymorphic_indices)))
|
|
216
|
+
|
|
217
|
+
# Get the outgroup sequence row
|
|
218
|
+
outgroup_line = get_outgroup_vcf_var_pos(args, var_positions=unique_indices)
|
|
219
|
+
|
|
220
|
+
# Safety check: Row length must match (Name + Bases)
|
|
221
|
+
if len(outgroup_line) != len(pol_pos[0]):
|
|
222
|
+
print(f"CRITICAL ERROR: Outgroup row length ({len(outgroup_line)}) "
|
|
223
|
+
f"mismatch with Sample row length ({len(pol_pos[0])})")
|
|
224
|
+
|
|
225
|
+
pol_pos.append(outgroup_line)
|
|
226
|
+
|
|
227
|
+
print("Number of polymorphic sites:", len(pol_pos[0]) - 1)
|
|
228
|
+
print("Number of polymorphic_indices:", len(polymorphic_indices))
|
|
229
|
+
|
|
230
|
+
# Write the output file
|
|
231
|
+
output_alignment_path = os.path.join(args.output_dir, 'snp_alignment.fasta')
|
|
232
|
+
with open(output_alignment_path, 'w') as output_file:
|
|
233
|
+
for i, row in enumerate(pol_pos):
|
|
234
|
+
sequence_name = row[0]
|
|
235
|
+
# Convert everything to string just in case a None or list slipped in
|
|
236
|
+
sequence = "".join(str(base) for base in row[1:])
|
|
237
|
+
|
|
238
|
+
# Standard FASTA format: >Header\nSequence\n
|
|
239
|
+
output_file.write(f">{sequence_name}\n")
|
|
240
|
+
output_file.write(f"{sequence}\n")
|
|
241
|
+
|
|
242
|
+
print(f"Successfully wrote {len(pol_pos)} sequences to {output_alignment_path}")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tb_consensus_aligner
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Building consensus fasta files and variable multi-sequence alignments for mycobacterial genomes.
|
|
5
|
+
Author-email: scg40 <gian.schuepbach@swisstph.ch>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: biopython>=1.80
|
|
13
|
+
|
|
14
|
+
# TBConsensusAligner
|
|
15
|
+
TBConsensusAligner is a tool designed to create a multi-sequence alignment via consensus
|
|
16
|
+
genomes created from VCF files.
|
|
17
|
+
It can currently be used in different modes as follows
|
|
18
|
+
- VCF to SNP alignment `-s all`
|
|
19
|
+
- input = VCFs, output = SNP alignment
|
|
20
|
+
- runs `consensus_galaxy.py` and `snp_aligner_galaxy.py`
|
|
21
|
+
|
|
22
|
+
- VCF to consensus FASTA files `-s consensus`
|
|
23
|
+
- input = VCFs, output = consensus FASTA files
|
|
24
|
+
- runs only `consensus_galaxy.py`
|
|
25
|
+
|
|
26
|
+
If the script produces a SNP alignment from VCFs, the user can choose whether to ouptut just the SNP alignment
|
|
27
|
+
or the used consensus FASTA files as well with the `-m` option.
|
|
28
|
+
|
|
29
|
+
When running the `consensus_galaxy.py` the user has to provide the VCF files, the reference genome used in their creation and the multisample
|
|
30
|
+
depth file from bamtools depth. Optionally, the user can provide one or several BED files to mask certain regions of the genome.
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
The script is run via the command line.
|
|
34
|
+
|
|
35
|
+
***usage***:
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
Usage: TBConsensusAligner [options]
|
|
39
|
+
|
|
40
|
+
Options: -s STR ['consensus' or 'all'] Create either consensus files or consensus files and a variable alignment
|
|
41
|
+
|
|
42
|
+
-m STR ['everything' or 'alignment_only'] If -s all is chosen, output either consensus files and the alignment or just the alignment
|
|
43
|
+
|
|
44
|
+
-v list[STR] Path to the input VCF files
|
|
45
|
+
|
|
46
|
+
-r STR Path to the reference genome
|
|
47
|
+
|
|
48
|
+
-d STR Path to the depth file created by samtools depth
|
|
49
|
+
|
|
50
|
+
-c STR Path to the outgroup VCF for the alignment
|
|
51
|
+
|
|
52
|
+
-b list[STR] Path to the BED files to mask certain genomic regions
|
|
53
|
+
|
|
54
|
+
-o STR Path to the output directory
|
|
55
|
+
|
|
56
|
+
-g FLOAT Percentage of undefined states allowed per polymorphic position in the alignment default at 90 percent value=0.9
|
|
57
|
+
|
|
58
|
+
-t BOOLEAN Test Mode to run the script with smaller genomes
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Example usages command line
|
|
63
|
+
|
|
64
|
+
### From 3 VCFs to alignment, output consensus FASTAs and SNP alignment, masked by two bedfiles
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
python yourfolder/galaxy_main.py \
|
|
68
|
+
-s all \
|
|
69
|
+
-m everything \
|
|
70
|
+
-v path/to/vcf1/vcf1.vcf -v path/to/vcf2/vcf2.vcf -v path/to/vcf2/vcf2.vcf \
|
|
71
|
+
-r path/to/reference/reference_genome.reference.fasta \
|
|
72
|
+
-d path/to/depthfile/depthfile.tabular \
|
|
73
|
+
-c path/to/outgroupvcf/outgroup.vcf \
|
|
74
|
+
-b path/to/bedfile1/bed1.bed -b path/to/bedfile2/bed2.bed \
|
|
75
|
+
-o output
|
|
76
|
+
-g 0.9
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Logic consensus_galaxy.py
|
|
81
|
+
|
|
82
|
+
This script produces consensus FASTA files from VCFs. The user provides the VCFs, the reference genome,
|
|
83
|
+
the multisample depth file obtained from `bamtools depth` and optionally BED files to mask certain genomic regions.
|
|
84
|
+
|
|
85
|
+
***algorithm***
|
|
86
|
+
|
|
87
|
+
We loop through each position in the reference genome and build the consensus sequence base per base with the following rules:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
- SNPs and MNPs with a frequency >90% are encoded with the alternative base from the VCF.
|
|
91
|
+
- SNPs and MNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
|
|
92
|
+
- SNPs and MNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
|
|
93
|
+
|
|
94
|
+
- Large deletions (coverage 0) are encoded by dashes (-).
|
|
95
|
+
- Small deletions with frequency >90% are encoded by dashes (-).
|
|
96
|
+
- Small deletions with frequency between 10% and 90% are encoded with a 'N'.
|
|
97
|
+
- Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
|
|
98
|
+
|
|
99
|
+
- Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
|
|
100
|
+
|
|
101
|
+
- Sites to exclude specified in the bed files are encoded with a 'N'.
|
|
102
|
+
- Variants that have a phred score < 20 are encoded with a 'N'.
|
|
103
|
+
- Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
|
|
104
|
+
|
|
105
|
+
- If more than one ALT allele is present, we consider the one with the highest allele frequency.
|
|
106
|
+
```
|
|
107
|
+
Since the VCF for which the script is tailored to does not have the allele frequency `AF` calculated, we calculate it using
|
|
108
|
+
`RO = REF allele occurance` and `AO = ALT allele occurance` with
|
|
109
|
+
`AF = AO / ( sum(all AO's) + RO )`.
|
|
110
|
+
|
|
111
|
+
## Logic snp_aligner_galaxy.py
|
|
112
|
+
|
|
113
|
+
This script produces a SNP alignment i.e. multi-sequence alignment of polymorphic positions in FASTA format from consensus FASTA files.
|
|
114
|
+
The consensus FASTAs are generated in the previous step by `consensus_galaxy.py` wrapped in `TBConsensusAligner`.
|
|
115
|
+
|
|
116
|
+
***algorithm***
|
|
117
|
+
|
|
118
|
+
The script creates a multi-sequence alignment from the consensus FASTAs in form of an 2D array where each row represents a sequence
|
|
119
|
+
and each column a genomic position. Each column is checked for the abundance of a polymorphism. If such a polymorphism is detected,
|
|
120
|
+
the column is appended to the alignment of polymorphic positions. For each polymorphic position, the corresponding nucleotide from
|
|
121
|
+
outgroup VCF is retrieved to get the outgroup's sequence.
|
|
122
|
+
|
|
123
|
+
The algorithm to populate the multi-sequence alignment of polymorphic positions is asd follows.
|
|
124
|
+
The user can control the proportion of gaps or undefined states (`-`or`N`) for a polymorphic position
|
|
125
|
+
to be kept in the alignment (argument `-g`). By default, this is set to `0.9`, meaning that if a polymorphic position has
|
|
126
|
+
more than 90% gaps or undefined states, it will not be in the final alignment.
|
|
127
|
+
|
|
128
|
+
## Directory structure used for development
|
|
129
|
+
|
|
130
|
+
* TBConsensusAligner
|
|
131
|
+
* test_data
|
|
132
|
+
- snp_alignment.fasta
|
|
133
|
+
- test_G77777.consensus.fasta
|
|
134
|
+
- test_G77777.vcf.gz
|
|
135
|
+
- test_G88888_k1.consensus.fasta
|
|
136
|
+
- test_G88888_k1.vcf.gz
|
|
137
|
+
- test_G99999.k2.consensus.fasta
|
|
138
|
+
- test_G99999.k2.vcf.gz
|
|
139
|
+
- test_Galaxy_multiple_depths_header.tabular
|
|
140
|
+
- test_reference_200bp.reference.fasta
|
|
141
|
+
- test_regions_blindspots_modlin_farhat_and_PE_PPE_PGRS.bed
|
|
142
|
+
- test.outgroup.all.pos.vcf.gz
|
|
143
|
+
- consensus_galaxy.py
|
|
144
|
+
- main_galaxy.py
|
|
145
|
+
- README.md
|
|
146
|
+
- snp_aligner_galaxy.py
|
|
147
|
+
- TBConsensusAligner.xml
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
tb_consensus_aligner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
tb_consensus_aligner/consensus_galaxy.py,sha256=NnwO3_rT43NquvkOFQR0si_ivbAP0JRfOn4o1ezkdJw,20165
|
|
3
|
+
tb_consensus_aligner/main_galaxy.py,sha256=8m_73RdxwDM3W_sjtNtKyvxhEmMYWEXFp2BJoCd1kR4,7376
|
|
4
|
+
tb_consensus_aligner/snp_aligner_galaxy.py,sha256=tv8J2LmBsHzr9K-T5fLhd82B7q7CYDJEuE3UDK38UkY,9218
|
|
5
|
+
tb_consensus_aligner-1.0.0.dist-info/METADATA,sha256=2WWLuwLTnqcbzQXzKW9EhSREDX8CAVGMliPVb2z_ALk,6415
|
|
6
|
+
tb_consensus_aligner-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
tb_consensus_aligner-1.0.0.dist-info/entry_points.txt,sha256=cBK81ccXI6DIOvhvxfZNr4v2j21Rdq_7HEJT_1RGKUw,79
|
|
8
|
+
tb_consensus_aligner-1.0.0.dist-info/top_level.txt,sha256=Q8H2spBV19grmt3Mw6TTiZ_D-1CvaK0LeqkqTT84gbA,21
|
|
9
|
+
tb_consensus_aligner-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tb_consensus_aligner
|