snp-search 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+
2
+ #This is the method that creates the database and imports the data.
3
+
4
+ #Called in lib/snp-search.rb
5
+
6
+ #This method guesses the reference sequence file format
7
+ def guess_sequence_format(reference_genome)
8
+ file_extension = File.extname(reference_genome).downcase
9
+ file_format = nil
10
+ case file_extension
11
+ when ".gbk", ".genbank", ".gb"
12
+ file_format = :genbank
13
+ when ".embl", ".emb"
14
+ file_format = :embl
15
+ end
16
+ return file_format
17
+ end
18
+
19
+ # A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.
20
+ # We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
21
+ # The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
22
+ #Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
23
+ def populate_features_and_annotations(sequence_file)
24
+ puts "Adding features and their annotations...."
25
+ ActiveRecord::Base.transaction do
26
+ counter = 0
27
+ sequence_file.features.each do |feature|
28
+ unless feature.feature == "source" || feature.feature == "gene"
29
+ counter += 1
30
+ puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
31
+ db_feature = Feature.new
32
+ db_feature.name = feature.feature
33
+ db_feature.start = feature.locations.first.from
34
+ db_feature.end = feature.locations.first.to
35
+ db_feature.strand = feature.locations.first.strand
36
+ #Add nucleotide sequence from ORIGIN of genbank file.
37
+ db_feature.sequence = sequence_file.seq[feature.locations.first.from-1..feature.locations.first.to-1]
38
+ db_feature.save
39
+ # Populate the Annotation table with qualifier information from the genbank file
40
+ feature.qualifiers.each do |qualifier|
41
+ a = Annotation.new
42
+ a.qualifier = qualifier.qualifier
43
+ a.value = qualifier.value
44
+ a.save
45
+ db_feature.annotations << a
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
53
+ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
54
+
55
+ puts "Adding SNPs........"
56
+ # open vcf file and parse each line
57
+ File.open(vcf_file) do |f|
58
+ # header names
59
+ while line = f.gets
60
+ if line =~ /CHROM/
61
+ line.chomp!
62
+ column_headings = line.split("\t")
63
+ strain_names = column_headings[9..-1]
64
+ strain_names.map!{|name| name.sub(/\..*/, '')}
65
+
66
+ strain_names.each do |str|
67
+ ss = Strain.new
68
+ ss.name = str
69
+ ss.save
70
+ end
71
+
72
+ strains = Array.new
73
+ strain_names.each do |strain_name|
74
+ strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
75
+ strains << strain
76
+ end
77
+
78
+ good_snps = 0
79
+ # start parsing snps
80
+ while line = f.gets
81
+ line.chomp!
82
+ details = line.split("\t")
83
+ ref = details[0]
84
+ ref_pos = details[1].to_i
85
+
86
+ ref_base = details[3]
87
+ snp_bases = details[4].split(",")
88
+ snp_qual = details [5]
89
+ format = details[8].split(":")
90
+ gt_array_position = format.index("GT")
91
+ gq_array_position = format.index("GQ")
92
+ ad_array_position = format.index("AD")
93
+ # dp = format.index("DP")
94
+ samples = details[9..-1]
95
+
96
+ gts = []
97
+ gqs = []
98
+ ad_ratios = []
99
+
100
+
101
+ next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
102
+ samples.map do |sample|
103
+ format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
104
+ gt = format_values[gt_array_position] # e.g.
105
+ gt = gt.split("/")
106
+ next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
107
+ next if gt.first == "." # no coverage
108
+ gt = gt.first.to_i
109
+
110
+ gq = format_values[gq_array_position].to_f
111
+
112
+ if ad_array_position
113
+ # If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
114
+ # ad below is 6 and 9 in the example above.
115
+ ad = format_values[ad_array_position].split(",").map{|ad_value| ad_value.to_i}
116
+ # Find the sum of all bases (sum_of_ad) reported by the ad, so its 15 in the example.
117
+ sum_of_ad = ad.inject{|sum,x| sum + x }
118
+ ad_ratios << ad[gt]/sum_of_ad.to_f
119
+ end
120
+
121
+ gqs << gq
122
+ gts << gt
123
+ end
124
+
125
+ next if ad_ratios.any?{|ad_ratio| ad_ratio < cuttoff_ad.to_i} # exclude if any samples have a call ratio of less than a cuttoff set by user
126
+ if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
127
+ good_snps +=1
128
+
129
+ # populate snps
130
+
131
+ ActiveRecord::Base.transaction do
132
+ s = Snp.new
133
+ s.ref_pos = ref_pos
134
+ s.qual = snp_qual
135
+ s.save
136
+
137
+ # create ref allele
138
+ ref_allele = Allele.new
139
+ ref_allele.base = ref_base
140
+ ref_allele.snp = s
141
+ ref_allele.save
142
+
143
+ s.reference_allele = ref_allele
144
+ s.save
145
+
146
+ snp_alleles = Array.new
147
+ gts.uniq.select{|gt| gt > 0}.each do |gt|
148
+ # create snp allele
149
+ snp_allele = Allele.new
150
+ snp_bases_index = gt - 1
151
+ snp_allele.base = snp_bases[snp_bases_index]
152
+ snp_allele.snp = s
153
+ snp_allele.save
154
+ snp_alleles << snp_allele
155
+ end
156
+
157
+ genos = []
158
+ gts.each_with_index do |gt, index|
159
+ genotype = Genotype.new
160
+ genotype.strain = strains[index]
161
+ #Adding the genotype quality with Genotype
162
+
163
+ genotype.geno_qual = gqs[index]
164
+ if gt == 0# wild type
165
+ genotype.allele = ref_allele
166
+ else # snp type
167
+ genotype.allele = snp_alleles[gt - 1]
168
+ end
169
+ genos << genotype
170
+ end
171
+
172
+ # Using activerecord-import to speed up importing
173
+ Genotype.import genos, :validate => false
174
+ puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
175
+ # puts "Total SNPs added so far: #{good_snps}"
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
182
+ #Here we link the features to snps.
183
+ puts "Linking features to SNPs"
184
+ ActiveRecord::Base.transaction do
185
+ Snp.all.each_with_index do |snp, index|
186
+ puts "Total SNPs linked to features added so far: #{index}" if index % 100 == 0
187
+ features = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos)
188
+
189
+ unless features.empty?
190
+ features.each do |feature|
191
+ snp.features << feature
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,130 @@
1
+
2
+ # require "/Volumes/NGS2_DataRAID/projects/ali/GAS/snp-search-2.0.0/lib/information_methods.rb"
3
+ # This method performs several queries to ignore elements of the data for fasta or tabular output.
4
+ # Its is called in lib/snp-search.rb
5
+
6
+ require 'output_information_methods'
7
+
8
+ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cuttoff_genotype, cuttoff_snp, tree, fasttree_path)
9
+
10
+ strains = Strain.all
11
+
12
+ sequence_hash = Hash.new
13
+ sequence_hash["ref"] = Array.new
14
+ strains.each do |strain|
15
+ sequence_hash[strain.name] = Array.new
16
+ end
17
+
18
+ snps_array = Array.new
19
+ snp_positions = Array.new
20
+
21
+ # output opened for data input
22
+ output = File.open(out, "w")
23
+ tab_delim_file_name = File.basename(out, File.extname(out)) + "_snps.tsv"
24
+ tab_delim_file = File.open(tab_delim_file_name, "w")
25
+ position_map_file_name = File.basename(out, File.extname(out)) + "_snps_positions.txt"
26
+ position_map_file = File.open(position_map_file_name, "w")
27
+
28
+
29
+ snps_within_features_with_annotation = ""
30
+ # Perform query
31
+ # puts ignore_snps_on_annotation.inspect
32
+ if ignore_snps_on_annotation
33
+ annotations_where = ignore_snps_on_annotation.split(",").map{|annotation| "annotations.value LIKE '%#{annotation}%'"}.join(" OR ")
34
+ features_with_annotation = Feature.joins(:annotations).where(annotations_where)
35
+ snps_within_features_with_annotation = Snp.joins(:features).where("features.id IN (?)", features_with_annotation.collect{|feature| feature.id})
36
+ end
37
+
38
+ if snps_within_features_with_annotation.empty?
39
+ snps = Snp.all
40
+ else
41
+ snps = Snp.where("snps.id NOT IN (?)", snps_within_features_with_annotation.collect{|snp| snp.id})
42
+ end
43
+
44
+ positions_to_ignore = Array.new
45
+ if ignore_snps_in_range
46
+ range_strings = ignore_snps_in_range.split(",")
47
+ range_strings.each do |range|
48
+ start_position, end_position = range.split("..")
49
+ positions_to_ignore += (start_position.to_i..end_position.to_i).to_a
50
+ end
51
+ end
52
+
53
+ if ignore_strains
54
+ strains_to_ignore = ignore_strains.split(",")
55
+ end
56
+
57
+
58
+ i = 0
59
+ puts "Your Query is submitted and is being processed......."
60
+ strains = Strain.find(:all)
61
+ if ignore_strains
62
+ strains_to_ignore = ignore_strains.split(",")
63
+ strains.reject!{|strain| strains_to_ignore.include?(strain.name)}
64
+ end
65
+
66
+ snps.each do |snp|
67
+
68
+ ActiveRecord::Base.transaction do
69
+ i += 1
70
+ next if positions_to_ignore.include?(snp.ref_pos) # Ignore positions that user specified
71
+ alleles = snp.alleles
72
+
73
+ genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
74
+
75
+ snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
76
+ # ignore snp if the snp qual is less than cuttoff.
77
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
78
+
79
+ next if alleles.any?{|allele| allele.base.length > 1} # indel
80
+ next unless genotypes.all?{|genotype| genotype.geno_qual >= cuttoff_genotype} # all geno quals > cutoff
81
+ # puts "#{i} SNPs processed so far" if i % 100 == 0
82
+ strain_alleles = Hash.new
83
+ strains.each do |strain|
84
+ strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
85
+ strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
86
+
87
+ strain_alleles[strain.name] = strain_allele.base
88
+ end
89
+
90
+ if remove_non_informative_snps
91
+ next if strain_alleles.values.uniq.size == 1 # remove non-informative SNPs
92
+ end
93
+
94
+ snp_positions << snp.ref_pos
95
+ snps_array << snp
96
+ strain_alleles.each do |strain_name, allele_base|
97
+ sequence_hash[strain_name] << allele_base
98
+ end
99
+ sequence_hash["ref"] << snp.reference_allele.base
100
+ end
101
+ end
102
+
103
+ # If user has specified a tabular output
104
+ if tabular_output
105
+ output_information_methods(snps_array, output, cuttoff_genotype, cuttoff_snp, true)
106
+ # If user has specified a fasta output
107
+ elsif fasta_output
108
+ # generate FASTA file
109
+ output.puts ">ref\n#{sequence_hash["ref"].join("")}"
110
+ tab_delim_file.puts "\t#{snp_positions.join("\t")}"
111
+ tab_delim_file.puts "ref\t#{sequence_hash["ref"].join("\t")}"
112
+ strains.each do |strain|
113
+ output.puts ">#{strain.name}\n#{sequence_hash[strain.name].join("")}"
114
+ tab_delim_file.puts "#{strain.name}\t#{sequence_hash[strain.name].join("\t")}"
115
+ end
116
+
117
+ snp_positions.each_with_index do |snp_position, index|
118
+ position_map_file.puts "#{index+1} => #{snp_position}"
119
+ end
120
+ end
121
+ # If user has chosen a newick output.
122
+ if tree
123
+ nwk_out_file_name = File.basename(out, File.extname(out)) + ".nwk"
124
+ puts "running phylogeny"
125
+ `#{fasttree_path} -fastest -nt #{output} > #{nwk_out_file_name}`
126
+ end
127
+
128
+ output.close
129
+ tab_delim_file.close
130
+ end
@@ -0,0 +1,117 @@
1
+
2
+ # This method outputs information for each SNP, e.g. synonymous and non-synonymous etc.
3
+ # Its is called in lib/snp-search.rb
4
+
5
+ def information()
6
+
7
+ strains = Strain.all
8
+
9
+ output = File.open("#{out}", "w")
10
+
11
+ output.puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tSNP_base\tsynonymous or non-synonymous\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t")}"
12
+
13
+ snp_info = 0
14
+
15
+ snps = Snp.find_by_sql("SELECT snps.* from snps inner join alleles on snps.id = alleles.snp_id")
16
+
17
+ snps.each do |snp|
18
+ snp.alleles.each do |allele|
19
+
20
+ features = Feature.joins(:snps).where("snps.id = ?", snp.id)
21
+
22
+ features.each do |feature|
23
+ if feature.feature == "CDS" || "RNA"
24
+ annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
25
+ else
26
+ annotation = Annotation.where("annotations.qualifier = 'note' and annotations.feature_id = ?", feature.id).first
27
+ end
28
+
29
+
30
+ all_seqs_mutated = genome_sequence.seq
31
+ mutated_seq_translated = []
32
+ original_seq_translated = []
33
+
34
+ # Mutate the sequence with the SNP
35
+ all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base
36
+
37
+ # The reason why we use all the sequence from the genome and not just the feature sequence is because the ref_pos is based on the genome sequence and when we come to mutate it we have to have the full sequence to find the position.
38
+
39
+ mutated_seq = Bio::Sequence.auto(all_seqs_mutated[feature.start-1..feature.end-1])
40
+
41
+ original_seq = Bio::Sequence.auto(all_seqs_original[feature.start-1..feature.end-1])
42
+
43
+ #If the strand is negative then reverse complement
44
+
45
+ if feature.strand == -1
46
+ mutated_seq_translated << mutated_seq.reverse_complement.translate
47
+ original_seq_translated << original_seq.reverse_complement.translate
48
+
49
+ else
50
+ mutated_seq_translated << mutated_seq.translate
51
+ original_seq_translated << original_seq.translate
52
+
53
+ end
54
+
55
+ # Remove the star at the end of each translated sequence.
56
+
57
+ mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
58
+ mutated_seq_translated_clean = mut.gsub(/\*$/,"")
59
+ original_seq_translated_clean = org.gsub(/\*$/,"")
60
+
61
+ # Amino acid properties
62
+
63
+ hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
64
+ non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
65
+
66
+ polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
67
+ non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
68
+
69
+ small = ["V","C","A","G","D","N","S","T","P"]
70
+ non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
71
+
72
+ alleles_array = []
73
+ strains.each do |strain|
74
+ allele = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
75
+ alleles_array << allele.base
76
+ end
77
+
78
+ snp_info +=1
79
+
80
+ if original_seq_translated_clean == mutated_seq_translated_clean
81
+ if mutated_seq_translated_clean =~ /\*/
82
+ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\tYes\t\t\t\t\t\t#{alleles_array.join("\t")}"
83
+ else
84
+ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\t\t\t\t\t\t\t#{alleles_array.join("\t")}"
85
+ end
86
+ else
87
+
88
+ diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
89
+
90
+ if mutated_seq_translated_clean =~ /\*/
91
+ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
92
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
93
+ else
94
+ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
95
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
96
+ end
97
+ end
98
+ puts "Total SNPs outputted so far: #{snp_info}" if snp_info % 50 == 0
99
+ end
100
+ end
101
+ end
102
+
103
+ #Take all SNP positions in ref genome
104
+ # snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}
105
+
106
+ # # Take all SNP nucleotide
107
+ # snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}
108
+
109
+ # # Mutate (substitute) the original sequence with the SNPs
110
+
111
+ # # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them
112
+
113
+ # #Get start position of CDS with SNP
114
+ # coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
115
+
116
+ # #Get end position of CDS with SNP
117
+ # coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end}
@@ -0,0 +1,131 @@
1
+
2
+ #This method finds info_snps from a list of strains given by user.
3
+ # Its is called in lib/snp-search.rb
4
+
5
+ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true)
6
+
7
+ strains = Strain.all
8
+
9
+ outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}"
10
+
11
+ snps_counter = 0
12
+ total_number_of_syn_snps = 0
13
+ total_number_of_non_syn_snps = 0
14
+ total_number_of_pseudo = 0
15
+ snps.each do |snp|
16
+
17
+ ActiveRecord::Base.transaction do
18
+ snps_counter +=1
19
+ snp.alleles.each do |allele|
20
+ next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
21
+ if allele.id != snp.reference_allele_id
22
+
23
+ # get annotation (if there is any) for each SNP
24
+ features = Feature.joins(:snps).where("snps.id = ?", snp.id)
25
+
26
+ # get snp quality for each snp
27
+ snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
28
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
29
+ # ignore snp if the snp qual is less than cuttoff.
30
+ # next if snp.snp_qual < cuttoff_snp.to_i
31
+
32
+ # get all genotype qualities for each snp.
33
+ gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
34
+ # ignore snp if any of its genotype qualities is lower than the cuttoff.
35
+ next if gqs.any?{|genotype_quality| genotype_quality.geno_qual < cuttoff_genotype.to_i}
36
+
37
+ ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
38
+ snp_base = Bio::Sequence.auto(allele.base)
39
+
40
+ # If the feature is empty then just output basic information about the snp.
41
+ if features.empty?
42
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
43
+ else
44
+ features.each do |feature|
45
+
46
+ if feature.name == "CDS"
47
+
48
+ annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
49
+ #if annotation is nil, or empty
50
+ if annotation.nil?
51
+ outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
52
+ else
53
+ feature_sequence = feature.sequence
54
+
55
+ feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence)
56
+
57
+ #Mutate sequence with SNP
58
+ feature_sequence_mutated = feature.sequence
59
+ feature_sequence_snp_pos = (snp.ref_pos-1) - (feature.start-1)
60
+ feature_sequence_mutated[feature_sequence_snp_pos] = allele.base
61
+ feature_sequence_mutated_bio = Bio::Sequence::NA.new(feature_sequence_mutated)
62
+
63
+ # Translate the sequences
64
+ if feature.strand == -1
65
+ mutated_seq_translated = feature_sequence_mutated_bio.reverse_complement.translate
66
+ original_seq_translated = feature_sequence_bio.reverse_complement.translate
67
+
68
+ else
69
+ mutated_seq_translated = feature_sequence_mutated_bio.translate
70
+ original_seq_translated = feature_sequence_bio.translate
71
+
72
+ end
73
+
74
+ # Remove the star at the end of each translated sequence.
75
+ mutated_seq_translated_clean = mutated_seq_translated.gsub(/\*$/,"")
76
+ original_seq_translated_clean = original_seq_translated.gsub(/\*$/,"")
77
+
78
+ # Amino acid properties
79
+ hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
80
+ non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
81
+
82
+ polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
83
+ non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
84
+
85
+ small = ["V","C","A","G","D","N","S","T","P"]
86
+ non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
87
+
88
+ # Get alleles for each strain
89
+ alleles_array = []
90
+ strains.each do |strain|
91
+ allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
92
+ alleles_array << allele_for_strains.base
93
+ end
94
+ # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
95
+ if original_seq_translated_clean == mutated_seq_translated_clean
96
+ total_number_of_non_syn_snps +=1
97
+ if mutated_seq_translated_clean =~ /\*/
98
+ total_number_of_pseudo +=1
99
+ outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
100
+ else
101
+ outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
102
+ end
103
+ else
104
+ total_number_of_syn_snps +=1
105
+ diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
106
+
107
+ if mutated_seq_translated_clean =~ /\*/
108
+ total_number_of_pseudo +=1
109
+ outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
110
+ else
111
+ outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+ puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
121
+ end
122
+ end
123
+ puts "Total number of snps: #{snps_counter}"
124
+ puts "Total number of synonymous SNPs #{total_number_of_syn_snps}"
125
+ puts "Total number of non-synonymous SNPs #{total_number_of_non_syn_snps}"
126
+ puts "Total number of pseudogenes #{total_number_of_pseudo}"
127
+ outfile.puts "Total number of snps: #{snps_counter}"
128
+ outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
129
+ outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
130
+ outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}"
131
+ end