snp-search 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -2
- data/Gemfile.lock +2 -3
- data/README +0 -105
- data/README.rdoc +35 -29
- data/Rakefile +2 -2
- data/VERSION +1 -1
- data/bin/snp-search +174 -261
- data/lib/create_methods.rb +196 -0
- data/lib/filter_ignore_snps_methods.rb +130 -0
- data/lib/information_methods.rb +117 -0
- data/lib/output_information_methods.rb +131 -0
- data/lib/snp-search.rb +18 -280
- data/lib/snp_db_connection.rb +1 -2
- data/lib/snp_db_models.rb +3 -3
- data/lib/snp_db_schema.rb +119 -80
- data/pkg/snp-search-1.1.0.gem +0 -0
- data/pkg/snp-search-1.2.0.gem +0 -0
- data/pkg/snp-search-2.3.0.gem +0 -0
- data/snp-search.gemspec +15 -12
- metadata +73 -33
- data/.rspec +0 -1
@@ -0,0 +1,196 @@
|
|
1
|
+
|
2
|
+
#This is the method that creates the database and imports the data.
|
3
|
+
|
4
|
+
#Called in lib/snp-search.rb
|
5
|
+
|
6
|
+
#This method guesses the reference sequence file format
|
7
|
+
def guess_sequence_format(reference_genome)
|
8
|
+
file_extension = File.extname(reference_genome).downcase
|
9
|
+
file_format = nil
|
10
|
+
case file_extension
|
11
|
+
when ".gbk", ".genbank", ".gb"
|
12
|
+
file_format = :genbank
|
13
|
+
when ".embl", ".emb"
|
14
|
+
file_format = :embl
|
15
|
+
end
|
16
|
+
return file_format
|
17
|
+
end
|
18
|
+
|
19
|
+
# A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.
|
20
|
+
# We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
|
21
|
+
# The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
|
22
|
+
#Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
|
23
|
+
def populate_features_and_annotations(sequence_file)
|
24
|
+
puts "Adding features and their annotations...."
|
25
|
+
ActiveRecord::Base.transaction do
|
26
|
+
counter = 0
|
27
|
+
sequence_file.features.each do |feature|
|
28
|
+
unless feature.feature == "source" || feature.feature == "gene"
|
29
|
+
counter += 1
|
30
|
+
puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
|
31
|
+
db_feature = Feature.new
|
32
|
+
db_feature.name = feature.feature
|
33
|
+
db_feature.start = feature.locations.first.from
|
34
|
+
db_feature.end = feature.locations.first.to
|
35
|
+
db_feature.strand = feature.locations.first.strand
|
36
|
+
#Add nucleotide sequence from ORIGIN of genbank file.
|
37
|
+
db_feature.sequence = sequence_file.seq[feature.locations.first.from-1..feature.locations.first.to-1]
|
38
|
+
db_feature.save
|
39
|
+
# Populate the Annotation table with qualifier information from the genbank file
|
40
|
+
feature.qualifiers.each do |qualifier|
|
41
|
+
a = Annotation.new
|
42
|
+
a.qualifier = qualifier.qualifier
|
43
|
+
a.value = qualifier.value
|
44
|
+
a.save
|
45
|
+
db_feature.annotations << a
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
|
53
|
+
def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
54
|
+
|
55
|
+
puts "Adding SNPs........"
|
56
|
+
# open vcf file and parse each line
|
57
|
+
File.open(vcf_file) do |f|
|
58
|
+
# header names
|
59
|
+
while line = f.gets
|
60
|
+
if line =~ /CHROM/
|
61
|
+
line.chomp!
|
62
|
+
column_headings = line.split("\t")
|
63
|
+
strain_names = column_headings[9..-1]
|
64
|
+
strain_names.map!{|name| name.sub(/\..*/, '')}
|
65
|
+
|
66
|
+
strain_names.each do |str|
|
67
|
+
ss = Strain.new
|
68
|
+
ss.name = str
|
69
|
+
ss.save
|
70
|
+
end
|
71
|
+
|
72
|
+
strains = Array.new
|
73
|
+
strain_names.each do |strain_name|
|
74
|
+
strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
75
|
+
strains << strain
|
76
|
+
end
|
77
|
+
|
78
|
+
good_snps = 0
|
79
|
+
# start parsing snps
|
80
|
+
while line = f.gets
|
81
|
+
line.chomp!
|
82
|
+
details = line.split("\t")
|
83
|
+
ref = details[0]
|
84
|
+
ref_pos = details[1].to_i
|
85
|
+
|
86
|
+
ref_base = details[3]
|
87
|
+
snp_bases = details[4].split(",")
|
88
|
+
snp_qual = details [5]
|
89
|
+
format = details[8].split(":")
|
90
|
+
gt_array_position = format.index("GT")
|
91
|
+
gq_array_position = format.index("GQ")
|
92
|
+
ad_array_position = format.index("AD")
|
93
|
+
# dp = format.index("DP")
|
94
|
+
samples = details[9..-1]
|
95
|
+
|
96
|
+
gts = []
|
97
|
+
gqs = []
|
98
|
+
ad_ratios = []
|
99
|
+
|
100
|
+
|
101
|
+
next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
|
102
|
+
samples.map do |sample|
|
103
|
+
format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
|
104
|
+
gt = format_values[gt_array_position] # e.g.
|
105
|
+
gt = gt.split("/")
|
106
|
+
next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
|
107
|
+
next if gt.first == "." # no coverage
|
108
|
+
gt = gt.first.to_i
|
109
|
+
|
110
|
+
gq = format_values[gq_array_position].to_f
|
111
|
+
|
112
|
+
if ad_array_position
|
113
|
+
# If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
|
114
|
+
# ad below is 6 and 9 in the example above.
|
115
|
+
ad = format_values[ad_array_position].split(",").map{|ad_value| ad_value.to_i}
|
116
|
+
# Find the sum of all bases (sum_of_ad) reported by the ad, so its 15 in the example.
|
117
|
+
sum_of_ad = ad.inject{|sum,x| sum + x }
|
118
|
+
ad_ratios << ad[gt]/sum_of_ad.to_f
|
119
|
+
end
|
120
|
+
|
121
|
+
gqs << gq
|
122
|
+
gts << gt
|
123
|
+
end
|
124
|
+
|
125
|
+
next if ad_ratios.any?{|ad_ratio| ad_ratio < cuttoff_ad.to_i} # exclude if any samples have a call ratio of less than a cuttoff set by user
|
126
|
+
if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
|
127
|
+
good_snps +=1
|
128
|
+
|
129
|
+
# populate snps
|
130
|
+
|
131
|
+
ActiveRecord::Base.transaction do
|
132
|
+
s = Snp.new
|
133
|
+
s.ref_pos = ref_pos
|
134
|
+
s.qual = snp_qual
|
135
|
+
s.save
|
136
|
+
|
137
|
+
# create ref allele
|
138
|
+
ref_allele = Allele.new
|
139
|
+
ref_allele.base = ref_base
|
140
|
+
ref_allele.snp = s
|
141
|
+
ref_allele.save
|
142
|
+
|
143
|
+
s.reference_allele = ref_allele
|
144
|
+
s.save
|
145
|
+
|
146
|
+
snp_alleles = Array.new
|
147
|
+
gts.uniq.select{|gt| gt > 0}.each do |gt|
|
148
|
+
# create snp allele
|
149
|
+
snp_allele = Allele.new
|
150
|
+
snp_bases_index = gt - 1
|
151
|
+
snp_allele.base = snp_bases[snp_bases_index]
|
152
|
+
snp_allele.snp = s
|
153
|
+
snp_allele.save
|
154
|
+
snp_alleles << snp_allele
|
155
|
+
end
|
156
|
+
|
157
|
+
genos = []
|
158
|
+
gts.each_with_index do |gt, index|
|
159
|
+
genotype = Genotype.new
|
160
|
+
genotype.strain = strains[index]
|
161
|
+
#Adding the genotype quality with Genotype
|
162
|
+
|
163
|
+
genotype.geno_qual = gqs[index]
|
164
|
+
if gt == 0# wild type
|
165
|
+
genotype.allele = ref_allele
|
166
|
+
else # snp type
|
167
|
+
genotype.allele = snp_alleles[gt - 1]
|
168
|
+
end
|
169
|
+
genos << genotype
|
170
|
+
end
|
171
|
+
|
172
|
+
# Using activerecord-import to speed up importing
|
173
|
+
Genotype.import genos, :validate => false
|
174
|
+
puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
|
175
|
+
# puts "Total SNPs added so far: #{good_snps}"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
#Here we link the features to snps.
|
183
|
+
puts "Linking features to SNPs"
|
184
|
+
ActiveRecord::Base.transaction do
|
185
|
+
Snp.all.each_with_index do |snp, index|
|
186
|
+
puts "Total SNPs linked to features added so far: #{index}" if index % 100 == 0
|
187
|
+
features = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos)
|
188
|
+
|
189
|
+
unless features.empty?
|
190
|
+
features.each do |feature|
|
191
|
+
snp.features << feature
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
|
2
|
+
# require "/Volumes/NGS2_DataRAID/projects/ali/GAS/snp-search-2.0.0/lib/information_methods.rb"
|
3
|
+
# This method performs several queries to ignore elements of the data for fasta or tabular output.
|
4
|
+
# Its is called in lib/snp-search.rb
|
5
|
+
|
6
|
+
require 'output_information_methods'
|
7
|
+
|
8
|
+
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cuttoff_genotype, cuttoff_snp, tree, fasttree_path)
|
9
|
+
|
10
|
+
strains = Strain.all
|
11
|
+
|
12
|
+
sequence_hash = Hash.new
|
13
|
+
sequence_hash["ref"] = Array.new
|
14
|
+
strains.each do |strain|
|
15
|
+
sequence_hash[strain.name] = Array.new
|
16
|
+
end
|
17
|
+
|
18
|
+
snps_array = Array.new
|
19
|
+
snp_positions = Array.new
|
20
|
+
|
21
|
+
# output opened for data input
|
22
|
+
output = File.open(out, "w")
|
23
|
+
tab_delim_file_name = File.basename(out, File.extname(out)) + "_snps.tsv"
|
24
|
+
tab_delim_file = File.open(tab_delim_file_name, "w")
|
25
|
+
position_map_file_name = File.basename(out, File.extname(out)) + "_snps_positions.txt"
|
26
|
+
position_map_file = File.open(position_map_file_name, "w")
|
27
|
+
|
28
|
+
|
29
|
+
snps_within_features_with_annotation = ""
|
30
|
+
# Perform query
|
31
|
+
# puts ignore_snps_on_annotation.inspect
|
32
|
+
if ignore_snps_on_annotation
|
33
|
+
annotations_where = ignore_snps_on_annotation.split(",").map{|annotation| "annotations.value LIKE '%#{annotation}%'"}.join(" OR ")
|
34
|
+
features_with_annotation = Feature.joins(:annotations).where(annotations_where)
|
35
|
+
snps_within_features_with_annotation = Snp.joins(:features).where("features.id IN (?)", features_with_annotation.collect{|feature| feature.id})
|
36
|
+
end
|
37
|
+
|
38
|
+
if snps_within_features_with_annotation.empty?
|
39
|
+
snps = Snp.all
|
40
|
+
else
|
41
|
+
snps = Snp.where("snps.id NOT IN (?)", snps_within_features_with_annotation.collect{|snp| snp.id})
|
42
|
+
end
|
43
|
+
|
44
|
+
positions_to_ignore = Array.new
|
45
|
+
if ignore_snps_in_range
|
46
|
+
range_strings = ignore_snps_in_range.split(",")
|
47
|
+
range_strings.each do |range|
|
48
|
+
start_position, end_position = range.split("..")
|
49
|
+
positions_to_ignore += (start_position.to_i..end_position.to_i).to_a
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
if ignore_strains
|
54
|
+
strains_to_ignore = ignore_strains.split(",")
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
i = 0
|
59
|
+
puts "Your Query is submitted and is being processed......."
|
60
|
+
strains = Strain.find(:all)
|
61
|
+
if ignore_strains
|
62
|
+
strains_to_ignore = ignore_strains.split(",")
|
63
|
+
strains.reject!{|strain| strains_to_ignore.include?(strain.name)}
|
64
|
+
end
|
65
|
+
|
66
|
+
snps.each do |snp|
|
67
|
+
|
68
|
+
ActiveRecord::Base.transaction do
|
69
|
+
i += 1
|
70
|
+
next if positions_to_ignore.include?(snp.ref_pos) # Ignore positions that user specified
|
71
|
+
alleles = snp.alleles
|
72
|
+
|
73
|
+
genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
|
74
|
+
|
75
|
+
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
76
|
+
# ignore snp if the snp qual is less than cuttoff.
|
77
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
78
|
+
|
79
|
+
next if alleles.any?{|allele| allele.base.length > 1} # indel
|
80
|
+
next unless genotypes.all?{|genotype| genotype.geno_qual >= cuttoff_genotype} # all geno quals > cutoff
|
81
|
+
# puts "#{i} SNPs processed so far" if i % 100 == 0
|
82
|
+
strain_alleles = Hash.new
|
83
|
+
strains.each do |strain|
|
84
|
+
strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
|
85
|
+
strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
|
86
|
+
|
87
|
+
strain_alleles[strain.name] = strain_allele.base
|
88
|
+
end
|
89
|
+
|
90
|
+
if remove_non_informative_snps
|
91
|
+
next if strain_alleles.values.uniq.size == 1 # remove non-informative SNPs
|
92
|
+
end
|
93
|
+
|
94
|
+
snp_positions << snp.ref_pos
|
95
|
+
snps_array << snp
|
96
|
+
strain_alleles.each do |strain_name, allele_base|
|
97
|
+
sequence_hash[strain_name] << allele_base
|
98
|
+
end
|
99
|
+
sequence_hash["ref"] << snp.reference_allele.base
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# If user has specified a tabular output
|
104
|
+
if tabular_output
|
105
|
+
output_information_methods(snps_array, output, cuttoff_genotype, cuttoff_snp, true)
|
106
|
+
# If user has specified a fasta output
|
107
|
+
elsif fasta_output
|
108
|
+
# generate FASTA file
|
109
|
+
output.puts ">ref\n#{sequence_hash["ref"].join("")}"
|
110
|
+
tab_delim_file.puts "\t#{snp_positions.join("\t")}"
|
111
|
+
tab_delim_file.puts "ref\t#{sequence_hash["ref"].join("\t")}"
|
112
|
+
strains.each do |strain|
|
113
|
+
output.puts ">#{strain.name}\n#{sequence_hash[strain.name].join("")}"
|
114
|
+
tab_delim_file.puts "#{strain.name}\t#{sequence_hash[strain.name].join("\t")}"
|
115
|
+
end
|
116
|
+
|
117
|
+
snp_positions.each_with_index do |snp_position, index|
|
118
|
+
position_map_file.puts "#{index+1} => #{snp_position}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
# If user has chosen a newick output.
|
122
|
+
if tree
|
123
|
+
nwk_out_file_name = File.basename(out, File.extname(out)) + ".nwk"
|
124
|
+
puts "running phylogeny"
|
125
|
+
`#{fasttree_path} -fastest -nt #{output} > #{nwk_out_file_name}`
|
126
|
+
end
|
127
|
+
|
128
|
+
output.close
|
129
|
+
tab_delim_file.close
|
130
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
|
2
|
+
# This method outputs information for each SNP, e.g. synonymous and non-synonymous etc.
|
3
|
+
# Its is called in lib/snp-search.rb
|
4
|
+
|
5
|
+
def information()
|
6
|
+
|
7
|
+
strains = Strain.all
|
8
|
+
|
9
|
+
output = File.open("#{out}", "w")
|
10
|
+
|
11
|
+
output.puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tSNP_base\tsynonymous or non-synonymous\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t")}"
|
12
|
+
|
13
|
+
snp_info = 0
|
14
|
+
|
15
|
+
snps = Snp.find_by_sql("SELECT snps.* from snps inner join alleles on snps.id = alleles.snp_id")
|
16
|
+
|
17
|
+
snps.each do |snp|
|
18
|
+
snp.alleles.each do |allele|
|
19
|
+
|
20
|
+
features = Feature.joins(:snps).where("snps.id = ?", snp.id)
|
21
|
+
|
22
|
+
features.each do |feature|
|
23
|
+
if feature.feature == "CDS" || "RNA"
|
24
|
+
annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
|
25
|
+
else
|
26
|
+
annotation = Annotation.where("annotations.qualifier = 'note' and annotations.feature_id = ?", feature.id).first
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
all_seqs_mutated = genome_sequence.seq
|
31
|
+
mutated_seq_translated = []
|
32
|
+
original_seq_translated = []
|
33
|
+
|
34
|
+
# Mutate the sequence with the SNP
|
35
|
+
all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base
|
36
|
+
|
37
|
+
# The reason why we use all the sequence from the genome and not just the feature sequence is because the ref_pos is based on the genome sequence and when we come to mutate it we have to have the full sequence to find the position.
|
38
|
+
|
39
|
+
mutated_seq = Bio::Sequence.auto(all_seqs_mutated[feature.start-1..feature.end-1])
|
40
|
+
|
41
|
+
original_seq = Bio::Sequence.auto(all_seqs_original[feature.start-1..feature.end-1])
|
42
|
+
|
43
|
+
#If the strand is negative then reverse complement
|
44
|
+
|
45
|
+
if feature.strand == -1
|
46
|
+
mutated_seq_translated << mutated_seq.reverse_complement.translate
|
47
|
+
original_seq_translated << original_seq.reverse_complement.translate
|
48
|
+
|
49
|
+
else
|
50
|
+
mutated_seq_translated << mutated_seq.translate
|
51
|
+
original_seq_translated << original_seq.translate
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
# Remove the star at the end of each translated sequence.
|
56
|
+
|
57
|
+
mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
|
58
|
+
mutated_seq_translated_clean = mut.gsub(/\*$/,"")
|
59
|
+
original_seq_translated_clean = org.gsub(/\*$/,"")
|
60
|
+
|
61
|
+
# Amino acid properties
|
62
|
+
|
63
|
+
hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
|
64
|
+
non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
|
65
|
+
|
66
|
+
polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
|
67
|
+
non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
|
68
|
+
|
69
|
+
small = ["V","C","A","G","D","N","S","T","P"]
|
70
|
+
non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
|
71
|
+
|
72
|
+
alleles_array = []
|
73
|
+
strains.each do |strain|
|
74
|
+
allele = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
|
75
|
+
alleles_array << allele.base
|
76
|
+
end
|
77
|
+
|
78
|
+
snp_info +=1
|
79
|
+
|
80
|
+
if original_seq_translated_clean == mutated_seq_translated_clean
|
81
|
+
if mutated_seq_translated_clean =~ /\*/
|
82
|
+
output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\tYes\t\t\t\t\t\t#{alleles_array.join("\t")}"
|
83
|
+
else
|
84
|
+
output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\t\t\t\t\t\t\t#{alleles_array.join("\t")}"
|
85
|
+
end
|
86
|
+
else
|
87
|
+
|
88
|
+
diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
|
89
|
+
|
90
|
+
if mutated_seq_translated_clean =~ /\*/
|
91
|
+
output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
|
92
|
+
puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
|
93
|
+
else
|
94
|
+
output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
|
95
|
+
puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
puts "Total SNPs outputted so far: #{snp_info}" if snp_info % 50 == 0
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#Take all SNP positions in ref genome
|
104
|
+
# snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}
|
105
|
+
|
106
|
+
# # Take all SNP nucleotide
|
107
|
+
# snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}
|
108
|
+
|
109
|
+
# # Mutate (substitute) the original sequence with the SNPs
|
110
|
+
|
111
|
+
# # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them
|
112
|
+
|
113
|
+
# #Get start position of CDS with SNP
|
114
|
+
# coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
|
115
|
+
|
116
|
+
# #Get end position of CDS with SNP
|
117
|
+
# coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end}
|
@@ -0,0 +1,131 @@
|
|
1
|
+
|
2
|
+
#This method finds info_snps from a list of strains given by user.
|
3
|
+
# Its is called in lib/snp-search.rb
|
4
|
+
|
5
|
+
def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true)
|
6
|
+
|
7
|
+
strains = Strain.all
|
8
|
+
|
9
|
+
outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}"
|
10
|
+
|
11
|
+
snps_counter = 0
|
12
|
+
total_number_of_syn_snps = 0
|
13
|
+
total_number_of_non_syn_snps = 0
|
14
|
+
total_number_of_pseudo = 0
|
15
|
+
snps.each do |snp|
|
16
|
+
|
17
|
+
ActiveRecord::Base.transaction do
|
18
|
+
snps_counter +=1
|
19
|
+
snp.alleles.each do |allele|
|
20
|
+
next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
|
21
|
+
if allele.id != snp.reference_allele_id
|
22
|
+
|
23
|
+
# get annotation (if there is any) for each SNP
|
24
|
+
features = Feature.joins(:snps).where("snps.id = ?", snp.id)
|
25
|
+
|
26
|
+
# get snp quality for each snp
|
27
|
+
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
28
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
29
|
+
# ignore snp if the snp qual is less than cuttoff.
|
30
|
+
# next if snp.snp_qual < cuttoff_snp.to_i
|
31
|
+
|
32
|
+
# get all genotype qualities for each snp.
|
33
|
+
gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
|
34
|
+
# ignore snp if any of its genotype qualities is lower than the cuttoff.
|
35
|
+
next if gqs.any?{|genotype_quality| genotype_quality.geno_qual < cuttoff_genotype.to_i}
|
36
|
+
|
37
|
+
ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
|
38
|
+
snp_base = Bio::Sequence.auto(allele.base)
|
39
|
+
|
40
|
+
# If the feature is empty then just output basic information about the snp.
|
41
|
+
if features.empty?
|
42
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
43
|
+
else
|
44
|
+
features.each do |feature|
|
45
|
+
|
46
|
+
if feature.name == "CDS"
|
47
|
+
|
48
|
+
annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
|
49
|
+
#if annotation is nil, or empty
|
50
|
+
if annotation.nil?
|
51
|
+
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
52
|
+
else
|
53
|
+
feature_sequence = feature.sequence
|
54
|
+
|
55
|
+
feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence)
|
56
|
+
|
57
|
+
#Mutate sequence with SNP
|
58
|
+
feature_sequence_mutated = feature.sequence
|
59
|
+
feature_sequence_snp_pos = (snp.ref_pos-1) - (feature.start-1)
|
60
|
+
feature_sequence_mutated[feature_sequence_snp_pos] = allele.base
|
61
|
+
feature_sequence_mutated_bio = Bio::Sequence::NA.new(feature_sequence_mutated)
|
62
|
+
|
63
|
+
# Translate the sequences
|
64
|
+
if feature.strand == -1
|
65
|
+
mutated_seq_translated = feature_sequence_mutated_bio.reverse_complement.translate
|
66
|
+
original_seq_translated = feature_sequence_bio.reverse_complement.translate
|
67
|
+
|
68
|
+
else
|
69
|
+
mutated_seq_translated = feature_sequence_mutated_bio.translate
|
70
|
+
original_seq_translated = feature_sequence_bio.translate
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
# Remove the star at the end of each translated sequence.
|
75
|
+
mutated_seq_translated_clean = mutated_seq_translated.gsub(/\*$/,"")
|
76
|
+
original_seq_translated_clean = original_seq_translated.gsub(/\*$/,"")
|
77
|
+
|
78
|
+
# Amino acid properties
|
79
|
+
hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
|
80
|
+
non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
|
81
|
+
|
82
|
+
polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
|
83
|
+
non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
|
84
|
+
|
85
|
+
small = ["V","C","A","G","D","N","S","T","P"]
|
86
|
+
non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
|
87
|
+
|
88
|
+
# Get alleles for each strain
|
89
|
+
alleles_array = []
|
90
|
+
strains.each do |strain|
|
91
|
+
allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
|
92
|
+
alleles_array << allele_for_strains.base
|
93
|
+
end
|
94
|
+
# If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
|
95
|
+
if original_seq_translated_clean == mutated_seq_translated_clean
|
96
|
+
total_number_of_non_syn_snps +=1
|
97
|
+
if mutated_seq_translated_clean =~ /\*/
|
98
|
+
total_number_of_pseudo +=1
|
99
|
+
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
100
|
+
else
|
101
|
+
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
102
|
+
end
|
103
|
+
else
|
104
|
+
total_number_of_syn_snps +=1
|
105
|
+
diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
|
106
|
+
|
107
|
+
if mutated_seq_translated_clean =~ /\*/
|
108
|
+
total_number_of_pseudo +=1
|
109
|
+
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
110
|
+
else
|
111
|
+
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
|
121
|
+
end
|
122
|
+
end
|
123
|
+
puts "Total number of snps: #{snps_counter}"
|
124
|
+
puts "Total number of synonymous SNPs #{total_number_of_syn_snps}"
|
125
|
+
puts "Total number of non-synonymous SNPs #{total_number_of_non_syn_snps}"
|
126
|
+
puts "Total number of pseudogenes #{total_number_of_pseudo}"
|
127
|
+
outfile.puts "Total number of snps: #{snps_counter}"
|
128
|
+
outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
|
129
|
+
outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
|
130
|
+
outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}"
|
131
|
+
end
|