snp-search 2.10.8 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manual/snp-search_user_manual.pdf +0 -0
- data/README.rdoc +3 -3
- data/VERSION +1 -1
- data/bin/snp-search +23 -20
- data/lib/create_methods.rb +53 -23
- data/lib/filter_ignore_snps_methods.rb +7 -6
- data/lib/output_information_methods.rb +18 -15
- data/lib/snp-search.rb +7 -6
- data/snp-search.gemspec +6 -3
- data/test_data/Ecoli_test_set.vcf +1851 -0
- data/test_data/Reference_file_Ecoli_test_set.gbk +4122 -0
- metadata +6 -3
|
Binary file
|
data/README.rdoc
CHANGED
|
@@ -106,13 +106,13 @@ Alternatively, you may download a SQL tool to view your database (e.g. SQLite so
|
|
|
106
106
|
== Contact
|
|
107
107
|
|
|
108
108
|
If you have any comments, questions or suggestions, please email
|
|
109
|
-
ali.al-shahib@
|
|
109
|
+
ali.al-shahib@phe.gov.uk
|
|
110
110
|
or
|
|
111
|
-
anthony.underwood@
|
|
111
|
+
anthony.underwood@phe.gov.uk
|
|
112
112
|
|
|
113
113
|
Have fun snp-searching!
|
|
114
114
|
|
|
115
115
|
== Copyright
|
|
116
116
|
|
|
117
117
|
Copyright (c) 2012 Ali Al-Shahib. See LICENSE.txt for
|
|
118
|
-
further details.
|
|
118
|
+
further details.
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.
|
|
1
|
+
2.11.0
|
data/bin/snp-search
CHANGED
|
@@ -3,6 +3,8 @@ require 'snp_db_connection'
|
|
|
3
3
|
require 'snp_db_models'
|
|
4
4
|
require 'snp_db_schema'
|
|
5
5
|
require 'output_information_methods'
|
|
6
|
+
require 'create_methods'
|
|
7
|
+
require 'filter_ignore_snps_methods.rb'
|
|
6
8
|
require 'activerecord-import'
|
|
7
9
|
require 'slop'
|
|
8
10
|
|
|
@@ -24,7 +26,7 @@ opts = Slop.parse do
|
|
|
24
26
|
on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
|
|
25
27
|
on :v, :vcf_file=, 'variant call format (vcf) file, Required', true
|
|
26
28
|
on :d, :name_of_database=, 'Name of database, Required'
|
|
27
|
-
on :A, :
|
|
29
|
+
on :A, :cutoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
|
|
28
30
|
|
|
29
31
|
separator ''
|
|
30
32
|
|
|
@@ -33,8 +35,8 @@ opts = Slop.parse do
|
|
|
33
35
|
on :f, :all_or_filtered_snps, 'ignore SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -F/T -o)'
|
|
34
36
|
on :F, :fasta, 'output fasta file format (default)'
|
|
35
37
|
on :T, :tabular, 'output tabular file format'
|
|
36
|
-
on :c, :
|
|
37
|
-
on :g, :
|
|
38
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
|
39
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
|
38
40
|
on :R, :remove_non_informative_snps, 'Only output informative SNPs.'
|
|
39
41
|
on :e, :ignore_snps_in_range=, 'A list of position ranges to ignore e.g 10..500,2000..2500.'
|
|
40
42
|
on :a, :ignore_strains=, 'A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ).'
|
|
@@ -47,8 +49,8 @@ opts = Slop.parse do
|
|
|
47
49
|
separator '-output -unique_snps -d db.sqlite3 -s strains.txt -o unique_snps.txt [options]'
|
|
48
50
|
separator 'e.g. snp-search -O -u -d ecoli.sqlite3 -s strains_list_for_unique_snps.txt -o ecoli_unique_snps_strains.txt'
|
|
49
51
|
on :u, :unique_snps, 'Query for unique snps in the database'
|
|
50
|
-
on :c, :
|
|
51
|
-
on :g, :
|
|
52
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
|
53
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
|
52
54
|
on :s, :strain=, 'The strains/samples you like to query (only used with -unique_snps flag)'
|
|
53
55
|
on :o, :out=, 'Name of output file, Required'
|
|
54
56
|
|
|
@@ -57,8 +59,8 @@ opts = Slop.parse do
|
|
|
57
59
|
separator '-output -info -d db.sqlite3 -o info.txt [options]'
|
|
58
60
|
separator ''
|
|
59
61
|
on :i, :info, 'Output various information about SNPs'
|
|
60
|
-
on :c, :
|
|
61
|
-
on :g, :
|
|
62
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
|
63
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
|
62
64
|
on :o, :out=, 'Name of output file, Required'
|
|
63
65
|
end
|
|
64
66
|
|
|
@@ -68,7 +70,7 @@ end
|
|
|
68
70
|
if opts[:create]
|
|
69
71
|
|
|
70
72
|
# raise "Please provide a database file name" if opts[:reference_file].empty?
|
|
71
|
-
# puts opts[:
|
|
73
|
+
# puts opts[:cutoff_snp_qual].to_i
|
|
72
74
|
|
|
73
75
|
error_msg = ""
|
|
74
76
|
|
|
@@ -122,7 +124,7 @@ if opts[:create]
|
|
|
122
124
|
|
|
123
125
|
#The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
|
|
124
126
|
|
|
125
|
-
populate_snps_alleles_genotypes(opts[:vcf_file], opts[:
|
|
127
|
+
populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cutoff_ad])
|
|
126
128
|
|
|
127
129
|
###########################################################
|
|
128
130
|
|
|
@@ -143,6 +145,7 @@ elsif opts[:output]
|
|
|
143
145
|
|
|
144
146
|
error_msg = ""
|
|
145
147
|
|
|
148
|
+
|
|
146
149
|
error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
|
|
147
150
|
error_msg += "-o: \t name of your output file\n" unless opts[:out]
|
|
148
151
|
error_msg += "-F: \t Fasta output OR\n-T: \t Tabular output" unless opts[:fasta] || opts[:tabular]
|
|
@@ -152,8 +155,8 @@ elsif opts[:output]
|
|
|
152
155
|
error_msg_optional += "-I,\t --ignore_snps_on_annotation: The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)\n" unless opts[:ignore_snps_on_annotation]
|
|
153
156
|
error_msg_optional += "-a,\t --ignore_strains: A list of strains to ignore\n" unless opts[:ignore_strains]
|
|
154
157
|
error_msg_optional += "-e,\t --ignore_snps_in_range: A list of position ranges to ignore e.g 10..500,2000..2500\n" unless opts[:ignore_snps_in_range]
|
|
155
|
-
error_msg_optional += "-c,\t --
|
|
156
|
-
error_msg_optional += "-g,\t --
|
|
158
|
+
error_msg_optional += "-c,\t --cutoff_snp_qual: cutoff for SNP Quality\n" unless opts[:cutoff_snp_qual]
|
|
159
|
+
error_msg_optional += "-g,\t --cutoff_genotype: cutoff for Genotype Quality\n" unless opts[:cutoff_genotype]
|
|
157
160
|
error_msg_optional += "-R,\t --remove_non_informative_snps: Only output informative SNPs\n" unless opts[:remove_non_informative_snps]
|
|
158
161
|
error_msg_optional += "-t,\t --tree: Construct tree from output\n" unless opts[:tree]
|
|
159
162
|
|
|
@@ -163,8 +166,8 @@ elsif opts[:output]
|
|
|
163
166
|
puts "Optional fields:"
|
|
164
167
|
puts error_msg_optional
|
|
165
168
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
|
166
|
-
puts "-c,\t --
|
|
167
|
-
puts "-g,\t --
|
|
169
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
|
170
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
|
168
171
|
# puts opts.help unless opts.empty?
|
|
169
172
|
exit
|
|
170
173
|
end
|
|
@@ -173,7 +176,7 @@ elsif opts[:output]
|
|
|
173
176
|
|
|
174
177
|
establish_connection(opts[:name_of_database])
|
|
175
178
|
|
|
176
|
-
get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:
|
|
179
|
+
get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cutoff_genotype], opts[:cutoff_snp_qual], opts[:tree], opts[:fasttree_path])
|
|
177
180
|
end
|
|
178
181
|
|
|
179
182
|
####################################################################################################
|
|
@@ -191,8 +194,8 @@ elsif opts[:output]
|
|
|
191
194
|
puts error_msg
|
|
192
195
|
puts "Optional fields:"
|
|
193
196
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
|
194
|
-
puts "-c,\t --
|
|
195
|
-
puts "-g,\t --
|
|
197
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
|
198
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
|
196
199
|
# puts opts.help unless opts.empty?
|
|
197
200
|
exit
|
|
198
201
|
end
|
|
@@ -207,7 +210,7 @@ elsif opts[:output]
|
|
|
207
210
|
strains << line.chop
|
|
208
211
|
end
|
|
209
212
|
# find_unique_snps defined in bin/snp-search.rb
|
|
210
|
-
find_unqiue_snps(strains, opts[:out], opts[:
|
|
213
|
+
find_unqiue_snps(strains, opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
|
|
211
214
|
end
|
|
212
215
|
|
|
213
216
|
##############################################################
|
|
@@ -223,8 +226,8 @@ elsif opts[:output]
|
|
|
223
226
|
puts error_msg
|
|
224
227
|
puts "Optional fields:"
|
|
225
228
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
|
226
|
-
puts "-c,\t --
|
|
227
|
-
puts "-g,\t --
|
|
229
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
|
230
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
|
228
231
|
# puts opts.help unless opts.empty?
|
|
229
232
|
exit
|
|
230
233
|
end
|
|
@@ -234,7 +237,7 @@ elsif opts[:output]
|
|
|
234
237
|
establish_connection(opts[:name_of_database])
|
|
235
238
|
|
|
236
239
|
#information defined in bin/snp-search.rb
|
|
237
|
-
information(opts[:out], opts[:
|
|
240
|
+
information(opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
|
|
238
241
|
|
|
239
242
|
end
|
|
240
243
|
|
data/lib/create_methods.rb
CHANGED
|
@@ -50,9 +50,10 @@ def populate_features_and_annotations(sequence_file)
|
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
|
|
53
|
-
def populate_snps_alleles_genotypes(vcf_file,
|
|
53
|
+
def populate_snps_alleles_genotypes(vcf_file, cutoff_ad)
|
|
54
54
|
|
|
55
55
|
puts "Adding SNPs........"
|
|
56
|
+
|
|
56
57
|
# open vcf file and parse each line
|
|
57
58
|
File.open(vcf_file) do |f|
|
|
58
59
|
# header names
|
|
@@ -60,20 +61,23 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
|
60
61
|
if line =~ /CHROM/
|
|
61
62
|
line.chomp!
|
|
62
63
|
column_headings = line.split("\t")
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
potential_strain_names = column_headings[9..-1]
|
|
65
|
+
potential_strain_names.map!{|name| name.sub(/\..*/, '')}
|
|
66
|
+
# strain_names = column_headings[9..-1]
|
|
67
|
+
# strain_names.map!{|name| name.sub(/\..*/, '')}
|
|
68
|
+
strains = Array.new
|
|
65
69
|
|
|
66
|
-
strain_names.each do |str|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
end
|
|
70
|
+
# strain_names.each do |str|
|
|
71
|
+
# ss = Strain.new
|
|
72
|
+
# ss.name = str
|
|
73
|
+
# ss.save
|
|
74
|
+
# end
|
|
71
75
|
|
|
72
|
-
strains = Array.new
|
|
73
|
-
strain_names.each do |strain_name|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
end
|
|
76
|
+
# strains = Array.new
|
|
77
|
+
# strain_names.each do |strain_name|
|
|
78
|
+
# strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
|
79
|
+
# strains << strain
|
|
80
|
+
# end
|
|
77
81
|
|
|
78
82
|
good_snps = 0
|
|
79
83
|
# start parsing snps
|
|
@@ -86,28 +90,53 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
|
86
90
|
ref_base = details[3]
|
|
87
91
|
snp_bases = details[4].split(",")
|
|
88
92
|
snp_qual = details [5]
|
|
93
|
+
number_of_colons_in_format = details[8].count ":"
|
|
89
94
|
format = details[8].split(":")
|
|
95
|
+
|
|
90
96
|
gt_array_position = format.index("GT")
|
|
91
97
|
gq_array_position = format.index("GQ")
|
|
92
98
|
ad_array_position = format.index("AD")
|
|
93
99
|
# dp = format.index("DP")
|
|
100
|
+
# columns_after_format = details[9..-1]
|
|
101
|
+
# samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format}
|
|
102
|
+
# puts samples
|
|
94
103
|
samples = details[9..-1]
|
|
104
|
+
if strains.empty?
|
|
105
|
+
samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name|
|
|
106
|
+
if column_after_format.count(":") == number_of_colons_in_format
|
|
107
|
+
ss = Strain.new
|
|
108
|
+
ss.name = potential_strain_name
|
|
109
|
+
ss.save
|
|
110
|
+
strains << ss
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
95
114
|
|
|
96
115
|
gts = []
|
|
97
116
|
gqs = []
|
|
98
117
|
ad_ratios = []
|
|
99
|
-
|
|
100
118
|
|
|
101
119
|
next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
|
|
120
|
+
|
|
102
121
|
samples.map do |sample|
|
|
103
122
|
format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
123
|
+
if gt_array_position
|
|
124
|
+
gt = format_values[gt_array_position] # e.g.
|
|
125
|
+
gt = gt.split("/")
|
|
126
|
+
next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
|
|
127
|
+
next if gt.first == "." # no coverage
|
|
128
|
+
gt = gt.first.to_i
|
|
129
|
+
else
|
|
130
|
+
puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
|
|
131
|
+
exit
|
|
132
|
+
end
|
|
109
133
|
|
|
110
|
-
|
|
134
|
+
if gq_array_position
|
|
135
|
+
gq = format_values[gq_array_position].to_f
|
|
136
|
+
else
|
|
137
|
+
puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
|
|
138
|
+
exit
|
|
139
|
+
end
|
|
111
140
|
|
|
112
141
|
if ad_array_position
|
|
113
142
|
# If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
|
|
@@ -117,12 +146,12 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
|
117
146
|
sum_of_ad = ad.inject{|sum,x| sum + x }
|
|
118
147
|
ad_ratios << ad[gt]/sum_of_ad.to_f
|
|
119
148
|
end
|
|
120
|
-
|
|
149
|
+
|
|
121
150
|
gqs << gq
|
|
122
151
|
gts << gt
|
|
123
152
|
end
|
|
124
|
-
|
|
125
|
-
next if ad_ratios.any?{|ad_ratio| ad_ratio <
|
|
153
|
+
|
|
154
|
+
next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user
|
|
126
155
|
if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
|
|
127
156
|
good_snps +=1
|
|
128
157
|
|
|
@@ -143,6 +172,7 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
|
143
172
|
s.reference_allele = ref_allele
|
|
144
173
|
s.save
|
|
145
174
|
|
|
175
|
+
|
|
146
176
|
snp_alleles = Array.new
|
|
147
177
|
gts.uniq.select{|gt| gt > 0}.each do |gt|
|
|
148
178
|
# create snp allele
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
require 'output_information_methods'
|
|
6
6
|
|
|
7
|
-
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output,
|
|
7
|
+
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)
|
|
8
8
|
|
|
9
9
|
strains = Strain.all
|
|
10
10
|
|
|
@@ -72,17 +72,18 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
|
|
|
72
72
|
genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
|
|
73
73
|
|
|
74
74
|
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
|
75
|
-
# ignore snp if the snp qual is less than
|
|
76
|
-
next if snp_qual.any?{|snps_quality| snps_quality.qual <
|
|
75
|
+
# ignore snp if the snp qual is less than cutoff.
|
|
76
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i}
|
|
77
77
|
|
|
78
78
|
next if alleles.any?{|allele| allele.base.length > 1} # indel
|
|
79
|
-
next unless genotypes.all?{|genotype| genotype.geno_qual >=
|
|
79
|
+
next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff
|
|
80
80
|
# puts "#{i} SNPs processed so far" if i % 100 == 0
|
|
81
81
|
strain_alleles = Hash.new
|
|
82
82
|
strains.each do |strain|
|
|
83
83
|
strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
|
|
84
|
+
# next if strain_genotype == nil
|
|
85
|
+
puts strain_genotype.inspect
|
|
84
86
|
strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
|
|
85
|
-
|
|
86
87
|
strain_alleles[strain.name] = strain_allele.base
|
|
87
88
|
end
|
|
88
89
|
|
|
@@ -101,7 +102,7 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
|
|
|
101
102
|
|
|
102
103
|
# If user has specified a tabular output
|
|
103
104
|
if tabular_output
|
|
104
|
-
output_information_methods(snps_array, output,
|
|
105
|
+
output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true)
|
|
105
106
|
# If user has specified a fasta output
|
|
106
107
|
elsif fasta_output
|
|
107
108
|
# generate FASTA file
|
|
@@ -13,21 +13,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
|
13
13
|
total_number_of_syn_snps = 0
|
|
14
14
|
total_number_of_non_syn_snps = 0
|
|
15
15
|
total_number_of_pseudo = 0
|
|
16
|
-
snps.each do |snp|
|
|
17
16
|
|
|
17
|
+
snps.each do |snp|
|
|
18
18
|
ActiveRecord::Base.transaction do
|
|
19
19
|
snp.alleles.each do |allele|
|
|
20
20
|
next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
|
|
21
21
|
if allele.id != snp.reference_allele_id
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
# get annotation (if there is any) for each SNP
|
|
24
24
|
features = Feature.joins(:snps).where("snps.id = ?", snp.id)
|
|
25
25
|
|
|
26
26
|
# get snp quality for each snp
|
|
27
27
|
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
|
28
|
-
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
|
29
28
|
# ignore snp if the snp qual is less than cuttoff.
|
|
30
|
-
|
|
29
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
|
30
|
+
|
|
31
31
|
|
|
32
32
|
# get all genotype qualities for each snp.
|
|
33
33
|
gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
|
|
@@ -37,13 +37,12 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
|
37
37
|
ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
|
|
38
38
|
snp_base = Bio::Sequence.auto(allele.base)
|
|
39
39
|
# count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
|
|
40
|
-
snps_counter += 1
|
|
40
|
+
snps_counter += 1
|
|
41
41
|
# If the feature is empty then just output basic information about the snp.
|
|
42
42
|
|
|
43
43
|
if features.empty?
|
|
44
44
|
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
|
45
|
-
|
|
46
|
-
else
|
|
45
|
+
else
|
|
47
46
|
features.each do |feature|
|
|
48
47
|
if feature.name == "CDS"
|
|
49
48
|
|
|
@@ -91,10 +90,14 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
|
91
90
|
non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
|
|
92
91
|
|
|
93
92
|
# Get alleles for each strain
|
|
94
|
-
|
|
93
|
+
bases_from_alleles = []
|
|
95
94
|
strains.each do |strain|
|
|
95
|
+
|
|
96
96
|
allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
|
|
97
|
-
|
|
97
|
+
# allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}")
|
|
98
|
+
puts allele_for_strains.inspect
|
|
99
|
+
# next if bases_from_alleles.empty?
|
|
100
|
+
bases_from_alleles << allele_for_strains.base
|
|
98
101
|
end
|
|
99
102
|
|
|
100
103
|
# If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
|
|
@@ -102,9 +105,9 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
|
102
105
|
total_number_of_syn_snps +=1
|
|
103
106
|
if mutated_seq_translated_clean =~ /\*/
|
|
104
107
|
total_number_of_pseudo +=1
|
|
105
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{
|
|
108
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
|
|
106
109
|
else
|
|
107
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{
|
|
110
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
|
|
108
111
|
end
|
|
109
112
|
else
|
|
110
113
|
total_number_of_non_syn_snps +=1
|
|
@@ -112,21 +115,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
|
112
115
|
|
|
113
116
|
if mutated_seq_translated_clean =~ /\*/
|
|
114
117
|
total_number_of_pseudo +=1
|
|
115
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{
|
|
118
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
|
|
116
119
|
else
|
|
117
|
-
outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{
|
|
120
|
+
outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
|
|
118
121
|
end
|
|
119
122
|
end
|
|
120
123
|
end
|
|
121
124
|
end
|
|
122
125
|
end
|
|
123
126
|
end
|
|
127
|
+
puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
|
|
124
128
|
end
|
|
125
129
|
end
|
|
126
|
-
puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
|
|
127
130
|
end
|
|
128
131
|
end
|
|
129
|
-
puts "Total number of snps: #{snps_counter}"
|
|
132
|
+
puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}"
|
|
130
133
|
puts "Total number of snps in CDS region: #{cds_snps_counter}"
|
|
131
134
|
puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
|
|
132
135
|
puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
|
data/lib/snp-search.rb
CHANGED
|
@@ -7,7 +7,7 @@ require 'create_methods'
|
|
|
7
7
|
require 'filter_ignore_snps_methods'
|
|
8
8
|
require 'output_information_methods'
|
|
9
9
|
|
|
10
|
-
def find_unqiue_snps(strain_names, out,
|
|
10
|
+
def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)
|
|
11
11
|
|
|
12
12
|
*strain_names = strain_names
|
|
13
13
|
|
|
@@ -15,24 +15,25 @@ def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
|
|
|
15
15
|
|
|
16
16
|
outfile = File.open(out, "w")
|
|
17
17
|
|
|
18
|
-
snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{
|
|
18
|
+
snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
|
|
19
19
|
# puts "The number of unique snps are #{snps.size}"
|
|
20
20
|
|
|
21
|
-
output_information_methods(snps, outfile,
|
|
21
|
+
output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false)
|
|
22
22
|
end
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def information(out,
|
|
25
|
+
def information(out, cutoff_genotype, cutoff_snp)
|
|
26
26
|
|
|
27
27
|
puts "outputting SNP info....."
|
|
28
28
|
|
|
29
29
|
strains = Strain.all
|
|
30
30
|
|
|
31
|
-
snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
|
|
31
|
+
# snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
|
|
32
32
|
|
|
33
|
+
snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id")
|
|
33
34
|
outfile = File.open(out, "w")
|
|
34
35
|
|
|
35
|
-
output_information_methods(snps, outfile,
|
|
36
|
+
output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, true)
|
|
36
37
|
|
|
37
38
|
end
|
|
38
39
|
|