snp-search 2.10.8 → 2.11.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Manual/snp-search_user_manual.pdf +0 -0
- data/README.rdoc +3 -3
- data/VERSION +1 -1
- data/bin/snp-search +23 -20
- data/lib/create_methods.rb +53 -23
- data/lib/filter_ignore_snps_methods.rb +7 -6
- data/lib/output_information_methods.rb +18 -15
- data/lib/snp-search.rb +7 -6
- data/snp-search.gemspec +6 -3
- data/test_data/Ecoli_test_set.vcf +1851 -0
- data/test_data/Reference_file_Ecoli_test_set.gbk +4122 -0
- metadata +6 -3
Binary file
|
data/README.rdoc
CHANGED
@@ -106,13 +106,13 @@ Alternatively, you may download a SQL tool to view your database (e.g. SQLite so
|
|
106
106
|
== Contact
|
107
107
|
|
108
108
|
If you have any comments, questions or suggestions, please email
|
109
|
-
ali.al-shahib@
|
109
|
+
ali.al-shahib@phe.gov.uk
|
110
110
|
or
|
111
|
-
anthony.underwood@
|
111
|
+
anthony.underwood@phe.gov.uk
|
112
112
|
|
113
113
|
Have fun snp-searching!
|
114
114
|
|
115
115
|
== Copyright
|
116
116
|
|
117
117
|
Copyright (c) 2012 Ali Al-Shahib. See LICENSE.txt for
|
118
|
-
further details.
|
118
|
+
further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.11.0
|
data/bin/snp-search
CHANGED
@@ -3,6 +3,8 @@ require 'snp_db_connection'
|
|
3
3
|
require 'snp_db_models'
|
4
4
|
require 'snp_db_schema'
|
5
5
|
require 'output_information_methods'
|
6
|
+
require 'create_methods'
|
7
|
+
require 'filter_ignore_snps_methods.rb'
|
6
8
|
require 'activerecord-import'
|
7
9
|
require 'slop'
|
8
10
|
|
@@ -24,7 +26,7 @@ opts = Slop.parse do
|
|
24
26
|
on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
|
25
27
|
on :v, :vcf_file=, 'variant call format (vcf) file, Required', true
|
26
28
|
on :d, :name_of_database=, 'Name of database, Required'
|
27
|
-
on :A, :
|
29
|
+
on :A, :cutoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
|
28
30
|
|
29
31
|
separator ''
|
30
32
|
|
@@ -33,8 +35,8 @@ opts = Slop.parse do
|
|
33
35
|
on :f, :all_or_filtered_snps, 'ignore SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -F/T -o)'
|
34
36
|
on :F, :fasta, 'output fasta file format (default)'
|
35
37
|
on :T, :tabular, 'output tabular file format'
|
36
|
-
on :c, :
|
37
|
-
on :g, :
|
38
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
39
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
38
40
|
on :R, :remove_non_informative_snps, 'Only output informative SNPs.'
|
39
41
|
on :e, :ignore_snps_in_range=, 'A list of position ranges to ignore e.g 10..500,2000..2500.'
|
40
42
|
on :a, :ignore_strains=, 'A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ).'
|
@@ -47,8 +49,8 @@ opts = Slop.parse do
|
|
47
49
|
separator '-output -unique_snps -d db.sqlite3 -s strains.txt -o unique_snps.txt [options]'
|
48
50
|
separator 'e.g. snp-search -O -u -d ecoli.sqlite3 -s strains_list_for_unique_snps.txt -o ecoli_unique_snps_strains.txt'
|
49
51
|
on :u, :unique_snps, 'Query for unique snps in the database'
|
50
|
-
on :c, :
|
51
|
-
on :g, :
|
52
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
53
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
52
54
|
on :s, :strain=, 'The strains/samples you like to query (only used with -unique_snps flag)'
|
53
55
|
on :o, :out=, 'Name of output file, Required'
|
54
56
|
|
@@ -57,8 +59,8 @@ opts = Slop.parse do
|
|
57
59
|
separator '-output -info -d db.sqlite3 -o info.txt [options]'
|
58
60
|
separator ''
|
59
61
|
on :i, :info, 'Output various information about SNPs'
|
60
|
-
on :c, :
|
61
|
-
on :g, :
|
62
|
+
on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
63
|
+
on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
62
64
|
on :o, :out=, 'Name of output file, Required'
|
63
65
|
end
|
64
66
|
|
@@ -68,7 +70,7 @@ end
|
|
68
70
|
if opts[:create]
|
69
71
|
|
70
72
|
# raise "Please provide a database file name" if opts[:reference_file].empty?
|
71
|
-
# puts opts[:
|
73
|
+
# puts opts[:cutoff_snp_qual].to_i
|
72
74
|
|
73
75
|
error_msg = ""
|
74
76
|
|
@@ -122,7 +124,7 @@ if opts[:create]
|
|
122
124
|
|
123
125
|
#The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
|
124
126
|
|
125
|
-
populate_snps_alleles_genotypes(opts[:vcf_file], opts[:
|
127
|
+
populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cutoff_ad])
|
126
128
|
|
127
129
|
###########################################################
|
128
130
|
|
@@ -143,6 +145,7 @@ elsif opts[:output]
|
|
143
145
|
|
144
146
|
error_msg = ""
|
145
147
|
|
148
|
+
|
146
149
|
error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
|
147
150
|
error_msg += "-o: \t name of your output file\n" unless opts[:out]
|
148
151
|
error_msg += "-F: \t Fasta output OR\n-T: \t Tabular output" unless opts[:fasta] || opts[:tabular]
|
@@ -152,8 +155,8 @@ elsif opts[:output]
|
|
152
155
|
error_msg_optional += "-I,\t --ignore_snps_on_annotation: The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)\n" unless opts[:ignore_snps_on_annotation]
|
153
156
|
error_msg_optional += "-a,\t --ignore_strains: A list of strains to ignore\n" unless opts[:ignore_strains]
|
154
157
|
error_msg_optional += "-e,\t --ignore_snps_in_range: A list of position ranges to ignore e.g 10..500,2000..2500\n" unless opts[:ignore_snps_in_range]
|
155
|
-
error_msg_optional += "-c,\t --
|
156
|
-
error_msg_optional += "-g,\t --
|
158
|
+
error_msg_optional += "-c,\t --cutoff_snp_qual: cutoff for SNP Quality\n" unless opts[:cutoff_snp_qual]
|
159
|
+
error_msg_optional += "-g,\t --cutoff_genotype: cutoff for Genotype Quality\n" unless opts[:cutoff_genotype]
|
157
160
|
error_msg_optional += "-R,\t --remove_non_informative_snps: Only output informative SNPs\n" unless opts[:remove_non_informative_snps]
|
158
161
|
error_msg_optional += "-t,\t --tree: Construct tree from output\n" unless opts[:tree]
|
159
162
|
|
@@ -163,8 +166,8 @@ elsif opts[:output]
|
|
163
166
|
puts "Optional fields:"
|
164
167
|
puts error_msg_optional
|
165
168
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
166
|
-
puts "-c,\t --
|
167
|
-
puts "-g,\t --
|
169
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
170
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
168
171
|
# puts opts.help unless opts.empty?
|
169
172
|
exit
|
170
173
|
end
|
@@ -173,7 +176,7 @@ elsif opts[:output]
|
|
173
176
|
|
174
177
|
establish_connection(opts[:name_of_database])
|
175
178
|
|
176
|
-
get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:
|
179
|
+
get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cutoff_genotype], opts[:cutoff_snp_qual], opts[:tree], opts[:fasttree_path])
|
177
180
|
end
|
178
181
|
|
179
182
|
####################################################################################################
|
@@ -191,8 +194,8 @@ elsif opts[:output]
|
|
191
194
|
puts error_msg
|
192
195
|
puts "Optional fields:"
|
193
196
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
194
|
-
puts "-c,\t --
|
195
|
-
puts "-g,\t --
|
197
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
198
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
196
199
|
# puts opts.help unless opts.empty?
|
197
200
|
exit
|
198
201
|
end
|
@@ -207,7 +210,7 @@ elsif opts[:output]
|
|
207
210
|
strains << line.chop
|
208
211
|
end
|
209
212
|
# find_unique_snps defined in bin/snp-search.rb
|
210
|
-
find_unqiue_snps(strains, opts[:out], opts[:
|
213
|
+
find_unqiue_snps(strains, opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
|
211
214
|
end
|
212
215
|
|
213
216
|
##############################################################
|
@@ -223,8 +226,8 @@ elsif opts[:output]
|
|
223
226
|
puts error_msg
|
224
227
|
puts "Optional fields:"
|
225
228
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
226
|
-
puts "-c,\t --
|
227
|
-
puts "-g,\t --
|
229
|
+
puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
|
230
|
+
puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
|
228
231
|
# puts opts.help unless opts.empty?
|
229
232
|
exit
|
230
233
|
end
|
@@ -234,7 +237,7 @@ elsif opts[:output]
|
|
234
237
|
establish_connection(opts[:name_of_database])
|
235
238
|
|
236
239
|
#information defined in bin/snp-search.rb
|
237
|
-
information(opts[:out], opts[:
|
240
|
+
information(opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
|
238
241
|
|
239
242
|
end
|
240
243
|
|
data/lib/create_methods.rb
CHANGED
@@ -50,9 +50,10 @@ def populate_features_and_annotations(sequence_file)
|
|
50
50
|
end
|
51
51
|
|
52
52
|
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
|
53
|
-
def populate_snps_alleles_genotypes(vcf_file,
|
53
|
+
def populate_snps_alleles_genotypes(vcf_file, cutoff_ad)
|
54
54
|
|
55
55
|
puts "Adding SNPs........"
|
56
|
+
|
56
57
|
# open vcf file and parse each line
|
57
58
|
File.open(vcf_file) do |f|
|
58
59
|
# header names
|
@@ -60,20 +61,23 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
60
61
|
if line =~ /CHROM/
|
61
62
|
line.chomp!
|
62
63
|
column_headings = line.split("\t")
|
63
|
-
|
64
|
-
|
64
|
+
potential_strain_names = column_headings[9..-1]
|
65
|
+
potential_strain_names.map!{|name| name.sub(/\..*/, '')}
|
66
|
+
# strain_names = column_headings[9..-1]
|
67
|
+
# strain_names.map!{|name| name.sub(/\..*/, '')}
|
68
|
+
strains = Array.new
|
65
69
|
|
66
|
-
strain_names.each do |str|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
end
|
70
|
+
# strain_names.each do |str|
|
71
|
+
# ss = Strain.new
|
72
|
+
# ss.name = str
|
73
|
+
# ss.save
|
74
|
+
# end
|
71
75
|
|
72
|
-
strains = Array.new
|
73
|
-
strain_names.each do |strain_name|
|
74
|
-
|
75
|
-
|
76
|
-
end
|
76
|
+
# strains = Array.new
|
77
|
+
# strain_names.each do |strain_name|
|
78
|
+
# strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
79
|
+
# strains << strain
|
80
|
+
# end
|
77
81
|
|
78
82
|
good_snps = 0
|
79
83
|
# start parsing snps
|
@@ -86,28 +90,53 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
86
90
|
ref_base = details[3]
|
87
91
|
snp_bases = details[4].split(",")
|
88
92
|
snp_qual = details [5]
|
93
|
+
number_of_colons_in_format = details[8].count ":"
|
89
94
|
format = details[8].split(":")
|
95
|
+
|
90
96
|
gt_array_position = format.index("GT")
|
91
97
|
gq_array_position = format.index("GQ")
|
92
98
|
ad_array_position = format.index("AD")
|
93
99
|
# dp = format.index("DP")
|
100
|
+
# columns_after_format = details[9..-1]
|
101
|
+
# samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format}
|
102
|
+
# puts samples
|
94
103
|
samples = details[9..-1]
|
104
|
+
if strains.empty?
|
105
|
+
samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name|
|
106
|
+
if column_after_format.count(":") == number_of_colons_in_format
|
107
|
+
ss = Strain.new
|
108
|
+
ss.name = potential_strain_name
|
109
|
+
ss.save
|
110
|
+
strains << ss
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
95
114
|
|
96
115
|
gts = []
|
97
116
|
gqs = []
|
98
117
|
ad_ratios = []
|
99
|
-
|
100
118
|
|
101
119
|
next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
|
120
|
+
|
102
121
|
samples.map do |sample|
|
103
122
|
format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
123
|
+
if gt_array_position
|
124
|
+
gt = format_values[gt_array_position] # e.g.
|
125
|
+
gt = gt.split("/")
|
126
|
+
next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
|
127
|
+
next if gt.first == "." # no coverage
|
128
|
+
gt = gt.first.to_i
|
129
|
+
else
|
130
|
+
puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
|
131
|
+
exit
|
132
|
+
end
|
109
133
|
|
110
|
-
|
134
|
+
if gq_array_position
|
135
|
+
gq = format_values[gq_array_position].to_f
|
136
|
+
else
|
137
|
+
puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
|
138
|
+
exit
|
139
|
+
end
|
111
140
|
|
112
141
|
if ad_array_position
|
113
142
|
# If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
|
@@ -117,12 +146,12 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
117
146
|
sum_of_ad = ad.inject{|sum,x| sum + x }
|
118
147
|
ad_ratios << ad[gt]/sum_of_ad.to_f
|
119
148
|
end
|
120
|
-
|
149
|
+
|
121
150
|
gqs << gq
|
122
151
|
gts << gt
|
123
152
|
end
|
124
|
-
|
125
|
-
next if ad_ratios.any?{|ad_ratio| ad_ratio <
|
153
|
+
|
154
|
+
next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user
|
126
155
|
if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
|
127
156
|
good_snps +=1
|
128
157
|
|
@@ -143,6 +172,7 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
|
|
143
172
|
s.reference_allele = ref_allele
|
144
173
|
s.save
|
145
174
|
|
175
|
+
|
146
176
|
snp_alleles = Array.new
|
147
177
|
gts.uniq.select{|gt| gt > 0}.each do |gt|
|
148
178
|
# create snp allele
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
require 'output_information_methods'
|
6
6
|
|
7
|
-
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output,
|
7
|
+
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)
|
8
8
|
|
9
9
|
strains = Strain.all
|
10
10
|
|
@@ -72,17 +72,18 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
|
|
72
72
|
genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
|
73
73
|
|
74
74
|
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
75
|
-
# ignore snp if the snp qual is less than
|
76
|
-
next if snp_qual.any?{|snps_quality| snps_quality.qual <
|
75
|
+
# ignore snp if the snp qual is less than cutoff.
|
76
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i}
|
77
77
|
|
78
78
|
next if alleles.any?{|allele| allele.base.length > 1} # indel
|
79
|
-
next unless genotypes.all?{|genotype| genotype.geno_qual >=
|
79
|
+
next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff
|
80
80
|
# puts "#{i} SNPs processed so far" if i % 100 == 0
|
81
81
|
strain_alleles = Hash.new
|
82
82
|
strains.each do |strain|
|
83
83
|
strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
|
84
|
+
# next if strain_genotype == nil
|
85
|
+
puts strain_genotype.inspect
|
84
86
|
strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
|
85
|
-
|
86
87
|
strain_alleles[strain.name] = strain_allele.base
|
87
88
|
end
|
88
89
|
|
@@ -101,7 +102,7 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
|
|
101
102
|
|
102
103
|
# If user has specified a tabular output
|
103
104
|
if tabular_output
|
104
|
-
output_information_methods(snps_array, output,
|
105
|
+
output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true)
|
105
106
|
# If user has specified a fasta output
|
106
107
|
elsif fasta_output
|
107
108
|
# generate FASTA file
|
@@ -13,21 +13,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
13
13
|
total_number_of_syn_snps = 0
|
14
14
|
total_number_of_non_syn_snps = 0
|
15
15
|
total_number_of_pseudo = 0
|
16
|
-
snps.each do |snp|
|
17
16
|
|
17
|
+
snps.each do |snp|
|
18
18
|
ActiveRecord::Base.transaction do
|
19
19
|
snp.alleles.each do |allele|
|
20
20
|
next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
|
21
21
|
if allele.id != snp.reference_allele_id
|
22
|
-
|
22
|
+
|
23
23
|
# get annotation (if there is any) for each SNP
|
24
24
|
features = Feature.joins(:snps).where("snps.id = ?", snp.id)
|
25
25
|
|
26
26
|
# get snp quality for each snp
|
27
27
|
snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
|
28
|
-
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
29
28
|
# ignore snp if the snp qual is less than cuttoff.
|
30
|
-
|
29
|
+
next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
|
30
|
+
|
31
31
|
|
32
32
|
# get all genotype qualities for each snp.
|
33
33
|
gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
|
@@ -37,13 +37,12 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
37
37
|
ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
|
38
38
|
snp_base = Bio::Sequence.auto(allele.base)
|
39
39
|
# count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
|
40
|
-
snps_counter += 1
|
40
|
+
snps_counter += 1
|
41
41
|
# If the feature is empty then just output basic information about the snp.
|
42
42
|
|
43
43
|
if features.empty?
|
44
44
|
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
45
|
-
|
46
|
-
else
|
45
|
+
else
|
47
46
|
features.each do |feature|
|
48
47
|
if feature.name == "CDS"
|
49
48
|
|
@@ -91,10 +90,14 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
91
90
|
non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
|
92
91
|
|
93
92
|
# Get alleles for each strain
|
94
|
-
|
93
|
+
bases_from_alleles = []
|
95
94
|
strains.each do |strain|
|
95
|
+
|
96
96
|
allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
|
97
|
-
|
97
|
+
# allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}")
|
98
|
+
puts allele_for_strains.inspect
|
99
|
+
# next if bases_from_alleles.empty?
|
100
|
+
bases_from_alleles << allele_for_strains.base
|
98
101
|
end
|
99
102
|
|
100
103
|
# If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
|
@@ -102,9 +105,9 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
102
105
|
total_number_of_syn_snps +=1
|
103
106
|
if mutated_seq_translated_clean =~ /\*/
|
104
107
|
total_number_of_pseudo +=1
|
105
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{
|
108
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
|
106
109
|
else
|
107
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{
|
110
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
|
108
111
|
end
|
109
112
|
else
|
110
113
|
total_number_of_non_syn_snps +=1
|
@@ -112,21 +115,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
112
115
|
|
113
116
|
if mutated_seq_translated_clean =~ /\*/
|
114
117
|
total_number_of_pseudo +=1
|
115
|
-
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{
|
118
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
|
116
119
|
else
|
117
|
-
outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{
|
120
|
+
outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
|
118
121
|
end
|
119
122
|
end
|
120
123
|
end
|
121
124
|
end
|
122
125
|
end
|
123
126
|
end
|
127
|
+
puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
|
124
128
|
end
|
125
129
|
end
|
126
|
-
puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
|
127
130
|
end
|
128
131
|
end
|
129
|
-
puts "Total number of snps: #{snps_counter}"
|
132
|
+
puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}"
|
130
133
|
puts "Total number of snps in CDS region: #{cds_snps_counter}"
|
131
134
|
puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
|
132
135
|
puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
|
data/lib/snp-search.rb
CHANGED
@@ -7,7 +7,7 @@ require 'create_methods'
|
|
7
7
|
require 'filter_ignore_snps_methods'
|
8
8
|
require 'output_information_methods'
|
9
9
|
|
10
|
-
def find_unqiue_snps(strain_names, out,
|
10
|
+
def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)
|
11
11
|
|
12
12
|
*strain_names = strain_names
|
13
13
|
|
@@ -15,24 +15,25 @@ def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
|
|
15
15
|
|
16
16
|
outfile = File.open(out, "w")
|
17
17
|
|
18
|
-
snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{
|
18
|
+
snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
|
19
19
|
# puts "The number of unique snps are #{snps.size}"
|
20
20
|
|
21
|
-
output_information_methods(snps, outfile,
|
21
|
+
output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false)
|
22
22
|
end
|
23
23
|
|
24
24
|
|
25
|
-
def information(out,
|
25
|
+
def information(out, cutoff_genotype, cutoff_snp)
|
26
26
|
|
27
27
|
puts "outputting SNP info....."
|
28
28
|
|
29
29
|
strains = Strain.all
|
30
30
|
|
31
|
-
snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
|
31
|
+
# snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
|
32
32
|
|
33
|
+
snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id")
|
33
34
|
outfile = File.open(out, "w")
|
34
35
|
|
35
|
-
output_information_methods(snps, outfile,
|
36
|
+
output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, true)
|
36
37
|
|
37
38
|
end
|
38
39
|
|