snp-search 2.10.8 → 2.11.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -106,13 +106,13 @@ Alternatively, you may download a SQL tool to view your database (e.g. SQLite so
106
106
  == Contact
107
107
 
108
108
  If you have any comments, questions or suggestions, please email
109
- ali.al-shahib@hpa.org.uk
109
+ ali.al-shahib@phe.gov.uk
110
110
  or
111
- anthony.underwood@hpa.org.uk
111
+ anthony.underwood@phe.gov.uk
112
112
 
113
113
  Have fun snp-searching!
114
114
 
115
115
  == Copyright
116
116
 
117
117
  Copyright (c) 2012 Ali Al-Shahib. See LICENSE.txt for
118
- further details.
118
+ further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.10.8
1
+ 2.11.0
@@ -3,6 +3,8 @@ require 'snp_db_connection'
3
3
  require 'snp_db_models'
4
4
  require 'snp_db_schema'
5
5
  require 'output_information_methods'
6
+ require 'create_methods'
7
+ require 'filter_ignore_snps_methods.rb'
6
8
  require 'activerecord-import'
7
9
  require 'slop'
8
10
 
@@ -24,7 +26,7 @@ opts = Slop.parse do
24
26
  on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
25
27
  on :v, :vcf_file=, 'variant call format (vcf) file, Required', true
26
28
  on :d, :name_of_database=, 'Name of database, Required'
27
- on :A, :cuttoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
29
+ on :A, :cutoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
28
30
 
29
31
  separator ''
30
32
 
@@ -33,8 +35,8 @@ opts = Slop.parse do
33
35
  on :f, :all_or_filtered_snps, 'ignore SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -F/T -o)'
34
36
  on :F, :fasta, 'output fasta file format (default)'
35
37
  on :T, :tabular, 'output tabular file format'
36
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
37
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
38
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
39
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
38
40
  on :R, :remove_non_informative_snps, 'Only output informative SNPs.'
39
41
  on :e, :ignore_snps_in_range=, 'A list of position ranges to ignore e.g 10..500,2000..2500.'
40
42
  on :a, :ignore_strains=, 'A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ).'
@@ -47,8 +49,8 @@ opts = Slop.parse do
47
49
  separator '-output -unique_snps -d db.sqlite3 -s strains.txt -o unique_snps.txt [options]'
48
50
  separator 'e.g. snp-search -O -u -d ecoli.sqlite3 -s strains_list_for_unique_snps.txt -o ecoli_unique_snps_strains.txt'
49
51
  on :u, :unique_snps, 'Query for unique snps in the database'
50
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
51
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
52
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
53
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
52
54
  on :s, :strain=, 'The strains/samples you like to query (only used with -unique_snps flag)'
53
55
  on :o, :out=, 'Name of output file, Required'
54
56
 
@@ -57,8 +59,8 @@ opts = Slop.parse do
57
59
  separator '-output -info -d db.sqlite3 -o info.txt [options]'
58
60
  separator ''
59
61
  on :i, :info, 'Output various information about SNPs'
60
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
61
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
62
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
63
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
62
64
  on :o, :out=, 'Name of output file, Required'
63
65
  end
64
66
 
@@ -68,7 +70,7 @@ end
68
70
  if opts[:create]
69
71
 
70
72
  # raise "Please provide a database file name" if opts[:reference_file].empty?
71
- # puts opts[:cuttoff_snp_qual].to_i
73
+ # puts opts[:cutoff_snp_qual].to_i
72
74
 
73
75
  error_msg = ""
74
76
 
@@ -122,7 +124,7 @@ if opts[:create]
122
124
 
123
125
  #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
124
126
 
125
- populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cuttoff_ad])
127
+ populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cutoff_ad])
126
128
 
127
129
  ###########################################################
128
130
 
@@ -143,6 +145,7 @@ elsif opts[:output]
143
145
 
144
146
  error_msg = ""
145
147
 
148
+
146
149
  error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
147
150
  error_msg += "-o: \t name of your output file\n" unless opts[:out]
148
151
  error_msg += "-F: \t Fasta output OR\n-T: \t Tabular output" unless opts[:fasta] || opts[:tabular]
@@ -152,8 +155,8 @@ elsif opts[:output]
152
155
  error_msg_optional += "-I,\t --ignore_snps_on_annotation: The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)\n" unless opts[:ignore_snps_on_annotation]
153
156
  error_msg_optional += "-a,\t --ignore_strains: A list of strains to ignore\n" unless opts[:ignore_strains]
154
157
  error_msg_optional += "-e,\t --ignore_snps_in_range: A list of position ranges to ignore e.g 10..500,2000..2500\n" unless opts[:ignore_snps_in_range]
155
- error_msg_optional += "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality\n" unless opts[:cuttoff_snp_qual]
156
- error_msg_optional += "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality\n" unless opts[:cuttoff_genotype]
158
+ error_msg_optional += "-c,\t --cutoff_snp_qual: cutoff for SNP Quality\n" unless opts[:cutoff_snp_qual]
159
+ error_msg_optional += "-g,\t --cutoff_genotype: cutoff for Genotype Quality\n" unless opts[:cutoff_genotype]
157
160
  error_msg_optional += "-R,\t --remove_non_informative_snps: Only output informative SNPs\n" unless opts[:remove_non_informative_snps]
158
161
  error_msg_optional += "-t,\t --tree: Construct tree from output\n" unless opts[:tree]
159
162
 
@@ -163,8 +166,8 @@ elsif opts[:output]
163
166
  puts "Optional fields:"
164
167
  puts error_msg_optional
165
168
  # Added this here as it wont appear here in error_msg_optional as its set as default.
166
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
167
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
169
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
170
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
168
171
  # puts opts.help unless opts.empty?
169
172
  exit
170
173
  end
@@ -173,7 +176,7 @@ elsif opts[:output]
173
176
 
174
177
  establish_connection(opts[:name_of_database])
175
178
 
176
- get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual], opts[:tree], opts[:fasttree_path])
179
+ get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cutoff_genotype], opts[:cutoff_snp_qual], opts[:tree], opts[:fasttree_path])
177
180
  end
178
181
 
179
182
  ####################################################################################################
@@ -191,8 +194,8 @@ elsif opts[:output]
191
194
  puts error_msg
192
195
  puts "Optional fields:"
193
196
  # Added this here as it wont appear here in error_msg_optional as its set as default.
194
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
195
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
197
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
198
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
196
199
  # puts opts.help unless opts.empty?
197
200
  exit
198
201
  end
@@ -207,7 +210,7 @@ elsif opts[:output]
207
210
  strains << line.chop
208
211
  end
209
212
  # find_unique_snps defined in bin/snp-search.rb
210
- find_unqiue_snps(strains, opts[:out], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual])
213
+ find_unqiue_snps(strains, opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
211
214
  end
212
215
 
213
216
  ##############################################################
@@ -223,8 +226,8 @@ elsif opts[:output]
223
226
  puts error_msg
224
227
  puts "Optional fields:"
225
228
  # Added this here as it wont appear here in error_msg_optional as its set as default.
226
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
227
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
229
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
230
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
228
231
  # puts opts.help unless opts.empty?
229
232
  exit
230
233
  end
@@ -234,7 +237,7 @@ elsif opts[:output]
234
237
  establish_connection(opts[:name_of_database])
235
238
 
236
239
  #information defined in bin/snp-search.rb
237
- information(opts[:out], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual])
240
+ information(opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
238
241
 
239
242
  end
240
243
 
@@ -50,9 +50,10 @@ def populate_features_and_annotations(sequence_file)
50
50
  end
51
51
 
52
52
  #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
53
- def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
53
+ def populate_snps_alleles_genotypes(vcf_file, cutoff_ad)
54
54
 
55
55
  puts "Adding SNPs........"
56
+
56
57
  # open vcf file and parse each line
57
58
  File.open(vcf_file) do |f|
58
59
  # header names
@@ -60,20 +61,23 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
60
61
  if line =~ /CHROM/
61
62
  line.chomp!
62
63
  column_headings = line.split("\t")
63
- strain_names = column_headings[9..-1]
64
- strain_names.map!{|name| name.sub(/\..*/, '')}
64
+ potential_strain_names = column_headings[9..-1]
65
+ potential_strain_names.map!{|name| name.sub(/\..*/, '')}
66
+ # strain_names = column_headings[9..-1]
67
+ # strain_names.map!{|name| name.sub(/\..*/, '')}
68
+ strains = Array.new
65
69
 
66
- strain_names.each do |str|
67
- ss = Strain.new
68
- ss.name = str
69
- ss.save
70
- end
70
+ # strain_names.each do |str|
71
+ # ss = Strain.new
72
+ # ss.name = str
73
+ # ss.save
74
+ # end
71
75
 
72
- strains = Array.new
73
- strain_names.each do |strain_name|
74
- strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
75
- strains << strain
76
- end
76
+ # strains = Array.new
77
+ # strain_names.each do |strain_name|
78
+ # strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
79
+ # strains << strain
80
+ # end
77
81
 
78
82
  good_snps = 0
79
83
  # start parsing snps
@@ -86,28 +90,53 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
86
90
  ref_base = details[3]
87
91
  snp_bases = details[4].split(",")
88
92
  snp_qual = details [5]
93
+ number_of_colons_in_format = details[8].count ":"
89
94
  format = details[8].split(":")
95
+
90
96
  gt_array_position = format.index("GT")
91
97
  gq_array_position = format.index("GQ")
92
98
  ad_array_position = format.index("AD")
93
99
  # dp = format.index("DP")
100
+ # columns_after_format = details[9..-1]
101
+ # samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format}
102
+ # puts samples
94
103
  samples = details[9..-1]
104
+ if strains.empty?
105
+ samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name|
106
+ if column_after_format.count(":") == number_of_colons_in_format
107
+ ss = Strain.new
108
+ ss.name = potential_strain_name
109
+ ss.save
110
+ strains << ss
111
+ end
112
+ end
113
+ end
95
114
 
96
115
  gts = []
97
116
  gqs = []
98
117
  ad_ratios = []
99
-
100
118
 
101
119
  next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
120
+
102
121
  samples.map do |sample|
103
122
  format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
104
- gt = format_values[gt_array_position] # e.g.
105
- gt = gt.split("/")
106
- next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
107
- next if gt.first == "." # no coverage
108
- gt = gt.first.to_i
123
+ if gt_array_position
124
+ gt = format_values[gt_array_position] # e.g.
125
+ gt = gt.split("/")
126
+ next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
127
+ next if gt.first == "." # no coverage
128
+ gt = gt.first.to_i
129
+ else
130
+ puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
131
+ exit
132
+ end
109
133
 
110
- gq = format_values[gq_array_position].to_f
134
+ if gq_array_position
135
+ gq = format_values[gq_array_position].to_f
136
+ else
137
+ puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
138
+ exit
139
+ end
111
140
 
112
141
  if ad_array_position
113
142
  # If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
@@ -117,12 +146,12 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
117
146
  sum_of_ad = ad.inject{|sum,x| sum + x }
118
147
  ad_ratios << ad[gt]/sum_of_ad.to_f
119
148
  end
120
-
149
+
121
150
  gqs << gq
122
151
  gts << gt
123
152
  end
124
-
125
- next if ad_ratios.any?{|ad_ratio| ad_ratio < cuttoff_ad.to_i} # exclude if any samples have a call ratio of less than a cuttoff set by user
153
+
154
+ next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user
126
155
  if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
127
156
  good_snps +=1
128
157
 
@@ -143,6 +172,7 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
143
172
  s.reference_allele = ref_allele
144
173
  s.save
145
174
 
175
+
146
176
  snp_alleles = Array.new
147
177
  gts.uniq.select{|gt| gt > 0}.each do |gt|
148
178
  # create snp allele
@@ -4,7 +4,7 @@
4
4
 
5
5
  require 'output_information_methods'
6
6
 
7
- def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cuttoff_genotype, cuttoff_snp, tree, fasttree_path)
7
+ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)
8
8
 
9
9
  strains = Strain.all
10
10
 
@@ -72,17 +72,18 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
72
72
  genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
73
73
 
74
74
  snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
75
- # ignore snp if the snp qual is less than cuttoff.
76
- next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
75
+ # ignore snp if the snp qual is less than cutoff.
76
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i}
77
77
 
78
78
  next if alleles.any?{|allele| allele.base.length > 1} # indel
79
- next unless genotypes.all?{|genotype| genotype.geno_qual >= cuttoff_genotype} # all geno quals > cutoff
79
+ next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff
80
80
  # puts "#{i} SNPs processed so far" if i % 100 == 0
81
81
  strain_alleles = Hash.new
82
82
  strains.each do |strain|
83
83
  strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
84
+ # next if strain_genotype == nil
85
+ puts strain_genotype.inspect
84
86
  strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
85
-
86
87
  strain_alleles[strain.name] = strain_allele.base
87
88
  end
88
89
 
@@ -101,7 +102,7 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
101
102
 
102
103
  # If user has specified a tabular output
103
104
  if tabular_output
104
- output_information_methods(snps_array, output, cuttoff_genotype, cuttoff_snp, true)
105
+ output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true)
105
106
  # If user has specified a fasta output
106
107
  elsif fasta_output
107
108
  # generate FASTA file
@@ -13,21 +13,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
13
13
  total_number_of_syn_snps = 0
14
14
  total_number_of_non_syn_snps = 0
15
15
  total_number_of_pseudo = 0
16
- snps.each do |snp|
17
16
 
17
+ snps.each do |snp|
18
18
  ActiveRecord::Base.transaction do
19
19
  snp.alleles.each do |allele|
20
20
  next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
21
21
  if allele.id != snp.reference_allele_id
22
-
22
+
23
23
  # get annotation (if there is any) for each SNP
24
24
  features = Feature.joins(:snps).where("snps.id = ?", snp.id)
25
25
 
26
26
  # get snp quality for each snp
27
27
  snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
28
- next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
29
28
  # ignore snp if the snp qual is less than cuttoff.
30
- # next if snp.snp_qual < cuttoff_snp.to_i
29
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
30
+
31
31
 
32
32
  # get all genotype qualities for each snp.
33
33
  gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
@@ -37,13 +37,12 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
37
37
  ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
38
38
  snp_base = Bio::Sequence.auto(allele.base)
39
39
  # count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
40
- snps_counter += 1
40
+ snps_counter += 1
41
41
  # If the feature is empty then just output basic information about the snp.
42
42
 
43
43
  if features.empty?
44
44
  outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
45
-
46
- else
45
+ else
47
46
  features.each do |feature|
48
47
  if feature.name == "CDS"
49
48
 
@@ -91,10 +90,14 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
91
90
  non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
92
91
 
93
92
  # Get alleles for each strain
94
- alleles_array = []
93
+ bases_from_alleles = []
95
94
  strains.each do |strain|
95
+
96
96
  allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
97
- alleles_array << allele_for_strains.base
97
+ # allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}")
98
+ puts allele_for_strains.inspect
99
+ # next if bases_from_alleles.empty?
100
+ bases_from_alleles << allele_for_strains.base
98
101
  end
99
102
 
100
103
  # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
@@ -102,9 +105,9 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
102
105
  total_number_of_syn_snps +=1
103
106
  if mutated_seq_translated_clean =~ /\*/
104
107
  total_number_of_pseudo +=1
105
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
108
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
106
109
  else
107
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
110
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
108
111
  end
109
112
  else
110
113
  total_number_of_non_syn_snps +=1
@@ -112,21 +115,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
112
115
 
113
116
  if mutated_seq_translated_clean =~ /\*/
114
117
  total_number_of_pseudo +=1
115
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
118
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
116
119
  else
117
- outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
120
+ outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
118
121
  end
119
122
  end
120
123
  end
121
124
  end
122
125
  end
123
126
  end
127
+ puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
124
128
  end
125
129
  end
126
- puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
127
130
  end
128
131
  end
129
- puts "Total number of snps: #{snps_counter}"
132
+ puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}"
130
133
  puts "Total number of snps in CDS region: #{cds_snps_counter}"
131
134
  puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
132
135
  puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
@@ -7,7 +7,7 @@ require 'create_methods'
7
7
  require 'filter_ignore_snps_methods'
8
8
  require 'output_information_methods'
9
9
 
10
- def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
10
+ def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)
11
11
 
12
12
  *strain_names = strain_names
13
13
 
@@ -15,24 +15,25 @@ def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
15
15
 
16
16
  outfile = File.open(out, "w")
17
17
 
18
- snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cuttoff_genotype} AND snps.qual >= #{cuttoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
18
+ snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
19
19
  # puts "The number of unique snps are #{snps.size}"
20
20
 
21
- output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, false)
21
+ output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false)
22
22
  end
23
23
 
24
24
 
25
- def information(out, cuttoff_genotype, cuttoff_snp)
25
+ def information(out, cutoff_genotype, cutoff_snp)
26
26
 
27
27
  puts "outputting SNP info....."
28
28
 
29
29
  strains = Strain.all
30
30
 
31
- snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
31
+ # snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
32
32
 
33
+ snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id")
33
34
  outfile = File.open(out, "w")
34
35
 
35
- output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, true)
36
+ output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, true)
36
37
 
37
38
  end
38
39