snp-search 2.10.8 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -106,13 +106,13 @@ Alternatively, you may download a SQL tool to view your database (e.g. SQLite so
106
106
  == Contact
107
107
 
108
108
  If you have any comments, questions or suggestions, please email
109
- ali.al-shahib@hpa.org.uk
109
+ ali.al-shahib@phe.gov.uk
110
110
  or
111
- anthony.underwood@hpa.org.uk
111
+ anthony.underwood@phe.gov.uk
112
112
 
113
113
  Have fun snp-searching!
114
114
 
115
115
  == Copyright
116
116
 
117
117
  Copyright (c) 2012 Ali Al-Shahib. See LICENSE.txt for
118
- further details.
118
+ further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.10.8
1
+ 2.11.0
@@ -3,6 +3,8 @@ require 'snp_db_connection'
3
3
  require 'snp_db_models'
4
4
  require 'snp_db_schema'
5
5
  require 'output_information_methods'
6
+ require 'create_methods'
7
+ require 'filter_ignore_snps_methods.rb'
6
8
  require 'activerecord-import'
7
9
  require 'slop'
8
10
 
@@ -24,7 +26,7 @@ opts = Slop.parse do
24
26
  on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
25
27
  on :v, :vcf_file=, 'variant call format (vcf) file, Required', true
26
28
  on :d, :name_of_database=, 'Name of database, Required'
27
- on :A, :cuttoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
29
+ on :A, :cutoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
28
30
 
29
31
  separator ''
30
32
 
@@ -33,8 +35,8 @@ opts = Slop.parse do
33
35
  on :f, :all_or_filtered_snps, 'ignore SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -F/T -o)'
34
36
  on :F, :fasta, 'output fasta file format (default)'
35
37
  on :T, :tabular, 'output tabular file format'
36
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
37
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
38
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
39
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
38
40
  on :R, :remove_non_informative_snps, 'Only output informative SNPs.'
39
41
  on :e, :ignore_snps_in_range=, 'A list of position ranges to ignore e.g 10..500,2000..2500.'
40
42
  on :a, :ignore_strains=, 'A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ).'
@@ -47,8 +49,8 @@ opts = Slop.parse do
47
49
  separator '-output -unique_snps -d db.sqlite3 -s strains.txt -o unique_snps.txt [options]'
48
50
  separator 'e.g. snp-search -O -u -d ecoli.sqlite3 -s strains_list_for_unique_snps.txt -o ecoli_unique_snps_strains.txt'
49
51
  on :u, :unique_snps, 'Query for unique snps in the database'
50
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
51
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
52
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
53
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
52
54
  on :s, :strain=, 'The strains/samples you like to query (only used with -unique_snps flag)'
53
55
  on :o, :out=, 'Name of output file, Required'
54
56
 
@@ -57,8 +59,8 @@ opts = Slop.parse do
57
59
  separator '-output -info -d db.sqlite3 -o info.txt [options]'
58
60
  separator ''
59
61
  on :i, :info, 'Output various information about SNPs'
60
- on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
61
- on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
62
+ on :c, :cutoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
63
+ on :g, :cutoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
62
64
  on :o, :out=, 'Name of output file, Required'
63
65
  end
64
66
 
@@ -68,7 +70,7 @@ end
68
70
  if opts[:create]
69
71
 
70
72
  # raise "Please provide a database file name" if opts[:reference_file].empty?
71
- # puts opts[:cuttoff_snp_qual].to_i
73
+ # puts opts[:cutoff_snp_qual].to_i
72
74
 
73
75
  error_msg = ""
74
76
 
@@ -122,7 +124,7 @@ if opts[:create]
122
124
 
123
125
  #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
124
126
 
125
- populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cuttoff_ad])
127
+ populate_snps_alleles_genotypes(opts[:vcf_file], opts[:cutoff_ad])
126
128
 
127
129
  ###########################################################
128
130
 
@@ -143,6 +145,7 @@ elsif opts[:output]
143
145
 
144
146
  error_msg = ""
145
147
 
148
+
146
149
  error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
147
150
  error_msg += "-o: \t name of your output file\n" unless opts[:out]
148
151
  error_msg += "-F: \t Fasta output OR\n-T: \t Tabular output" unless opts[:fasta] || opts[:tabular]
@@ -152,8 +155,8 @@ elsif opts[:output]
152
155
  error_msg_optional += "-I,\t --ignore_snps_on_annotation: The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)\n" unless opts[:ignore_snps_on_annotation]
153
156
  error_msg_optional += "-a,\t --ignore_strains: A list of strains to ignore\n" unless opts[:ignore_strains]
154
157
  error_msg_optional += "-e,\t --ignore_snps_in_range: A list of position ranges to ignore e.g 10..500,2000..2500\n" unless opts[:ignore_snps_in_range]
155
- error_msg_optional += "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality\n" unless opts[:cuttoff_snp_qual]
156
- error_msg_optional += "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality\n" unless opts[:cuttoff_genotype]
158
+ error_msg_optional += "-c,\t --cutoff_snp_qual: cutoff for SNP Quality\n" unless opts[:cutoff_snp_qual]
159
+ error_msg_optional += "-g,\t --cutoff_genotype: cutoff for Genotype Quality\n" unless opts[:cutoff_genotype]
157
160
  error_msg_optional += "-R,\t --remove_non_informative_snps: Only output informative SNPs\n" unless opts[:remove_non_informative_snps]
158
161
  error_msg_optional += "-t,\t --tree: Construct tree from output\n" unless opts[:tree]
159
162
 
@@ -163,8 +166,8 @@ elsif opts[:output]
163
166
  puts "Optional fields:"
164
167
  puts error_msg_optional
165
168
  # Added this here as it wont appear here in error_msg_optional as its set as default.
166
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
167
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
169
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
170
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
168
171
  # puts opts.help unless opts.empty?
169
172
  exit
170
173
  end
@@ -173,7 +176,7 @@ elsif opts[:output]
173
176
 
174
177
  establish_connection(opts[:name_of_database])
175
178
 
176
- get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual], opts[:tree], opts[:fasttree_path])
179
+ get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cutoff_genotype], opts[:cutoff_snp_qual], opts[:tree], opts[:fasttree_path])
177
180
  end
178
181
 
179
182
  ####################################################################################################
@@ -191,8 +194,8 @@ elsif opts[:output]
191
194
  puts error_msg
192
195
  puts "Optional fields:"
193
196
  # Added this here as it wont appear here in error_msg_optional as its set as default.
194
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
195
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
197
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
198
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
196
199
  # puts opts.help unless opts.empty?
197
200
  exit
198
201
  end
@@ -207,7 +210,7 @@ elsif opts[:output]
207
210
  strains << line.chop
208
211
  end
209
212
  # find_unique_snps defined in bin/snp-search.rb
210
- find_unqiue_snps(strains, opts[:out], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual])
213
+ find_unqiue_snps(strains, opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
211
214
  end
212
215
 
213
216
  ##############################################################
@@ -223,8 +226,8 @@ elsif opts[:output]
223
226
  puts error_msg
224
227
  puts "Optional fields:"
225
228
  # Added this here as it wont appear here in error_msg_optional as its set as default.
226
- puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
227
- puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
229
+ puts "-c,\t --cutoff_snp_qual: cutoff for SNP Quality (default 90)\n"
230
+ puts "-g,\t --cutoff_genotype: cutoff for Genotype Quality (default 30)\n"
228
231
  # puts opts.help unless opts.empty?
229
232
  exit
230
233
  end
@@ -234,7 +237,7 @@ elsif opts[:output]
234
237
  establish_connection(opts[:name_of_database])
235
238
 
236
239
  #information defined in bin/snp-search.rb
237
- information(opts[:out], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual])
240
+ information(opts[:out], opts[:cutoff_genotype], opts[:cutoff_snp_qual])
238
241
 
239
242
  end
240
243
 
@@ -50,9 +50,10 @@ def populate_features_and_annotations(sequence_file)
50
50
  end
51
51
 
52
52
  #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
53
- def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
53
+ def populate_snps_alleles_genotypes(vcf_file, cutoff_ad)
54
54
 
55
55
  puts "Adding SNPs........"
56
+
56
57
  # open vcf file and parse each line
57
58
  File.open(vcf_file) do |f|
58
59
  # header names
@@ -60,20 +61,23 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
60
61
  if line =~ /CHROM/
61
62
  line.chomp!
62
63
  column_headings = line.split("\t")
63
- strain_names = column_headings[9..-1]
64
- strain_names.map!{|name| name.sub(/\..*/, '')}
64
+ potential_strain_names = column_headings[9..-1]
65
+ potential_strain_names.map!{|name| name.sub(/\..*/, '')}
66
+ # strain_names = column_headings[9..-1]
67
+ # strain_names.map!{|name| name.sub(/\..*/, '')}
68
+ strains = Array.new
65
69
 
66
- strain_names.each do |str|
67
- ss = Strain.new
68
- ss.name = str
69
- ss.save
70
- end
70
+ # strain_names.each do |str|
71
+ # ss = Strain.new
72
+ # ss.name = str
73
+ # ss.save
74
+ # end
71
75
 
72
- strains = Array.new
73
- strain_names.each do |strain_name|
74
- strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
75
- strains << strain
76
- end
76
+ # strains = Array.new
77
+ # strain_names.each do |strain_name|
78
+ # strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
79
+ # strains << strain
80
+ # end
77
81
 
78
82
  good_snps = 0
79
83
  # start parsing snps
@@ -86,28 +90,53 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
86
90
  ref_base = details[3]
87
91
  snp_bases = details[4].split(",")
88
92
  snp_qual = details [5]
93
+ number_of_colons_in_format = details[8].count ":"
89
94
  format = details[8].split(":")
95
+
90
96
  gt_array_position = format.index("GT")
91
97
  gq_array_position = format.index("GQ")
92
98
  ad_array_position = format.index("AD")
93
99
  # dp = format.index("DP")
100
+ # columns_after_format = details[9..-1]
101
+ # samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format}
102
+ # puts samples
94
103
  samples = details[9..-1]
104
+ if strains.empty?
105
+ samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name|
106
+ if column_after_format.count(":") == number_of_colons_in_format
107
+ ss = Strain.new
108
+ ss.name = potential_strain_name
109
+ ss.save
110
+ strains << ss
111
+ end
112
+ end
113
+ end
95
114
 
96
115
  gts = []
97
116
  gqs = []
98
117
  ad_ratios = []
99
-
100
118
 
101
119
  next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample
120
+
102
121
  samples.map do |sample|
103
122
  format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
104
- gt = format_values[gt_array_position] # e.g.
105
- gt = gt.split("/")
106
- next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
107
- next if gt.first == "." # no coverage
108
- gt = gt.first.to_i
123
+ if gt_array_position
124
+ gt = format_values[gt_array_position] # e.g.
125
+ gt = gt.split("/")
126
+ next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
127
+ next if gt.first == "." # no coverage
128
+ gt = gt.first.to_i
129
+ else
130
+ puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
131
+ exit
132
+ end
109
133
 
110
- gq = format_values[gq_array_position].to_f
134
+ if gq_array_position
135
+ gq = format_values[gq_array_position].to_f
136
+ else
137
+ puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
138
+ exit
139
+ end
111
140
 
112
141
  if ad_array_position
113
142
  # If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
@@ -117,12 +146,12 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
117
146
  sum_of_ad = ad.inject{|sum,x| sum + x }
118
147
  ad_ratios << ad[gt]/sum_of_ad.to_f
119
148
  end
120
-
149
+
121
150
  gqs << gq
122
151
  gts << gt
123
152
  end
124
-
125
- next if ad_ratios.any?{|ad_ratio| ad_ratio < cuttoff_ad.to_i} # exclude if any samples have a call ratio of less than a cuttoff set by user
153
+
154
+ next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user
126
155
  if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
127
156
  good_snps +=1
128
157
 
@@ -143,6 +172,7 @@ def populate_snps_alleles_genotypes(vcf_file, cuttoff_ad)
143
172
  s.reference_allele = ref_allele
144
173
  s.save
145
174
 
175
+
146
176
  snp_alleles = Array.new
147
177
  gts.uniq.select{|gt| gt > 0}.each do |gt|
148
178
  # create snp allele
@@ -4,7 +4,7 @@
4
4
 
5
5
  require 'output_information_methods'
6
6
 
7
- def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cuttoff_genotype, cuttoff_snp, tree, fasttree_path)
7
+ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)
8
8
 
9
9
  strains = Strain.all
10
10
 
@@ -72,17 +72,18 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
72
72
  genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten
73
73
 
74
74
  snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
75
- # ignore snp if the snp qual is less than cuttoff.
76
- next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
75
+ # ignore snp if the snp qual is less than cutoff.
76
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i}
77
77
 
78
78
  next if alleles.any?{|allele| allele.base.length > 1} # indel
79
- next unless genotypes.all?{|genotype| genotype.geno_qual >= cuttoff_genotype} # all geno quals > cutoff
79
+ next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff
80
80
  # puts "#{i} SNPs processed so far" if i % 100 == 0
81
81
  strain_alleles = Hash.new
82
82
  strains.each do |strain|
83
83
  strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
84
+ # next if strain_genotype == nil
85
+ puts strain_genotype.inspect
84
86
  strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
85
-
86
87
  strain_alleles[strain.name] = strain_allele.base
87
88
  end
88
89
 
@@ -101,7 +102,7 @@ def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strain
101
102
 
102
103
  # If user has specified a tabular output
103
104
  if tabular_output
104
- output_information_methods(snps_array, output, cuttoff_genotype, cuttoff_snp, true)
105
+ output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true)
105
106
  # If user has specified a fasta output
106
107
  elsif fasta_output
107
108
  # generate FASTA file
@@ -13,21 +13,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
13
13
  total_number_of_syn_snps = 0
14
14
  total_number_of_non_syn_snps = 0
15
15
  total_number_of_pseudo = 0
16
- snps.each do |snp|
17
16
 
17
+ snps.each do |snp|
18
18
  ActiveRecord::Base.transaction do
19
19
  snp.alleles.each do |allele|
20
20
  next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
21
21
  if allele.id != snp.reference_allele_id
22
-
22
+
23
23
  # get annotation (if there is any) for each SNP
24
24
  features = Feature.joins(:snps).where("snps.id = ?", snp.id)
25
25
 
26
26
  # get snp quality for each snp
27
27
  snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
28
- next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
29
28
  # ignore snp if the snp qual is less than cuttoff.
30
- # next if snp.snp_qual < cuttoff_snp.to_i
29
+ next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
30
+
31
31
 
32
32
  # get all genotype qualities for each snp.
33
33
  gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
@@ -37,13 +37,12 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
37
37
  ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
38
38
  snp_base = Bio::Sequence.auto(allele.base)
39
39
  # count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
40
- snps_counter += 1
40
+ snps_counter += 1
41
41
  # If the feature is empty then just output basic information about the snp.
42
42
 
43
43
  if features.empty?
44
44
  outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
45
-
46
- else
45
+ else
47
46
  features.each do |feature|
48
47
  if feature.name == "CDS"
49
48
 
@@ -91,10 +90,14 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
91
90
  non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
92
91
 
93
92
  # Get alleles for each strain
94
- alleles_array = []
93
+ bases_from_alleles = []
95
94
  strains.each do |strain|
95
+
96
96
  allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
97
- alleles_array << allele_for_strains.base
97
+ # allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}")
98
+ puts allele_for_strains.inspect
99
+ # next if bases_from_alleles.empty?
100
+ bases_from_alleles << allele_for_strains.base
98
101
  end
99
102
 
100
103
  # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
@@ -102,9 +105,9 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
102
105
  total_number_of_syn_snps +=1
103
106
  if mutated_seq_translated_clean =~ /\*/
104
107
  total_number_of_pseudo +=1
105
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
108
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
106
109
  else
107
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
110
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
108
111
  end
109
112
  else
110
113
  total_number_of_non_syn_snps +=1
@@ -112,21 +115,21 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
112
115
 
113
116
  if mutated_seq_translated_clean =~ /\*/
114
117
  total_number_of_pseudo +=1
115
- outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
118
+ outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
116
119
  else
117
- outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
120
+ outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
118
121
  end
119
122
  end
120
123
  end
121
124
  end
122
125
  end
123
126
  end
127
+ puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
124
128
  end
125
129
  end
126
- puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0
127
130
  end
128
131
  end
129
- puts "Total number of snps: #{snps_counter}"
132
+ puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}"
130
133
  puts "Total number of snps in CDS region: #{cds_snps_counter}"
131
134
  puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
132
135
  puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
@@ -7,7 +7,7 @@ require 'create_methods'
7
7
  require 'filter_ignore_snps_methods'
8
8
  require 'output_information_methods'
9
9
 
10
- def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
10
+ def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)
11
11
 
12
12
  *strain_names = strain_names
13
13
 
@@ -15,24 +15,25 @@ def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
15
15
 
16
16
  outfile = File.open(out, "w")
17
17
 
18
- snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cuttoff_genotype} AND snps.qual >= #{cuttoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
18
+ snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
19
19
  # puts "The number of unique snps are #{snps.size}"
20
20
 
21
- output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, false)
21
+ output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false)
22
22
  end
23
23
 
24
24
 
25
- def information(out, cuttoff_genotype, cuttoff_snp)
25
+ def information(out, cutoff_genotype, cutoff_snp)
26
26
 
27
27
  puts "outputting SNP info....."
28
28
 
29
29
  strains = Strain.all
30
30
 
31
- snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
31
+ # snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
32
32
 
33
+ snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id")
33
34
  outfile = File.open(out, "w")
34
35
 
35
- output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, true)
36
+ output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, true)
36
37
 
37
38
  end
38
39