snp-search 0.26.0 → 0.27.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -31,11 +31,10 @@ Once you have these files ready, you may run snp-search with the following optio
31
31
  -n Name of your database Optional, default = snp_db.sqlite3
32
32
  -v .vcf file Required
33
33
  -r Reference genome file (The same file that was used in generating the .vcf file). This should be in genbank or embl format. Required
34
- -c SNP quality cutoff. A phred-scaled quality score. High quality scores indicate high confidence calls. Optional, default = 90
35
- -t Genotype Quality cutoff. This is the probability that the genotype call is wrong under the condition that the site is being variant. Optional, default = 30
34
+ -c SNP quality score cutoff. A Phred-scaled quality score. High quality scores indicate high confidence calls. Optional, default = 90 (out of 100)
35
+ -t Genotype Quality score cutoff. Phred-scaled quality score that the genotype is true. Optional, default = 30
36
36
  -h help message
37
37
 
38
-
39
38
  Usage:
40
39
  snp-search -n my_snp_db.sqlite3 -r my_ref.gbk -v my_vcf_file.vcf
41
40
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.26.0
1
+ 0.27.0
data/bin/snp-search CHANGED
@@ -6,6 +6,7 @@ require 'snp_db_schema'
6
6
  gem "slop", "~> 2.4.0"
7
7
  require 'slop'
8
8
 
9
+ begin
9
10
  opts = Slop.new :help do
10
11
  banner "ruby snp-search [OPTIONS]"
11
12
 
@@ -16,9 +17,6 @@ opts = Slop.new :help do
16
17
  on :c, :cuttoff_snp=, 'SNP quality cutoff, (default = 90)', :default => 90
17
18
  on :t, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :default => 30
18
19
 
19
- on_empty do
20
- puts help
21
- end
22
20
  end
23
21
  opts.parse
24
22
 
@@ -30,7 +28,7 @@ opts.parse
30
28
 
31
29
  unless error_msg == ""
32
30
  puts error_msg
33
- puts opts.help unless opts.empty?
31
+ puts opts.help unless opts.empty?
34
32
  exit
35
33
  end
36
34
 
@@ -44,12 +42,12 @@ opts.parse
44
42
  rescue
45
43
  end
46
44
 
47
- begin
45
+
48
46
  # Enter the name of your database
49
47
  establish_connection(opts[:name])
50
48
 
51
49
  # Schema will run here
52
- db_schema
50
+ #db_schema
53
51
 
54
52
  ref = opts[:reference_file]
55
53
 
@@ -70,7 +68,7 @@ sequence_format = guess_sequence_format(ref)
70
68
  vcf_mpileup_file = opts[:vcf_file]
71
69
 
72
70
  # The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file.
73
- # populate_features_and_annotations(sequence_flatfile)
71
+ populate_features_and_annotations(sequence_flatfile)
74
72
 
75
73
  #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
76
74
  populate_snps_alleles_genotypes(vcf_mpileup_file, opts[:cuttoff_snp].to_i, opts[:cuttoff_genotype].to_i)
data/lib/snp-search.rb CHANGED
@@ -3,7 +3,6 @@ gem "bio", "~> 1.4.2"
3
3
  require 'bio'
4
4
  require 'snp_db_models'
5
5
  require 'activerecord-import'
6
- #establish_connection
7
6
 
8
7
  def guess_sequence_format(reference_genome)
9
8
  file_extension = File.extname(reference_genome).downcase
@@ -20,148 +19,142 @@ end
20
19
  # A method to populate the database with the features (genes etc) and the annotations from the embl file.
21
20
  # We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
22
21
  # The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
23
- # Also, the qualifier and value are extracted from the embl file and added to the database.
24
- # def populate_features_and_annotations(sequence_file)
25
- # ActiveRecord::Base.transaction do
26
- # sequence_file.features.each do |feature|
27
- # unless feature.feature == "source" || feature.feature == "gene"
28
- # db_feature = Feature.new
29
- # db_feature.start = feature.locations.first.from
30
- # db_feature.end = feature.locations.first.to
31
- # db_feature.strand = feature.locations.first.strand
32
- # db_feature.name = feature.feature
33
- # db_feature.save
34
- # puts "populating #{db_feature.name}, start: #{db_feature.start}, end: #{db_feature.end}, strand: #{db_feature.strand} for feature: #{db_feature.id}"
35
- # # Populate the Annotation table with qualifier information from the genbank file
36
- # feature.qualifiers.each do |qualifier|
37
- # a = Annotation.new
38
- # a.qualifier = qualifier.qualifier
39
- # a.value = qualifier.value
40
- # a.save
41
- # db_feature.annotations << a
42
- # puts "populating #{a.qualifier} for feature: #{db_feature.id}"
43
- # end
44
- # end
45
- # end
46
- # end
47
- # end
22
+ #Also, the qualifier and value are extracted from the embl file and added to the database.
23
+ def populate_features_and_annotations(sequence_file)
24
+ puts "Adding features and their annotations...."
25
+ ActiveRecord::Base.transaction do
26
+ counter = 0
27
+ sequence_file.features.each do |feature|
28
+ counter += 1
29
+ puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
30
+ unless feature.feature == "source" || feature.feature == "gene"
31
+ db_feature = Feature.new
32
+ db_feature.start = feature.locations.first.from
33
+ db_feature.end = feature.locations.first.to
34
+ db_feature.strand = feature.locations.first.strand
35
+ db_feature.name = feature.feature
36
+ db_feature.save
37
+ # Populate the Annotation table with qualifier information from the genbank file
38
+ feature.qualifiers.each do |qualifier|
39
+ a = Annotation.new
40
+ a.qualifier = qualifier.qualifier
41
+ a.value = qualifier.value
42
+ a.save
43
+ db_feature.annotations << a
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
48
49
 
49
50
  #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
50
51
  # It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.
51
-
52
52
  def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
53
-
54
-
53
+ puts "Adding SNPs........"
55
54
  # open vcf file and parse each line
56
55
  File.open(vcf_file) do |f|
57
56
  # header names
58
- header = f.gets
59
- header2 = f.gets.chomp
60
- column_headings = header2.split("\t")
61
- strain_names = column_headings[9..-1]
62
- strain_names.map!{|name| name.sub(/\..*/, '')}
63
-
64
- strain_names.each do |str|
65
- ss = Strain.new
66
- ss.name = str
67
- ss.save
68
- end
69
-
70
- strains = Array.new
71
- strain_names.each do |strain_name|
72
- strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
73
- strains << strain
74
- end
75
-
76
-
77
- good_snps = 0
78
- # start parsing snps
79
57
  while line = f.gets
80
- details = line.split("\t")
81
- ref = details[0]
82
- ref_pos = details[1]
83
- ref_base = details[3]
84
- snp_base = details[4]
85
- snp_qual = details [5]
86
- samples = details[9..-1]
87
-
88
- genotypes = samples.map do |s|
89
- pl, gt, gq = s.chomp.split(":")
90
- gt
91
- end
92
-
93
- genotypes_qualities = samples.map do |s|
94
- pl, gt, gq = s.chomp.split(":")
95
- gq
96
- end
97
-
98
- high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
99
- variant_genotypes = Array.new
100
- genotypes.each_with_index do |gt, index|
101
- if gt == "1/1"
102
- variant_genotypes << index
103
- if genotypes_qualities[index].to_i >= cuttoff_genotype
104
- high_quality_variant_genotypes << index
105
- end
106
- end
107
- end
108
-
109
- if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
110
- if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
111
- good_snps +=1
112
- # puts good_snps
113
- #create snp
114
- s = Snp.new
115
- s.ref_pos = ref_pos
116
- s.save
117
- puts "Adding Reference SNP position: #{ref_pos}"
118
-
119
- # create ref allele
120
- ref_allele = Allele.new
121
- ref_allele.base = ref_base
122
- ref_allele.snp = s
123
- ref_allele.save
124
-
125
- puts "Adding Reference SNP base: #{ref_base}"
126
-
127
- s.reference_allele = ref_allele
128
- s.save
129
-
130
- # create snp allele
131
- snp_allele = Allele.new
132
- snp_allele.base = snp_base
133
- snp_allele.snp = s
134
- snp_allele.save
135
-
136
- puts "Adding SNP base: #{snp_base}"
137
-
138
- a = Time.now
139
-
140
- puts "Adding Genotype information..."
141
- ActiveRecord::Base.transaction do
142
- genotypes.each_with_index do |gt, index|
143
- genotype = Genotype.new
144
- genotype.strain = strains[index]
145
- puts index if strains[index].nil?
146
- # print "#{gt}(#{genotypes_qualities[index]}) "
147
- if gt == "0/0" # wild type
148
- genotype.allele = ref_allele
149
- elsif gt == "1/1" # snp type
150
- genotype.allele = snp_allele
151
- else
152
- puts "Strange SNP #{gt}"
153
- end
154
- #genotype.save
155
-
156
- end
58
+ if line =~ /CHROM/
59
+ #puts line
60
+ column_headings = line.split("\t")
61
+ strain_names = column_headings[9..-1]
62
+ strain_names.map!{|name| name.sub(/\..*/, '')}
63
+ #puts strain_names
64
+
65
+ strain_names.each do |str|
66
+ ss = Strain.new
67
+ ss.name = str
68
+ ss.save
69
+ end
70
+
71
+ strains = Array.new
72
+ strain_names.each do |strain_name|
73
+ strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
74
+ strains << strain
75
+ end
76
+
77
+ good_snps = 0
78
+ # start parsing snps
79
+ while line = f.gets
80
+ #puts line
81
+ details = line.split("\t")
82
+ ref = details[0]
83
+ ref_pos = details[1]
84
+ ref_base = details[3]
85
+ snp_base = details[4]
86
+ snp_qual = details [5]
87
+ samples = details[9..-1]
88
+
89
+ next if ref_base.size != 1 || snp_base.size != 1 # exclude indels
90
+ genotypes = samples.map do |s|
91
+ pl, gt, gq = s.chomp.split(":")
92
+ gt
93
+ end
94
+
95
+ genotypes_qualities = samples.map do |s|
96
+ pl, gt, gq = s.chomp.split(":")
97
+ gq
98
+ end
99
+
100
+ high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
101
+ variant_genotypes = Array.new
102
+ genotypes.each_with_index do |gt, index|
103
+ if gt == "1/1"
104
+ variant_genotypes << index
105
+ if genotypes_qualities[index].to_i >= cuttoff_genotype
106
+ high_quality_variant_genotypes << index
107
+ end
108
+ end
157
109
  end
158
- b = Time.now
159
- mm, ss = (b-a).divmod(60)
160
- puts "Time taken: #{mm} minutes #{ss} seconds"
161
- a = Time.now
162
- end
163
- end
164
-
110
+
111
+ if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
112
+ if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
113
+ good_snps +=1
114
+ # puts good_snps
115
+ #create snp
116
+ s = Snp.new
117
+ s.ref_pos = ref_pos
118
+ s.save
119
+
120
+ # create ref allele
121
+ ref_allele = Allele.new
122
+ ref_allele.base = ref_base
123
+ ref_allele.snp = s
124
+ ref_allele.save
125
+
126
+ s.reference_allele = ref_allele
127
+ s.save
128
+
129
+ # create snp allele
130
+ snp_allele = Allele.new
131
+ snp_allele.base = snp_base
132
+ snp_allele.snp = s
133
+ snp_allele.save
134
+
135
+ a = Time.now
136
+ # geno = [:ref_allele, :snp_allele]
137
+ ActiveRecord::Base.transaction do
138
+ genotypes.each_with_index do |gt, index|
139
+ genotype = Genotype.new
140
+ genotype.strain = strains[index]
141
+ puts index if strains[index].nil?
142
+ # print "#{gt}(#{genotypes_qualities[index]}) "
143
+ if gt == "0/0" # wild type
144
+ genotype.allele = ref_allele
145
+ elsif gt == "1/1" # snp type
146
+ genotype.allele = snp_allele
147
+ else
148
+ puts "Strange SNP #{gt}"
149
+ end
150
+ genotype.save
151
+ end
152
+ puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
165
158
  end
166
159
  end
167
160
  #Here we link the features to snps.
data/snp-search.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "snp-search"
8
- s.version = "0.26.0"
8
+ s.version = "0.27.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
12
- s.date = "2011-12-13"
12
+ s.date = "2011-12-16"
13
13
  s.description = "Use the snp-search toolset to query the SNP database"
14
14
  s.email = "ali.al-shahib@hpa.org.uk"
15
15
  s.executables = ["snp-search"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snp-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.26.0
4
+ version: 0.27.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,11 +10,11 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2011-12-13 00:00:00.000000000Z
13
+ date: 2011-12-16 00:00:00.000000000Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: activerecord
17
- requirement: &2153046460 !ruby/object:Gem::Requirement
17
+ requirement: &2154719420 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 3.1.3
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *2153046460
25
+ version_requirements: *2154719420
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: bio
28
- requirement: &2153045960 !ruby/object:Gem::Requirement
28
+ requirement: &2154718940 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 1.4.2
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *2153045960
36
+ version_requirements: *2154718940
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: slop
39
- requirement: &2153045420 !ruby/object:Gem::Requirement
39
+ requirement: &2154718420 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 2.4.0
45
45
  type: :runtime
46
46
  prerelease: false
47
- version_requirements: *2153045420
47
+ version_requirements: *2154718420
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: sqlite3
50
- requirement: &2153044880 !ruby/object:Gem::Requirement
50
+ requirement: &2154717940 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ~>
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 1.3.4
56
56
  type: :runtime
57
57
  prerelease: false
58
- version_requirements: *2153044880
58
+ version_requirements: *2154717940
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: activerecord-import
61
- requirement: &2153044340 !ruby/object:Gem::Requirement
61
+ requirement: &2154717460 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ~>
@@ -66,10 +66,10 @@ dependencies:
66
66
  version: 0.2.8
67
67
  type: :runtime
68
68
  prerelease: false
69
- version_requirements: *2153044340
69
+ version_requirements: *2154717460
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: rspec
72
- requirement: &2153043800 !ruby/object:Gem::Requirement
72
+ requirement: &2154716980 !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
75
  - - ~>
@@ -77,10 +77,10 @@ dependencies:
77
77
  version: 2.3.0
78
78
  type: :development
79
79
  prerelease: false
80
- version_requirements: *2153043800
80
+ version_requirements: *2154716980
81
81
  - !ruby/object:Gem::Dependency
82
82
  name: bundler
83
- requirement: &2153043300 !ruby/object:Gem::Requirement
83
+ requirement: &2154716400 !ruby/object:Gem::Requirement
84
84
  none: false
85
85
  requirements:
86
86
  - - ~>
@@ -88,10 +88,10 @@ dependencies:
88
88
  version: 1.0.0
89
89
  type: :development
90
90
  prerelease: false
91
- version_requirements: *2153043300
91
+ version_requirements: *2154716400
92
92
  - !ruby/object:Gem::Dependency
93
93
  name: jeweler
94
- requirement: &2153042820 !ruby/object:Gem::Requirement
94
+ requirement: &2154715920 !ruby/object:Gem::Requirement
95
95
  none: false
96
96
  requirements:
97
97
  - - ~>
@@ -99,10 +99,10 @@ dependencies:
99
99
  version: 1.6.4
100
100
  type: :development
101
101
  prerelease: false
102
- version_requirements: *2153042820
102
+ version_requirements: *2154715920
103
103
  - !ruby/object:Gem::Dependency
104
104
  name: rcov
105
- requirement: &2153042340 !ruby/object:Gem::Requirement
105
+ requirement: &2154715440 !ruby/object:Gem::Requirement
106
106
  none: false
107
107
  requirements:
108
108
  - - ! '>='
@@ -110,7 +110,7 @@ dependencies:
110
110
  version: '0'
111
111
  type: :development
112
112
  prerelease: false
113
- version_requirements: *2153042340
113
+ version_requirements: *2154715440
114
114
  description: Use the snp-search toolset to query the SNP database
115
115
  email: ali.al-shahib@hpa.org.uk
116
116
  executables:
@@ -157,7 +157,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
157
157
  version: '0'
158
158
  segments:
159
159
  - 0
160
- hash: -2048575630589609763
160
+ hash: -2414192550927857417
161
161
  required_rubygems_version: !ruby/object:Gem::Requirement
162
162
  none: false
163
163
  requirements: