snp-search 0.26.0 → 0.27.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -3
- data/VERSION +1 -1
- data/bin/snp-search +5 -7
- data/lib/snp-search.rb +127 -134
- data/snp-search.gemspec +2 -2
- metadata +21 -21
data/README.rdoc
CHANGED
@@ -31,11 +31,10 @@ Once you have these files ready, you may run snp-search with the following optio
|
|
31
31
|
-n Name of your database Optional, default = snp_db.sqlite3
|
32
32
|
-v .vcf file Required
|
33
33
|
-r Reference genome file (The same file that was used in generating the .vcf file). This should be in genbank or embl format. Required
|
34
|
-
-c SNP quality cutoff. A
|
35
|
-
-t Genotype Quality cutoff.
|
34
|
+
-c SNP quality score cutoff. A Phred-scaled quality score. High quality scores indicate high confidence calls. Optional, default = 90 (out of 100)
|
35
|
+
-t Genotype Quality score cutoff. Phred-scaled quality score that the genotype is true. Optional, default = 30
|
36
36
|
-h help message
|
37
37
|
|
38
|
-
|
39
38
|
Usage:
|
40
39
|
snp-search -n my_snp_db.sqlite3 -r my_ref.gbk -v my_vcf_file.vcf
|
41
40
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.27.0
|
data/bin/snp-search
CHANGED
@@ -6,6 +6,7 @@ require 'snp_db_schema'
|
|
6
6
|
gem "slop", "~> 2.4.0"
|
7
7
|
require 'slop'
|
8
8
|
|
9
|
+
begin
|
9
10
|
opts = Slop.new :help do
|
10
11
|
banner "ruby snp-search [OPTIONS]"
|
11
12
|
|
@@ -16,9 +17,6 @@ opts = Slop.new :help do
|
|
16
17
|
on :c, :cuttoff_snp=, 'SNP quality cutoff, (default = 90)', :default => 90
|
17
18
|
on :t, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :default => 30
|
18
19
|
|
19
|
-
on_empty do
|
20
|
-
puts help
|
21
|
-
end
|
22
20
|
end
|
23
21
|
opts.parse
|
24
22
|
|
@@ -30,7 +28,7 @@ opts.parse
|
|
30
28
|
|
31
29
|
unless error_msg == ""
|
32
30
|
puts error_msg
|
33
|
-
puts opts.help
|
31
|
+
puts opts.help unless opts.empty?
|
34
32
|
exit
|
35
33
|
end
|
36
34
|
|
@@ -44,12 +42,12 @@ opts.parse
|
|
44
42
|
rescue
|
45
43
|
end
|
46
44
|
|
47
|
-
|
45
|
+
|
48
46
|
# Enter the name of your database
|
49
47
|
establish_connection(opts[:name])
|
50
48
|
|
51
49
|
# Schema will run here
|
52
|
-
db_schema
|
50
|
+
#db_schema
|
53
51
|
|
54
52
|
ref = opts[:reference_file]
|
55
53
|
|
@@ -70,7 +68,7 @@ sequence_format = guess_sequence_format(ref)
|
|
70
68
|
vcf_mpileup_file = opts[:vcf_file]
|
71
69
|
|
72
70
|
# The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file.
|
73
|
-
|
71
|
+
populate_features_and_annotations(sequence_flatfile)
|
74
72
|
|
75
73
|
#The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
|
76
74
|
populate_snps_alleles_genotypes(vcf_mpileup_file, opts[:cuttoff_snp].to_i, opts[:cuttoff_genotype].to_i)
|
data/lib/snp-search.rb
CHANGED
@@ -3,7 +3,6 @@ gem "bio", "~> 1.4.2"
|
|
3
3
|
require 'bio'
|
4
4
|
require 'snp_db_models'
|
5
5
|
require 'activerecord-import'
|
6
|
-
#establish_connection
|
7
6
|
|
8
7
|
def guess_sequence_format(reference_genome)
|
9
8
|
file_extension = File.extname(reference_genome).downcase
|
@@ -20,148 +19,142 @@ end
|
|
20
19
|
# A method to populate the database with the features (genes etc) and the annotations from the embl file.
|
21
20
|
# We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
|
22
21
|
# The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
#
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
22
|
+
#Also, the qualifier and value are extracted from the embl file and added to the database.
|
23
|
+
def populate_features_and_annotations(sequence_file)
|
24
|
+
puts "Adding features and their annotations...."
|
25
|
+
ActiveRecord::Base.transaction do
|
26
|
+
counter = 0
|
27
|
+
sequence_file.features.each do |feature|
|
28
|
+
counter += 1
|
29
|
+
puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
|
30
|
+
unless feature.feature == "source" || feature.feature == "gene"
|
31
|
+
db_feature = Feature.new
|
32
|
+
db_feature.start = feature.locations.first.from
|
33
|
+
db_feature.end = feature.locations.first.to
|
34
|
+
db_feature.strand = feature.locations.first.strand
|
35
|
+
db_feature.name = feature.feature
|
36
|
+
db_feature.save
|
37
|
+
# Populate the Annotation table with qualifier information from the genbank file
|
38
|
+
feature.qualifiers.each do |qualifier|
|
39
|
+
a = Annotation.new
|
40
|
+
a.qualifier = qualifier.qualifier
|
41
|
+
a.value = qualifier.value
|
42
|
+
a.save
|
43
|
+
db_feature.annotations << a
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
48
49
|
|
49
50
|
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
|
50
51
|
# It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.
|
51
|
-
|
52
52
|
def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
|
53
|
-
|
54
|
-
|
53
|
+
puts "Adding SNPs........"
|
55
54
|
# open vcf file and parse each line
|
56
55
|
File.open(vcf_file) do |f|
|
57
56
|
# header names
|
58
|
-
header = f.gets
|
59
|
-
header2 = f.gets.chomp
|
60
|
-
column_headings = header2.split("\t")
|
61
|
-
strain_names = column_headings[9..-1]
|
62
|
-
strain_names.map!{|name| name.sub(/\..*/, '')}
|
63
|
-
|
64
|
-
strain_names.each do |str|
|
65
|
-
ss = Strain.new
|
66
|
-
ss.name = str
|
67
|
-
ss.save
|
68
|
-
end
|
69
|
-
|
70
|
-
strains = Array.new
|
71
|
-
strain_names.each do |strain_name|
|
72
|
-
strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
73
|
-
strains << strain
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
good_snps = 0
|
78
|
-
# start parsing snps
|
79
57
|
while line = f.gets
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
snp_allele = Allele.new
|
132
|
-
snp_allele.base = snp_base
|
133
|
-
snp_allele.snp = s
|
134
|
-
snp_allele.save
|
135
|
-
|
136
|
-
puts "Adding SNP base: #{snp_base}"
|
137
|
-
|
138
|
-
a = Time.now
|
139
|
-
|
140
|
-
puts "Adding Genotype information..."
|
141
|
-
ActiveRecord::Base.transaction do
|
142
|
-
genotypes.each_with_index do |gt, index|
|
143
|
-
genotype = Genotype.new
|
144
|
-
genotype.strain = strains[index]
|
145
|
-
puts index if strains[index].nil?
|
146
|
-
# print "#{gt}(#{genotypes_qualities[index]}) "
|
147
|
-
if gt == "0/0" # wild type
|
148
|
-
genotype.allele = ref_allele
|
149
|
-
elsif gt == "1/1" # snp type
|
150
|
-
genotype.allele = snp_allele
|
151
|
-
else
|
152
|
-
puts "Strange SNP #{gt}"
|
153
|
-
end
|
154
|
-
#genotype.save
|
155
|
-
|
156
|
-
end
|
58
|
+
if line =~ /CHROM/
|
59
|
+
#puts line
|
60
|
+
column_headings = line.split("\t")
|
61
|
+
strain_names = column_headings[9..-1]
|
62
|
+
strain_names.map!{|name| name.sub(/\..*/, '')}
|
63
|
+
#puts strain_names
|
64
|
+
|
65
|
+
strain_names.each do |str|
|
66
|
+
ss = Strain.new
|
67
|
+
ss.name = str
|
68
|
+
ss.save
|
69
|
+
end
|
70
|
+
|
71
|
+
strains = Array.new
|
72
|
+
strain_names.each do |strain_name|
|
73
|
+
strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
74
|
+
strains << strain
|
75
|
+
end
|
76
|
+
|
77
|
+
good_snps = 0
|
78
|
+
# start parsing snps
|
79
|
+
while line = f.gets
|
80
|
+
#puts line
|
81
|
+
details = line.split("\t")
|
82
|
+
ref = details[0]
|
83
|
+
ref_pos = details[1]
|
84
|
+
ref_base = details[3]
|
85
|
+
snp_base = details[4]
|
86
|
+
snp_qual = details [5]
|
87
|
+
samples = details[9..-1]
|
88
|
+
|
89
|
+
next if ref_base.size != 1 || snp_base.size != 1 # exclude indels
|
90
|
+
genotypes = samples.map do |s|
|
91
|
+
pl, gt, gq = s.chomp.split(":")
|
92
|
+
gt
|
93
|
+
end
|
94
|
+
|
95
|
+
genotypes_qualities = samples.map do |s|
|
96
|
+
pl, gt, gq = s.chomp.split(":")
|
97
|
+
gq
|
98
|
+
end
|
99
|
+
|
100
|
+
high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
|
101
|
+
variant_genotypes = Array.new
|
102
|
+
genotypes.each_with_index do |gt, index|
|
103
|
+
if gt == "1/1"
|
104
|
+
variant_genotypes << index
|
105
|
+
if genotypes_qualities[index].to_i >= cuttoff_genotype
|
106
|
+
high_quality_variant_genotypes << index
|
107
|
+
end
|
108
|
+
end
|
157
109
|
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
110
|
+
|
111
|
+
if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
|
112
|
+
if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
|
113
|
+
good_snps +=1
|
114
|
+
# puts good_snps
|
115
|
+
#create snp
|
116
|
+
s = Snp.new
|
117
|
+
s.ref_pos = ref_pos
|
118
|
+
s.save
|
119
|
+
|
120
|
+
# create ref allele
|
121
|
+
ref_allele = Allele.new
|
122
|
+
ref_allele.base = ref_base
|
123
|
+
ref_allele.snp = s
|
124
|
+
ref_allele.save
|
125
|
+
|
126
|
+
s.reference_allele = ref_allele
|
127
|
+
s.save
|
128
|
+
|
129
|
+
# create snp allele
|
130
|
+
snp_allele = Allele.new
|
131
|
+
snp_allele.base = snp_base
|
132
|
+
snp_allele.snp = s
|
133
|
+
snp_allele.save
|
134
|
+
|
135
|
+
a = Time.now
|
136
|
+
# geno = [:ref_allele, :snp_allele]
|
137
|
+
ActiveRecord::Base.transaction do
|
138
|
+
genotypes.each_with_index do |gt, index|
|
139
|
+
genotype = Genotype.new
|
140
|
+
genotype.strain = strains[index]
|
141
|
+
puts index if strains[index].nil?
|
142
|
+
# print "#{gt}(#{genotypes_qualities[index]}) "
|
143
|
+
if gt == "0/0" # wild type
|
144
|
+
genotype.allele = ref_allele
|
145
|
+
elsif gt == "1/1" # snp type
|
146
|
+
genotype.allele = snp_allele
|
147
|
+
else
|
148
|
+
puts "Strange SNP #{gt}"
|
149
|
+
end
|
150
|
+
genotype.save
|
151
|
+
end
|
152
|
+
puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
165
158
|
end
|
166
159
|
end
|
167
160
|
#Here we link the features to snps.
|
data/snp-search.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "snp-search"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.27.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
|
12
|
-
s.date = "2011-12-
|
12
|
+
s.date = "2011-12-16"
|
13
13
|
s.description = "Use the snp-search toolset to query the SNP database"
|
14
14
|
s.email = "ali.al-shahib@hpa.org.uk"
|
15
15
|
s.executables = ["snp-search"]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snp-search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.27.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,11 +10,11 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2011-12-
|
13
|
+
date: 2011-12-16 00:00:00.000000000Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activerecord
|
17
|
-
requirement: &
|
17
|
+
requirement: &2154719420 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: 3.1.3
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *2154719420
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: bio
|
28
|
-
requirement: &
|
28
|
+
requirement: &2154718940 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: 1.4.2
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *2154718940
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: slop
|
39
|
-
requirement: &
|
39
|
+
requirement: &2154718420 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: 2.4.0
|
45
45
|
type: :runtime
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *2154718420
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: sqlite3
|
50
|
-
requirement: &
|
50
|
+
requirement: &2154717940 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ~>
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: 1.3.4
|
56
56
|
type: :runtime
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *2154717940
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: activerecord-import
|
61
|
-
requirement: &
|
61
|
+
requirement: &2154717460 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ~>
|
@@ -66,10 +66,10 @@ dependencies:
|
|
66
66
|
version: 0.2.8
|
67
67
|
type: :runtime
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *2154717460
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: rspec
|
72
|
-
requirement: &
|
72
|
+
requirement: &2154716980 !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
74
74
|
requirements:
|
75
75
|
- - ~>
|
@@ -77,10 +77,10 @@ dependencies:
|
|
77
77
|
version: 2.3.0
|
78
78
|
type: :development
|
79
79
|
prerelease: false
|
80
|
-
version_requirements: *
|
80
|
+
version_requirements: *2154716980
|
81
81
|
- !ruby/object:Gem::Dependency
|
82
82
|
name: bundler
|
83
|
-
requirement: &
|
83
|
+
requirement: &2154716400 !ruby/object:Gem::Requirement
|
84
84
|
none: false
|
85
85
|
requirements:
|
86
86
|
- - ~>
|
@@ -88,10 +88,10 @@ dependencies:
|
|
88
88
|
version: 1.0.0
|
89
89
|
type: :development
|
90
90
|
prerelease: false
|
91
|
-
version_requirements: *
|
91
|
+
version_requirements: *2154716400
|
92
92
|
- !ruby/object:Gem::Dependency
|
93
93
|
name: jeweler
|
94
|
-
requirement: &
|
94
|
+
requirement: &2154715920 !ruby/object:Gem::Requirement
|
95
95
|
none: false
|
96
96
|
requirements:
|
97
97
|
- - ~>
|
@@ -99,10 +99,10 @@ dependencies:
|
|
99
99
|
version: 1.6.4
|
100
100
|
type: :development
|
101
101
|
prerelease: false
|
102
|
-
version_requirements: *
|
102
|
+
version_requirements: *2154715920
|
103
103
|
- !ruby/object:Gem::Dependency
|
104
104
|
name: rcov
|
105
|
-
requirement: &
|
105
|
+
requirement: &2154715440 !ruby/object:Gem::Requirement
|
106
106
|
none: false
|
107
107
|
requirements:
|
108
108
|
- - ! '>='
|
@@ -110,7 +110,7 @@ dependencies:
|
|
110
110
|
version: '0'
|
111
111
|
type: :development
|
112
112
|
prerelease: false
|
113
|
-
version_requirements: *
|
113
|
+
version_requirements: *2154715440
|
114
114
|
description: Use the snp-search toolset to query the SNP database
|
115
115
|
email: ali.al-shahib@hpa.org.uk
|
116
116
|
executables:
|
@@ -157,7 +157,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
157
157
|
version: '0'
|
158
158
|
segments:
|
159
159
|
- 0
|
160
|
-
hash: -
|
160
|
+
hash: -2414192550927857417
|
161
161
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
162
162
|
none: false
|
163
163
|
requirements:
|