snp-search 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +21 -1
- data/VERSION +1 -1
- data/lib/snp-search.rb +150 -0
- data/lib/snp_db_connection.rb +9 -0
- data/lib/snp_db_models.rb +32 -0
- data/lib/snp_db_schema.rb +86 -0
- data/lib/user_entry_file.rb +21 -0
- data/snp-search.gemspec +5 -1
- metadata +14 -10
data/README.rdoc
CHANGED
@@ -1,6 +1,25 @@
|
|
1
1
|
= snp-search
|
2
2
|
|
3
|
-
|
3
|
+
snp-search is a set of tools that manages SNP data and allows for data importing, manipulating, editing and complex querying of SNP data. It can be used to evaluate the utility of SNPs for the assessment of genetic diversity between strains and the management of genotype and phenotype data. Once a query is performed, SNPsearch can be used to convert the selected SNP data into FASTA sequences. SNPsearch is particularly useful in the analysis of phylogenetic trees that are based on SNP differences across whole core genomes. Queries can be made to answer critical genomic questions such as the association of SNPs with particular phenotypes.
|
4
|
+
|
5
|
+
== Obtaining and installing the code
|
6
|
+
snp-search is written in Ruby and operates in a Unix enviroment. It is made available as a gem. See the github site for more information (https://github.com/hpa-bioinformatics/snp-search).
|
7
|
+
|
8
|
+
To install snp-search, do
|
9
|
+
gem install snp-search
|
10
|
+
|
11
|
+
== Requirements
|
12
|
+
|
13
|
+
* ActiveRecord: The snp-search API is based on ActiveRecord to get the data from the database. ActiveRecord is available as a gem:
|
14
|
+
gem install activerecord
|
15
|
+
|
16
|
+
* SQLite3: The SQL engine used is sqlite3. Most Linux operating systems come with sqlite3. However if you do not have sqlite then you may download it from http://www.sqlite.org/download.html. The installation instructions are available in the download page.
|
17
|
+
|
18
|
+
* bio gem.
|
19
|
+
|
20
|
+
== Running snp-search
|
21
|
+
|
22
|
+
Step 1:
|
4
23
|
|
5
24
|
== Contributing to snp-search
|
6
25
|
|
@@ -11,6 +30,7 @@ Description goes here.
|
|
11
30
|
* Commit and push until you are happy with your contribution
|
12
31
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
32
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
33
|
+
*
|
14
34
|
|
15
35
|
== Copyright
|
16
36
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/snp-search.rb
CHANGED
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bio'
|
3
|
+
require 'snp_db_models'
|
4
|
+
establish_connection
|
5
|
+
|
6
|
+
# A method to populate the strain names in the Strain table. strain_names is an array of strain names.
|
7
|
+
def populate_strains(strain_names)
|
8
|
+
strain_names.each do |strain|
|
9
|
+
s = Strain.new
|
10
|
+
s.name = strain
|
11
|
+
s.save
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# A method to populate the database with the features (genes etc) and the annotations from the embl file.
|
16
|
+
# We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
|
17
|
+
# The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
|
18
|
+
# Also, the qualifier and value are extracted from the embl file and added to the database.
|
19
|
+
def populate_features_and_annotations(embl_ncbi_file)
|
20
|
+
embl_ncbi_file.features.each do |feature|
|
21
|
+
unless feature.feature == "source" || feature.feature == "gene"
|
22
|
+
db_feature = Feature.new
|
23
|
+
db_feature.start = feature.locations.first.from
|
24
|
+
db_feature.end = feature.locations.first.to
|
25
|
+
db_feature.strand = feature.locations.first.strand
|
26
|
+
db_feature.name = feature.feature
|
27
|
+
db_feature.save
|
28
|
+
puts "populated #{db_feature.name}, start: #{db_feature.start}, end: #{db_feature.end}, strand: #{db_feature.strand} for feature: #{db_feature.id}"
|
29
|
+
# Populate the Annotation table with qualifier information from the genbank file
|
30
|
+
feature.qualifiers.each do |qualifier|
|
31
|
+
a = Annotation.new
|
32
|
+
a.qualifier = qualifier.qualifier
|
33
|
+
a.value = qualifier.value
|
34
|
+
a.save
|
35
|
+
db_feature.annotations << a
|
36
|
+
puts "populated #{a.qualifier} for feature: #{db_feature.id}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
|
44
|
+
# It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.
|
45
|
+
|
46
|
+
def populate_snps_alleles_genotypes(strain_names, vcf_file)
|
47
|
+
strains = Array.new
|
48
|
+
strain_names.each do |strain_name|
|
49
|
+
strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
|
50
|
+
strains << strain
|
51
|
+
end
|
52
|
+
|
53
|
+
# open vcf file and parse each line
|
54
|
+
File.open(vcf_file) do |f|
|
55
|
+
# header names
|
56
|
+
header = f.gets
|
57
|
+
header2 = f.gets.chomp
|
58
|
+
column_headings = header2.split("\t")
|
59
|
+
sample_names = column_headings[9..-1]
|
60
|
+
|
61
|
+
good_snps = 0
|
62
|
+
# start parsing snps
|
63
|
+
while line = f.gets
|
64
|
+
details = line.split("\t")
|
65
|
+
ref = details[0]
|
66
|
+
ref_pos = details[1]
|
67
|
+
ref_base = details[3]
|
68
|
+
snp_base = details[4]
|
69
|
+
snp_qual = details [5]
|
70
|
+
samples = details[9..-1]
|
71
|
+
|
72
|
+
genotypes = samples.map do |s|
|
73
|
+
pl, gt, gq = s.chomp.split(":")
|
74
|
+
gt
|
75
|
+
end
|
76
|
+
|
77
|
+
genotypes_qualities = samples.map do |s|
|
78
|
+
pl, gt, gq = s.chomp.split(":")
|
79
|
+
gq
|
80
|
+
end
|
81
|
+
|
82
|
+
high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
|
83
|
+
variant_genotypes = Array.new
|
84
|
+
genotypes.each_with_index do |gt, index|
|
85
|
+
if gt == "1/1"
|
86
|
+
variant_genotypes << index
|
87
|
+
if genotypes_qualities[index].to_i >= 30
|
88
|
+
high_quality_variant_genotypes << index
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
if snp_qual.to_i >= 90 && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
|
94
|
+
if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
|
95
|
+
good_snps +=1
|
96
|
+
# puts good_snps
|
97
|
+
#create snp
|
98
|
+
s = Snp.new
|
99
|
+
s.ref_pos = ref_pos
|
100
|
+
s.save
|
101
|
+
puts "Adding Reference SNP position: #{ref_pos}"
|
102
|
+
|
103
|
+
# create ref allele
|
104
|
+
ref_allele = Allele.new
|
105
|
+
ref_allele.base = ref_base
|
106
|
+
ref_allele.snp = s
|
107
|
+
ref_allele.save
|
108
|
+
|
109
|
+
puts "Adding Reference SNP base: #{ref_base}"
|
110
|
+
|
111
|
+
s.reference_allele = ref_allele
|
112
|
+
s.save
|
113
|
+
|
114
|
+
# create snp allele
|
115
|
+
snp_allele = Allele.new
|
116
|
+
snp_allele.base = snp_base
|
117
|
+
snp_allele.snp = s
|
118
|
+
snp_allele.save
|
119
|
+
|
120
|
+
puts "Adding SNP base: #{snp_base}"
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
genotypes.each_with_index do |gt, index|
|
125
|
+
genotype = Genotype.new
|
126
|
+
genotype.strain = strains[index]
|
127
|
+
puts index if strains[index].nil?
|
128
|
+
# print "#{gt}(#{genotypes_qualities[index]}) "
|
129
|
+
if gt == "0/0" # wild type
|
130
|
+
genotype.allele = ref_allele
|
131
|
+
elsif gt == "1/1" # snp type
|
132
|
+
genotype.allele = snp_allele
|
133
|
+
else
|
134
|
+
puts "Strange SNP #{gt}"
|
135
|
+
end
|
136
|
+
genotype.save
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
143
|
+
#Here we link the features to snps.
|
144
|
+
Snp.all.each do |snp|
|
145
|
+
x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first
|
146
|
+
snp.feature = x
|
147
|
+
snp.save
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'snp_db_connection'
|
2
|
+
|
3
|
+
class Strain < ActiveRecord::Base
|
4
|
+
has_many :alleles, :through => :genotypes
|
5
|
+
has_many :genotypes
|
6
|
+
end
|
7
|
+
|
8
|
+
class Feature < ActiveRecord::Base
|
9
|
+
has_many :annotations
|
10
|
+
has_many :snps
|
11
|
+
end
|
12
|
+
|
13
|
+
class Snp < ActiveRecord::Base
|
14
|
+
belongs_to :feature
|
15
|
+
has_many :alleles
|
16
|
+
belongs_to :reference_allele, :class_name => "Allele", :foreign_key => "reference_allele_id"
|
17
|
+
end
|
18
|
+
|
19
|
+
class Allele < ActiveRecord::Base
|
20
|
+
has_many :genotypes
|
21
|
+
belongs_to :snp
|
22
|
+
has_many :strains, :through => :genotypes
|
23
|
+
end
|
24
|
+
|
25
|
+
class Genotype < ActiveRecord::Base
|
26
|
+
belongs_to :allele
|
27
|
+
belongs_to :strain
|
28
|
+
end
|
29
|
+
|
30
|
+
class Annotation < ActiveRecord::Base
|
31
|
+
belongs_to :feature
|
32
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'snp_db_connection'
|
2
|
+
establish_connection
|
3
|
+
|
4
|
+
ActiveRecord::Schema.define do
|
5
|
+
unless table_exists? :strains
|
6
|
+
create_table :strains do |t|
|
7
|
+
t.column :name, :string
|
8
|
+
t.column :description, :string
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
unless table_exists? :features
|
13
|
+
create_table :features do |t|
|
14
|
+
t.column :name, :string
|
15
|
+
t.column :sequence, :string
|
16
|
+
t.column :start, :integer
|
17
|
+
t.column :end, :integer
|
18
|
+
t.column :strand, :integer
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
unless table_exists? :snps
|
23
|
+
create_table :snps do |t|
|
24
|
+
t.column :feature_id, :integer
|
25
|
+
t.column :ref_pos, :integer
|
26
|
+
t.column :reference_allele_id, :integer
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
unless table_exists? :alleles
|
31
|
+
create_table :alleles do |t|name
|
32
|
+
t.column :snp_id, :integer
|
33
|
+
t.column :base, :string
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
unless table_exists? :genotypes
|
38
|
+
create_table :genotypes do |t|
|
39
|
+
t.column :allele_id, :integer
|
40
|
+
t.column :strain_id, :integer
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
unless table_exists? :annotations
|
45
|
+
create_table :annotations do |t|
|
46
|
+
t.column :qualifier, :string
|
47
|
+
t.column :value, :string
|
48
|
+
t.column :feature_id, :integer
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# indices
|
53
|
+
unless index_exists? :features, :name
|
54
|
+
add_index :features, :name
|
55
|
+
end
|
56
|
+
unless index_exists? :features, :start
|
57
|
+
add_index :features, :start
|
58
|
+
end
|
59
|
+
unless index_exists? :features, :end
|
60
|
+
add_index :features, :end
|
61
|
+
end
|
62
|
+
unless index_exists? :features, :strand
|
63
|
+
add_index :features, :strand
|
64
|
+
end
|
65
|
+
unless index_exists? :snps, :ref_pos
|
66
|
+
add_index :snps, :ref_pos
|
67
|
+
end
|
68
|
+
unless index_exists? :snps, :feature_id
|
69
|
+
add_index :snps, :feature_id
|
70
|
+
end
|
71
|
+
unless index_exists? :alleles, :snp_id
|
72
|
+
add_index :alleles, :snp_id
|
73
|
+
end
|
74
|
+
unless index_exists? :alleles, :base
|
75
|
+
add_index :alleles, :base
|
76
|
+
end
|
77
|
+
unless index_exists? :genotypes, :allele_id
|
78
|
+
add_index :genotypes, :allele_id
|
79
|
+
end
|
80
|
+
unless index_exists? :genotypes, :strain_id
|
81
|
+
add_index :genotypes, :strain_id
|
82
|
+
end
|
83
|
+
unless index_exists? :annotations, :feature_id
|
84
|
+
add_index :annotations, :feature_id
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'snp-search'
|
2
|
+
|
3
|
+
#path for embl file here
|
4
|
+
genome_sequence = Bio::FlatFile.open(Bio::EMBL, "path_for_embl_file_here").next_entry
|
5
|
+
|
6
|
+
#path for vcf file here
|
7
|
+
vcf_mpileup_file = "path_for_vcf_file_here"
|
8
|
+
|
9
|
+
#array of strain names here
|
10
|
+
strains = ["STRAIN_NAME_1", "STRAIN_NAME_2"]
|
11
|
+
|
12
|
+
# Thats it, you job is done here.
|
13
|
+
|
14
|
+
# The populate_strains method populates the strains in the db. It uses the strain names in array.
|
15
|
+
populate_strains(strains)
|
16
|
+
|
17
|
+
# The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file.
|
18
|
+
populate_features_and_annotations(genome_sequence)
|
19
|
+
|
20
|
+
# The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the strain names (array) and vcf file.
|
21
|
+
populate_snps_alleles_genotypes(strains, vcf_mpileup_file)
|
data/snp-search.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "snp-search"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
|
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
|
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"lib/snp-search.rb",
|
30
|
+
"lib/snp_db_connection.rb",
|
31
|
+
"lib/snp_db_models.rb",
|
32
|
+
"lib/snp_db_schema.rb",
|
33
|
+
"lib/user_entry_file.rb",
|
30
34
|
"snp-search.gemspec",
|
31
35
|
"spec/snp-search_spec.rb",
|
32
36
|
"spec/spec_helper.rb"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snp-search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -14,7 +14,7 @@ date: 2011-11-25 00:00:00.000000000Z
|
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rspec
|
17
|
-
requirement: &
|
17
|
+
requirement: &2161702220 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: 2.3.0
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *2161702220
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: bundler
|
28
|
-
requirement: &
|
28
|
+
requirement: &2161701620 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: 1.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *2161701620
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: jeweler
|
39
|
-
requirement: &
|
39
|
+
requirement: &2161701080 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: 1.6.4
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *2161701080
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: rcov
|
50
|
-
requirement: &
|
50
|
+
requirement: &2161699820 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ! '>='
|
@@ -55,7 +55,7 @@ dependencies:
|
|
55
55
|
version: '0'
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *2161699820
|
59
59
|
description: Use the snp-search toolset to query the SNP database
|
60
60
|
email: ali.al-shahib@hpa.org.uk
|
61
61
|
executables: []
|
@@ -74,6 +74,10 @@ files:
|
|
74
74
|
- Rakefile
|
75
75
|
- VERSION
|
76
76
|
- lib/snp-search.rb
|
77
|
+
- lib/snp_db_connection.rb
|
78
|
+
- lib/snp_db_models.rb
|
79
|
+
- lib/snp_db_schema.rb
|
80
|
+
- lib/user_entry_file.rb
|
77
81
|
- snp-search.gemspec
|
78
82
|
- spec/snp-search_spec.rb
|
79
83
|
- spec/spec_helper.rb
|
@@ -92,7 +96,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
92
96
|
version: '0'
|
93
97
|
segments:
|
94
98
|
- 0
|
95
|
-
hash:
|
99
|
+
hash: 3531722145808918413
|
96
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
101
|
none: false
|
98
102
|
requirements:
|