snp-search 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,25 @@
1
1
  = snp-search
2
2
 
3
- Description goes here.
3
+ snp-search is a set of tools that manages SNP data and allows for data importing, manipulating, editing and complex querying of SNP data. It can be used to evaluate the utility of SNPs for the assessment of genetic diversity between strains and the management of genotype and phenotype data. Once a query is performed, SNPsearch can be used to convert the selected SNP data into FASTA sequences. SNPsearch is particularly useful in the analysis of phylogenetic trees that are based on SNP differences across whole core genomes. Queries can be made to answer critical genomic questions such as the association of SNPs with particular phenotypes.
4
+
5
+ == Obtaining and installing the code
6
+ snp-search is written in Ruby and operates in a Unix enviroment. It is made available as a gem. See the github site for more information (https://github.com/hpa-bioinformatics/snp-search).
7
+
8
+ To install snp-search, do
9
+ gem install snp-search
10
+
11
+ == Requirements
12
+
13
+ * ActiveRecord: The snp-search API is based on ActiveRecord to get the data from the database. ActiveRecord is available as a gem:
14
+ gem install activerecord
15
+
16
+ * SQLite3: The SQL engine used is sqlite3. Most Linux operating systems come with sqlite3. However if you do not have sqlite then you may download it from http://www.sqlite.org/download.html. The installation instructions are available in the download page.
17
+
18
+ * bio gem.
19
+
20
+ == Running snp-search
21
+
22
+ Step 1:
4
23
 
5
24
  == Contributing to snp-search
6
25
 
@@ -11,6 +30,7 @@ Description goes here.
11
30
  * Commit and push until you are happy with your contribution
12
31
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
32
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
33
+ *
14
34
 
15
35
  == Copyright
16
36
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -0,0 +1,150 @@
1
+ require 'rubygems'
2
+ require 'bio'
3
+ require 'snp_db_models'
4
+ establish_connection
5
+
6
+ # A method to populate the strain names in the Strain table. strain_names is an array of strain names.
7
+ def populate_strains(strain_names)
8
+ strain_names.each do |strain|
9
+ s = Strain.new
10
+ s.name = strain
11
+ s.save
12
+ end
13
+ end
14
+
15
+ # A method to populate the database with the features (genes etc) and the annotations from the embl file.
16
+ # We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
17
+ # The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
18
+ # Also, the qualifier and value are extracted from the embl file and added to the database.
19
+ def populate_features_and_annotations(embl_ncbi_file)
20
+ embl_ncbi_file.features.each do |feature|
21
+ unless feature.feature == "source" || feature.feature == "gene"
22
+ db_feature = Feature.new
23
+ db_feature.start = feature.locations.first.from
24
+ db_feature.end = feature.locations.first.to
25
+ db_feature.strand = feature.locations.first.strand
26
+ db_feature.name = feature.feature
27
+ db_feature.save
28
+ puts "populated #{db_feature.name}, start: #{db_feature.start}, end: #{db_feature.end}, strand: #{db_feature.strand} for feature: #{db_feature.id}"
29
+ # Populate the Annotation table with qualifier information from the genbank file
30
+ feature.qualifiers.each do |qualifier|
31
+ a = Annotation.new
32
+ a.qualifier = qualifier.qualifier
33
+ a.value = qualifier.value
34
+ a.save
35
+ db_feature.annotations << a
36
+ puts "populated #{a.qualifier} for feature: #{db_feature.id}"
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+
43
+ #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
44
+ # It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.
45
+
46
+ def populate_snps_alleles_genotypes(strain_names, vcf_file)
47
+ strains = Array.new
48
+ strain_names.each do |strain_name|
49
+ strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
50
+ strains << strain
51
+ end
52
+
53
+ # open vcf file and parse each line
54
+ File.open(vcf_file) do |f|
55
+ # header names
56
+ header = f.gets
57
+ header2 = f.gets.chomp
58
+ column_headings = header2.split("\t")
59
+ sample_names = column_headings[9..-1]
60
+
61
+ good_snps = 0
62
+ # start parsing snps
63
+ while line = f.gets
64
+ details = line.split("\t")
65
+ ref = details[0]
66
+ ref_pos = details[1]
67
+ ref_base = details[3]
68
+ snp_base = details[4]
69
+ snp_qual = details [5]
70
+ samples = details[9..-1]
71
+
72
+ genotypes = samples.map do |s|
73
+ pl, gt, gq = s.chomp.split(":")
74
+ gt
75
+ end
76
+
77
+ genotypes_qualities = samples.map do |s|
78
+ pl, gt, gq = s.chomp.split(":")
79
+ gq
80
+ end
81
+
82
+ high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
83
+ variant_genotypes = Array.new
84
+ genotypes.each_with_index do |gt, index|
85
+ if gt == "1/1"
86
+ variant_genotypes << index
87
+ if genotypes_qualities[index].to_i >= 30
88
+ high_quality_variant_genotypes << index
89
+ end
90
+ end
91
+ end
92
+
93
+ if snp_qual.to_i >= 90 && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
94
+ if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
95
+ good_snps +=1
96
+ # puts good_snps
97
+ #create snp
98
+ s = Snp.new
99
+ s.ref_pos = ref_pos
100
+ s.save
101
+ puts "Adding Reference SNP position: #{ref_pos}"
102
+
103
+ # create ref allele
104
+ ref_allele = Allele.new
105
+ ref_allele.base = ref_base
106
+ ref_allele.snp = s
107
+ ref_allele.save
108
+
109
+ puts "Adding Reference SNP base: #{ref_base}"
110
+
111
+ s.reference_allele = ref_allele
112
+ s.save
113
+
114
+ # create snp allele
115
+ snp_allele = Allele.new
116
+ snp_allele.base = snp_base
117
+ snp_allele.snp = s
118
+ snp_allele.save
119
+
120
+ puts "Adding SNP base: #{snp_base}"
121
+
122
+
123
+
124
+ genotypes.each_with_index do |gt, index|
125
+ genotype = Genotype.new
126
+ genotype.strain = strains[index]
127
+ puts index if strains[index].nil?
128
+ # print "#{gt}(#{genotypes_qualities[index]}) "
129
+ if gt == "0/0" # wild type
130
+ genotype.allele = ref_allele
131
+ elsif gt == "1/1" # snp type
132
+ genotype.allele = snp_allele
133
+ else
134
+ puts "Strange SNP #{gt}"
135
+ end
136
+ genotype.save
137
+ end
138
+ end
139
+ end
140
+
141
+ end
142
+ end
143
+ #Here we link the features to snps.
144
+ Snp.all.each do |snp|
145
+ x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first
146
+ snp.feature = x
147
+ snp.save
148
+ end
149
+ end
150
+
@@ -0,0 +1,9 @@
1
+ require 'active_record'
2
+ def establish_connection(db_location= "snp.db.sqlite3")
3
+ ActiveRecord::Base.establish_connection(
4
+ :adapter => "sqlite3",
5
+ :database => db_location,
6
+ :pool => 5,
7
+ :timeout => 5000
8
+ )
9
+ end
@@ -0,0 +1,32 @@
1
+ require 'snp_db_connection'
2
+
3
+ class Strain < ActiveRecord::Base
4
+ has_many :alleles, :through => :genotypes
5
+ has_many :genotypes
6
+ end
7
+
8
+ class Feature < ActiveRecord::Base
9
+ has_many :annotations
10
+ has_many :snps
11
+ end
12
+
13
+ class Snp < ActiveRecord::Base
14
+ belongs_to :feature
15
+ has_many :alleles
16
+ belongs_to :reference_allele, :class_name => "Allele", :foreign_key => "reference_allele_id"
17
+ end
18
+
19
+ class Allele < ActiveRecord::Base
20
+ has_many :genotypes
21
+ belongs_to :snp
22
+ has_many :strains, :through => :genotypes
23
+ end
24
+
25
+ class Genotype < ActiveRecord::Base
26
+ belongs_to :allele
27
+ belongs_to :strain
28
+ end
29
+
30
+ class Annotation < ActiveRecord::Base
31
+ belongs_to :feature
32
+ end
@@ -0,0 +1,86 @@
1
+ require 'snp_db_connection'
2
+ establish_connection
3
+
4
+ ActiveRecord::Schema.define do
5
+ unless table_exists? :strains
6
+ create_table :strains do |t|
7
+ t.column :name, :string
8
+ t.column :description, :string
9
+ end
10
+ end
11
+
12
+ unless table_exists? :features
13
+ create_table :features do |t|
14
+ t.column :name, :string
15
+ t.column :sequence, :string
16
+ t.column :start, :integer
17
+ t.column :end, :integer
18
+ t.column :strand, :integer
19
+ end
20
+ end
21
+
22
+ unless table_exists? :snps
23
+ create_table :snps do |t|
24
+ t.column :feature_id, :integer
25
+ t.column :ref_pos, :integer
26
+ t.column :reference_allele_id, :integer
27
+ end
28
+ end
29
+
30
+ unless table_exists? :alleles
31
+ create_table :alleles do |t|name
32
+ t.column :snp_id, :integer
33
+ t.column :base, :string
34
+ end
35
+ end
36
+
37
+ unless table_exists? :genotypes
38
+ create_table :genotypes do |t|
39
+ t.column :allele_id, :integer
40
+ t.column :strain_id, :integer
41
+ end
42
+ end
43
+
44
+ unless table_exists? :annotations
45
+ create_table :annotations do |t|
46
+ t.column :qualifier, :string
47
+ t.column :value, :string
48
+ t.column :feature_id, :integer
49
+ end
50
+ end
51
+
52
+ # indices
53
+ unless index_exists? :features, :name
54
+ add_index :features, :name
55
+ end
56
+ unless index_exists? :features, :start
57
+ add_index :features, :start
58
+ end
59
+ unless index_exists? :features, :end
60
+ add_index :features, :end
61
+ end
62
+ unless index_exists? :features, :strand
63
+ add_index :features, :strand
64
+ end
65
+ unless index_exists? :snps, :ref_pos
66
+ add_index :snps, :ref_pos
67
+ end
68
+ unless index_exists? :snps, :feature_id
69
+ add_index :snps, :feature_id
70
+ end
71
+ unless index_exists? :alleles, :snp_id
72
+ add_index :alleles, :snp_id
73
+ end
74
+ unless index_exists? :alleles, :base
75
+ add_index :alleles, :base
76
+ end
77
+ unless index_exists? :genotypes, :allele_id
78
+ add_index :genotypes, :allele_id
79
+ end
80
+ unless index_exists? :genotypes, :strain_id
81
+ add_index :genotypes, :strain_id
82
+ end
83
+ unless index_exists? :annotations, :feature_id
84
+ add_index :annotations, :feature_id
85
+ end
86
+ end
@@ -0,0 +1,21 @@
1
+ require 'snp-search'
2
+
3
+ #path for embl file here
4
+ genome_sequence = Bio::FlatFile.open(Bio::EMBL, "path_for_embl_file_here").next_entry
5
+
6
+ #path for vcf file here
7
+ vcf_mpileup_file = "path_for_vcf_file_here"
8
+
9
+ #array of strain names here
10
+ strains = ["STRAIN_NAME_1", "STRAIN_NAME_2"]
11
+
12
+ # Thats it, you job is done here.
13
+
14
+ # The populate_strains method populates the strains in the db. It uses the strain names in array.
15
+ populate_strains(strains)
16
+
17
+ # The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file.
18
+ populate_features_and_annotations(genome_sequence)
19
+
20
+ # The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the strain names (array) and vcf file.
21
+ populate_snps_alleles_genotypes(strains, vcf_mpileup_file)
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "snp-search"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "lib/snp-search.rb",
30
+ "lib/snp_db_connection.rb",
31
+ "lib/snp_db_models.rb",
32
+ "lib/snp_db_schema.rb",
33
+ "lib/user_entry_file.rb",
30
34
  "snp-search.gemspec",
31
35
  "spec/snp-search_spec.rb",
32
36
  "spec/spec_helper.rb"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snp-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -14,7 +14,7 @@ date: 2011-11-25 00:00:00.000000000Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rspec
17
- requirement: &2168567960 !ruby/object:Gem::Requirement
17
+ requirement: &2161702220 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 2.3.0
23
23
  type: :development
24
24
  prerelease: false
25
- version_requirements: *2168567960
25
+ version_requirements: *2161702220
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: bundler
28
- requirement: &2168545440 !ruby/object:Gem::Requirement
28
+ requirement: &2161701620 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 1.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *2168545440
36
+ version_requirements: *2161701620
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: jeweler
39
- requirement: &2168544960 !ruby/object:Gem::Requirement
39
+ requirement: &2161701080 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 1.6.4
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *2168544960
47
+ version_requirements: *2161701080
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: rcov
50
- requirement: &2168544400 !ruby/object:Gem::Requirement
50
+ requirement: &2161699820 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ! '>='
@@ -55,7 +55,7 @@ dependencies:
55
55
  version: '0'
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *2168544400
58
+ version_requirements: *2161699820
59
59
  description: Use the snp-search toolset to query the SNP database
60
60
  email: ali.al-shahib@hpa.org.uk
61
61
  executables: []
@@ -74,6 +74,10 @@ files:
74
74
  - Rakefile
75
75
  - VERSION
76
76
  - lib/snp-search.rb
77
+ - lib/snp_db_connection.rb
78
+ - lib/snp_db_models.rb
79
+ - lib/snp_db_schema.rb
80
+ - lib/user_entry_file.rb
77
81
  - snp-search.gemspec
78
82
  - spec/snp-search_spec.rb
79
83
  - spec/spec_helper.rb
@@ -92,7 +96,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
92
96
  version: '0'
93
97
  segments:
94
98
  - 0
95
- hash: 1006490403685499848
99
+ hash: 3531722145808918413
96
100
  required_rubygems_version: !ruby/object:Gem::Requirement
97
101
  none: false
98
102
  requirements: