snp-search 2.5.2 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +46 -39
- data/VERSION +1 -1
- data/bin/snp-search +53 -53
- data/lib/information_methods.rb +2 -2
- data/lib/output_information_methods.rb +23 -15
- data/lib/snp-search.rb +1 -2
- data/pkg/snp-search-2.5.2.gem +0 -0
- data/pkg/snp-search-2.6.0.gem +0 -0
- data/snp-search.gemspec +4 -2
- metadata +5 -3
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= snp-search
|
2
2
|
|
3
|
-
|
3
|
+
an easy to use tool for management of SNPs generated from haploid next generation sequencing data. Given a vcf file, snp-search stores the SNPs generated by the variant calling algorithm into a sqlite database. snp-search can then be used to extract useful information from the database.
|
4
4
|
|
5
5
|
== Obtaining and installing the code
|
6
6
|
SNPsearch is written in Ruby and operates in a Unix environment. It is made available as a gem. See the github site for more information (https://github.com/hpa-bioinformatics/snp-search).
|
@@ -13,9 +13,10 @@ To install snp-search, do
|
|
13
13
|
Not much, you just need:
|
14
14
|
|
15
15
|
* Unix. Once snp-search is installed, all the necessary gems to run snp-search will also be installed from Rubygems (note that Rubygems requires admin privileges. If you do not have admin privileges then we suggest you install RVM: (http://beginrescueend.com/rvm/install/) and then gem install snp-search).
|
16
|
+
|
16
17
|
* ruby version 1.8.7 and above.
|
17
18
|
|
18
|
-
* Optional: FastTree. If you require a tree output in Newick format, you must install FastTree from http://www.microbesonline.org/fasttree/#Install.
|
19
|
+
* Optional: FastTree 2. If you require a tree output in Newick format, you must install FastTree from http://www.microbesonline.org/fasttree/#Install.
|
19
20
|
|
20
21
|
Thats it!
|
21
22
|
|
@@ -29,65 +30,72 @@ Thats it!
|
|
29
30
|
|
30
31
|
1B- Your database reference genome that you used to generate your .vcf file (in genbank or embl format, the script will automatically detect the format).
|
31
32
|
|
32
|
-
You need the following parameters:
|
33
|
+
You need the following parameters:
|
33
34
|
|
34
|
-
-
|
35
|
+
-d Name of your database (note that this is a required field in all commands).
|
35
36
|
-v .vcf file
|
36
|
-
-
|
37
|
+
-r Database Reference genome (The same file that was used in generating the .vcf file). This should be in genbank or embl format.
|
37
38
|
|
38
|
-
|
39
|
-
-c SNP quality score cutoff. A Phred-scaled quality score. High quality scores indicate high confidence calls. Optional, default = 90 (out of 100)
|
40
|
-
-g Genotype Quality score cutoff. Phred-scaled quality score that the genotype is true. Optional, default = 30
|
41
|
-
-h help message
|
39
|
+
Optional: -A AD ratio cutoff (default 0.9)
|
42
40
|
|
43
41
|
Usage:
|
44
|
-
snp-search -create -
|
42
|
+
snp-search -create -d my_snp_db.sqlite3 -r my_ref.gbk -v my_vcf_file.vcf
|
45
43
|
|
46
44
|
Note: The strain names in your database will be taken from your vcf file so make sure they are named appropriately in your vcf file.
|
47
45
|
|
48
46
|
2- Now that you have created the database (my_snp_db.sqlite3) you can use snp-search to output several queried data.
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
- Querying the database to output all SNPs without specified features in the database (e.g. phages). This is a way of
|
64
|
-
|
65
|
-
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
48
|
+
First, you need to tell snp-search what you want out. You have several options:
|
49
|
+
- Querying the Database to select the number of unique SNPs within the list of the strains/samples provided (list_of_my_strains.txt). The output is a text file with a list of the unique SNPs and information about each SNP (e.g. if its synonymous or non-synonymous SNP).
|
50
|
+
|
51
|
+
-output -unique_snps -d db.sqlite3 [options]
|
52
|
+
-u, --unique_snps Query for unique snps in the database
|
53
|
+
-c, --cuttoff_snp_qual SNP quality cutoff, (default = 90)
|
54
|
+
-g, --cuttoff_genotype Genotype quality cutoff (default = 30)
|
55
|
+
-s, --strain The strains/samples you like to query (only used with -unique_snps flag)
|
56
|
+
-o, --out Name of output file, Required
|
57
|
+
|
58
|
+
Usage:
|
59
|
+
snp-search -O -u -d my_snp_db.sqlite3 -s list_of_my_strains.txt -o unique_snps.out
|
60
|
+
|
61
|
+
- Querying the database to output all SNPs without SNPs in a specified features in the database (e.g. phages). This is a way of ignoring SNPs in genes (likely to be mobile element genes) that are not needed for SNP analysis. The user has the option of generating a core SNP tree Newick file for SNP phylogeny (if -F option was used to ouput fasta file).
|
62
|
+
|
63
|
+
-output -all_or_filtered_snps -d db.sqlite3 [options]
|
64
|
+
-f, --all_or_filtered_snps SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -n -F/T -o)
|
65
|
+
-F, --fasta output fasta file format (default)
|
66
|
+
-T, --tabular output tabular file format
|
67
|
+
-c, --cuttoff_snp_qual SNP quality cutoff, (default = 90)
|
68
|
+
-g, --cuttoff_genotype Genotype quality cutoff (default = 30)
|
69
|
+
-R, --remove_non_informative_snps Only output informative SNPs. Only used with -e option
|
70
|
+
-e, --ignore_snps_in_range A list of position ranges to ignore e.g 10..500,2000..2500. Only used with -e option
|
71
|
+
-a, --ignore_strains A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ). Only used with -f option
|
72
|
+
-I, --ignore_snps_on_annotation The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)
|
73
|
+
-o, --out Name of output file, Required
|
74
|
+
-t, --tree Generate SNP phylogeny (only used with -fasta option)
|
75
|
+
-p, --fasttree_path Full path to the FastTree tool (e.g. /usr/local/bin/FastTree. only used with -tree option)
|
71
76
|
|
72
77
|
Usage:
|
73
|
-
snp-search -O -F -
|
78
|
+
snp-search -O -F -f -n my_snp_db.sqlite3 -a phage,insertion,transposon -R -o snps_without_phages.fasta
|
74
79
|
|
75
80
|
- Optionally, you can add the following options to generate a phylogenetic tree from the resulting fasta file:
|
76
81
|
|
77
82
|
-t Generate SNP phylogeny
|
78
|
-
-
|
83
|
+
-p Full path to the FastTree tool (e.g. /usr/local/bin/FastTree. only used with -tree option)
|
79
84
|
Usage:
|
80
|
-
snp-search -O -F -e -n my_snp_db.sqlite3 -a phage,insertion,transposon -r -t -
|
85
|
+
snp-search -O -F -e -n my_snp_db.sqlite3 -a phage,insertion,transposon -r -t -p /usr/local/bin/FastTree -o snps_without_phages.fasta
|
81
86
|
|
82
87
|
The algorithm FastTree is used to generate the nwk file. FastTree can be downloaded from http://www.microbesonline.org/fasttree/#Install (see above)
|
83
88
|
|
84
89
|
- Output all SNPs with information. Information for each SNP includes whether the SNP is synonymous or non-synonymous, gene function, whether it is a pseudogene and other useful information. These information will be tab-seperated.
|
85
90
|
|
86
|
-
-
|
87
|
-
|
91
|
+
-output -info -d db.sqlite3 [options]
|
92
|
+
-i, --info Output various information about SNPs
|
93
|
+
-c, --cuttoff_snp_qual SNP quality cutoff, (default = 90)
|
94
|
+
-g, --cuttoff_genotype Genotype quality cutoff (default = 30)
|
95
|
+
-o, --out Name of output file, Required
|
88
96
|
|
89
97
|
Usage:
|
90
|
-
snp-search -O -
|
98
|
+
snp-search -O -info -d my_snp_db.sqlite3 -o snps_all_with_info.txt
|
91
99
|
|
92
100
|
== View database in Unix or in a GUI
|
93
101
|
Your database will be in sqlite3 format. If you like to view your table(s) and perform direct queries you can type
|
@@ -107,5 +115,4 @@ Have fun snp-searching!
|
|
107
115
|
== Copyright
|
108
116
|
|
109
117
|
Copyright (c) 2012 Ali Al-Shahib. See LICENSE.txt for
|
110
|
-
further details.
|
111
|
-
|
118
|
+
further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/bin/snp-search
CHANGED
@@ -1,14 +1,15 @@
|
|
1
|
-
require 'snp-search'
|
1
|
+
require '/Volumes/NGS2_DataRAID/projects/ali/GAS/snp-search/lib/snp-search'
|
2
2
|
require 'snp_db_connection.rb'
|
3
3
|
require 'snp_db_models.rb'
|
4
4
|
require 'snp_db_schema.rb'
|
5
|
-
require 'output_information_methods.rb'
|
5
|
+
require '/Volumes/NGS2_DataRAID/projects/ali/GAS/snp-search/lib/output_information_methods.rb'
|
6
6
|
require 'activerecord-import'
|
7
7
|
require 'slop'
|
8
8
|
|
9
9
|
opts = Slop.parse do
|
10
10
|
|
11
11
|
banner "\nruby snp-search [-create] [-output] [-n <sqlite3>] [options]*"
|
12
|
+
|
12
13
|
separator ''
|
13
14
|
|
14
15
|
on :C, :create, 'Create database'
|
@@ -17,49 +18,46 @@ opts = Slop.parse do
|
|
17
18
|
# separator ''
|
18
19
|
# # separator 'README file: https://github.com/hpa-bioinformatics/snp-search/blob/master/README.rdoc'
|
19
20
|
# # separator 'The following command must be used when using -create, or -query or -out_file'
|
20
|
-
# on :n, :
|
21
|
+
# on :n, :name_of_database=, 'Name of database, Required'
|
21
22
|
|
22
23
|
separator ''
|
23
24
|
|
24
|
-
separator '-create
|
25
|
-
on :
|
25
|
+
separator '-create -r reference_file.fasta -v vcf_file.vcf -d db.sqlite3'
|
26
|
+
on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
|
26
27
|
on :v, :vcf_file=, 'variant call format (vcf) file, Required', true
|
27
|
-
on :
|
28
|
+
on :d, :name_of_database=, 'Name of database, Required'
|
28
29
|
on :A, :cuttoff_ad=, 'AD ratio cutoff (default 0.9)', :as => :int, :default => 0.9
|
29
30
|
|
30
31
|
separator ''
|
31
32
|
|
32
|
-
separator '-output -
|
33
|
-
on :
|
33
|
+
separator '-output -all_or_filtered_snps -d db.sqlite3 [options]'
|
34
|
+
on :f, :all_or_filtered_snps, 'SNPs from specified features in the database (if you do not want to ignore any SNPs, just use this option with -n -F/T -o)'
|
35
|
+
on :F, :fasta, 'output fasta file format (default)'
|
34
36
|
on :T, :tabular, 'output tabular file format'
|
35
37
|
on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
36
38
|
on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
37
|
-
on :
|
38
|
-
on :
|
39
|
-
on :
|
40
|
-
on :
|
41
|
-
on :I, :ignore_snps_on_annotation=, 'The name of the feature to ignore.'
|
39
|
+
on :R, :remove_non_informative_snps, 'Only output informative SNPs.'
|
40
|
+
on :e, :ignore_snps_in_range=, 'A list of position ranges to ignore e.g 10..500,2000..2500.'
|
41
|
+
on :a, :ignore_strains=, 'A list of strains to ignore (seperate by comma e.g. S1,S4,S8 ).'
|
42
|
+
on :I, :ignore_snps_on_annotation=, 'The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)'
|
42
43
|
on :o, :out=, 'Name of output file, Required'
|
43
44
|
on :t, :tree, 'Generate SNP phylogeny (only used with -fasta option)'
|
44
45
|
on :p, :fasttree_path=, 'Full path to the FastTree tool (e.g. /usr/local/bin/FastTree. only used with -tree option)'
|
45
|
-
|
46
46
|
separator ''
|
47
47
|
|
48
|
-
separator '-output -unique_snps -
|
48
|
+
separator '-output -unique_snps -d db.sqlite3 [options]'
|
49
|
+
on :u, :unique_snps, 'Query for unique snps in the database'
|
49
50
|
on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
50
51
|
on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
51
|
-
on :u, :unique_snps, 'Query for unique snps in the database'
|
52
52
|
on :s, :strain=, 'The strains/samples you like to query (only used with -unique_snps flag)'
|
53
53
|
on :o, :out=, 'Name of output file, Required'
|
54
54
|
|
55
55
|
separator ''
|
56
56
|
|
57
|
-
separator '-output -info -
|
57
|
+
separator '-output -info -d db.sqlite3 [options]'
|
58
58
|
on :i, :info, 'Output various information about SNPs'
|
59
59
|
on :c, :cuttoff_snp_qual=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90
|
60
60
|
on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30
|
61
|
-
on :t, :tree, 'Generate SNP phylogeny (only used with -fasta option)'
|
62
|
-
on :w, :nwk_out=, 'Name of output tree in Newick format (only used with -tree option)'
|
63
61
|
on :o, :out=, 'Name of output file, Required'
|
64
62
|
end
|
65
63
|
|
@@ -67,49 +65,52 @@ end
|
|
67
65
|
|
68
66
|
# CREATING A DATABASE
|
69
67
|
if opts[:create]
|
70
|
-
|
68
|
+
|
69
|
+
# raise "Please provide a database file name" if opts[:reference_file].empty?
|
71
70
|
# puts opts[:cuttoff_snp_qual].to_i
|
72
71
|
|
73
72
|
error_msg = ""
|
74
73
|
|
75
|
-
error_msg += "-
|
76
|
-
error_msg += "-
|
74
|
+
error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
|
75
|
+
error_msg += "-r: \t Reference genome file, in gbk or embl file format\n" unless opts[:reference_file]
|
77
76
|
error_msg += "-v: \t .vcf file\n" unless opts[:vcf_file]
|
78
77
|
|
79
|
-
error_msg_optional = ""
|
78
|
+
# error_msg_optional = ""
|
80
79
|
|
81
|
-
error_msg_optional += "-c: \tSNP quality cutoff, (default = 90)\n"
|
82
|
-
error_msg_optional += "-g: \tGenotype quality cutoff (default = 30)\n"
|
80
|
+
# error_msg_optional += "-c: \tSNP quality cutoff, (default = 90)\n"
|
81
|
+
# error_msg_optional += "-g: \tGenotype quality cutoff (default = 30)\n"
|
83
82
|
|
84
83
|
unless error_msg == ""
|
85
84
|
puts "Please provide the following required fields:"
|
86
85
|
puts error_msg
|
87
|
-
puts "Optional fields:"
|
88
|
-
puts error_msg_optional
|
89
|
-
|
86
|
+
# puts "Optional fields:"
|
87
|
+
# puts error_msg_optional
|
88
|
+
|
89
|
+
# puts "Please provide a database file name" if opts[:reference_file].empty?
|
90
|
+
# puts opts.help unless opts.empty?
|
90
91
|
exit
|
91
92
|
end
|
92
93
|
|
93
|
-
abort "#{opts[:
|
94
|
+
abort "#{opts[:reference_file]} file does not exist!" unless File.exist?(opts[:reference_file])
|
94
95
|
|
95
96
|
abort "#{opts[:vcf_file]} file does not exist!" unless File.exist?(opts[:vcf_file])
|
96
97
|
|
97
98
|
|
98
99
|
# Name of your database
|
99
|
-
establish_connection(opts[:
|
100
|
+
establish_connection(opts[:name_of_database])
|
100
101
|
|
101
102
|
# Schema will run here
|
102
103
|
db_schema
|
103
104
|
|
104
|
-
ref = opts[:
|
105
|
+
ref = opts[:reference_file]
|
105
106
|
|
106
107
|
sequence_format = guess_sequence_format(ref)
|
107
108
|
|
108
109
|
case sequence_format
|
109
110
|
when :genbank
|
110
|
-
sequence_flatfile = Bio::FlatFile.open(Bio::GenBank,opts[:
|
111
|
+
sequence_flatfile = Bio::FlatFile.open(Bio::GenBank,opts[:reference_file]).next_entry
|
111
112
|
when :embl
|
112
|
-
sequence_flatfile = Bio::FlatFile.open(Bio::EMBL,opts[:
|
113
|
+
sequence_flatfile = Bio::FlatFile.open(Bio::EMBL,opts[:reference_file]).next_entry
|
113
114
|
else
|
114
115
|
puts "All sequence files should be in genbank or embl format"
|
115
116
|
exit
|
@@ -128,33 +129,32 @@ if opts[:create]
|
|
128
129
|
elsif opts[:output]
|
129
130
|
|
130
131
|
error_msg = ""
|
131
|
-
error_msg += "-
|
132
|
+
error_msg += "-f: \t SNPs from specified features in the database OR\n-u: \t Query for unique snps in the database OR\n-i: \t Information on all SNPs\n" unless opts[:all_or_filtered_snps] || opts[:unique_snps] || opts[:info]
|
132
133
|
|
133
134
|
unless error_msg == ""
|
134
135
|
puts "Please provide the following required fields:"
|
135
136
|
puts error_msg
|
136
|
-
puts opts.help unless opts.empty?
|
137
|
+
# puts opts.help unless opts.empty?
|
137
138
|
exit
|
138
139
|
end
|
139
140
|
|
140
|
-
if opts[:
|
141
|
+
if opts[:all_or_filtered_snps]
|
141
142
|
|
142
143
|
error_msg = ""
|
143
144
|
|
144
|
-
error_msg += "-
|
145
|
+
error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
|
145
146
|
error_msg += "-o: \t name of your output file\n" unless opts[:out]
|
146
147
|
error_msg += "-F: \t Fasta output OR\n-T: \t Tabular output" unless opts[:fasta] || opts[:tabular]
|
147
148
|
|
148
149
|
error_msg_optional = ""
|
149
150
|
|
150
|
-
error_msg_optional += "-I,\t --ignore_snps_on_annotation: ignore
|
151
|
-
error_msg_optional += "-
|
152
|
-
error_msg_optional += "-
|
151
|
+
error_msg_optional += "-I,\t --ignore_snps_on_annotation: The name of the feature(s) to ignore. Features should be seperated by comma (e.g. phages,inserstion,transposons)\n" unless opts[:ignore_snps_on_annotation]
|
152
|
+
error_msg_optional += "-a,\t --ignore_strains: A list of strains to ignore\n" unless opts[:ignore_strains]
|
153
|
+
error_msg_optional += "-e,\t --ignore_snps_in_range: A list of position ranges to ignore e.g 10..500,2000..2500\n" unless opts[:ignore_snps_in_range]
|
153
154
|
error_msg_optional += "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality\n" unless opts[:cuttoff_snp_qual]
|
154
155
|
error_msg_optional += "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality\n" unless opts[:cuttoff_genotype]
|
155
|
-
error_msg_optional += "-
|
156
|
+
error_msg_optional += "-R,\t --remove_non_informative_snps: Only output informative SNPs\n" unless opts[:remove_non_informative_snps]
|
156
157
|
error_msg_optional += "-t,\t --tree: Construct tree from output\n" unless opts[:tree]
|
157
|
-
error_msg_optional += "-w,\t --nwk_out: Name of Newick output file(use only when-tree option used)\n" unless opts[:nwk_out]
|
158
158
|
|
159
159
|
unless error_msg == ""
|
160
160
|
puts "Please provide the following required fields:"
|
@@ -164,13 +164,13 @@ elsif opts[:output]
|
|
164
164
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
165
165
|
puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
|
166
166
|
puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
|
167
|
-
puts opts.help unless opts.empty?
|
167
|
+
# puts opts.help unless opts.empty?
|
168
168
|
exit
|
169
169
|
end
|
170
170
|
|
171
|
-
abort "#{opts[:
|
171
|
+
abort "#{opts[:name_of_database]} database does not exist!" unless File.exist?(opts[:name_of_database])
|
172
172
|
|
173
|
-
establish_connection(opts[:
|
173
|
+
establish_connection(opts[:name_of_database])
|
174
174
|
|
175
175
|
get_snps(opts[:out], opts[:ignore_snps_on_annotation], opts[:ignore_snps_in_range], opts[:ignore_strains], opts[:remove_non_informative_snps], opts[:fasta], opts[:tabular], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual], opts[:tree], opts[:fasttree_path])
|
176
176
|
end
|
@@ -181,7 +181,7 @@ elsif opts[:output]
|
|
181
181
|
|
182
182
|
error_msg = ""
|
183
183
|
|
184
|
-
error_msg += "-
|
184
|
+
error_msg += "-d: \t Name of your database\n" unless opts[:name_of_database]
|
185
185
|
error_msg += "-s: \t List of strains you like to query\n" unless opts[:strain]
|
186
186
|
error_msg += "-o: \t Name of the output file\n" unless opts[:out]
|
187
187
|
|
@@ -192,14 +192,14 @@ elsif opts[:output]
|
|
192
192
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
193
193
|
puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
|
194
194
|
puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
|
195
|
-
puts opts.help unless opts.empty?
|
195
|
+
# puts opts.help unless opts.empty?
|
196
196
|
exit
|
197
197
|
end
|
198
198
|
|
199
|
-
abort "#{opts[:
|
199
|
+
abort "#{opts[:name_of_database]} database does not exist!" unless File.exist?(opts[:name_of_database])
|
200
200
|
abort "#{opts[:strain]} file does not exist!" unless File.exist?(opts[:strain])
|
201
201
|
|
202
|
-
establish_connection(opts[:
|
202
|
+
establish_connection(opts[:name_of_database])
|
203
203
|
|
204
204
|
strains = []
|
205
205
|
File.read(opts[:strain]).each_line do |line|
|
@@ -214,7 +214,7 @@ elsif opts[:output]
|
|
214
214
|
|
215
215
|
error_msg = ""
|
216
216
|
|
217
|
-
error_msg += "-
|
217
|
+
error_msg += "-d: \t the name of your database\n" unless opts[:name_of_database]
|
218
218
|
error_msg += "-o: \t name of your output file (in tab-delimited format)\n" unless opts[:out]
|
219
219
|
|
220
220
|
unless error_msg == ""
|
@@ -224,13 +224,13 @@ elsif opts[:output]
|
|
224
224
|
# Added this here as it wont appear here in error_msg_optional as its set as default.
|
225
225
|
puts "-c,\t --cuttoff_snp_qual: cuttoff for SNP Quality (default 90)\n"
|
226
226
|
puts "-g,\t --cuttoff_genotype: cuttoff for Genotype Quality (default 30)\n"
|
227
|
-
puts opts.help unless opts.empty?
|
227
|
+
# puts opts.help unless opts.empty?
|
228
228
|
exit
|
229
229
|
end
|
230
230
|
|
231
|
-
abort "#{opts[:
|
231
|
+
abort "#{opts[:name_of_database]} database does not exist!" unless File.exist?(opts[:name_of_database])
|
232
232
|
|
233
|
-
establish_connection(opts[:
|
233
|
+
establish_connection(opts[:name_of_database])
|
234
234
|
|
235
235
|
#information defined in bin/snp-search.rb
|
236
236
|
information(opts[:out], opts[:cuttoff_genotype], opts[:cuttoff_snp_qual])
|
data/lib/information_methods.rb
CHANGED
@@ -63,8 +63,8 @@ def information()
|
|
63
63
|
hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
|
64
64
|
non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
|
65
65
|
|
66
|
-
polar = ["
|
67
|
-
non_polar = ["
|
66
|
+
polar = ["R", "N", "D", "E", "Q", "H", "K", "S", "T", "Y"]
|
67
|
+
non_polar = ["A", "C", "G", "I", "L", "M", "F", "P", "W", "V"]
|
68
68
|
|
69
69
|
small = ["V","C","A","G","D","N","S","T","P"]
|
70
70
|
non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
|
@@ -9,17 +9,17 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
9
9
|
outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}"
|
10
10
|
|
11
11
|
snps_counter = 0
|
12
|
+
cds_snps_counter = 0
|
12
13
|
total_number_of_syn_snps = 0
|
13
14
|
total_number_of_non_syn_snps = 0
|
14
15
|
total_number_of_pseudo = 0
|
15
16
|
snps.each do |snp|
|
16
17
|
|
17
18
|
ActiveRecord::Base.transaction do
|
18
|
-
snps_counter +=1
|
19
19
|
snp.alleles.each do |allele|
|
20
20
|
next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
|
21
21
|
if allele.id != snp.reference_allele_id
|
22
|
-
|
22
|
+
|
23
23
|
# get annotation (if there is any) for each SNP
|
24
24
|
features = Feature.joins(:snps).where("snps.id = ?", snp.id)
|
25
25
|
|
@@ -36,20 +36,25 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
36
36
|
|
37
37
|
ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
|
38
38
|
snp_base = Bio::Sequence.auto(allele.base)
|
39
|
-
|
39
|
+
# count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
|
40
|
+
snps_counter += 1
|
40
41
|
# If the feature is empty then just output basic information about the snp.
|
42
|
+
|
41
43
|
if features.empty?
|
42
44
|
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
43
|
-
|
45
|
+
|
46
|
+
else
|
44
47
|
features.each do |feature|
|
45
|
-
|
46
48
|
if feature.name == "CDS"
|
47
49
|
|
50
|
+
cds_snps_counter +=1
|
51
|
+
|
48
52
|
annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
|
49
53
|
#if annotation is nil, or empty
|
50
54
|
if annotation.nil?
|
51
55
|
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
|
52
56
|
else
|
57
|
+
|
53
58
|
feature_sequence = feature.sequence
|
54
59
|
|
55
60
|
feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence)
|
@@ -91,24 +96,25 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
91
96
|
allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
|
92
97
|
alleles_array << allele_for_strains.base
|
93
98
|
end
|
99
|
+
|
94
100
|
# If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
|
95
101
|
if original_seq_translated_clean == mutated_seq_translated_clean
|
96
|
-
|
102
|
+
total_number_of_syn_snps +=1
|
97
103
|
if mutated_seq_translated_clean =~ /\*/
|
98
104
|
total_number_of_pseudo +=1
|
99
|
-
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
105
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
100
106
|
else
|
101
|
-
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
107
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{alleles_array.join("\t") if info}"
|
102
108
|
end
|
103
109
|
else
|
104
|
-
|
110
|
+
total_number_of_non_syn_snps +=1
|
105
111
|
diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
|
106
112
|
|
107
113
|
if mutated_seq_translated_clean =~ /\*/
|
108
114
|
total_number_of_pseudo +=1
|
109
|
-
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
115
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
110
116
|
else
|
111
|
-
outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
117
|
+
outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t") if info}"
|
112
118
|
end
|
113
119
|
end
|
114
120
|
end
|
@@ -117,14 +123,16 @@ def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, inf
|
|
117
123
|
end
|
118
124
|
end
|
119
125
|
end
|
120
|
-
puts "Total SNPs added so far: #{
|
126
|
+
puts "Total SNPs added so far: #{cds_snps_counter}" if snps_counter % 100 == 0
|
121
127
|
end
|
122
128
|
end
|
123
129
|
puts "Total number of snps: #{snps_counter}"
|
124
|
-
puts "Total number of
|
125
|
-
puts "Total number of
|
126
|
-
puts "Total number of
|
130
|
+
puts "Total number of snps in CDS region: #{cds_snps_counter}"
|
131
|
+
puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
|
132
|
+
puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
|
133
|
+
puts "Total number of pseudogenes: #{total_number_of_pseudo}"
|
127
134
|
outfile.puts "Total number of snps: #{snps_counter}"
|
135
|
+
outfile.puts "Total number of snps in CDS region: #{cds_snps_counter}"
|
128
136
|
outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
|
129
137
|
outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
|
130
138
|
outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}"
|
data/lib/snp-search.rb
CHANGED
@@ -16,8 +16,7 @@ def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
|
|
16
16
|
outfile = File.open(out, "w")
|
17
17
|
|
18
18
|
snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cuttoff_genotype} AND snps.qual >= #{cuttoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
|
19
|
-
|
20
|
-
puts "The number of unique snps are #{snps.size}"
|
19
|
+
# puts "The number of unique snps are #{snps.size}"
|
21
20
|
|
22
21
|
output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, false)
|
23
22
|
end
|
Binary file
|
Binary file
|
data/snp-search.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "snp-search"
|
8
|
-
s.version = "2.
|
8
|
+
s.version = "2.7.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-08-02"
|
13
13
|
s.description = "Use the snp-search tool to create, import, manipulate and query your SNP database"
|
14
14
|
s.email = "ali.al-shahib@phe.gov.uk"
|
15
15
|
s.executables = ["snp-search"]
|
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"pkg/snp-search-2.3.0.gem",
|
41
41
|
"pkg/snp-search-2.4.0.gem",
|
42
42
|
"pkg/snp-search-2.5.0.gem",
|
43
|
+
"pkg/snp-search-2.5.2.gem",
|
44
|
+
"pkg/snp-search-2.6.0.gem",
|
43
45
|
"snp-search.gemspec",
|
44
46
|
"spec/snp-search_spec.rb",
|
45
47
|
"spec/spec_helper.rb"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snp-search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-
|
13
|
+
date: 2013-08-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activerecord
|
@@ -188,6 +188,8 @@ files:
|
|
188
188
|
- pkg/snp-search-2.3.0.gem
|
189
189
|
- pkg/snp-search-2.4.0.gem
|
190
190
|
- pkg/snp-search-2.5.0.gem
|
191
|
+
- pkg/snp-search-2.5.2.gem
|
192
|
+
- pkg/snp-search-2.6.0.gem
|
191
193
|
- snp-search.gemspec
|
192
194
|
- spec/snp-search_spec.rb
|
193
195
|
- spec/spec_helper.rb
|
@@ -206,7 +208,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
206
208
|
version: '0'
|
207
209
|
segments:
|
208
210
|
- 0
|
209
|
-
hash: -
|
211
|
+
hash: -258043406808362242
|
210
212
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
211
213
|
none: false
|
212
214
|
requirements:
|