snp-search 0.21.0 → 0.22.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -58,7 +58,7 @@ Alternatively, you may download a SQL tool to see a GUI of your database (e.g. S
58
58
 
59
59
  We have included two example queries that you may find useful:
60
60
 
61
- * Example1: This script queries the database and selects all genes except the phage genes. The output is a FASTA file of the genes. This is a way of removing a set of genes that are not needed for the SNP analysis. You may use this script to do other SQL queries that result in a FASTA output.
61
+ * Example1: This script queries the database to select only those SNPs not found in phage related genes. These SNPs were used to make a concatenated SNP multiple alignment file (FASTA format). This is a way of removing a set of genes that are not needed for the SNP analysis. You may use this script to do other SQL queries that result in a FASTA output.
62
62
 
63
63
  Usage:
64
64
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.21.0
1
+ 0.22.0
data/bin/snp-search CHANGED
@@ -10,10 +10,9 @@ opts = Slop.new :help do
10
10
  banner "ruby snp-search [OPTIONS]"
11
11
 
12
12
  on :V, :verbose, 'Enable verbose mode'
13
- on :n, :name=, 'Name of database, (default: snp_db.sqlite3)', :default => 'snp_db.sqlite3'
13
+ on :n, :name=, 'Name of database, Required', true
14
14
  on :r, :reference_file=, 'Reference genome file, in gbk or embl file format, Required', true
15
15
  on :v, :vcf_file=, '.vcf file, Required', true
16
- on :s, :strain=, 'text file with a list of strains/samples, Required', true
17
16
  on :c, :cuttoff_snp=, 'SNP quality cutoff, (default = 90)', :default => 90
18
17
  on :t, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :default => 30
19
18
 
@@ -23,9 +22,9 @@ opts = Slop.new :help do
23
22
  end
24
23
  opts.parse
25
24
 
25
+ puts "You must supply the -n option, it's a required field" and exit unless opts[:name]
26
26
  puts "You must supply the -r option, it's a required field" and exit unless opts[:reference_file]
27
27
  puts "You must supply the -v option, it's a required field" and exit unless opts[:vcf_file]
28
- puts "You must supply the -s option, it's a required field" and exit unless opts[:strain]
29
28
 
30
29
  begin
31
30
  puts "#{opts[:reference_file]} file does not exist!" and exit unless File.exist?(opts[:reference_file])
@@ -37,19 +36,7 @@ opts.parse
37
36
  rescue
38
37
  end
39
38
 
40
- begin
41
- puts "#{opts[:strain]} file does not exist!" and exit unless File.exist?(opts[:strain])
42
- rescue
43
- end
44
-
45
-
46
39
  begin
47
- strains = []
48
- File.read(opts[:strain]).each_line do |line|
49
- strains << line.chop
50
- end
51
-
52
-
53
40
  # Enter the name of your database
54
41
  establish_connection(opts[:name])
55
42
 
@@ -74,15 +61,11 @@ sequence_format = guess_sequence_format(ref)
74
61
  # path for vcf file here
75
62
  vcf_mpileup_file = opts[:vcf_file]
76
63
 
77
-
78
- # The populate_strains method populates the strains in the db. It uses the strain names in array.
79
- populate_strains(strains)
80
-
81
64
  # The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file.
82
65
  populate_features_and_annotations(sequence_flatfile)
83
66
 
84
- #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the strain names (array) and vcf file.
85
- populate_snps_alleles_genotypes(strains, vcf_mpileup_file, opts[:cuttoff_snp].to_i, opts[:cuttoff_genotype].to_i)
67
+ #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff
68
+ populate_snps_alleles_genotypes(vcf_mpileup_file, opts[:cuttoff_snp].to_i, opts[:cuttoff_genotype].to_i)
86
69
 
87
70
  rescue
88
71
  end
data/examples/ali.txt ADDED
File without changes
data/examples/example1.rb CHANGED
@@ -2,9 +2,7 @@
2
2
  # Only use this script once your database has been fully populated.
3
3
  # Usage: ruby example1.rb -d your_db_name.sqlite3 -s list_of_your_species.txt -o output.fasta
4
4
  # You may use this script to do other SQL queries that result in a fasta output. Just change the 'snps' SQL query below with your query.
5
- require 'snp_db_connection'
6
5
  require 'snp_db_models'
7
- require 'snp_db_schema'
8
6
  gem "slop", "~> 2.4.0"
9
7
  require 'slop'
10
8
 
@@ -15,7 +13,8 @@ opts = Slop.new :help do
15
13
  on :d, :database=, 'The name of the database you like to query', true
16
14
  on :o, :outfile=, 'output file, in fasta format', true
17
15
  on :s, :strain=, 'The strains/samples you like to query', true
18
-
16
+ on :a, :annotation=, 'The gene you like to remove from analysis', true
17
+
19
18
  on_empty do
20
19
  puts help
21
20
  end
@@ -35,6 +34,7 @@ begin
35
34
  rescue
36
35
  end
37
36
 
37
+ annotation = opts[:annotation]
38
38
  establish_connection(opts[:database])
39
39
 
40
40
  begin
@@ -68,7 +68,7 @@ snps = Snp.find_by_sql("SELECT snps.* FROM snps
68
68
  (select distinct features.id FROM features
69
69
  INNER JOIN annotations ON
70
70
  annotations.feature_id = features.id
71
- WHERE annotations.value LIKE '%phage%'))")
71
+ WHERE annotations.value LIKE '%(#{annotation})%'))")
72
72
 
73
73
 
74
74
  #puts snps.size
@@ -0,0 +1,212 @@
1
+ H041200152
2
+ H041260144
3
+ H041320325
4
+ H041380342
5
+ H041520010
6
+ H041620019
7
+ H041680416
8
+ H041740036
9
+ H041860019
10
+ H042140018
11
+ H040300050
12
+ H042220216
13
+ H041400347
14
+ H041460313
15
+ H042320017
16
+ H040360231
17
+ H040640029
18
+ H040660409
19
+ H040680243
20
+ H040700032
21
+ H040960438
22
+ H041080566
23
+ H041120010
24
+ H044140024
25
+ H044300140
26
+ H044600024
27
+ H044760104
28
+ H045220067
29
+ H045220068
30
+ H050200647
31
+ H050620079
32
+ H080540148
33
+ H080580153
34
+ H042380354
35
+ H080680108
36
+ H080700014
37
+ H080920125
38
+ H081100032
39
+ H042560017
40
+ H042660021
41
+ H042880341
42
+ H043080575
43
+ H043280056
44
+ H043820282
45
+ H043920025
46
+ H044020657
47
+ H081880240
48
+ H081940295
49
+ H081980138
50
+ H082060034
51
+ H082140058
52
+ H082160009
53
+ H082160010
54
+ H082240367
55
+ H082320060
56
+ H082340085
57
+ H081120076
58
+ H082340087
59
+ H082400032
60
+ H082420086
61
+ H081180049
62
+ H081200095
63
+ H081220201
64
+ H081380045
65
+ H081420209
66
+ H081520265
67
+ H081520277
68
+ H081860326
69
+ H085140304
70
+ H085180220
71
+ H085180222
72
+ H090140403
73
+ H090140410
74
+ H090220294
75
+ H090240476
76
+ H090340417
77
+ H090360446
78
+ H090400355
79
+ H082500121
80
+ H090460251
81
+ H090480157
82
+ H090500230
83
+ H082660048
84
+ H082780120
85
+ H082800043
86
+ H082800044
87
+ H082980048
88
+ H083140517
89
+ H083200136
90
+ H084740200
91
+ H091280140
92
+ H091340198
93
+ H091300415
94
+ H091360124
95
+ H091380126
96
+ H091540320
97
+ H091680457
98
+ H091740414
99
+ H091760233
100
+ H091760238
101
+ H090580174
102
+ H091960112
103
+ H091960708
104
+ H091980013
105
+ H090640283
106
+ H090700200
107
+ H090780401
108
+ H090800100
109
+ H090920456
110
+ H091200143
111
+ H091220182
112
+ H091220183
113
+ H090480158
114
+ H090580172
115
+ H090600241
116
+ H090600242
117
+ H090640289
118
+ H090740100
119
+ H090940181
120
+ H090960223
121
+ H091300411
122
+ H091560205
123
+ H092120149
124
+ H091640755
125
+ H091680438
126
+ H091760235
127
+ H092260222
128
+ H092480114
129
+ H092520139
130
+ H092780164
131
+ H093980210
132
+ H095060138
133
+ H095160155
134
+ H090200297
135
+ H094540354
136
+ H094760078
137
+ H095080492
138
+ H095100188
139
+ H095240140
140
+ H100180477
141
+ H095260498
142
+ H094680245
143
+ H094560504
144
+ H094360202
145
+ H091780500
146
+ H094180239
147
+ H094160182
148
+ H093840091
149
+ H091820284
150
+ H092080099
151
+ H093040584
152
+ H093080338
153
+ H093180215
154
+ H093340223
155
+ H093420437
156
+ H093640539
157
+ H093420432
158
+ H093380123
159
+ H093360238
160
+ H093260335
161
+ H093200228
162
+ H093180214
163
+ H093120266
164
+ H093100592
165
+ H093080653
166
+ H093060566
167
+ H093780119
168
+ H092940206
169
+ H092940205
170
+ H092920374
171
+ H093700446
172
+ H093640534
173
+ H093560718
174
+ H093560202
175
+ H093520331
176
+ H093500353
177
+ H093440288
178
+ H093420433
179
+ H092560200
180
+ H092300123
181
+ H092300122
182
+ H092280040
183
+ H092260221
184
+ H091940171
185
+ H090200284
186
+ H091820288
187
+ H091780492
188
+ H091640750
189
+ H092920373
190
+ H090980124
191
+ H090900473
192
+ H090300377
193
+ H092920369
194
+ H092880154
195
+ H092880153
196
+ H092860554
197
+ H092860549
198
+ H092840305
199
+ H092800528
200
+ H092600318
201
+ MGAS10270
202
+ MGAS10394
203
+ MGAS10750
204
+ MGAS2096
205
+ MGAS315
206
+ MGAS5005
207
+ MGAS6180
208
+ MGAS8232
209
+ MGAS9429
210
+ Manfredo
211
+ NZ131
212
+ SSI
@@ -0,0 +1,32 @@
1
+ require 'snp_db_connection'
2
+
3
+ class Strain < ActiveRecord::Base
4
+ has_many :alleles, :through => :genotypes
5
+ has_many :genotypes
6
+ end
7
+
8
+ class Feature < ActiveRecord::Base
9
+ has_many :annotations
10
+ has_many :snps
11
+ end
12
+
13
+ class Snp < ActiveRecord::Base
14
+ belongs_to :feature
15
+ has_many :alleles
16
+ belongs_to :reference_allele, :class_name => "Allele", :foreign_key => "reference_allele_id"
17
+ end
18
+
19
+ class Allele < ActiveRecord::Base
20
+ has_many :genotypes
21
+ belongs_to :snp
22
+ has_many :strains, :through => :genotypes
23
+ end
24
+
25
+ class Genotype < ActiveRecord::Base
26
+ belongs_to :allele
27
+ belongs_to :strain
28
+ end
29
+
30
+ class Annotation < ActiveRecord::Base
31
+ belongs_to :feature
32
+ end
data/lib/snp-search.rb CHANGED
@@ -16,15 +16,6 @@ def guess_sequence_format(reference_genome)
16
16
  return file_format
17
17
  end
18
18
 
19
- # A method to populate the strain names in the Strain table. strain_names is an array of strain names.
20
- def populate_strains(strain_names)
21
- strain_names.each do |strain|
22
- s = Strain.new
23
- s.name = strain
24
- s.save
25
- end
26
- end
27
-
28
19
  # A method to populate the database with the features (genes etc) and the annotations from the embl file.
29
20
  # We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene.
30
21
  # The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
@@ -56,12 +47,8 @@ end
56
47
  #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
57
48
  # It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.
58
49
 
59
- def populate_snps_alleles_genotypes(strain_names, vcf_file, cuttoff_snp, cuttoff_genotype)
60
- strains = Array.new
61
- strain_names.each do |strain_name|
62
- strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
63
- strains << strain
64
- end
50
+ def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
51
+
65
52
 
66
53
  # open vcf file and parse each line
67
54
  File.open(vcf_file) do |f|
@@ -69,8 +56,22 @@ def populate_snps_alleles_genotypes(strain_names, vcf_file, cuttoff_snp, cuttoff
69
56
  header = f.gets
70
57
  header2 = f.gets.chomp
71
58
  column_headings = header2.split("\t")
72
- sample_names = column_headings[9..-1]
73
-
59
+ strain_names = column_headings[9..-1]
60
+ strain_names.map!{|name| name.sub(/\..*/, '')}
61
+
62
+ strain_names.each do |str|
63
+ ss = Strain.new
64
+ ss.name = str
65
+ ss.save
66
+ end
67
+
68
+ strains = Array.new
69
+ strain_names.each do |strain_name|
70
+ strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
71
+ strains << strain
72
+ end
73
+
74
+
74
75
  good_snps = 0
75
76
  # start parsing snps
76
77
  while line = f.gets
data/snp-search.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "snp-search"
8
- s.version = "0.21.0"
8
+ s.version = "0.22.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ali Al-Shahib", "Anthony Underwood"]
@@ -28,8 +28,11 @@ Gem::Specification.new do |s|
28
28
  "Rakefile",
29
29
  "VERSION",
30
30
  "bin/snp-search",
31
+ "examples/ali.txt",
31
32
  "examples/example1.rb",
32
33
  "examples/example2.rb",
34
+ "examples/list_of_GAS_strains.txt",
35
+ "examples/snp_db_models.rb",
33
36
  "lib/snp-search.rb",
34
37
  "lib/snp_db_connection.rb",
35
38
  "lib/snp_db_models.rb",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snp-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.21.0
4
+ version: 0.22.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -14,7 +14,7 @@ date: 2011-12-07 00:00:00.000000000Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: activerecord
17
- requirement: &2158662860 !ruby/object:Gem::Requirement
17
+ requirement: &2158657440 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 3.1.3
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *2158662860
25
+ version_requirements: *2158657440
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: bio
28
- requirement: &2158662200 !ruby/object:Gem::Requirement
28
+ requirement: &2158656840 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 1.4.2
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *2158662200
36
+ version_requirements: *2158656840
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: slop
39
- requirement: &2158661540 !ruby/object:Gem::Requirement
39
+ requirement: &2158645460 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 2.4.0
45
45
  type: :runtime
46
46
  prerelease: false
47
- version_requirements: *2158661540
47
+ version_requirements: *2158645460
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: sqlite3
50
- requirement: &2158660960 !ruby/object:Gem::Requirement
50
+ requirement: &2158644880 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ~>
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 1.3.4
56
56
  type: :runtime
57
57
  prerelease: false
58
- version_requirements: *2158660960
58
+ version_requirements: *2158644880
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: rspec
61
- requirement: &2158660360 !ruby/object:Gem::Requirement
61
+ requirement: &2158644320 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ~>
@@ -66,10 +66,10 @@ dependencies:
66
66
  version: 2.3.0
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *2158660360
69
+ version_requirements: *2158644320
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: bundler
72
- requirement: &2158659780 !ruby/object:Gem::Requirement
72
+ requirement: &2158643800 !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
75
  - - ~>
@@ -77,10 +77,10 @@ dependencies:
77
77
  version: 1.0.0
78
78
  type: :development
79
79
  prerelease: false
80
- version_requirements: *2158659780
80
+ version_requirements: *2158643800
81
81
  - !ruby/object:Gem::Dependency
82
82
  name: jeweler
83
- requirement: &2158659200 !ruby/object:Gem::Requirement
83
+ requirement: &2158643320 !ruby/object:Gem::Requirement
84
84
  none: false
85
85
  requirements:
86
86
  - - ~>
@@ -88,10 +88,10 @@ dependencies:
88
88
  version: 1.6.4
89
89
  type: :development
90
90
  prerelease: false
91
- version_requirements: *2158659200
91
+ version_requirements: *2158643320
92
92
  - !ruby/object:Gem::Dependency
93
93
  name: rcov
94
- requirement: &2158658680 !ruby/object:Gem::Requirement
94
+ requirement: &2158642840 !ruby/object:Gem::Requirement
95
95
  none: false
96
96
  requirements:
97
97
  - - ! '>='
@@ -99,7 +99,7 @@ dependencies:
99
99
  version: '0'
100
100
  type: :development
101
101
  prerelease: false
102
- version_requirements: *2158658680
102
+ version_requirements: *2158642840
103
103
  description: Use the snp-search toolset to query the SNP database
104
104
  email: ali.al-shahib@hpa.org.uk
105
105
  executables:
@@ -119,8 +119,11 @@ files:
119
119
  - Rakefile
120
120
  - VERSION
121
121
  - bin/snp-search
122
+ - examples/ali.txt
122
123
  - examples/example1.rb
123
124
  - examples/example2.rb
125
+ - examples/list_of_GAS_strains.txt
126
+ - examples/snp_db_models.rb
124
127
  - lib/snp-search.rb
125
128
  - lib/snp_db_connection.rb
126
129
  - lib/snp_db_models.rb
@@ -143,7 +146,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
143
146
  version: '0'
144
147
  segments:
145
148
  - 0
146
- hash: 2333300887326022643
149
+ hash: 534829390528662828
147
150
  required_rubygems_version: !ruby/object:Gem::Requirement
148
151
  none: false
149
152
  requirements: