reubypathdb 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -0
- data/Gemfile.lock +20 -0
- data/README.rdoc +3 -1
- data/Rakefile +28 -28
- data/VERSION +1 -1
- data/lib/eupathdb_gff.rb +2 -1
- data/lib/eupathdb_species_data.rb +413 -0
- data/lib/reubypathdb.rb +1 -0
- data/reubypathdb.gemspec +21 -11
- data/test/test_eupathdb_species_data.rb +113 -0
- metadata +62 -13
data/Gemfile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "shoulda", ">= 0"
|
10
|
+
gem "bundler", "~> 1.0.0"
|
11
|
+
gem "jeweler", "~> 1.6.4"
|
12
|
+
gem "rcov", ">= 0"
|
13
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.6.4)
|
6
|
+
bundler (~> 1.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
rake (0.9.2)
|
10
|
+
rcov (0.9.10)
|
11
|
+
shoulda (2.11.3)
|
12
|
+
|
13
|
+
PLATFORMS
|
14
|
+
ruby
|
15
|
+
|
16
|
+
DEPENDENCIES
|
17
|
+
bundler (~> 1.0.0)
|
18
|
+
jeweler (~> 1.6.4)
|
19
|
+
rcov
|
20
|
+
shoulda
|
data/README.rdoc
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
= eupathdb
|
2
2
|
|
3
|
-
|
3
|
+
ALPHA software! Most likely the interface to the methods will change, and often.
|
4
|
+
|
5
|
+
Reubypathdb is a collection of Ruby methods associated with EuPathDB(.org) databases. Reubypathdb focuses on using files downloaded from the downloads sections of different databases, e.g. the GFF file and the gene information file for each species.
|
4
6
|
|
5
7
|
== Note on Patches/Pull Requests
|
6
8
|
|
data/Rakefile
CHANGED
@@ -1,22 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
2
12
|
require 'rake'
|
3
13
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
16
|
-
Jeweler::GemcutterTasks.new
|
17
|
-
rescue LoadError
|
18
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "reubypathdb"
|
18
|
+
gem.homepage = "http://github.com/wwood/reubypathdb"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Classes to help parsing EuPathDB data files}
|
21
|
+
gem.description = %Q{Classes to help parsing EuPathDB data files}
|
22
|
+
gem.email = "donttrustben near gmail.com"
|
23
|
+
gem.authors = ["Ben J Woodcroft"]
|
24
|
+
# dependencies defined in Gemfile
|
19
25
|
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
20
27
|
|
21
28
|
require 'rake/testtask'
|
22
29
|
Rake::TestTask.new(:test) do |test|
|
@@ -25,21 +32,14 @@ Rake::TestTask.new(:test) do |test|
|
|
25
32
|
test.verbose = true
|
26
33
|
end
|
27
34
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
end
|
35
|
-
rescue LoadError
|
36
|
-
task :rcov do
|
37
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
-
end
|
35
|
+
require 'rcov/rcovtask'
|
36
|
+
Rcov::RcovTask.new do |test|
|
37
|
+
test.libs << 'test'
|
38
|
+
test.pattern = 'test/**/test_*.rb'
|
39
|
+
test.verbose = true
|
40
|
+
test.rcov_opts << '--exclude "gems/*"'
|
39
41
|
end
|
40
42
|
|
41
|
-
task :test => :check_dependencies
|
42
|
-
|
43
43
|
task :default => :test
|
44
44
|
|
45
45
|
require 'rake/rdoctask'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/eupathdb_gff.rb
CHANGED
@@ -0,0 +1,413 @@
|
|
1
|
+
|
2
|
+
# A class dedicated to recording 'administrative' data about the databases,
|
3
|
+
# answering questions such as "which species are recorded in ToxoDB?" for instance.
|
4
|
+
#
|
5
|
+
# It is also meant for dealing with locally cached version of the files, where
|
6
|
+
# all the data is stored in a base directory with a specified structure.
|
7
|
+
#
|
8
|
+
# TODO: functions for the info and the local caching should probably be separated
|
9
|
+
# into separate classes, and the directory structure of the local versions shouldn't
|
10
|
+
# be forced on the user.
|
11
|
+
class EuPathDBSpeciesData
|
12
|
+
@@data = {
|
13
|
+
## PlasmoDB
|
14
|
+
'Plasmodium falciparum' => {
|
15
|
+
:name => 'Plasmodium falciparum',
|
16
|
+
:source => 'PlasmoDB',
|
17
|
+
:fasta_file_species_name => 'Plasmodium_falciparum_3D7',
|
18
|
+
:sequencing_centre_abbreviation => 'psu',
|
19
|
+
:behind_usage_policy => true,
|
20
|
+
},
|
21
|
+
'Plasmodium yoelii' => {
|
22
|
+
:directory => 'yoelii',
|
23
|
+
:name => 'Plasmodium yoelii',
|
24
|
+
:sequencing_centre_abbreviation => 'TIGR',
|
25
|
+
:fasta_file_species_name => 'Plasmodium_yoelii_yoelii_str._17XNL',
|
26
|
+
:proteins_fasta_filename => lambda {|version| "PyoeliiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
|
27
|
+
#:transcripts_fasta_filename => lambda {|version| "PyoeliiAllTranscripts_PlasmoDB-#{version}.fasta"},
|
28
|
+
:source => 'PlasmoDB'
|
29
|
+
},
|
30
|
+
'Plasmodium vivax' => {
|
31
|
+
:name => 'Plasmodium vivax',
|
32
|
+
:sequencing_centre_abbreviation => 'gb',
|
33
|
+
:fasta_file_species_name => 'Plasmodium_vivax_SaI-1',
|
34
|
+
:proteins_fasta_filename => lambda {|version| "PvivaxAnnotatedProteins_PlasmoDB-#{version}.fasta"},
|
35
|
+
:source => 'PlasmoDB'
|
36
|
+
},
|
37
|
+
'Plasmodium berghei' => {
|
38
|
+
:name => 'Plasmodium berghei',
|
39
|
+
:sequencing_centre_abbreviation => 'psu',
|
40
|
+
:fasta_file_species_name => 'Plasmodium_berghei_str._ANKA',
|
41
|
+
:proteins_fasta_filename => lambda {|version| "PbergheiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
|
42
|
+
#:transcripts_fasta_filename => lambda {|version| "PbergheiAllTranscripts_PlasmoDB-#{version}.fasta"},
|
43
|
+
:source => 'PlasmoDB'
|
44
|
+
},
|
45
|
+
'Plasmodium chabaudi' => {
|
46
|
+
:name => 'Plasmodium chabaudi',
|
47
|
+
:sequencing_centre_abbreviation => 'psu',
|
48
|
+
:fasta_file_species_name => 'Plasmodium_chabaudi_chabaudi',
|
49
|
+
:proteins_fasta_filename => lambda {|version| "PchabaudiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
|
50
|
+
:source => 'PlasmoDB',
|
51
|
+
:behind_usage_policy => true,
|
52
|
+
},
|
53
|
+
'Plasmodium knowlesi' => {
|
54
|
+
:name => 'Plasmodium knowlesi',
|
55
|
+
:sequencing_centre_abbreviation => 'psu',
|
56
|
+
:fasta_file_species_name => 'Plasmodium_knowlesi_strain_H',
|
57
|
+
:source => 'PlasmoDB',
|
58
|
+
:behind_usage_policy => true,
|
59
|
+
},
|
60
|
+
## ToxoDB
|
61
|
+
'Neospora caninum' => {
|
62
|
+
:name => 'Neospora caninum',
|
63
|
+
:sequencing_centre_abbreviation => 'psu',
|
64
|
+
:fasta_file_species_name => 'Neospora_caninum',
|
65
|
+
:database_download_folder => 'NeosporaCaninum',
|
66
|
+
:representative_strain_name => 'NeosporaCaninum',
|
67
|
+
:proteins_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedProteins_ToxoDB-#{version}.fasta"},
|
68
|
+
:transcripts_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedTranscripts_ToxoDB-#{version}.fasta"},
|
69
|
+
:source => 'ToxoDB',
|
70
|
+
:behind_usage_policy => true,
|
71
|
+
},
|
72
|
+
'Eimeria tenella' => {
|
73
|
+
:name => 'Eimeria tenella',
|
74
|
+
:sequencing_centre_abbreviation => 'GeneDB',
|
75
|
+
:fasta_file_species_name => 'EtenellaHoughton',
|
76
|
+
:source => 'ToxoDB',
|
77
|
+
:database_download_folder => 'EtenellaHoughton',
|
78
|
+
:behind_usage_policy => true,
|
79
|
+
},
|
80
|
+
'Toxoplasma gondii' => {
|
81
|
+
:name => 'Toxoplasma gondii',
|
82
|
+
:sequencing_centre_abbreviation => 'gb',
|
83
|
+
:fasta_file_species_name => 'Toxoplasma_gondii_ME49',
|
84
|
+
:database_download_folder => 'TgondiiME49',
|
85
|
+
:gene_information_filename => lambda {|version| "TgondiiME49Gene_ToxoDB-#{version}.txt"},
|
86
|
+
:proteins_fasta_filename => lambda {|version| "TgondiiME49AnnotatedProteins_ToxoDB-#{version}.fasta"},
|
87
|
+
:transcripts_fasta_filename => lambda {|version| "TgondiiME49AnnotatedTranscripts_ToxoDB-#{version}.fasta"},
|
88
|
+
:gff_filename => lambda {|version| "TgondiiME49_ToxoDB-#{version}.gff"},
|
89
|
+
:genomic_fasta_filename => lambda {|version| "TgondiiME49Genomic_ToxoDB-#{version}.fasta"},
|
90
|
+
:source => 'ToxoDB'
|
91
|
+
},
|
92
|
+
## CryptoDB
|
93
|
+
'Cryptosporidium parvum' => {
|
94
|
+
:name => 'Cryptosporidium parvum',
|
95
|
+
:sequencing_centre_abbreviation => 'gb',
|
96
|
+
:fasta_file_species_name => 'Cryptosporidium_parvum',
|
97
|
+
:proteins_fasta_filename => lambda {|version| "CparvumAnnotatedProteins_CryptoDB-#{version}.fasta"},
|
98
|
+
:transcripts_fasta_filename => lambda {|version| "CparvumAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
|
99
|
+
#:gff_filename => lambda {|version| "c_parvum_iowa_ii.gff"}, #changed as of version 4.3
|
100
|
+
:source => 'CryptoDB'
|
101
|
+
},
|
102
|
+
'Cryptosporidium hominis' => {
|
103
|
+
:name => 'Cryptosporidium hominis',
|
104
|
+
:sequencing_centre_abbreviation => 'gb',
|
105
|
+
:fasta_file_species_name => 'Cryptosporidium_hominis',
|
106
|
+
:proteins_fasta_filename => lambda {|version| "ChominisAnnotatedProteins_CryptoDB-#{version}.fasta"},
|
107
|
+
:transcripts_fasta_filename => lambda {|version| "ChominisAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
|
108
|
+
#:gff_filename => lambda {|version| "c_hominis_tu502.gff"}, #changed as of version 4.3
|
109
|
+
:source => 'CryptoDB'
|
110
|
+
},
|
111
|
+
'Cryptosporidium muris' => {
|
112
|
+
:name => 'Cryptosporidium muris',
|
113
|
+
:sequencing_centre_abbreviation => 'gb',
|
114
|
+
:fasta_file_species_name => 'Cryptosporidium_muris',
|
115
|
+
:proteins_fasta_filename => lambda {|version| "CmurisAnnotatedProteins_CryptoDB-#{version}.fasta"},
|
116
|
+
:transcripts_fasta_filename => lambda {|version| "CmurisAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
|
117
|
+
#:gff_filename => lambda {|version| "c_muris.gff"}, #changed as of version 4.3
|
118
|
+
:source => 'CryptoDB'
|
119
|
+
},
|
120
|
+
## PiroplasmaDB
|
121
|
+
'Theileria annulata' => {
|
122
|
+
:name => 'Theileria annulata',
|
123
|
+
:database_download_folder => 'TannulataAnkara',
|
124
|
+
:sequencing_centre_abbreviation => 'Genbank',
|
125
|
+
:fasta_file_species_name => 'Theileria_annulata_strain_Ankara',
|
126
|
+
:source => 'PiroplasmaDB',
|
127
|
+
},
|
128
|
+
'Theileria parva' => {
|
129
|
+
:name => 'Theileria parva',
|
130
|
+
:database_download_folder => 'TparvaMuguga',
|
131
|
+
:sequencing_centre_abbreviation => 'Genbank',
|
132
|
+
:fasta_file_species_name => 'Theileria_parva_strain_Muguga',
|
133
|
+
:source => 'PiroplasmaDB',
|
134
|
+
},
|
135
|
+
'Babesia bovis' => {
|
136
|
+
:name => 'Babesia bovis',
|
137
|
+
:database_download_folder => 'BbovisT2Bo',
|
138
|
+
:representative_strain_name => 'BbovisT2Bo',
|
139
|
+
:sequencing_centre_abbreviation => 'Genbank',
|
140
|
+
:fasta_file_species_name => 'Babesia_bovis_T2Bo',
|
141
|
+
:source => 'PiroplasmaDB',
|
142
|
+
},
|
143
|
+
## FungiDB
|
144
|
+
'Candida albicans' => {
|
145
|
+
:name => 'Candida albicans',
|
146
|
+
:database_download_folder => 'Candida_albicans_SC5314',
|
147
|
+
:sequencing_centre_abbreviation => 'CGD',
|
148
|
+
:fasta_file_species_name => 'Candida_albicans_SC5314',
|
149
|
+
:source => 'FungiDB',
|
150
|
+
},
|
151
|
+
## TriTrypDB
|
152
|
+
'Trypanosoma brucei' => {
|
153
|
+
:name => 'Trypanosoma brucei',
|
154
|
+
:sequencing_centre_abbreviation => 'GeneDB',
|
155
|
+
:source => 'TriTrypDB',
|
156
|
+
:representative_strain_name => 'TbruceiTreu927',
|
157
|
+
:fasta_file_species_name => 'Trypanosoma_brucei_TREU927',
|
158
|
+
},
|
159
|
+
}
|
160
|
+
# Duplicate so both the species name and genus-species name work
|
161
|
+
@@data.keys.each do |key|
|
162
|
+
# name is full name of the species by default
|
163
|
+
@@data[key][:name] ||= key
|
164
|
+
|
165
|
+
# the species name without genus can also be used
|
166
|
+
splits = key.split(' ')
|
167
|
+
raise unless splits.length == 2
|
168
|
+
raise if @@data[splits[1]]
|
169
|
+
@@data[splits[1]] = @@data[key]
|
170
|
+
end
|
171
|
+
|
172
|
+
SOURCE_VERSIONS = {
|
173
|
+
'PlasmoDB' => '7.2',#
|
174
|
+
'ToxoDB' => '6.4',#'7.0',#
|
175
|
+
'CryptoDB' => '4.4',#'4.5',#
|
176
|
+
'PiroplasmaDB' => '1.0',#'1.1',#
|
177
|
+
'FungiDB' => '1.0',
|
178
|
+
'TriTrypDB' => '3.2',
|
179
|
+
}
|
180
|
+
DATABASES = SOURCE_VERSIONS.keys
|
181
|
+
|
182
|
+
# Create a new object about one particular species. The species can be specified
|
183
|
+
# by a nickname, which is either the full binomal name of the specie e.g.
|
184
|
+
# "Plasmodium falciparum", or by simply the second part (the species name without
|
185
|
+
# the genus name) e.g. 'falciparum'.
|
186
|
+
#
|
187
|
+
# base_data_directory is the directory where locally cached version of the downloaded
|
188
|
+
# files are stored.
|
189
|
+
def initialize(nickname, base_data_directory=nil)
|
190
|
+
@species_data = @@data[nickname] # try the full name
|
191
|
+
@species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
|
192
|
+
if @species_data.nil? # try using just the second word
|
193
|
+
splits = nickname.split(' ')
|
194
|
+
if splits.length == 2
|
195
|
+
@species_data = @@data[splits[1]]
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
@base_data_directory = base_data_directory
|
200
|
+
|
201
|
+
raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
|
202
|
+
end
|
203
|
+
|
204
|
+
def method_missing(symbol)
|
205
|
+
answer = @species_data[symbol]
|
206
|
+
return answer unless answer.nil?
|
207
|
+
super
|
208
|
+
end
|
209
|
+
|
210
|
+
# The path to the EuPathDB gene information table (stored as a gzip)
|
211
|
+
def gene_information_gzfile_path
|
212
|
+
"#{local_download_directory}/#{gene_information_gzfile_filename}"
|
213
|
+
end
|
214
|
+
|
215
|
+
# The path to the EuPathDB gene information table (stored as a gzip)
|
216
|
+
def gene_information_gzfile_filename
|
217
|
+
"#{gene_information_filename}.gz"
|
218
|
+
end
|
219
|
+
|
220
|
+
def gene_information_path
|
221
|
+
"#{local_download_directory}/#{gene_information_filename}"
|
222
|
+
end
|
223
|
+
|
224
|
+
def representative_strain_name
|
225
|
+
return @species_data[:representative_strain_name] unless @species_data[:representative_strain_name].nil?
|
226
|
+
return one_word_name
|
227
|
+
end
|
228
|
+
|
229
|
+
def gene_information_filename
|
230
|
+
f = @species_data[:gene_information_filename]
|
231
|
+
if f
|
232
|
+
"#{f.call(version)}"
|
233
|
+
else # TgondiiME49Gene_ToxoDB-5.2.txt.gz
|
234
|
+
# PfalciparumGene_PlasmoDB-6.1.txt.gz
|
235
|
+
"#{representative_strain_name}Gene_#{database}-#{version}.txt"
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
def version
|
240
|
+
SOURCE_VERSIONS[@species_data[:source]]
|
241
|
+
end
|
242
|
+
|
243
|
+
def protein_fasta_filename
|
244
|
+
if @species_data[:proteins_fasta_filename]
|
245
|
+
return "#{@species_data[:proteins_fasta_filename].call(version)}"
|
246
|
+
else
|
247
|
+
return "#{representative_strain_name}AnnotatedProteins_#{database}-#{version}.fasta"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def protein_fasta_path
|
252
|
+
return File.join(local_download_directory,protein_fasta_filename)
|
253
|
+
end
|
254
|
+
|
255
|
+
def protein_blast_database_path
|
256
|
+
"/blastdb/#{protein_fasta_filename}"
|
257
|
+
end
|
258
|
+
|
259
|
+
def transcript_fasta_filename
|
260
|
+
if @species_data[:transcripts_fasta_filename]
|
261
|
+
return "#{@species_data[:transcripts_fasta_filename].call(version)}"
|
262
|
+
else
|
263
|
+
return "#{representative_strain_name}AnnotatedTranscripts_#{database}-#{version}.fasta"
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def transcript_fasta_path
|
268
|
+
File.join(local_download_directory,transcript_fasta_filename)
|
269
|
+
end
|
270
|
+
|
271
|
+
def genomic_fasta_filename
|
272
|
+
genomic = @species_data[:genomic_fasta_filename]
|
273
|
+
if genomic
|
274
|
+
return "#{genomic.call(version)}"
|
275
|
+
else
|
276
|
+
return "#{representative_strain_name}Genomic_#{database}-#{version}.fasta"
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def gff_filename
|
281
|
+
if @species_data[:gff_filename]
|
282
|
+
return @species_data[:gff_filename].call(version)
|
283
|
+
else
|
284
|
+
return "#{representative_strain_name}_#{database}-#{version}.gff"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def gff_path
|
289
|
+
File.join(local_download_directory,gff_filename)
|
290
|
+
end
|
291
|
+
|
292
|
+
def database
|
293
|
+
@species_data[:source]
|
294
|
+
end
|
295
|
+
|
296
|
+
def eu_path_db_download_directory
|
297
|
+
directories = {}
|
298
|
+
SOURCE_VERSIONS.each do |db, version|
|
299
|
+
# 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
|
300
|
+
directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
|
301
|
+
end
|
302
|
+
raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
|
303
|
+
return "#{directories[database]}/#{one_word_name}"
|
304
|
+
end
|
305
|
+
|
306
|
+
def eu_path_db_fasta_download_directory
|
307
|
+
path = "#{eu_path_db_download_directory}/fasta"
|
308
|
+
path = "#{path}/data" if @species_data[:behind_usage_policy]
|
309
|
+
path
|
310
|
+
end
|
311
|
+
|
312
|
+
def eu_path_db_gff_download_directory
|
313
|
+
path = "#{eu_path_db_download_directory}/gff"
|
314
|
+
path = "#{path}/data" if @species_data[:behind_usage_policy]
|
315
|
+
path
|
316
|
+
end
|
317
|
+
|
318
|
+
def eu_path_db_txt_download_directory
|
319
|
+
path = "#{eu_path_db_download_directory}/txt"
|
320
|
+
path = "#{path}/data" if @species_data[:behind_usage_policy]
|
321
|
+
path
|
322
|
+
end
|
323
|
+
|
324
|
+
# Plasmodium chabaudi => Pchabaudi
|
325
|
+
def one_word_name
|
326
|
+
return @species_data[:database_download_folder] unless @species_data[:database_download_folder].nil?
|
327
|
+
splits = @species_data[:name].split(' ')
|
328
|
+
raise unless splits.length == 2
|
329
|
+
return "#{splits[0][0..0]}#{splits[1]}"
|
330
|
+
end
|
331
|
+
|
332
|
+
def local_download_directory
|
333
|
+
s = @species_data
|
334
|
+
"#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{SOURCE_VERSIONS[s[:source]]}"
|
335
|
+
end
|
336
|
+
|
337
|
+
# an array of directory names. mkdir is called on each of them in order,
|
338
|
+
# otherwise mkdir throws errors because there isn't sufficient folders
|
339
|
+
# to build on.
|
340
|
+
def directories_for_mkdir
|
341
|
+
if @base_data_directory.nil?
|
342
|
+
raise Exception, "Unable to generate directories when @base_data_directory is not set"
|
343
|
+
end
|
344
|
+
|
345
|
+
s = @species_data
|
346
|
+
components = [
|
347
|
+
@base_data_directory,
|
348
|
+
s[:name],
|
349
|
+
'genome',
|
350
|
+
s[:source],
|
351
|
+
SOURCE_VERSIONS[s[:source]]
|
352
|
+
]
|
353
|
+
|
354
|
+
(0..components.length-1).collect do |i|
|
355
|
+
components[0..i].join('/')
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# Return a list of the species names that are included in the EuPathDB database
|
360
|
+
def self.species_data_from_database(database_name, base_download_directory=nil)
|
361
|
+
species = @@data.select {|name, info|
|
362
|
+
info[:source].downcase == database_name.downcase
|
363
|
+
}
|
364
|
+
species.collect do |name_info|
|
365
|
+
EuPathDBSpeciesData.new(name_info[0], base_download_directory)
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
# Download all the data files from all the EuPathDB databases, or just one single database.
|
370
|
+
# Requires wget to be available on the command line
|
371
|
+
def self.download(base_download_directory, database_name=nil)
|
372
|
+
# by default, download everything
|
373
|
+
if database_name.nil?
|
374
|
+
EuPathDBSpeciesData::DATABASES.each do |d|
|
375
|
+
download base_download_directory, d
|
376
|
+
end
|
377
|
+
else
|
378
|
+
# Download the new files from the relevant database
|
379
|
+
EuPathDBSpeciesData.species_data_from_database(database_name, base_download_directory).each do |spd|
|
380
|
+
spd.directories_for_mkdir.each do |directory|
|
381
|
+
unless File.exists?(directory)
|
382
|
+
Dir.mkdir(directory)
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
Dir.chdir(spd.local_download_directory) do
|
387
|
+
p spd.eu_path_db_fasta_download_directory
|
388
|
+
|
389
|
+
# protein
|
390
|
+
unless File.exists?(spd.protein_fasta_filename)
|
391
|
+
`wget #{spd.eu_path_db_fasta_download_directory}/#{spd.protein_fasta_filename}`
|
392
|
+
end
|
393
|
+
# gff
|
394
|
+
unless File.exists?(spd.gff_filename)
|
395
|
+
`wget #{spd.eu_path_db_gff_download_directory}/#{spd.gff_filename}`
|
396
|
+
end
|
397
|
+
# transcripts
|
398
|
+
unless File.exists?(spd.transcript_fasta_filename)
|
399
|
+
`wget #{spd.eu_path_db_fasta_download_directory}/#{spd.transcript_fasta_filename}`
|
400
|
+
end
|
401
|
+
# gene information table
|
402
|
+
unless File.exists?(spd.gene_information_filename)
|
403
|
+
`wget '#{spd.eu_path_db_txt_download_directory}/#{spd.gene_information_filename}'`
|
404
|
+
end
|
405
|
+
# genomic
|
406
|
+
unless File.exists?(spd.genomic_fasta_filename)
|
407
|
+
`wget '#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}'`
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
data/lib/reubypathdb.rb
CHANGED
data/reubypathdb.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{reubypathdb}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-08-26}
|
13
13
|
s.description = %q{Classes to help parsing EuPathDB data files}
|
14
14
|
s.email = %q{donttrustben near gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -18,38 +18,48 @@ Gem::Specification.new do |s|
|
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
21
23
|
"LICENSE",
|
22
24
|
"README.rdoc",
|
23
25
|
"Rakefile",
|
24
26
|
"VERSION",
|
25
27
|
"lib/eupathdb_gene_information_table.rb",
|
26
28
|
"lib/eupathdb_gff.rb",
|
29
|
+
"lib/eupathdb_species_data.rb",
|
27
30
|
"lib/jgi_genes.rb",
|
28
31
|
"lib/reubypathdb.rb",
|
29
32
|
"reubypathdb.gemspec",
|
30
33
|
"test/data/eupathGeneInformation.txt",
|
31
34
|
"test/helper.rb",
|
32
|
-
"test/test_eupathdb_gene_information_table.rb"
|
35
|
+
"test/test_eupathdb_gene_information_table.rb",
|
36
|
+
"test/test_eupathdb_species_data.rb"
|
33
37
|
]
|
34
38
|
s.homepage = %q{http://github.com/wwood/reubypathdb}
|
39
|
+
s.licenses = ["MIT"]
|
35
40
|
s.require_paths = ["lib"]
|
36
|
-
s.rubygems_version = %q{1.6.
|
41
|
+
s.rubygems_version = %q{1.6.1}
|
37
42
|
s.summary = %q{Classes to help parsing EuPathDB data files}
|
38
|
-
s.test_files = [
|
39
|
-
"test/helper.rb",
|
40
|
-
"test/test_eupathdb_gene_information_table.rb"
|
41
|
-
]
|
42
43
|
|
43
44
|
if s.respond_to? :specification_version then
|
44
45
|
s.specification_version = 3
|
45
46
|
|
46
47
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
47
|
-
s.add_development_dependency(%q<
|
48
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
49
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
50
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
51
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
48
52
|
else
|
49
|
-
s.add_dependency(%q<
|
53
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
54
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
55
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
56
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
50
57
|
end
|
51
58
|
else
|
52
|
-
s.add_dependency(%q<
|
59
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
60
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
61
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
62
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
53
63
|
end
|
54
64
|
end
|
55
65
|
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'eupathdb_species_data'
|
3
|
+
|
4
|
+
class EuPathDBSpeciesDataTest < Test::Unit::TestCase
|
5
|
+
def base_dir
|
6
|
+
'/home/ben/phd/data'
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_method_missing
|
10
|
+
spd = EuPathDBSpeciesData.new('Plasmodium yoelii')
|
11
|
+
assert_equal 'yoelii', spd.directory
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_nickname
|
15
|
+
spd = EuPathDBSpeciesData.new('Plasmodium yoelii').fasta_file_species_name
|
16
|
+
assert_equal spd, EuPathDBSpeciesData.new('yoelii').fasta_file_species_name
|
17
|
+
assert_equal spd, EuPathDBSpeciesData.new('P. yoelii').fasta_file_species_name #check for not exactly the last name but close enough
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_protein_data_path
|
21
|
+
spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir)
|
22
|
+
assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/PyoeliiAnnotatedProteins_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.fasta",
|
23
|
+
spd.protein_fasta_path
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_one_word_name
|
27
|
+
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
|
28
|
+
assert_equal 'Pchabaudi', spd.one_word_name
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_download_directory
|
32
|
+
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
|
33
|
+
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi", spd.eu_path_db_download_directory
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_transcript_path_default
|
37
|
+
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi', base_dir)
|
38
|
+
assert_equal "/home/ben/phd/data/Plasmodium chabaudi/genome/PlasmoDB/#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/PchabaudiAnnotatedTranscripts_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.fasta",
|
39
|
+
spd.transcript_fasta_path
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_transcript_fasta_filename
|
43
|
+
spd = EuPathDBSpeciesData.new('falciparum')
|
44
|
+
assert_equal "Pfalciparum_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.gff",
|
45
|
+
spd.gff_filename
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_gzfile_path_toxo
|
49
|
+
spd = EuPathDBSpeciesData.new('gondii', base_dir)
|
50
|
+
assert_equal "/home/ben/phd/data/Toxoplasma gondii/genome/ToxoDB/#{EuPathDBSpeciesData::SOURCE_VERSIONS['ToxoDB']}/TgondiiME49Gene_ToxoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['ToxoDB']}.txt.gz",
|
51
|
+
spd.gene_information_gzfile_path
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_gzfile_path_default
|
55
|
+
spd = EuPathDBSpeciesData.new('falciparum', base_dir)
|
56
|
+
assert_equal "/home/ben/phd/data/Plasmodium falciparum/genome/PlasmoDB/#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/PfalciparumGene_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.txt.gz",
|
57
|
+
spd.gene_information_gzfile_path
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_gzfile_filename_default
|
61
|
+
spd = EuPathDBSpeciesData.new('falciparum')
|
62
|
+
assert_equal "PfalciparumGene_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.txt.gz",
|
63
|
+
spd.gene_information_gzfile_filename
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_directories_for_mkdir
|
67
|
+
spd = EuPathDBSpeciesData.new('gondii', base_dir)
|
68
|
+
assert_equal [
|
69
|
+
'/home/ben/phd/data',
|
70
|
+
'/home/ben/phd/data/Toxoplasma gondii',
|
71
|
+
'/home/ben/phd/data/Toxoplasma gondii/genome',
|
72
|
+
'/home/ben/phd/data/Toxoplasma gondii/genome/ToxoDB',
|
73
|
+
"/home/ben/phd/data/Toxoplasma gondii/genome/ToxoDB/#{EuPathDBSpeciesData::SOURCE_VERSIONS['ToxoDB']}"
|
74
|
+
],
|
75
|
+
spd.directories_for_mkdir
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_one_word_name
|
79
|
+
assert_equal 'NeosporaCaninum', EuPathDBSpeciesData.new('Neospora caninum').one_word_name
|
80
|
+
spd = EuPathDBSpeciesData.new('Plasmodium falciparum')
|
81
|
+
assert_equal 'Pfalciparum', spd.one_word_name
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_genomic_filename
|
85
|
+
spd = EuPathDBSpeciesData.new('falciparum')
|
86
|
+
assert_equal "PfalciparumGenomic_PlasmoDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}.fasta",
|
87
|
+
spd.genomic_fasta_filename
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_transcripts_name_without_block
|
91
|
+
spd = EuPathDBSpeciesData.new('Babesia bovis')
|
92
|
+
assert_equal "BbovisT2BoAnnotatedTranscripts_PiroplasmaDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PiroplasmaDB']}.fasta",
|
93
|
+
spd.transcript_fasta_filename
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_behind_usage_policy
|
97
|
+
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
|
98
|
+
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
|
99
|
+
spd.eu_path_db_fasta_download_directory
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_behind_usage_policy
|
103
|
+
spd = EuPathDBSpeciesData.new('Plasmodium vivax')
|
104
|
+
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
|
105
|
+
spd.eu_path_db_fasta_download_directory
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_representative_strain_name
|
109
|
+
spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
|
110
|
+
assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
|
111
|
+
"#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reubypathdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 3
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ben J Woodcroft
|
@@ -15,12 +15,11 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-08-26 00:00:00 +10:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
|
23
|
-
prerelease: false
|
22
|
+
type: :development
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
26
25
|
requirements:
|
@@ -30,8 +29,55 @@ dependencies:
|
|
30
29
|
segments:
|
31
30
|
- 0
|
32
31
|
version: "0"
|
33
|
-
type: :development
|
34
32
|
version_requirements: *id001
|
33
|
+
name: shoulda
|
34
|
+
prerelease: false
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
type: :development
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 23
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 0
|
46
|
+
- 0
|
47
|
+
version: 1.0.0
|
48
|
+
version_requirements: *id002
|
49
|
+
name: bundler
|
50
|
+
prerelease: false
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
type: :development
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ~>
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 7
|
59
|
+
segments:
|
60
|
+
- 1
|
61
|
+
- 6
|
62
|
+
- 4
|
63
|
+
version: 1.6.4
|
64
|
+
version_requirements: *id003
|
65
|
+
name: jeweler
|
66
|
+
prerelease: false
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
type: :development
|
69
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
hash: 3
|
75
|
+
segments:
|
76
|
+
- 0
|
77
|
+
version: "0"
|
78
|
+
version_requirements: *id004
|
79
|
+
name: rcov
|
80
|
+
prerelease: false
|
35
81
|
description: Classes to help parsing EuPathDB data files
|
36
82
|
email: donttrustben near gmail.com
|
37
83
|
executables: []
|
@@ -43,22 +89,26 @@ extra_rdoc_files:
|
|
43
89
|
- README.rdoc
|
44
90
|
files:
|
45
91
|
- .document
|
92
|
+
- Gemfile
|
93
|
+
- Gemfile.lock
|
46
94
|
- LICENSE
|
47
95
|
- README.rdoc
|
48
96
|
- Rakefile
|
49
97
|
- VERSION
|
50
98
|
- lib/eupathdb_gene_information_table.rb
|
51
99
|
- lib/eupathdb_gff.rb
|
100
|
+
- lib/eupathdb_species_data.rb
|
52
101
|
- lib/jgi_genes.rb
|
53
102
|
- lib/reubypathdb.rb
|
54
103
|
- reubypathdb.gemspec
|
55
104
|
- test/data/eupathGeneInformation.txt
|
56
105
|
- test/helper.rb
|
57
106
|
- test/test_eupathdb_gene_information_table.rb
|
107
|
+
- test/test_eupathdb_species_data.rb
|
58
108
|
has_rdoc: true
|
59
109
|
homepage: http://github.com/wwood/reubypathdb
|
60
|
-
licenses:
|
61
|
-
|
110
|
+
licenses:
|
111
|
+
- MIT
|
62
112
|
post_install_message:
|
63
113
|
rdoc_options: []
|
64
114
|
|
@@ -85,10 +135,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
135
|
requirements: []
|
86
136
|
|
87
137
|
rubyforge_project:
|
88
|
-
rubygems_version: 1.6.
|
138
|
+
rubygems_version: 1.6.1
|
89
139
|
signing_key:
|
90
140
|
specification_version: 3
|
91
141
|
summary: Classes to help parsing EuPathDB data files
|
92
|
-
test_files:
|
93
|
-
|
94
|
-
- test/test_eupathdb_gene_information_table.rb
|
142
|
+
test_files: []
|
143
|
+
|