reubypathdb 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/eupathdb_fasta.rb +70 -0
- data/lib/eupathdb_species_data.rb +21 -18
- data/lib/reubypathdb.rb +1 -0
- data/reubypathdb.gemspec +7 -7
- data/test/test_eupathdb_species_data.rb +22 -7
- metadata +6 -8
- data/Gemfile.lock +0 -20
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
@@ -0,0 +1,70 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
class EuPathDB
|
5
|
+
# Looks like EuPathDB databases have settled on something like
|
6
|
+
# >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
|
7
|
+
# where the species name differs but the rest is mostly constant
|
8
|
+
class FastaParser
|
9
|
+
attr_accessor :species_name
|
10
|
+
|
11
|
+
# The species name is what should show up in the 2nd bracket, so something
|
12
|
+
# like 'Toxoplasma_gondii_ME49' for
|
13
|
+
# >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
|
14
|
+
# for instance
|
15
|
+
def initialize(species_name, filename)
|
16
|
+
@species_name = species_name
|
17
|
+
@filename = filename
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enumerate through fasta file entries
|
21
|
+
def each
|
22
|
+
@flat = Bio::FlatFile.open(Bio::FastaFormat, @filename)
|
23
|
+
n = next_entry
|
24
|
+
while !n.nil?
|
25
|
+
yield n
|
26
|
+
n = next_entry
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return the entry in the fasta file, or nil if there is no more or the
|
31
|
+
# fasta file could not be opened correctly.
|
32
|
+
def next_entry
|
33
|
+
return nil if !@flat
|
34
|
+
n = @flat.next_entry
|
35
|
+
return nil if !n
|
36
|
+
|
37
|
+
s = parse_name(n.definition)
|
38
|
+
s.sequence = n.seq
|
39
|
+
return s
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_name(definition)
|
43
|
+
s = FastaAnnotation.new
|
44
|
+
|
45
|
+
regex = /^(\S+)\|(.*?) \| organism=#{@species_name} \| product=(.*?) \| location=(.*) \| length=\d+$/
|
46
|
+
matches = definition.match(regex)
|
47
|
+
|
48
|
+
if !matches
|
49
|
+
raise Exception, "Definition line has unexpected format: `#{definition}'. Trying to match this line to the regular expression `#{regex.inspect}'"
|
50
|
+
end
|
51
|
+
|
52
|
+
matches2 = matches[4].match(/^(.+?)\:/)
|
53
|
+
if !matches2
|
54
|
+
raise ParseException, "Definition line has unexpected scaffold format: #{matches[4]}"
|
55
|
+
end
|
56
|
+
s.sequencing_centre = matches[1]
|
57
|
+
s.scaffold = matches2[1]
|
58
|
+
s.gene_id = matches[2]
|
59
|
+
s.annotation = matches[3]
|
60
|
+
return s
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class FastaAnnotation
|
65
|
+
attr_accessor :gene_id, :sequence, :annotation, :scaffold, :sequencing_centre
|
66
|
+
end
|
67
|
+
|
68
|
+
class ParseException < Exception; end
|
69
|
+
end
|
70
|
+
end
|
@@ -76,6 +76,7 @@ class EuPathDBSpeciesData
|
|
76
76
|
:source => 'ToxoDB',
|
77
77
|
:database_download_folder => 'EtenellaHoughton',
|
78
78
|
:behind_usage_policy => true,
|
79
|
+
:fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
|
79
80
|
},
|
80
81
|
'Toxoplasma gondii' => {
|
81
82
|
:name => 'Toxoplasma gondii',
|
@@ -170,12 +171,12 @@ class EuPathDBSpeciesData
|
|
170
171
|
end
|
171
172
|
|
172
173
|
SOURCE_VERSIONS = {
|
173
|
-
'PlasmoDB' => '
|
174
|
-
'ToxoDB' => '
|
175
|
-
'CryptoDB' => '4.
|
176
|
-
'PiroplasmaDB' => '1.
|
174
|
+
'PlasmoDB' => '8.2',
|
175
|
+
'ToxoDB' => '7.2',
|
176
|
+
'CryptoDB' => '4.6',
|
177
|
+
'PiroplasmaDB' => '1.1',
|
177
178
|
'FungiDB' => '1.0',
|
178
|
-
'TriTrypDB' => '
|
179
|
+
'TriTrypDB' => '4.0',
|
179
180
|
}
|
180
181
|
DATABASES = SOURCE_VERSIONS.keys
|
181
182
|
|
@@ -186,7 +187,7 @@ class EuPathDBSpeciesData
|
|
186
187
|
#
|
187
188
|
# base_data_directory is the directory where locally cached version of the downloaded
|
188
189
|
# files are stored.
|
189
|
-
def initialize(nickname, base_data_directory=nil)
|
190
|
+
def initialize(nickname, base_data_directory=nil, database_version=nil)
|
190
191
|
@species_data = @@data[nickname] # try the full name
|
191
192
|
@species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
|
192
193
|
if @species_data.nil? # try using just the second word
|
@@ -195,10 +196,13 @@ class EuPathDBSpeciesData
|
|
195
196
|
@species_data = @@data[splits[1]]
|
196
197
|
end
|
197
198
|
end
|
199
|
+
raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
|
198
200
|
|
199
201
|
@base_data_directory = base_data_directory
|
200
202
|
|
201
|
-
|
203
|
+
# record out what version of the db we are looking at, otherwise default
|
204
|
+
@database_version = database_version
|
205
|
+
@database_version ||= SOURCE_VERSIONS[@species_data[:source]]
|
202
206
|
end
|
203
207
|
|
204
208
|
def method_missing(symbol)
|
@@ -237,7 +241,7 @@ class EuPathDBSpeciesData
|
|
237
241
|
end
|
238
242
|
|
239
243
|
def version
|
240
|
-
|
244
|
+
@database_version
|
241
245
|
end
|
242
246
|
|
243
247
|
def protein_fasta_filename
|
@@ -294,13 +298,7 @@ class EuPathDBSpeciesData
|
|
294
298
|
end
|
295
299
|
|
296
300
|
def eu_path_db_download_directory
|
297
|
-
|
298
|
-
SOURCE_VERSIONS.each do |db, version|
|
299
|
-
# 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
|
300
|
-
directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
|
301
|
-
end
|
302
|
-
raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
|
303
|
-
return "#{directories[database]}/#{one_word_name}"
|
301
|
+
"http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
|
304
302
|
end
|
305
303
|
|
306
304
|
def eu_path_db_fasta_download_directory
|
@@ -331,7 +329,7 @@ class EuPathDBSpeciesData
|
|
331
329
|
|
332
330
|
def local_download_directory
|
333
331
|
s = @species_data
|
334
|
-
"#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{
|
332
|
+
"#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
|
335
333
|
end
|
336
334
|
|
337
335
|
# an array of directory names. mkdir is called on each of them in order,
|
@@ -348,7 +346,7 @@ class EuPathDBSpeciesData
|
|
348
346
|
s[:name],
|
349
347
|
'genome',
|
350
348
|
s[:source],
|
351
|
-
|
349
|
+
@database_version,
|
352
350
|
]
|
353
351
|
|
354
352
|
(0..components.length-1).collect do |i|
|
@@ -359,7 +357,8 @@ class EuPathDBSpeciesData
|
|
359
357
|
# Return a list of the species names that are included in the EuPathDB database
|
360
358
|
def self.species_data_from_database(database_name, base_download_directory=nil)
|
361
359
|
species = @@data.select {|name, info|
|
362
|
-
|
360
|
+
info[:source].downcase == database_name.downcase and
|
361
|
+
name == info[:name] #only allow ones that are fully specified - not shortcut ones
|
363
362
|
}
|
364
363
|
species.collect do |name_info|
|
365
364
|
EuPathDBSpeciesData.new(name_info[0], base_download_directory)
|
@@ -410,4 +409,8 @@ class EuPathDBSpeciesData
|
|
410
409
|
end
|
411
410
|
end
|
412
411
|
end
|
412
|
+
|
413
|
+
def protein_fasta_file_iterator
|
414
|
+
Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
|
415
|
+
end
|
413
416
|
end
|
data/lib/reubypathdb.rb
CHANGED
data/reubypathdb.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{reubypathdb}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = [
|
12
|
-
s.date = %q{
|
11
|
+
s.authors = [%q{Ben J Woodcroft}]
|
12
|
+
s.date = %q{2012-01-14}
|
13
13
|
s.description = %q{Classes to help parsing EuPathDB data files}
|
14
14
|
s.email = %q{donttrustben near gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -19,11 +19,11 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
22
|
"LICENSE",
|
24
23
|
"README.rdoc",
|
25
24
|
"Rakefile",
|
26
25
|
"VERSION",
|
26
|
+
"lib/eupathdb_fasta.rb",
|
27
27
|
"lib/eupathdb_gene_information_table.rb",
|
28
28
|
"lib/eupathdb_gff.rb",
|
29
29
|
"lib/eupathdb_species_data.rb",
|
@@ -36,9 +36,9 @@ Gem::Specification.new do |s|
|
|
36
36
|
"test/test_eupathdb_species_data.rb"
|
37
37
|
]
|
38
38
|
s.homepage = %q{http://github.com/wwood/reubypathdb}
|
39
|
-
s.licenses = [
|
40
|
-
s.require_paths = [
|
41
|
-
s.rubygems_version = %q{1.6
|
39
|
+
s.licenses = [%q{MIT}]
|
40
|
+
s.require_paths = [%q{lib}]
|
41
|
+
s.rubygems_version = %q{1.8.6}
|
42
42
|
s.summary = %q{Classes to help parsing EuPathDB data files}
|
43
43
|
|
44
44
|
if s.respond_to? :specification_version then
|
@@ -97,17 +97,32 @@ class EuPathDBSpeciesDataTest < Test::Unit::TestCase
|
|
97
97
|
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
|
98
98
|
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
|
99
99
|
spd.eu_path_db_fasta_download_directory
|
100
|
-
|
100
|
+
end
|
101
101
|
|
102
|
-
|
102
|
+
def test_behind_usage_policy
|
103
103
|
spd = EuPathDBSpeciesData.new('Plasmodium vivax')
|
104
104
|
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
|
105
105
|
spd.eu_path_db_fasta_download_directory
|
106
|
-
|
106
|
+
end
|
107
107
|
|
108
|
-
|
109
|
-
|
110
|
-
|
108
|
+
def test_representative_strain_name
|
109
|
+
spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
|
110
|
+
assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
|
111
111
|
"#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
|
112
|
-
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_non_default_db_version_of_protein_fasta
|
115
|
+
spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir,'100.9')
|
116
|
+
assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/100.9/PyoeliiAnnotatedProteins_PlasmoDB-100.9.fasta",
|
117
|
+
spd.protein_fasta_path
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_species_data_from_database
|
121
|
+
plasmos = EuPathDBSpeciesData.species_data_from_database('PlasmoDB')
|
122
|
+
# make sure there is only 1 of each species being produced - there was a bug about that multiple species were being generated at one point
|
123
|
+
bergs = plasmos.select{|s|
|
124
|
+
EuPathDBSpeciesData.new(s.name).name == 'Plasmodium berghei'
|
125
|
+
}
|
126
|
+
assert_equal ['Plasmodium berghei'], bergs.collect{|d| d.name}
|
127
|
+
end
|
113
128
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reubypathdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ben J Woodcroft
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
19
|
-
default_executable:
|
18
|
+
date: 2012-01-14 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
type: :development
|
@@ -90,11 +89,11 @@ extra_rdoc_files:
|
|
90
89
|
files:
|
91
90
|
- .document
|
92
91
|
- Gemfile
|
93
|
-
- Gemfile.lock
|
94
92
|
- LICENSE
|
95
93
|
- README.rdoc
|
96
94
|
- Rakefile
|
97
95
|
- VERSION
|
96
|
+
- lib/eupathdb_fasta.rb
|
98
97
|
- lib/eupathdb_gene_information_table.rb
|
99
98
|
- lib/eupathdb_gff.rb
|
100
99
|
- lib/eupathdb_species_data.rb
|
@@ -105,7 +104,6 @@ files:
|
|
105
104
|
- test/helper.rb
|
106
105
|
- test/test_eupathdb_gene_information_table.rb
|
107
106
|
- test/test_eupathdb_species_data.rb
|
108
|
-
has_rdoc: true
|
109
107
|
homepage: http://github.com/wwood/reubypathdb
|
110
108
|
licenses:
|
111
109
|
- MIT
|
@@ -135,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
133
|
requirements: []
|
136
134
|
|
137
135
|
rubyforge_project:
|
138
|
-
rubygems_version: 1.6
|
136
|
+
rubygems_version: 1.8.6
|
139
137
|
signing_key:
|
140
138
|
specification_version: 3
|
141
139
|
summary: Classes to help parsing EuPathDB data files
|
data/Gemfile.lock
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
git (1.2.5)
|
5
|
-
jeweler (1.6.4)
|
6
|
-
bundler (~> 1.0)
|
7
|
-
git (>= 1.2.5)
|
8
|
-
rake
|
9
|
-
rake (0.9.2)
|
10
|
-
rcov (0.9.10)
|
11
|
-
shoulda (2.11.3)
|
12
|
-
|
13
|
-
PLATFORMS
|
14
|
-
ruby
|
15
|
-
|
16
|
-
DEPENDENCIES
|
17
|
-
bundler (~> 1.0.0)
|
18
|
-
jeweler (~> 1.6.4)
|
19
|
-
rcov
|
20
|
-
shoulda
|