reubypathdb 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/eupathdb_fasta.rb +70 -0
- data/lib/eupathdb_species_data.rb +21 -18
- data/lib/reubypathdb.rb +1 -0
- data/reubypathdb.gemspec +7 -7
- data/test/test_eupathdb_species_data.rb +22 -7
- metadata +6 -8
- data/Gemfile.lock +0 -20
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
@@ -0,0 +1,70 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
class EuPathDB
|
5
|
+
# Looks like EuPathDB databases have settled on something like
|
6
|
+
# >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
|
7
|
+
# where the species name differs but the rest is mostly constant
|
8
|
+
class FastaParser
|
9
|
+
attr_accessor :species_name
|
10
|
+
|
11
|
+
# The species name is what should show up in the 2nd bracket, so something
|
12
|
+
# like 'Toxoplasma_gondii_ME49' for
|
13
|
+
# >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
|
14
|
+
# for instance
|
15
|
+
def initialize(species_name, filename)
|
16
|
+
@species_name = species_name
|
17
|
+
@filename = filename
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enumerate through fasta file entries
|
21
|
+
def each
|
22
|
+
@flat = Bio::FlatFile.open(Bio::FastaFormat, @filename)
|
23
|
+
n = next_entry
|
24
|
+
while !n.nil?
|
25
|
+
yield n
|
26
|
+
n = next_entry
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return the entry in the fasta file, or nil if there is no more or the
|
31
|
+
# fasta file could not be opened correctly.
|
32
|
+
def next_entry
|
33
|
+
return nil if !@flat
|
34
|
+
n = @flat.next_entry
|
35
|
+
return nil if !n
|
36
|
+
|
37
|
+
s = parse_name(n.definition)
|
38
|
+
s.sequence = n.seq
|
39
|
+
return s
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_name(definition)
|
43
|
+
s = FastaAnnotation.new
|
44
|
+
|
45
|
+
regex = /^(\S+)\|(.*?) \| organism=#{@species_name} \| product=(.*?) \| location=(.*) \| length=\d+$/
|
46
|
+
matches = definition.match(regex)
|
47
|
+
|
48
|
+
if !matches
|
49
|
+
raise Exception, "Definition line has unexpected format: `#{definition}'. Trying to match this line to the regular expression `#{regex.inspect}'"
|
50
|
+
end
|
51
|
+
|
52
|
+
matches2 = matches[4].match(/^(.+?)\:/)
|
53
|
+
if !matches2
|
54
|
+
raise ParseException, "Definition line has unexpected scaffold format: #{matches[4]}"
|
55
|
+
end
|
56
|
+
s.sequencing_centre = matches[1]
|
57
|
+
s.scaffold = matches2[1]
|
58
|
+
s.gene_id = matches[2]
|
59
|
+
s.annotation = matches[3]
|
60
|
+
return s
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class FastaAnnotation
|
65
|
+
attr_accessor :gene_id, :sequence, :annotation, :scaffold, :sequencing_centre
|
66
|
+
end
|
67
|
+
|
68
|
+
class ParseException < Exception; end
|
69
|
+
end
|
70
|
+
end
|
@@ -76,6 +76,7 @@ class EuPathDBSpeciesData
|
|
76
76
|
:source => 'ToxoDB',
|
77
77
|
:database_download_folder => 'EtenellaHoughton',
|
78
78
|
:behind_usage_policy => true,
|
79
|
+
:fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
|
79
80
|
},
|
80
81
|
'Toxoplasma gondii' => {
|
81
82
|
:name => 'Toxoplasma gondii',
|
@@ -170,12 +171,12 @@ class EuPathDBSpeciesData
|
|
170
171
|
end
|
171
172
|
|
172
173
|
SOURCE_VERSIONS = {
|
173
|
-
'PlasmoDB' => '
|
174
|
-
'ToxoDB' => '
|
175
|
-
'CryptoDB' => '4.
|
176
|
-
'PiroplasmaDB' => '1.
|
174
|
+
'PlasmoDB' => '8.2',
|
175
|
+
'ToxoDB' => '7.2',
|
176
|
+
'CryptoDB' => '4.6',
|
177
|
+
'PiroplasmaDB' => '1.1',
|
177
178
|
'FungiDB' => '1.0',
|
178
|
-
'TriTrypDB' => '
|
179
|
+
'TriTrypDB' => '4.0',
|
179
180
|
}
|
180
181
|
DATABASES = SOURCE_VERSIONS.keys
|
181
182
|
|
@@ -186,7 +187,7 @@ class EuPathDBSpeciesData
|
|
186
187
|
#
|
187
188
|
# base_data_directory is the directory where locally cached version of the downloaded
|
188
189
|
# files are stored.
|
189
|
-
def initialize(nickname, base_data_directory=nil)
|
190
|
+
def initialize(nickname, base_data_directory=nil, database_version=nil)
|
190
191
|
@species_data = @@data[nickname] # try the full name
|
191
192
|
@species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
|
192
193
|
if @species_data.nil? # try using just the second word
|
@@ -195,10 +196,13 @@ class EuPathDBSpeciesData
|
|
195
196
|
@species_data = @@data[splits[1]]
|
196
197
|
end
|
197
198
|
end
|
199
|
+
raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
|
198
200
|
|
199
201
|
@base_data_directory = base_data_directory
|
200
202
|
|
201
|
-
|
203
|
+
# record out what version of the db we are looking at, otherwise default
|
204
|
+
@database_version = database_version
|
205
|
+
@database_version ||= SOURCE_VERSIONS[@species_data[:source]]
|
202
206
|
end
|
203
207
|
|
204
208
|
def method_missing(symbol)
|
@@ -237,7 +241,7 @@ class EuPathDBSpeciesData
|
|
237
241
|
end
|
238
242
|
|
239
243
|
def version
|
240
|
-
|
244
|
+
@database_version
|
241
245
|
end
|
242
246
|
|
243
247
|
def protein_fasta_filename
|
@@ -294,13 +298,7 @@ class EuPathDBSpeciesData
|
|
294
298
|
end
|
295
299
|
|
296
300
|
def eu_path_db_download_directory
|
297
|
-
|
298
|
-
SOURCE_VERSIONS.each do |db, version|
|
299
|
-
# 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
|
300
|
-
directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
|
301
|
-
end
|
302
|
-
raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
|
303
|
-
return "#{directories[database]}/#{one_word_name}"
|
301
|
+
"http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
|
304
302
|
end
|
305
303
|
|
306
304
|
def eu_path_db_fasta_download_directory
|
@@ -331,7 +329,7 @@ class EuPathDBSpeciesData
|
|
331
329
|
|
332
330
|
def local_download_directory
|
333
331
|
s = @species_data
|
334
|
-
"#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{
|
332
|
+
"#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
|
335
333
|
end
|
336
334
|
|
337
335
|
# an array of directory names. mkdir is called on each of them in order,
|
@@ -348,7 +346,7 @@ class EuPathDBSpeciesData
|
|
348
346
|
s[:name],
|
349
347
|
'genome',
|
350
348
|
s[:source],
|
351
|
-
|
349
|
+
@database_version,
|
352
350
|
]
|
353
351
|
|
354
352
|
(0..components.length-1).collect do |i|
|
@@ -359,7 +357,8 @@ class EuPathDBSpeciesData
|
|
359
357
|
# Return a list of the species names that are included in the EuPathDB database
|
360
358
|
def self.species_data_from_database(database_name, base_download_directory=nil)
|
361
359
|
species = @@data.select {|name, info|
|
362
|
-
|
360
|
+
info[:source].downcase == database_name.downcase and
|
361
|
+
name == info[:name] #only allow ones that are fully specified - not shortcut ones
|
363
362
|
}
|
364
363
|
species.collect do |name_info|
|
365
364
|
EuPathDBSpeciesData.new(name_info[0], base_download_directory)
|
@@ -410,4 +409,8 @@ class EuPathDBSpeciesData
|
|
410
409
|
end
|
411
410
|
end
|
412
411
|
end
|
412
|
+
|
413
|
+
def protein_fasta_file_iterator
|
414
|
+
Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
|
415
|
+
end
|
413
416
|
end
|
data/lib/reubypathdb.rb
CHANGED
data/reubypathdb.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{reubypathdb}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = [
|
12
|
-
s.date = %q{
|
11
|
+
s.authors = [%q{Ben J Woodcroft}]
|
12
|
+
s.date = %q{2012-01-14}
|
13
13
|
s.description = %q{Classes to help parsing EuPathDB data files}
|
14
14
|
s.email = %q{donttrustben near gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -19,11 +19,11 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
22
|
"LICENSE",
|
24
23
|
"README.rdoc",
|
25
24
|
"Rakefile",
|
26
25
|
"VERSION",
|
26
|
+
"lib/eupathdb_fasta.rb",
|
27
27
|
"lib/eupathdb_gene_information_table.rb",
|
28
28
|
"lib/eupathdb_gff.rb",
|
29
29
|
"lib/eupathdb_species_data.rb",
|
@@ -36,9 +36,9 @@ Gem::Specification.new do |s|
|
|
36
36
|
"test/test_eupathdb_species_data.rb"
|
37
37
|
]
|
38
38
|
s.homepage = %q{http://github.com/wwood/reubypathdb}
|
39
|
-
s.licenses = [
|
40
|
-
s.require_paths = [
|
41
|
-
s.rubygems_version = %q{1.6
|
39
|
+
s.licenses = [%q{MIT}]
|
40
|
+
s.require_paths = [%q{lib}]
|
41
|
+
s.rubygems_version = %q{1.8.6}
|
42
42
|
s.summary = %q{Classes to help parsing EuPathDB data files}
|
43
43
|
|
44
44
|
if s.respond_to? :specification_version then
|
@@ -97,17 +97,32 @@ class EuPathDBSpeciesDataTest < Test::Unit::TestCase
|
|
97
97
|
spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
|
98
98
|
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
|
99
99
|
spd.eu_path_db_fasta_download_directory
|
100
|
-
|
100
|
+
end
|
101
101
|
|
102
|
-
|
102
|
+
def test_behind_usage_policy
|
103
103
|
spd = EuPathDBSpeciesData.new('Plasmodium vivax')
|
104
104
|
assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
|
105
105
|
spd.eu_path_db_fasta_download_directory
|
106
|
-
|
106
|
+
end
|
107
107
|
|
108
|
-
|
109
|
-
|
110
|
-
|
108
|
+
def test_representative_strain_name
|
109
|
+
spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
|
110
|
+
assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
|
111
111
|
"#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
|
112
|
-
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_non_default_db_version_of_protein_fasta
|
115
|
+
spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir,'100.9')
|
116
|
+
assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/100.9/PyoeliiAnnotatedProteins_PlasmoDB-100.9.fasta",
|
117
|
+
spd.protein_fasta_path
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_species_data_from_database
|
121
|
+
plasmos = EuPathDBSpeciesData.species_data_from_database('PlasmoDB')
|
122
|
+
# make sure there is only 1 of each species being produced - there was a bug about that multiple species were being generated at one point
|
123
|
+
bergs = plasmos.select{|s|
|
124
|
+
EuPathDBSpeciesData.new(s.name).name == 'Plasmodium berghei'
|
125
|
+
}
|
126
|
+
assert_equal ['Plasmodium berghei'], bergs.collect{|d| d.name}
|
127
|
+
end
|
113
128
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reubypathdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ben J Woodcroft
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
19
|
-
default_executable:
|
18
|
+
date: 2012-01-14 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
type: :development
|
@@ -90,11 +89,11 @@ extra_rdoc_files:
|
|
90
89
|
files:
|
91
90
|
- .document
|
92
91
|
- Gemfile
|
93
|
-
- Gemfile.lock
|
94
92
|
- LICENSE
|
95
93
|
- README.rdoc
|
96
94
|
- Rakefile
|
97
95
|
- VERSION
|
96
|
+
- lib/eupathdb_fasta.rb
|
98
97
|
- lib/eupathdb_gene_information_table.rb
|
99
98
|
- lib/eupathdb_gff.rb
|
100
99
|
- lib/eupathdb_species_data.rb
|
@@ -105,7 +104,6 @@ files:
|
|
105
104
|
- test/helper.rb
|
106
105
|
- test/test_eupathdb_gene_information_table.rb
|
107
106
|
- test/test_eupathdb_species_data.rb
|
108
|
-
has_rdoc: true
|
109
107
|
homepage: http://github.com/wwood/reubypathdb
|
110
108
|
licenses:
|
111
109
|
- MIT
|
@@ -135,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
133
|
requirements: []
|
136
134
|
|
137
135
|
rubyforge_project:
|
138
|
-
rubygems_version: 1.6
|
136
|
+
rubygems_version: 1.8.6
|
139
137
|
signing_key:
|
140
138
|
specification_version: 3
|
141
139
|
summary: Classes to help parsing EuPathDB data files
|
data/Gemfile.lock
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
git (1.2.5)
|
5
|
-
jeweler (1.6.4)
|
6
|
-
bundler (~> 1.0)
|
7
|
-
git (>= 1.2.5)
|
8
|
-
rake
|
9
|
-
rake (0.9.2)
|
10
|
-
rcov (0.9.10)
|
11
|
-
shoulda (2.11.3)
|
12
|
-
|
13
|
-
PLATFORMS
|
14
|
-
ruby
|
15
|
-
|
16
|
-
DEPENDENCIES
|
17
|
-
bundler (~> 1.0.0)
|
18
|
-
jeweler (~> 1.6.4)
|
19
|
-
rcov
|
20
|
-
shoulda
|