reubypathdb 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.3.1
@@ -0,0 +1,70 @@
1
+
2
+
3
+ module Bio
4
+ class EuPathDB
5
+ # Looks like EuPathDB databases have settled on something like
6
+ # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
7
+ # where the species name differs but the rest is mostly constant
8
+ class FastaParser
9
+ attr_accessor :species_name
10
+
11
+ # The species name is what should show up in the 2nd bracket, so something
12
+ # like 'Toxoplasma_gondii_ME49' for
13
+ # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
14
+ # for instance
15
+ def initialize(species_name, filename)
16
+ @species_name = species_name
17
+ @filename = filename
18
+ end
19
+
20
+ # Enumerate through fasta file entries
21
+ def each
22
+ @flat = Bio::FlatFile.open(Bio::FastaFormat, @filename)
23
+ n = next_entry
24
+ while !n.nil?
25
+ yield n
26
+ n = next_entry
27
+ end
28
+ end
29
+
30
+ # Return the entry in the fasta file, or nil if there is no more or the
31
+ # fasta file could not be opened correctly.
32
+ def next_entry
33
+ return nil if !@flat
34
+ n = @flat.next_entry
35
+ return nil if !n
36
+
37
+ s = parse_name(n.definition)
38
+ s.sequence = n.seq
39
+ return s
40
+ end
41
+
42
+ def parse_name(definition)
43
+ s = FastaAnnotation.new
44
+
45
+ regex = /^(\S+)\|(.*?) \| organism=#{@species_name} \| product=(.*?) \| location=(.*) \| length=\d+$/
46
+ matches = definition.match(regex)
47
+
48
+ if !matches
49
+ raise Exception, "Definition line has unexpected format: `#{definition}'. Trying to match this line to the regular expression `#{regex.inspect}'"
50
+ end
51
+
52
+ matches2 = matches[4].match(/^(.+?)\:/)
53
+ if !matches2
54
+ raise ParseException, "Definition line has unexpected scaffold format: #{matches[4]}"
55
+ end
56
+ s.sequencing_centre = matches[1]
57
+ s.scaffold = matches2[1]
58
+ s.gene_id = matches[2]
59
+ s.annotation = matches[3]
60
+ return s
61
+ end
62
+ end
63
+
64
+ class FastaAnnotation
65
+ attr_accessor :gene_id, :sequence, :annotation, :scaffold, :sequencing_centre
66
+ end
67
+
68
+ class ParseException < Exception; end
69
+ end
70
+ end
@@ -76,6 +76,7 @@ class EuPathDBSpeciesData
76
76
  :source => 'ToxoDB',
77
77
  :database_download_folder => 'EtenellaHoughton',
78
78
  :behind_usage_policy => true,
79
+ :fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
79
80
  },
80
81
  'Toxoplasma gondii' => {
81
82
  :name => 'Toxoplasma gondii',
@@ -170,12 +171,12 @@ class EuPathDBSpeciesData
170
171
  end
171
172
 
172
173
  SOURCE_VERSIONS = {
173
- 'PlasmoDB' => '7.2',#
174
- 'ToxoDB' => '6.4',#'7.0',#
175
- 'CryptoDB' => '4.4',#'4.5',#
176
- 'PiroplasmaDB' => '1.0',#'1.1',#
174
+ 'PlasmoDB' => '8.2',
175
+ 'ToxoDB' => '7.2',
176
+ 'CryptoDB' => '4.6',
177
+ 'PiroplasmaDB' => '1.1',
177
178
  'FungiDB' => '1.0',
178
- 'TriTrypDB' => '3.2',
179
+ 'TriTrypDB' => '4.0',
179
180
  }
180
181
  DATABASES = SOURCE_VERSIONS.keys
181
182
 
@@ -186,7 +187,7 @@ class EuPathDBSpeciesData
186
187
  #
187
188
  # base_data_directory is the directory where locally cached version of the downloaded
188
189
  # files are stored.
189
- def initialize(nickname, base_data_directory=nil)
190
+ def initialize(nickname, base_data_directory=nil, database_version=nil)
190
191
  @species_data = @@data[nickname] # try the full name
191
192
  @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
192
193
  if @species_data.nil? # try using just the second word
@@ -195,10 +196,13 @@ class EuPathDBSpeciesData
195
196
  @species_data = @@data[splits[1]]
196
197
  end
197
198
  end
199
+ raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
198
200
 
199
201
  @base_data_directory = base_data_directory
200
202
 
201
- raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
203
+ # record out what version of the db we are looking at, otherwise default
204
+ @database_version = database_version
205
+ @database_version ||= SOURCE_VERSIONS[@species_data[:source]]
202
206
  end
203
207
 
204
208
  def method_missing(symbol)
@@ -237,7 +241,7 @@ class EuPathDBSpeciesData
237
241
  end
238
242
 
239
243
  def version
240
- SOURCE_VERSIONS[@species_data[:source]]
244
+ @database_version
241
245
  end
242
246
 
243
247
  def protein_fasta_filename
@@ -294,13 +298,7 @@ class EuPathDBSpeciesData
294
298
  end
295
299
 
296
300
  def eu_path_db_download_directory
297
- directories = {}
298
- SOURCE_VERSIONS.each do |db, version|
299
- # 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
300
- directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
301
- end
302
- raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
303
- return "#{directories[database]}/#{one_word_name}"
301
+ "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
304
302
  end
305
303
 
306
304
  def eu_path_db_fasta_download_directory
@@ -331,7 +329,7 @@ class EuPathDBSpeciesData
331
329
 
332
330
  def local_download_directory
333
331
  s = @species_data
334
- "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{SOURCE_VERSIONS[s[:source]]}"
332
+ "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
335
333
  end
336
334
 
337
335
  # an array of directory names. mkdir is called on each of them in order,
@@ -348,7 +346,7 @@ class EuPathDBSpeciesData
348
346
  s[:name],
349
347
  'genome',
350
348
  s[:source],
351
- SOURCE_VERSIONS[s[:source]]
349
+ @database_version,
352
350
  ]
353
351
 
354
352
  (0..components.length-1).collect do |i|
@@ -359,7 +357,8 @@ class EuPathDBSpeciesData
359
357
  # Return a list of the species names that are included in the EuPathDB database
360
358
  def self.species_data_from_database(database_name, base_download_directory=nil)
361
359
  species = @@data.select {|name, info|
362
- info[:source].downcase == database_name.downcase
360
+ info[:source].downcase == database_name.downcase and
361
+ name == info[:name] #only allow ones that are fully specified - not shortcut ones
363
362
  }
364
363
  species.collect do |name_info|
365
364
  EuPathDBSpeciesData.new(name_info[0], base_download_directory)
@@ -410,4 +409,8 @@ class EuPathDBSpeciesData
410
409
  end
411
410
  end
412
411
  end
412
+
413
+ def protein_fasta_file_iterator
414
+ Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
415
+ end
413
416
  end
@@ -1,3 +1,4 @@
1
1
  require 'eupathdb_gene_information_table'
2
2
  require 'eupathdb_gff'
3
3
  require 'eupathdb_species_data'
4
+ require 'eupathdb_fasta'
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{reubypathdb}
8
- s.version = "0.3.0"
8
+ s.version = "0.3.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Ben J Woodcroft"]
12
- s.date = %q{2011-08-26}
11
+ s.authors = [%q{Ben J Woodcroft}]
12
+ s.date = %q{2012-01-14}
13
13
  s.description = %q{Classes to help parsing EuPathDB data files}
14
14
  s.email = %q{donttrustben near gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -19,11 +19,11 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  "Gemfile",
22
- "Gemfile.lock",
23
22
  "LICENSE",
24
23
  "README.rdoc",
25
24
  "Rakefile",
26
25
  "VERSION",
26
+ "lib/eupathdb_fasta.rb",
27
27
  "lib/eupathdb_gene_information_table.rb",
28
28
  "lib/eupathdb_gff.rb",
29
29
  "lib/eupathdb_species_data.rb",
@@ -36,9 +36,9 @@ Gem::Specification.new do |s|
36
36
  "test/test_eupathdb_species_data.rb"
37
37
  ]
38
38
  s.homepage = %q{http://github.com/wwood/reubypathdb}
39
- s.licenses = ["MIT"]
40
- s.require_paths = ["lib"]
41
- s.rubygems_version = %q{1.6.1}
39
+ s.licenses = [%q{MIT}]
40
+ s.require_paths = [%q{lib}]
41
+ s.rubygems_version = %q{1.8.6}
42
42
  s.summary = %q{Classes to help parsing EuPathDB data files}
43
43
 
44
44
  if s.respond_to? :specification_version then
@@ -97,17 +97,32 @@ class EuPathDBSpeciesDataTest < Test::Unit::TestCase
97
97
  spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
98
98
  assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
99
99
  spd.eu_path_db_fasta_download_directory
100
- end
100
+ end
101
101
 
102
- def test_behind_usage_policy
102
+ def test_behind_usage_policy
103
103
  spd = EuPathDBSpeciesData.new('Plasmodium vivax')
104
104
  assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
105
105
  spd.eu_path_db_fasta_download_directory
106
- end
106
+ end
107
107
 
108
- def test_representative_strain_name
109
- spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
110
- assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
108
+ def test_representative_strain_name
109
+ spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
110
+ assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
111
111
  "#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
112
- end
112
+ end
113
+
114
+ def test_non_default_db_version_of_protein_fasta
115
+ spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir,'100.9')
116
+ assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/100.9/PyoeliiAnnotatedProteins_PlasmoDB-100.9.fasta",
117
+ spd.protein_fasta_path
118
+ end
119
+
120
+ def test_species_data_from_database
121
+ plasmos = EuPathDBSpeciesData.species_data_from_database('PlasmoDB')
122
+ # make sure there is only 1 of each species being produced - there was a bug about that multiple species were being generated at one point
123
+ bergs = plasmos.select{|s|
124
+ EuPathDBSpeciesData.new(s.name).name == 'Plasmodium berghei'
125
+ }
126
+ assert_equal ['Plasmodium berghei'], bergs.collect{|d| d.name}
127
+ end
113
128
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reubypathdb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 0
10
- version: 0.3.0
9
+ - 1
10
+ version: 0.3.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Ben J Woodcroft
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-08-26 00:00:00 +10:00
19
- default_executable:
18
+ date: 2012-01-14 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  type: :development
@@ -90,11 +89,11 @@ extra_rdoc_files:
90
89
  files:
91
90
  - .document
92
91
  - Gemfile
93
- - Gemfile.lock
94
92
  - LICENSE
95
93
  - README.rdoc
96
94
  - Rakefile
97
95
  - VERSION
96
+ - lib/eupathdb_fasta.rb
98
97
  - lib/eupathdb_gene_information_table.rb
99
98
  - lib/eupathdb_gff.rb
100
99
  - lib/eupathdb_species_data.rb
@@ -105,7 +104,6 @@ files:
105
104
  - test/helper.rb
106
105
  - test/test_eupathdb_gene_information_table.rb
107
106
  - test/test_eupathdb_species_data.rb
108
- has_rdoc: true
109
107
  homepage: http://github.com/wwood/reubypathdb
110
108
  licenses:
111
109
  - MIT
@@ -135,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
133
  requirements: []
136
134
 
137
135
  rubyforge_project:
138
- rubygems_version: 1.6.1
136
+ rubygems_version: 1.8.6
139
137
  signing_key:
140
138
  specification_version: 3
141
139
  summary: Classes to help parsing EuPathDB data files
@@ -1,20 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- git (1.2.5)
5
- jeweler (1.6.4)
6
- bundler (~> 1.0)
7
- git (>= 1.2.5)
8
- rake
9
- rake (0.9.2)
10
- rcov (0.9.10)
11
- shoulda (2.11.3)
12
-
13
- PLATFORMS
14
- ruby
15
-
16
- DEPENDENCIES
17
- bundler (~> 1.0.0)
18
- jeweler (~> 1.6.4)
19
- rcov
20
- shoulda