reubypathdb 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.3.1
@@ -0,0 +1,70 @@
1
+
2
+
3
+ module Bio
4
+ class EuPathDB
5
+ # Looks like EuPathDB databases have settled on something like
6
+ # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
7
+ # where the species name differs but the rest is mostly constant
8
+ class FastaParser
9
+ attr_accessor :species_name
10
+
11
+ # The species name is what should show up in the 2nd bracket, so something
12
+ # like 'Toxoplasma_gondii_ME49' for
13
+ # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
14
+ # for instance
15
+ def initialize(species_name, filename)
16
+ @species_name = species_name
17
+ @filename = filename
18
+ end
19
+
20
+ # Enumerate through fasta file entries
21
+ def each
22
+ @flat = Bio::FlatFile.open(Bio::FastaFormat, @filename)
23
+ n = next_entry
24
+ while !n.nil?
25
+ yield n
26
+ n = next_entry
27
+ end
28
+ end
29
+
30
+ # Return the entry in the fasta file, or nil if there is no more or the
31
+ # fasta file could not be opened correctly.
32
+ def next_entry
33
+ return nil if !@flat
34
+ n = @flat.next_entry
35
+ return nil if !n
36
+
37
+ s = parse_name(n.definition)
38
+ s.sequence = n.seq
39
+ return s
40
+ end
41
+
42
+ def parse_name(definition)
43
+ s = FastaAnnotation.new
44
+
45
+ regex = /^(\S+)\|(.*?) \| organism=#{@species_name} \| product=(.*?) \| location=(.*) \| length=\d+$/
46
+ matches = definition.match(regex)
47
+
48
+ if !matches
49
+ raise Exception, "Definition line has unexpected format: `#{definition}'. Trying to match this line to the regular expression `#{regex.inspect}'"
50
+ end
51
+
52
+ matches2 = matches[4].match(/^(.+?)\:/)
53
+ if !matches2
54
+ raise ParseException, "Definition line has unexpected scaffold format: #{matches[4]}"
55
+ end
56
+ s.sequencing_centre = matches[1]
57
+ s.scaffold = matches2[1]
58
+ s.gene_id = matches[2]
59
+ s.annotation = matches[3]
60
+ return s
61
+ end
62
+ end
63
+
64
+ class FastaAnnotation
65
+ attr_accessor :gene_id, :sequence, :annotation, :scaffold, :sequencing_centre
66
+ end
67
+
68
+ class ParseException < Exception; end
69
+ end
70
+ end
@@ -76,6 +76,7 @@ class EuPathDBSpeciesData
76
76
  :source => 'ToxoDB',
77
77
  :database_download_folder => 'EtenellaHoughton',
78
78
  :behind_usage_policy => true,
79
+ :fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
79
80
  },
80
81
  'Toxoplasma gondii' => {
81
82
  :name => 'Toxoplasma gondii',
@@ -170,12 +171,12 @@ class EuPathDBSpeciesData
170
171
  end
171
172
 
172
173
  SOURCE_VERSIONS = {
173
- 'PlasmoDB' => '7.2',#
174
- 'ToxoDB' => '6.4',#'7.0',#
175
- 'CryptoDB' => '4.4',#'4.5',#
176
- 'PiroplasmaDB' => '1.0',#'1.1',#
174
+ 'PlasmoDB' => '8.2',
175
+ 'ToxoDB' => '7.2',
176
+ 'CryptoDB' => '4.6',
177
+ 'PiroplasmaDB' => '1.1',
177
178
  'FungiDB' => '1.0',
178
- 'TriTrypDB' => '3.2',
179
+ 'TriTrypDB' => '4.0',
179
180
  }
180
181
  DATABASES = SOURCE_VERSIONS.keys
181
182
 
@@ -186,7 +187,7 @@ class EuPathDBSpeciesData
186
187
  #
187
188
  # base_data_directory is the directory where locally cached version of the downloaded
188
189
  # files are stored.
189
- def initialize(nickname, base_data_directory=nil)
190
+ def initialize(nickname, base_data_directory=nil, database_version=nil)
190
191
  @species_data = @@data[nickname] # try the full name
191
192
  @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
192
193
  if @species_data.nil? # try using just the second word
@@ -195,10 +196,13 @@ class EuPathDBSpeciesData
195
196
  @species_data = @@data[splits[1]]
196
197
  end
197
198
  end
199
+ raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
198
200
 
199
201
  @base_data_directory = base_data_directory
200
202
 
201
- raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
203
+ # record out what version of the db we are looking at, otherwise default
204
+ @database_version = database_version
205
+ @database_version ||= SOURCE_VERSIONS[@species_data[:source]]
202
206
  end
203
207
 
204
208
  def method_missing(symbol)
@@ -237,7 +241,7 @@ class EuPathDBSpeciesData
237
241
  end
238
242
 
239
243
  def version
240
- SOURCE_VERSIONS[@species_data[:source]]
244
+ @database_version
241
245
  end
242
246
 
243
247
  def protein_fasta_filename
@@ -294,13 +298,7 @@ class EuPathDBSpeciesData
294
298
  end
295
299
 
296
300
  def eu_path_db_download_directory
297
- directories = {}
298
- SOURCE_VERSIONS.each do |db, version|
299
- # 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
300
- directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
301
- end
302
- raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
303
- return "#{directories[database]}/#{one_word_name}"
301
+ "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
304
302
  end
305
303
 
306
304
  def eu_path_db_fasta_download_directory
@@ -331,7 +329,7 @@ class EuPathDBSpeciesData
331
329
 
332
330
  def local_download_directory
333
331
  s = @species_data
334
- "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{SOURCE_VERSIONS[s[:source]]}"
332
+ "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
335
333
  end
336
334
 
337
335
  # an array of directory names. mkdir is called on each of them in order,
@@ -348,7 +346,7 @@ class EuPathDBSpeciesData
348
346
  s[:name],
349
347
  'genome',
350
348
  s[:source],
351
- SOURCE_VERSIONS[s[:source]]
349
+ @database_version,
352
350
  ]
353
351
 
354
352
  (0..components.length-1).collect do |i|
@@ -359,7 +357,8 @@ class EuPathDBSpeciesData
359
357
  # Return a list of the species names that are included in the EuPathDB database
360
358
  def self.species_data_from_database(database_name, base_download_directory=nil)
361
359
  species = @@data.select {|name, info|
362
- info[:source].downcase == database_name.downcase
360
+ info[:source].downcase == database_name.downcase and
361
+ name == info[:name] #only allow ones that are fully specified - not shortcut ones
363
362
  }
364
363
  species.collect do |name_info|
365
364
  EuPathDBSpeciesData.new(name_info[0], base_download_directory)
@@ -410,4 +409,8 @@ class EuPathDBSpeciesData
410
409
  end
411
410
  end
412
411
  end
412
+
413
+ def protein_fasta_file_iterator
414
+ Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
415
+ end
413
416
  end
@@ -1,3 +1,4 @@
1
1
  require 'eupathdb_gene_information_table'
2
2
  require 'eupathdb_gff'
3
3
  require 'eupathdb_species_data'
4
+ require 'eupathdb_fasta'
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{reubypathdb}
8
- s.version = "0.3.0"
8
+ s.version = "0.3.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Ben J Woodcroft"]
12
- s.date = %q{2011-08-26}
11
+ s.authors = [%q{Ben J Woodcroft}]
12
+ s.date = %q{2012-01-14}
13
13
  s.description = %q{Classes to help parsing EuPathDB data files}
14
14
  s.email = %q{donttrustben near gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -19,11 +19,11 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  "Gemfile",
22
- "Gemfile.lock",
23
22
  "LICENSE",
24
23
  "README.rdoc",
25
24
  "Rakefile",
26
25
  "VERSION",
26
+ "lib/eupathdb_fasta.rb",
27
27
  "lib/eupathdb_gene_information_table.rb",
28
28
  "lib/eupathdb_gff.rb",
29
29
  "lib/eupathdb_species_data.rb",
@@ -36,9 +36,9 @@ Gem::Specification.new do |s|
36
36
  "test/test_eupathdb_species_data.rb"
37
37
  ]
38
38
  s.homepage = %q{http://github.com/wwood/reubypathdb}
39
- s.licenses = ["MIT"]
40
- s.require_paths = ["lib"]
41
- s.rubygems_version = %q{1.6.1}
39
+ s.licenses = [%q{MIT}]
40
+ s.require_paths = [%q{lib}]
41
+ s.rubygems_version = %q{1.8.6}
42
42
  s.summary = %q{Classes to help parsing EuPathDB data files}
43
43
 
44
44
  if s.respond_to? :specification_version then
@@ -97,17 +97,32 @@ class EuPathDBSpeciesDataTest < Test::Unit::TestCase
97
97
  spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
98
98
  assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
99
99
  spd.eu_path_db_fasta_download_directory
100
- end
100
+ end
101
101
 
102
- def test_behind_usage_policy
102
+ def test_behind_usage_policy
103
103
  spd = EuPathDBSpeciesData.new('Plasmodium vivax')
104
104
  assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
105
105
  spd.eu_path_db_fasta_download_directory
106
- end
106
+ end
107
107
 
108
- def test_representative_strain_name
109
- spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
110
- assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
108
+ def test_representative_strain_name
109
+ spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
110
+ assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
111
111
  "#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
112
- end
112
+ end
113
+
114
+ def test_non_default_db_version_of_protein_fasta
115
+ spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir,'100.9')
116
+ assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/100.9/PyoeliiAnnotatedProteins_PlasmoDB-100.9.fasta",
117
+ spd.protein_fasta_path
118
+ end
119
+
120
+ def test_species_data_from_database
121
+ plasmos = EuPathDBSpeciesData.species_data_from_database('PlasmoDB')
122
+ # make sure there is only 1 of each species being produced - there was a bug about that multiple species were being generated at one point
123
+ bergs = plasmos.select{|s|
124
+ EuPathDBSpeciesData.new(s.name).name == 'Plasmodium berghei'
125
+ }
126
+ assert_equal ['Plasmodium berghei'], bergs.collect{|d| d.name}
127
+ end
113
128
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reubypathdb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 0
10
- version: 0.3.0
9
+ - 1
10
+ version: 0.3.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Ben J Woodcroft
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-08-26 00:00:00 +10:00
19
- default_executable:
18
+ date: 2012-01-14 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  type: :development
@@ -90,11 +89,11 @@ extra_rdoc_files:
90
89
  files:
91
90
  - .document
92
91
  - Gemfile
93
- - Gemfile.lock
94
92
  - LICENSE
95
93
  - README.rdoc
96
94
  - Rakefile
97
95
  - VERSION
96
+ - lib/eupathdb_fasta.rb
98
97
  - lib/eupathdb_gene_information_table.rb
99
98
  - lib/eupathdb_gff.rb
100
99
  - lib/eupathdb_species_data.rb
@@ -105,7 +104,6 @@ files:
105
104
  - test/helper.rb
106
105
  - test/test_eupathdb_gene_information_table.rb
107
106
  - test/test_eupathdb_species_data.rb
108
- has_rdoc: true
109
107
  homepage: http://github.com/wwood/reubypathdb
110
108
  licenses:
111
109
  - MIT
@@ -135,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
133
  requirements: []
136
134
 
137
135
  rubyforge_project:
138
- rubygems_version: 1.6.1
136
+ rubygems_version: 1.8.6
139
137
  signing_key:
140
138
  specification_version: 3
141
139
  summary: Classes to help parsing EuPathDB data files
@@ -1,20 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- git (1.2.5)
5
- jeweler (1.6.4)
6
- bundler (~> 1.0)
7
- git (>= 1.2.5)
8
- rake
9
- rake (0.9.2)
10
- rcov (0.9.10)
11
- shoulda (2.11.3)
12
-
13
- PLATFORMS
14
- ruby
15
-
16
- DEPENDENCIES
17
- bundler (~> 1.0.0)
18
- jeweler (~> 1.6.4)
19
- rcov
20
- shoulda