RubyGems - reubypathdb - Versions diffs - 0.3.0 → 0.3.1 - Mend

reubypathdb 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/VERSION +1 -1
data/lib/eupathdb_fasta.rb +70 -0
data/lib/eupathdb_species_data.rb +21 -18
data/lib/reubypathdb.rb +1 -0
data/reubypathdb.gemspec +7 -7
data/test/test_eupathdb_species_data.rb +22 -7
metadata +6 -8
data/Gemfile.lock +0 -20

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.3.0
1	+ 0.3.1

data/lib/eupathdb_fasta.rb ADDED

@@ -0,0 +1,70 @@
+module Bio
+  class EuPathDB
+    # Looks like EuPathDB databases have settled on something like
+    # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
+    # where the species name differs but the rest is mostly constant
+    class FastaParser
+      attr_accessor :species_name
+      # The species name is what should show up in the 2nd bracket, so something
+      # like 'Toxoplasma_gondii_ME49' for
+      # >gb|TGME49_000380 | organism=Toxoplasma_gondii_ME49 | product=myb-like DNA binding domain-containing protein | location=TGME49_chrVIII:6835359-6840923(-) | length=1528
+      # for instance
+      def initialize(species_name, filename)
+        @species_name = species_name
+        @filename = filename
+      end
+      # Enumerate through fasta file entries
+      def each
+        @flat = Bio::FlatFile.open(Bio::FastaFormat, @filename)
+        n = next_entry
+        while !n.nil?
+          yield n
+          n = next_entry
+        end
+      end
+      # Return the entry in the fasta file, or nil if there is no more or the
+      # fasta file could not be opened correctly.
+      def next_entry
+        return nil if !@flat
+        n = @flat.next_entry
+        return nil if !n
+        s = parse_name(n.definition)
+        s.sequence = n.seq
+        return s
+      end
+      def parse_name(definition)
+        s = FastaAnnotation.new
+        regex = /^(\S+)\|(.*?) \| organism=#{@species_name} \| product=(.*?) \| location=(.*) \| length=\d+$/
+        matches = definition.match(regex)
+        if !matches
+          raise Exception, "Definition line has unexpected format: `#{definition}'. Trying to match this line to the regular expression `#{regex.inspect}'"
+        end
+        matches2 = matches[4].match(/^(.+?)\:/)
+        if !matches2
+          raise ParseException, "Definition line has unexpected scaffold format: #{matches[4]}"
+        end
+        s.sequencing_centre = matches[1]
+        s.scaffold = matches2[1]
+        s.gene_id = matches[2]
+        s.annotation = matches[3]
+        return s
+      end
+    end
+    class FastaAnnotation
+      attr_accessor :gene_id, :sequence, :annotation, :scaffold, :sequencing_centre
+    end
+    class ParseException < Exception; end
+  end
+end

data/lib/eupathdb_species_data.rb CHANGED

@@ -76,6 +76,7 @@ class EuPathDBSpeciesData
       :source => 'ToxoDB',
       :database_download_folder => 'EtenellaHoughton',
       :behind_usage_policy => true,
+      :fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
     },
     'Toxoplasma gondii' => {
       :name => 'Toxoplasma gondii',
@@ -170,12 +171,12 @@ class EuPathDBSpeciesData
   end
   SOURCE_VERSIONS = {
-    'PlasmoDB' => '7.2',#
-    'ToxoDB' => '6.4',#'7.0',#
-    'CryptoDB' => '4.4',#'4.5',#
-    'PiroplasmaDB' => '1.0',#'1.1',#
+    'PlasmoDB' => '8.2',
+    'ToxoDB' => '7.2',
+    'CryptoDB' => '4.6',
+    'PiroplasmaDB' => '1.1',
     'FungiDB' => '1.0',
-    'TriTrypDB' => '3.2',
+    'TriTrypDB' => '4.0',
   }
   DATABASES = SOURCE_VERSIONS.keys
@@ -186,7 +187,7 @@ class EuPathDBSpeciesData
   #
   # base_data_directory is the directory where locally cached version of the downloaded
   # files are stored.
-  def initialize(nickname, base_data_directory=nil)
+  def initialize(nickname, base_data_directory=nil, database_version=nil)
     @species_data = @@data[nickname] # try the full name
     @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
     if @species_data.nil? # try using just the second word
@@ -195,10 +196,13 @@ class EuPathDBSpeciesData
         @species_data = @@data[splits[1]]
       end
     end
+    raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
     @base_data_directory = base_data_directory
-    raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
+    # record out what version of the db we are looking at, otherwise default
+    @database_version = database_version
+    @database_version ||= SOURCE_VERSIONS[@species_data[:source]]
   end
   def method_missing(symbol)
@@ -237,7 +241,7 @@ class EuPathDBSpeciesData
   end
   def version
-    SOURCE_VERSIONS[@species_data[:source]]
+    @database_version
   end
   def protein_fasta_filename
@@ -294,13 +298,7 @@ class EuPathDBSpeciesData
   end
   def eu_path_db_download_directory
-    directories = {}
-    SOURCE_VERSIONS.each do |db, version|
-      # 'PlasmoDB' => "http://plasmodb.org/common/downloads/release-#{SOURCE_VERSIONS['PlasmoDB']}",
-      directories[db] = "http://#{db.downcase}.org/common/downloads/release-#{version}"
-    end
-    raise Exception, "Base URL for database '#{database}' not known" if directories[database].nil?
-    return "#{directories[database]}/#{one_word_name}"
+    "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
   end
   def eu_path_db_fasta_download_directory
@@ -331,7 +329,7 @@ class EuPathDBSpeciesData
   def local_download_directory
     s = @species_data
-    "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{SOURCE_VERSIONS[s[:source]]}"
+    "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
   end
   # an array of directory names. mkdir is called on each of them in order,
@@ -348,7 +346,7 @@ class EuPathDBSpeciesData
     s[:name],
       'genome',
     s[:source],
-    SOURCE_VERSIONS[s[:source]]
+    @database_version,
     ]
      (0..components.length-1).collect do |i|
@@ -359,7 +357,8 @@ class EuPathDBSpeciesData
   # Return a list of the species names that are included in the EuPathDB database
   def self.species_data_from_database(database_name, base_download_directory=nil)
     species = @@data.select {|name, info|
-    info[:source].downcase == database_name.downcase
+      info[:source].downcase == database_name.downcase and
+      name == info[:name] #only allow ones that are fully specified - not shortcut ones
     }
     species.collect do |name_info|
       EuPathDBSpeciesData.new(name_info[0], base_download_directory)
@@ -410,4 +409,8 @@ class EuPathDBSpeciesData
       end
     end
   end
+  def protein_fasta_file_iterator
+    Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
+  end
 end

data/lib/reubypathdb.rb CHANGED

@@ -1,3 +1,4 @@
 require 'eupathdb_gene_information_table'
 require 'eupathdb_gff'
 require 'eupathdb_species_data'
+require 'eupathdb_fasta'

data/reubypathdb.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{reubypathdb}
-  s.version = "0.3.0"
+  s.version = "0.3.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
-  s.authors = ["Ben J Woodcroft"]
-  s.date = %q{2011-08-26}
+  s.authors = [%q{Ben J Woodcroft}]
+  s.date = %q{2012-01-14}
   s.description = %q{Classes to help parsing EuPathDB data files}
   s.email = %q{donttrustben near gmail.com}
   s.extra_rdoc_files = [
@@ -19,11 +19,11 @@ Gem::Specification.new do |s|
   s.files = [
     ".document",
     "Gemfile",
-    "Gemfile.lock",
     "LICENSE",
     "README.rdoc",
     "Rakefile",
     "VERSION",
+    "lib/eupathdb_fasta.rb",
     "lib/eupathdb_gene_information_table.rb",
     "lib/eupathdb_gff.rb",
     "lib/eupathdb_species_data.rb",
@@ -36,9 +36,9 @@ Gem::Specification.new do |s|
     "test/test_eupathdb_species_data.rb"
   ]
   s.homepage = %q{http://github.com/wwood/reubypathdb}
-  s.licenses = ["MIT"]
-  s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.6.1}
+  s.licenses = [%q{MIT}]
+  s.require_paths = [%q{lib}]
+  s.rubygems_version = %q{1.8.6}
   s.summary = %q{Classes to help parsing EuPathDB data files}
   if s.respond_to? :specification_version then

data/test/test_eupathdb_species_data.rb CHANGED

@@ -97,17 +97,32 @@ class EuPathDBSpeciesDataTest < Test::Unit::TestCase
     spd = EuPathDBSpeciesData.new('Plasmodium chabaudi')
     assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pchabaudi/fasta/data",
     spd.eu_path_db_fasta_download_directory
- end
+  end
- def test_behind_usage_policy
+  def test_behind_usage_policy
     spd = EuPathDBSpeciesData.new('Plasmodium vivax')
     assert_equal "http://plasmodb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['PlasmoDB']}/Pvivax/fasta",
     spd.eu_path_db_fasta_download_directory
- end
+  end
- def test_representative_strain_name
-   spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
-   assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
+  def test_representative_strain_name
+    spd = EuPathDBSpeciesData.new('Trypanosoma brucei')
+    assert_equal "http://tritrypdb.org/common/downloads/release-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}/Tbrucei/fasta/TbruceiTreu927Genomic_TriTrypDB-#{EuPathDBSpeciesData::SOURCE_VERSIONS['TriTrypDB']}.fasta",
     "#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}"
- end
+  end
+  def test_non_default_db_version_of_protein_fasta
+    spd = EuPathDBSpeciesData.new('Plasmodium yoelii', base_dir,'100.9')
+    assert_equal "/home/ben/phd/data/Plasmodium yoelii/genome/PlasmoDB/100.9/PyoeliiAnnotatedProteins_PlasmoDB-100.9.fasta",
+    spd.protein_fasta_path
+  end
+  def test_species_data_from_database
+    plasmos = EuPathDBSpeciesData.species_data_from_database('PlasmoDB')
+    # make sure there is only 1 of each species being produced - there was a bug about that multiple species were being generated at one point
+    bergs = plasmos.select{|s|
+      EuPathDBSpeciesData.new(s.name).name == 'Plasmodium berghei'
+    }
+    assert_equal ['Plasmodium berghei'], bergs.collect{|d| d.name}
+  end
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: reubypathdb
 version: !ruby/object:Gem::Version
-  hash: 19
+  hash: 17
   prerelease:
   segments:
   - 0
   - 3
-  - 0
-  version: 0.3.0
+  - 1
+  version: 0.3.1
 platform: ruby
 authors:
 - Ben J Woodcroft
@@ -15,8 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-08-26 00:00:00 +10:00
-default_executable:
+date: 2012-01-14 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   type: :development
@@ -90,11 +89,11 @@ extra_rdoc_files:
 files:
 - .document
 - Gemfile
-- Gemfile.lock
 - LICENSE
 - README.rdoc
 - Rakefile
 - VERSION
+- lib/eupathdb_fasta.rb
 - lib/eupathdb_gene_information_table.rb
 - lib/eupathdb_gff.rb
 - lib/eupathdb_species_data.rb
@@ -105,7 +104,6 @@ files:
 - test/helper.rb
 - test/test_eupathdb_gene_information_table.rb
 - test/test_eupathdb_species_data.rb
-has_rdoc: true
 homepage: http://github.com/wwood/reubypathdb
 licenses:
 - MIT
@@ -135,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.6.1
+rubygems_version: 1.8.6
 signing_key:
 specification_version: 3
 summary: Classes to help parsing EuPathDB data files

data/Gemfile.lock DELETED

@@ -1,20 +0,0 @@
-GEM
-  remote: http://rubygems.org/
-  specs:
-    git (1.2.5)
-    jeweler (1.6.4)
-      bundler (~> 1.0)
-      git (>= 1.2.5)
-      rake
-    rake (0.9.2)
-    rcov (0.9.10)
-    shoulda (2.11.3)
-PLATFORMS
-  ruby
-DEPENDENCIES
-  bundler (~> 1.0.0)
-  jeweler (~> 1.6.4)
-  rcov
-  shoulda