RubyGems - reubypathdb - Versions diffs - 0.1.0 → 0.2.0 - Mend

reubypathdb 0.1.0 → 0.2.0

Files changed (8) hide show

data/VERSION +1 -1
data/lib/eupathdb_gene_information_table.rb +67 -12
data/lib/eupathdb_gff.rb +215 -0
data/lib/jgi_genes.rb +300 -0
data/lib/reubypathdb.rb +2 -0
data/reubypathdb.gemspec +55 -0
metadata +12 -9
data/.gitignore +0 -21

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.0
1	+ 0.2.0

data/lib/eupathdb_gene_information_table.rb CHANGED Viewed

@@ -1,29 +1,80 @@
+# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
+# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
+require 'tempfile'
+# A class for extracting gene info from a particular gene from the information file
+class EuPathDBGeneInformationFileExtractor
+  # A filename path to the gene information file
+  attr_accessor :filename
+  def initialize(filename = nil)
+    @filename = filename
+  end
+  # Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
+  # there are multiple in the file, only the first is returned. If none are found, nil is returned.
+  #
+  # If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
+  def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
+    inside_iterator = lambda do |gene|
+      return gene if wanted_gene_id == gene.get_info('Gene Id')
+    end
+    filename = @filename
+    if grep_hack_lines and grep_hack_lines.to_i != 0
+      Tempfile.new('reubypathdb_grep_hack') do |tempfile|
+        # grep however many lines from past the point. Rather dodgy, but faster.
+        raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
+        `grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
+        EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
+          inside_iterator.call(gene)
+        end
+      end
+    else
+      # no grep hack. Parse the whole gene information file
+      EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
+        inside_iterator.call(gene)
+      end
+    end
+    return nil
+  end
+end
+# A class for parsing the 'gene information table' files from EuPathDB, such
+# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
+#
+# The usual way of interacting with these is the use of the each method,
+# which returns a EuPathDBGeneInformation object with all of the recorded
+# information in it.
 class EuPathDBGeneInformationTable
   include Enumerable
+  # Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
   def initialize(io)
     @io = io
   end
+  # Return a EuPathDBGeneInformation object with
+  # the contained info in it, one at a time
   def each
     while g = next_gene
       yield g
     end
   end
   # Returns a EuPathDBGeneInformation object with all the data you could
   # possibly want.
   def next_gene
     info = EuPathDBGeneInformation.new
     # first, read the table, which should start with the ID column
     line = @io.readline.strip
     while line == ''
       return nil if @io.eof?
       line = @io.readline.strip
     end
     while line != ''
       if matches = line.match(/^(.*?)\: (.*)$/)
         info.add_information(matches[1], matches[2])
@@ -33,7 +84,7 @@ class EuPathDBGeneInformationTable
       line = @io.readline.strip
     end
     # now read each of the tables, which should start with the
     # 'TABLE: <name>' entry
     line = @io.readline.strip
@@ -44,7 +95,7 @@ class EuPathDBGeneInformationTable
       if line == ''
         # add it to the stack unless we are just starting out
         info.add_table(table_name, headers, data) unless table_name.nil?
         # reset things
         table_name = nil
         headers = nil
@@ -63,32 +114,36 @@ class EuPathDBGeneInformationTable
       end
       line = @io.readline.strip
     end
     # return the object that has been created
     return info
   end
 end
+# Each gene in the gene information table is represented
+# by 2 types of information - info and tables.
+# info are 1 line data, whereas tables are tables of
+# data with possibly multiple rows
 class EuPathDBGeneInformation
   def info
     @info
   end
   def get_info(key)
     @info[key]
   end
   alias_method :[], :get_info
   def get_table(table_name)
     @tables[table_name]
   end
   def add_information(key, value)
     @info ||= {}
     @info[key] = value
     "Added info #{key}, now is #{@info[key]}"
   end
   def add_table(name, headers, data)
     @tables ||= {}
     @tables[name] = []

data/lib/eupathdb_gff.rb ADDED Viewed

@@ -0,0 +1,215 @@
+require 'rubygems'
+require 'bio'
+require 'jgi_genes'
+require 'cgi'
+# Unlike JGI genes files, ApiDB files have several differences:
+#  - genes on the reverse strand appear in order of their exons, and so
+#    the exons are not all in the correct order with respect to the underlying
+#    sequence.
+class EupathDBGFF < JgiGenesGff
+  attr_accessor :features_to_ignore
+  def initialize(path)
+    @file = File.open path, 'r'
+    @next_gff = read_record
+    @features_to_ignore = [
+      'rRNA',
+      'tRNA',
+      'snRNA',
+      'transcript'
+    ]
+  end
+  def next_gene
+    cur = @next_gff
+    if !cur
+      return nil
+    end
+    # Ignore the supercontigs at the start of the file
+    while ignore_line?(cur) or ignore_record?(cur)
+      @next_gff = read_record
+      cur = @next_gff
+      if !cur
+        return nil
+      end
+    end
+    if cur.feature != 'gene'
+      raise Exception, "Badly parsed apidb line: #{cur}. Expected gene first."
+    end
+    # save line so can set these values later,
+    # i
+    gene_line = cur
+    # First mRNA
+    cur = read_record
+    if cur.feature != 'mRNA'
+      # skip rRNA type genes because they are not relevant
+      if ignore_record?(cur)
+        # skip forward to the next gene
+        while cur.feature != 'gene'
+          cur = read_record
+          return nil if cur.nil? # we have reached the end on an ignored gene
+        end
+        @next_gff = cur
+        if cur
+          return next_gene
+        else
+          return nil
+        end
+      else
+        raise Exception, "Badly parsed apidb line: #{cur}. Expected mRNA next."
+      end
+    end
+    # Setup the gene in itself
+    gene = setup_gene_from_first_line gene_line
+    # setup stuff from mRNA line
+    ids = cur.attributes['Ontology_term']
+    if ids
+      gene.go_identifiers = ids.split ','
+    end
+    # Next CDS
+    cur = read_record
+    if cur.feature != 'CDS'
+      raise Exception, "Badly parsed apidb line: #{cur}. Expected CDS next."
+    end
+    gene.cds = []
+    while cur.feature == 'CDS'
+      f = Bio::Location.new
+      f.from = cur.start
+      f.to = cur.end
+      gene.cds.push f
+      cur = read_record
+    end
+    #next exons
+    if cur.feature != 'exon'
+      raise Exception, "Badly parsed apidb line: #{cur}. Expected exon next."
+    end
+    gene.exons = []
+    while cur and cur.feature == 'exon'
+      f = Bio::Location.new
+      f.from = cur.start
+      f.to = cur.end
+      gene.exons.push f
+      cur = read_record
+    end
+    @next_gff = cur
+    return gene
+  end
+  # ignore this line when parsing the file
+  def ignore_line?(cur)
+    return ['supercontig', 'introgressed_chromosome_region'].include?(cur.feature)
+  end
+  # Certain things I don't want uploaded, like apicoplast genome, etc.
+  def ignore_record?(record)
+    if !record or !record.seqname or
+        @features_to_ignore.include?(record.feature) or
+        record.seqname.match(/^apidb\|NC\_/) or
+        record.seqname.match(/^apidb\|API_IRAB/) or
+        record.seqname.match(/^apidb\|M76611/) or
+        record.seqname.match(/^apidb\|X95276/) #or
+#        record.seqname.match(/^apidb\|Pf/)
+      return true
+    else
+      return false
+    end
+  end
+  private
+  def read_record
+    line = ""
+    # while blank or comment lines, skip, except for ##Fasta, which
+    # means all the genes have already been defined
+    while line.lstrip.rstrip.empty? or line.match(/^\#/)
+      line = @file.gets
+      if !line or line.match(/^\#\#FASTA/)
+        return nil
+      end
+    end
+    whole = EupathDBGFFRecord.new(line)
+    return whole
+  end
+  # Given a line describing a gene in an apidb gff file, setup all the
+  # stuff associated with the 'gene' line
+  def setup_gene_from_first_line(gene_line)
+    gene = PositionedGeneWithOntology.new
+    gene.start = gene_line.start
+    gene.strand = gene_line.strand
+    aliai = gene_line.attributes['Alias']
+    if aliai
+      aliai.chomp!
+      gene.alternate_ids = aliai.split ','
+    end
+    # make description proper
+    description = gene_line.attributes['description']
+    gene.description = CGI::unescape(description) # yey for useful functions I didn't write
+    # name - remove the 'apidb|' bit
+    match = gene_line.attributes['ID'].match('apidb\|(.*)')
+    if !match or !match[1] or match[1] === ''
+      raise Exception, "Badly parsed gene name: #{gene_line}.attributes['ID']}."
+    end
+    gene.name = match[1]
+    gene.seqname = gene_line.seqname
+    return gene
+  end
+end
+class EupathDBGFFRecord < JgiGffRecord
+  # eg. ID=apidb|X95275;Name=X95275;description=Plasmodium+falciparum+complete+gene+map+of+plastid-like+DNA+%28IR-A%29.
+  def parse_attributes(attributes_string)
+    @attributes = Hash.new
+    parts = attributes_string.split ';'
+    if parts
+      parts.each {|couple|
+        cs = couple.split '='
+        #deal with attributes like 'Note=;' by ignoring them
+        # I once found one of these in the yeast genome gff
+        next if cs.length == 1 and couple.match(/=/)
+        if cs.length != 2
+          raise Exception, "Badly handled attributes bit in api db gff: '#{cs}' from '#{attributes_string}'"
+        end
+        @attributes[cs[0]] = cs[1]
+      }
+    end
+  end
+end
+class PositionedGeneWithOntology < PositionedGene
+  attr_accessor :alternate_ids, :description
+  attr_writer :go_identifiers
+  def go_identifiers
+    return nil if !@go_identifiers
+    return @go_identifiers.sort.uniq
+  end
+end

data/lib/jgi_genes.rb ADDED Viewed

@@ -0,0 +1,300 @@
+require 'bio'
+#
+class JgiGenesGff
+  #
+  def initialize(path)
+    @jgi_file = File.open(path, "r")
+    @next_gff = read_record
+  end
+  # Return a enhanced_gene object or nil if none exists
+  def next_gene
+    # Parse the first line into data structures for current gene
+    cur = @next_gff
+    if !cur
+      return nil
+    end
+    # Make sure the assumption that the first one is an exon is true
+    if cur.feature==='exon'
+      seqname = cur.seqname
+      strand = cur.strand
+      source = cur.source
+      name = parse_name(cur.attributes)
+      f = Bio::Location.new
+      f.from = cur.start
+      f.to = cur.end
+      exons = [f]
+      cds = []
+      protein_id = nil #Unknown until we have a CDS line in the file
+      # Continue reading until finished gene or finished file
+      finished_gene = false
+      while !finished_gene and (cur = read_record)
+        # if still in the same gene
+        if parse_name(cur.attributes) === name
+          if cur.strand != strand or cur.seqname != seqname or cur.source != source
+            puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
+            raise Exception, 'Data bug in JGI file or parsing is being done incorrectly'
+          end
+          f = Bio::Location.new
+          f.from = cur.start
+          f.to = cur.end
+          case cur.feature
+          when 'exon'
+            exons.push f
+          when 'CDS'
+            cds.push f
+            protein_id = parse_protein_id(cur.attributes)
+          when 'start_codon' #meh
+          when 'stop_codon'
+          else
+            puts "EXCEPTION !!!!!!!!!!!!!!!!!!!"
+            raise Exception, "Unknown feature type #{cur.feature} found."
+          end
+        else
+          finished_gene = true
+        end
+      end
+      #make ready for the next gene
+      @next_gff = cur
+      #create a new positioned gene with the useful characteristics
+      #      puts "Returning gene:"
+      #      p exons.length
+      #      p cds.length
+      g = PositionedGene.new
+      g.seqname = seqname
+      g.name = name
+      g.strand = strand
+      g.start = exons[0].from
+      g.exons = exons
+      g.cds = cds
+      g.protein_id = protein_id
+      return g
+    else
+      p cur.feature
+      # I'm not sure if this is detrimental or not, but to be safe..
+      raise Exception, "Assumption failed: exon is not first feature in the gene"
+    end
+  end
+  def distance_iterator
+    return JgiGenesIterator.new(self)
+  end
+  private
+  # Read a line from the file, and create the next gff object,
+  # or nil if none exists
+  def read_record
+    line = ""
+    while line.lstrip.rstrip.empty?
+      line = @jgi_file.gets
+      if !line
+        return nil
+      end
+    end
+    whole = JgiGffRecord.new(line)
+    return whole
+  end
+  # Return the name of the gene, given the attributes hash
+  def parse_name(attributes)
+    name = attributes['name'].gsub('"','')
+    return name
+  end
+  def parse_protein_id(attributes)
+    return attributes['proteinId'].to_i
+  end
+end
+# A gene as read from the gff file.
+# cds and exons are assumed to be in increasing order in terms of their
+# positions
+# along the positive strand.
+class PositionedGene
+  attr_accessor :seqname, :name, :strand, :start, :exons, :cds, :protein_id
+  # Return the position of the cds end
+  def cds_start
+    # If gene has no coding regions, I guess
+    if !@cds[0]
+      return nil
+    end
+    return @cds[0].from
+  end
+  def cds_end
+    # If gene has no coding regions, I guess
+    if !@cds[@cds.length-1]
+      return nil
+    end
+    return @cds[@cds.length-1].to
+  end
+  def positive_strand?
+    return @strand === '+'
+  end
+end
+# Fixes up JGI to GFF problems. I don't mean to blame anyone but
+# they just don't seem to go together
+class JgiGffRecord < Bio::GFF::Record
+  SEQNAME_COL = 0
+  SOURCE_COL = 1
+  FEATURE_COL = 2
+  START_COL = 3
+  END_COL = 4
+  SCORE_COL = 5
+  STRAND_COL = 6
+  FRAME_COL = 7
+  ATTRIBUTES_COL = 8
+  def initialize(line)
+    @line = line
+    parts = line.split("\t");
+    if parts.length != 9 and parts.length != 8
+      raise Exception, "Badly formatted GFF line - doesn't have correct number of components '#{line}"
+    end
+    parse_mandatory_columns(parts)
+    parse_attributes(parts[ATTRIBUTES_COL])
+  end
+  # Given an array of 8 strings, parse the columns into something
+  # that can be understood by this object
+  def parse_mandatory_columns(parts)
+    @seqname = parts[SEQNAME_COL]
+    @source = parts[SOURCE_COL]
+    @feature = parts[FEATURE_COL]
+    @start = parts[START_COL]
+    @end = parts[END_COL]
+    @score = parts[SCORE_COL]
+    @strand = parts[STRAND_COL]
+    @frame = parts[FRAME_COL]
+  end
+  # parse the last part of a line into a hash contained in attributes
+  # global variable
+  def parse_attributes(attribute_string)
+    @attributes = Hash.new #define empty attributes even if there are none
+    if attribute_string
+      #let the fancy parsing begin
+      aparts = attribute_string.split '; '
+      aparts.each do |bit|
+        hbits = bit.split ' '
+        if !hbits or hbits.length != 2
+          raise Exception, "Failed to parse attributes in line: #{line}"
+        end
+        str = hbits[1].gsub(/\"/, '').rstrip.lstrip
+        @attributes[hbits[0]] = str
+      end
+    end
+  end
+  def to_s
+    @line
+  end
+end
+class JgiGenesIterator
+  def initialize(jgiGenesGffObj)
+    @genbank = jgiGenesGffObj
+    # Setup cycle for iterator
+    @cur_gene = @genbank.next_gene
+    @next_gene = @genbank.next_gene
+    @next_is_first = true
+  end
+  def has_next_distance
+    return !@next_gene.nil?
+  end
+  # Return the next gene to be worked on
+  def next_gene
+    return @cur_gene
+  end
+  # Return the upstream distance between one gene and another
+  def next_distance
+    # if the first gene in the list
+    if @next_is_first
+      # cycle has already been setup in initialisation
+      @next_is_first = false;
+    else
+      #cycle through things
+      if !@cur_gene #if nothing is found
+        raise Exception, 'Unexpected nil cur_gene - a software coding error?'
+      end
+      @prev_gene = @cur_gene
+      @cur_gene = @next_gene
+      @next_gene = @genbank.next_gene
+    end
+    if !@cur_gene
+      raise Exception, 'Overrun iterator - no more genes available. Use has_next_distance'
+    end
+    # We look at the current gene, and return its upstream distance
+    if @cur_gene.positive_strand?
+      # so we want the distance between cur and last then
+      # if last gene undefined or on a different scaffold, return nothing
+      if !@prev_gene or @prev_gene.seqname != @cur_gene.seqname
+        return nil
+      end
+      return @cur_gene.cds_start.to_i - @prev_gene.cds_end.to_i
+    else
+      if !@next_gene or @next_gene.seqname != @cur_gene.seqname
+        return nil
+      end
+      return @next_gene.cds_start.to_i - @cur_gene.cds_end.to_i
+    end
+  end
+end

data/lib/reubypathdb.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'eupathdb_gene_information_table'
2	+ require 'eupathdb_gff'

data/reubypathdb.gemspec ADDED Viewed

@@ -0,0 +1,55 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{reubypathdb}
+  s.version = "0.2.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Ben J Woodcroft"]
+  s.date = %q{2011-04-19}
+  s.description = %q{Classes to help parsing EuPathDB data files}
+  s.email = %q{donttrustben near gmail.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+    "LICENSE",
+    "README.rdoc",
+    "Rakefile",
+    "VERSION",
+    "lib/eupathdb_gene_information_table.rb",
+    "lib/eupathdb_gff.rb",
+    "lib/jgi_genes.rb",
+    "lib/reubypathdb.rb",
+    "reubypathdb.gemspec",
+    "test/data/eupathGeneInformation.txt",
+    "test/helper.rb",
+    "test/test_eupathdb_gene_information_table.rb"
+  ]
+  s.homepage = %q{http://github.com/wwood/reubypathdb}
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.6.2}
+  s.summary = %q{Classes to help parsing EuPathDB data files}
+  s.test_files = [
+    "test/helper.rb",
+    "test/test_eupathdb_gene_information_table.rb"
+  ]
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+    else
+      s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+  end
+end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: reubypathdb
 version: !ruby/object:Gem::Version
-  hash: 27
-  prerelease: false
+  hash: 23
+  prerelease:
   segments:
   - 0
-  - 1
+  - 2
   - 0
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Ben J Woodcroft
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-11-01 00:00:00 +11:00
+date: 2011-04-19 00:00:00 +10:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -43,12 +43,15 @@ extra_rdoc_files:
 - README.rdoc
 files:
 - .document
-- .gitignore
 - LICENSE
 - README.rdoc
 - Rakefile
 - VERSION
 - lib/eupathdb_gene_information_table.rb
+- lib/eupathdb_gff.rb
+- lib/jgi_genes.rb
+- lib/reubypathdb.rb
+- reubypathdb.gemspec
 - test/data/eupathGeneInformation.txt
 - test/helper.rb
 - test/test_eupathdb_gene_information_table.rb
@@ -57,8 +60,8 @@ homepage: http://github.com/wwood/reubypathdb
 licenses: []
 post_install_message:
-rdoc_options:
-- --charset=UTF-8
+rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
@@ -82,7 +85,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.7
+rubygems_version: 1.6.2
 signing_key:
 specification_version: 3
 summary: Classes to help parsing EuPathDB data files

data/.gitignore DELETED Viewed

@@ -1,21 +0,0 @@
-## MAC OS
-.DS_Store
-## TEXTMATE
-*.tmproj
-tmtags
-## EMACS
-*~
-\#*
-.\#*
-## VIM
-*.swp
-## PROJECT::GENERAL
-coverage
-rdoc
-pkg
-## PROJECT::SPECIFIC