RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/lib/rbbt/util/open.rb ADDED Viewed

@@ -0,0 +1,207 @@
+require 'rbbt/util/tmpfile'
+require 'rbbt'
+# Provides with a few helper functions to read and write files, as well # as
+# for accessing remote files. It supports caching the files.
+module Open
+  class DirectoryNotFoundError < StandardError; end
+  private
+  @@remote_cachedir =  File.join(Rbbt.cachedir, 'open-remote/')
+  FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
+  # If no data is specified and the url is found in the cache the saved
+  # contents are returned, if not found, the url is opened and the contents of
+  # that are returned. If +data+ is specified then it is saved in the
+  # cache under +url+. To match +url+ in the cache a MD5 digest is used.
+  # The location of the cache directory is bu default
+  # File.join(Rbbt.cachedir, 'open-remote/').
+  def self.cache(url, data = nil)
+    require 'digest/md5'
+    digest = Digest::MD5.hexdigest(url)
+    if data
+      Open.write(File.join(@@remote_cachedir, digest), data)
+      return nil
+    else
+      if File.exist? File.join(@@remote_cachedir, digest)
+        return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
+      else
+        return nil
+      end
+    end
+  end
+  # Checks if +url+ is a remote file.
+  def self.remote(url)
+    url =~ /^(?:http|ssh|https|ftp):\/\//
+  end
+  # Checks if +url+ is a gzip file.
+  def self.gziped(url)
+    if remote(url)
+      return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
+    else
+      return url =~ /\.gz$/
+    end
+  end
+  public
+  # Reads the file specified by url. If the url es local it just opens
+  # the file, if it is remote if checks the cache first. In any case, it
+  # unzips gzip files automatically.
+  #
+  # Options:
+  # * :quiet => Do not print the progress of downloads
+  # * :nocache  => do not use the cache.
+  #
+  def self.read(url, options = {})
+    case
+    when remote(url)
+      if !options[:nocache] && data = cache(url)
+        return data
+      end
+      tmp = TmpFile.tmp_file("open-")
+      `wget -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
+      if gziped(url)
+        `mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
+      end
+      cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
+      data = File.open(tmp){|file| file.read}
+      FileUtils.rm tmp
+      return data
+    when IO === url
+      url.read
+    else
+      return  File.open(url){|file| file.read}
+    end
+  end
+  # Writes the contents on the path specified by filename
+  #
+  # Options:
+  # * :force => Create directories if missing.
+  def self.write(filename, content, options = {})
+    if !File.exist? File.dirname(filename)
+      if options[:force]
+        FileUtils.makedirs(File.dirname(filename))
+      else
+        raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
+      end
+    end
+    File.open(filename,'w'){|f|
+      f.write content
+    }
+    nil
+  end
+  # Writes the contents on the path specified by filename. If the file
+  # is present it appends the contents.
+  #
+  # Options:
+  # * :force => Create directories if missing.
+  def self.append(filename, content, options ={})
+    if !File.exist? File.dirname(filename)
+      if options[:force]
+        FileUtils.makedirs(File.dirname(filename))
+      else
+        raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
+      end
+    end
+    f = File.open(filename,'a')
+    f.write content
+    f.close
+    nil
+  end
+  # Reads a file with rows with elementes separated by a given pattern
+  # and builds a hash with it. The keys of the hash are the elements in
+  # the :native positions, by default the first (0). The value for each
+  # key is an array with one position for each of the rest possible
+  # positions specified in :extra, by default all but the :native. Since
+  # the native key may be repeated, each of the positions of the values
+  # is in itself an array. There are a number of options to change this
+  # behaviour.
+  #
+  # Options:
+  # * :native => position of the elements that will constitute the keys. By default 0.
+  # * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
+  # * :sep =>  pattern to use in splitting the lines into elements, by default "\t"
+  # * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
+  # * :single => for each key select only the first of the values, instead of the complete array.
+  # * :fix  => A Proc that is called to pre-process the line
+  # * :exclude => A Proc that is called to check if the line must be excluded from the process.
+  def self.to_hash(filename, options = {})
+    native  = options[:native]  || 0
+    extra   = options[:extra]
+    exclude = options[:exclude]
+    fix     = options[:fix]
+    sep     = options[:sep]     || "\t"
+    single  = options[:single]
+    single  = false if single.nil?
+    flatten = options[:flatten] || single
+    flatten = single if flatten.nil?
+    extra = [extra] if extra && ! extra.is_a?( Array)
+    data = {}
+    Open.read(filename).each{|l|
+      l = fix.call(l) if fix
+      next if exclude and exclude.call(l)
+      parts = l.chomp.split(/#{sep}/)
+      id = parts[native]
+      next if id.nil? || id == ""
+      data[id] ||= []
+      if extra
+        fields = extra
+      else
+        fields = (0..(parts.length - 1)).to_a - [native]
+      end
+      fields.each_with_index{|pos,i|
+        data[id][i] ||= []
+        data[id][i] << parts[pos]
+      }
+    }
+    if flatten
+      data.each{|key, values|
+        if values
+          values.flatten!
+          values.collect!{|v|
+            if v != ""
+              v
+            else
+              nil
+            end
+          }
+          values.compact!
+        else
+          nil
+        end
+      }
+    end
+    data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
+    data
+  end
+end

data/lib/rbbt/util/simpleDSL.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require 'parse_tree_extensions'
+require 'parse_tree'
+require 'ruby2ruby'
+# This class helps designing DSL in ruby based on method_missing. Class
+# is initialize with a block of code or a file with the code, and it is
+# given a method to be invoked instead of method missing. This class
+# deals simply with making the method_missing alias and removing it and
+# executing the block of file with code.
+class SimpleDSL
+  class ConfigFileMissingError < StandardError; end
+  private
+  def hook_method(method = nil)
+    method ||= :DSL_action
+    @@restore_name = ("restore_DSL_" + method.to_s).to_sym
+    @@method_name = method.to_sym
+    class << self
+      @restore_stack ||= []
+      @restore_stack << @@restore_name
+      alias_method(@@restore_name, :method_missing)
+      alias_method(:method_missing, @@method_name)
+    end
+  end
+  def unhook_method
+    class << self
+      alias_method(:method_missing, @restore_stack.pop)
+    end
+  end
+  public
+  def parse(method = nil, actions = nil, &block)
+    actions ||= block
+    hook_method(method)
+    # Execute
+    if actions.is_a? Proc
+      @config[@@method_name] = actions.to_ruby.collect[1..-2].join
+      instance_eval &actions
+    elsif File.exists?(actions)
+      @config[@@method_name] = File.open(actions).read
+      eval File.open(actions).read
+    end
+    unhook_method
+  end
+  # Processes a DSL. +method+ is the name of the method executed instead
+  # of method_missing. The code to be evaluated as a DSL is either
+  # specified in +&block+ or in the file pointed by +file+.
+  def initialize(method = nil, file = nil, &block)
+    @config = {}
+    if file
+      raise ConfigFileMissingError.new "File '#{ file }' is missing. Have you installed the config files? (rbbt_config install norm)." unless File.exists? file
+      parse(method, file)
+    end
+    if block
+      parse(method, block)
+    end
+  end
+  # Returns the code with the DSL that was executed. If it came from a
+  # block it was turned to string using ruby2ruby.
+  def config(action = nil)
+    if action
+      @config[action.to_sym]
+    else
+      @config[:DSL_action]
+    end
+  end
+end

data/lib/rbbt/util/tmpfile.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'fileutils'
+require 'rbbt'
+module TmpFile
+  # Creates a random file name, with the given suffix and a random number
+  # up to +max+
+  def self.random_name( s="",max=10000000)
+    n = rand(max)
+    s << n.to_s
+    s
+  end
+  # Creates a random filename in the temporary directory
+  def self.tmp_file(s = "",max=10000000)
+    File.join(Rbbt.tmpdir,random_name(s,max))
+  end
+end

data/lib/rbbt/version.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module Rbbt
+  module VERSION #:nodoc:
+    MAJOR = 1
+    MINOR = 0
+    TINY  = 0
+    STRING = [MAJOR, MINOR, TINY].join('.')
+    self
+  end
+end

data/lib/rbbt.rb ADDED Viewed

@@ -0,0 +1,86 @@
+$:.unshift(File.dirname(__FILE__)) unless
+  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+require 'fileutils'
+require 'yaml'
+# This module implements a number of utilities aimed at performing Text
+# Mining of BioMedical data. I includes the following:
+#
+# * Multi-purpose Named Entity Recognition and Normalization. And training data for
+#   Gene Mention from the BioCreative competition.
+# * Document Classification
+# * Interfaces to Gene Ontology, Entrez Gene, BioMart and PubMed
+#
+# There are a number of classes to help gather and integrate the
+# information from all the sources. It is design to be very flexible,
+# but with a sensible set of defaults.
+#
+module Rbbt
+  class NoConfig < Exception; end
+  @@rootdir = File.dirname(File.dirname(__FILE__))
+  @@datadir = @@cachedir = @@tmpdir = nil
+  def self.load_config
+    if File.exist?(File.join(@@rootdir, 'rbbt.config'))
+      config = YAML.load_file(File.join(@@rootdir, 'rbbt.config'))
+      if config.is_a? Hash
+        @@datadir  = config['datadir'] if config['datadir']
+        @@cachedir = config['cachedir'] if config['cachedir']
+        @@tmpdir   = config['tmpdir'] if config['tmpdir']
+      end
+    end
+    if File.exist?(File.join(ENV['HOME'], '.rbbt'))
+      config = YAML.load_file(File.join(ENV['HOME'], '.rbbt') )
+      if config.is_a? Hash
+        @@datadir  = config['datadir'] if config['datadir']
+        @@cachedir = config['cachedir'] if config['cachedir']
+        @@tmpdir   = config['tmpdir'] if config['tmpdir']
+      end
+    end
+    if @@datadir.nil?  || @@cachedir.nil? || @@tmpdir.nil?
+      raise NoConfig, "rbbt not configured. Edit #{File.join(@@rootdir, 'rbbt.config')} or $HOME/.rbbt"
+    end
+    FileUtils.mkdir_p @@datadir  unless File.exist? @@datadir
+    FileUtils.mkdir_p @@cachedir unless File.exist? @@cachedir
+    FileUtils.mkdir_p @@tmpdir   unless File.exist? @@tmpdir
+    # For some reason banner.jar must be loaded before abner.jar
+    ENV['CLASSPATH'] ||= ""
+    ENV['CLASSPATH'] += ":" + %w(banner abner).collect{|pkg| File.join(datadir, "third_party/#{pkg}/#{ pkg }.jar")}.join(":")
+  end
+  def self.rootdir
+    @@rootdir
+  end
+  def self.datadir
+    @@datadir
+  end
+  def self.cachedir
+    @@cachedir
+  end
+  def self.tmpdir
+    @@tmpdir
+  end
+  self.load_config
+end

data/tasks/install.rake ADDED Viewed

@@ -0,0 +1,123 @@
+require 'rbbt'
+$datadir = Rbbt.datadir
+$scriptdir = File.join(Rbbt.rootdir, '/install_scripts')
+task 'abner' do
+  directory = "#{$datadir}/third_party/abner/"
+  if !File.exists?(File.join(directory, 'abner.jar')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory};rm -Rf *; #{$scriptdir}/get_abner.sh;cd -`
+  end
+end
+task 'banner' do
+  directory = "#{$datadir}/third_party/banner/"
+  if !File.exists?(File.join(directory, 'banner.jar')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory};rm -Rf *; #{$scriptdir}/get_banner.sh;cd -`
+  end
+end
+task 'crf++' do
+  directory = "#{$datadir}/third_party/crf++/"
+  if !File.exists?(File.join(directory, 'ruby/CRFPP.so')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory};rm -Rf *; #{$scriptdir}/get_crf++.sh;cd -`
+  end
+end
+task 'wordlists' do
+  FileUtils.cp_r File.join($scriptdir, 'wordlists/'), $datadir
+end
+task 'polysearch' do
+  directory = "#{$datadir}/dbs/polysearch/"
+  if !File.exists?(File.join(directory,'disease.txt')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_polysearch.sh;cd -`
+  end
+end
+task '3party' => %w(abner banner crf++)
+task 'entrez' do
+  directory = "#{$datadir}/dbs/entrez/"
+  if !File.exists?(File.join(directory,'gene_info')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_entrez.sh;cd -`
+  end
+end
+task 'go' do
+  directory = "#{$datadir}/dbs/go/"
+  if !File.exists?(File.join(directory,'gene_ontology.obo')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_go.sh;cd -`
+  end
+end
+task 'biocreative' do
+  directory = "#{$datadir}/biocreative/"
+  if !File.exists?(File.join(directory, 'BC2GN')) || $force
+    FileUtils.mkdir_p directory
+    `cd #{directory};rm -Rf *; #{$scriptdir}/get_biocreative.sh;cd -`
+  end
+end
+task 'datasets' => %w(entrez biocreative)
+task 'organisms' do
+  directory = "#{$datadir}/organisms"
+  FileUtils.mkdir_p directory
+  %w(Rakefile rake-include.rb).each{|f|
+    FileUtils.cp_r File.join($scriptdir, "organisms/#{ f }"), directory
+  }
+  Dir.glob(File.join($scriptdir, "organisms/*.Rakefile")).each{|f|
+    org = File.basename(f).sub(/.Rakefile/,'')
+    if !File.exists?(File.join(directory, org))
+      FileUtils.mkdir_p File.join(directory, org)
+    end
+    FileUtils.cp f , File.join(directory, "#{ org }/Rakefile")
+  }
+end
+task 'ner' do
+  directory = "#{$datadir}/ner"
+  FileUtils.mkdir_p directory
+  %w(Rakefile config).each{|f|
+    FileUtils.cp_r File.join($scriptdir, "ner/#{ f }"), directory
+  }
+  %w(data model results).each{|d|
+    FileUtils.mkdir_p File.join(directory, d)
+  }
+end
+task 'norm' do
+  directory = "#{$datadir}/norm"
+  FileUtils.mkdir_p directory
+  %w(Rakefile config).each{|f|
+    FileUtils.cp_r File.join($scriptdir, "norm/#{ f }"), directory
+  }
+ %w(results).each{|d|
+  FileUtils.mkdir_p File.join(directory, d)
+  }
+end
+task 'classifier' do
+  directory = "#{$datadir}/classifier"
+  FileUtils.mkdir_p directory
+  %w(Rakefile R).each{|f|
+    FileUtils.cp_r File.join($scriptdir, "classifier/#{ f }"), directory
+  }
+  %w(data model results).each{|d|
+    FileUtils.mkdir_p File.join(directory, d)
+  }
+end

metadata ADDED Viewed

@@ -0,0 +1,114 @@
+--- !ruby/object:Gem::Specification
+name: rbbt
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- Miguel Vazquez
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-10-29 00:00:00 +01:00
+default_executable: rbbt_config
+dependencies: []
+description: |-
+  This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
+      classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
+email: miguel.vazquez@fdi.ucm.es
+executables:
+- rbbt_config
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- install_scripts/classifier/R/classify.R
+- install_scripts/classifier/Rakefile
+- install_scripts/get_abner.sh
+- install_scripts/get_banner.sh
+- install_scripts/get_biocreative.sh
+- install_scripts/get_crf++.sh
+- install_scripts/get_entrez.sh
+- install_scripts/get_go.sh
+- install_scripts/get_polysearch.sh
+- install_scripts/ner/Rakefile
+- install_scripts/ner/config/default.rb
+- install_scripts/norm/Rakefile
+- install_scripts/norm/config/cue_default.rb
+- install_scripts/norm/config/tokens_default.rb
+- install_scripts/norm/functions.sh
+- install_scripts/organisms/Rakefile
+- install_scripts/organisms/cgd.Rakefile
+- install_scripts/organisms/human.Rakefile
+- install_scripts/organisms/mgi.Rakefile
+- install_scripts/organisms/pombe.Rakefile
+- install_scripts/organisms/rake-include.rb
+- install_scripts/organisms/rgd.Rakefile
+- install_scripts/organisms/sgd.Rakefile
+- install_scripts/organisms/tair.Rakefile
+- install_scripts/organisms/worm.Rakefile
+- install_scripts/stopwords
+- install_scripts/wordlists/consonants
+- install_scripts/wordlists/stopwords
+- lib/rbbt.rb
+- lib/rbbt/bow/bow.rb
+- lib/rbbt/bow/classifier.rb
+- lib/rbbt/bow/dictionary.rb
+- lib/rbbt/ner/abner.rb
+- lib/rbbt/ner/banner.rb
+- lib/rbbt/ner/regexpNER.rb
+- lib/rbbt/ner/rner.rb
+- lib/rbbt/ner/rnorm.rb
+- lib/rbbt/ner/rnorm/cue_index.rb
+- lib/rbbt/ner/rnorm/tokens.rb
+- lib/rbbt/sources/biocreative.rb
+- lib/rbbt/sources/biomart.rb
+- lib/rbbt/sources/entrez.rb
+- lib/rbbt/sources/go.rb
+- lib/rbbt/sources/organism.rb
+- lib/rbbt/sources/polysearch.rb
+- lib/rbbt/sources/pubmed.rb
+- lib/rbbt/util/arrayHash.rb
+- lib/rbbt/util/filecache.rb
+- lib/rbbt/util/index.rb
+- lib/rbbt/util/misc.rb
+- lib/rbbt/util/open.rb
+- lib/rbbt/util/simpleDSL.rb
+- lib/rbbt/util/tmpfile.rb
+- lib/rbbt/version.rb
+- tasks/install.rake
+- LICENSE
+- README.rdoc
+has_rdoc: true
+homepage: http://github.com/mikisvaz/rbbt
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Bioinformatics and text mining toolbox
+test_files: []