RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/lib/rbbt/ner/abner.rb DELETED

@@ -1,34 +0,0 @@
-require 'rbbt'
-require 'rjb'
-# Offers a Ruby interface to the Abner Named Entity Recognition Package
-# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
-class Abner
-  @@JFile = Rjb::import('java.io.File')
-  @@Tagger = Rjb::import('abner.Tagger')
-  @@Trainer = Rjb::import('abner.Trainer')
-  # If modelfile is present a custom trained model can be used,
-  # otherwise, the default BioCreative model is used.
-  def initialize(modelfile=nil)
-    if modelfile == nil
-      @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
-    else
-      @tagger = @@Tagger.new(@@JFile.new(modelfile))
-    end
-  end
-  # Given a chunk of text, it finds all the mentions appearing in it. It
-  # returns all the mentions found, regardless of type, to be coherent
-  # with the rest of NER packages in Rbbt.
-  def extract(text)
-    res = @tagger.getEntities(text)
-    types = res[1]
-    strings = res[0]
-    return strings.collect{|s| s.to_s}
-  end
-end

data/lib/rbbt/ner/banner.rb DELETED

@@ -1,73 +0,0 @@
-require 'rbbt'
-require 'rjb'
-# Offers a Ruby interface to the Banner Named Entity Recognition Package
-# in Java. Banner[http://banner.sourceforge.net/].
-class Banner
-  @@JFile = Rjb::import('java.io.File')
-  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
-  @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
-  @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
-  @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
-  @@Sentence = Rjb::import('banner.Sentence')
-  @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
-  # The parameters are set to default values, the only one that one
-  # might want to change is the modelfile to point to a custom trained
-  # one.
-  def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
-                 lemmadir  = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
-                 taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
-                )
-    @tokenizer = @@SimpleTokenizer.new
-    model = @@JFile.new(modelfile)
-    lemma =  @@EngLemmatiser.new(lemmadir,false,true)
-    helper =  @@HeppleTagger.new(taggerdir)
-    # The next lines are needed to avoid colisions with
-    # metraprograming that could define load (activesupport in
-    # particular :@ ). RJB seems to call java on method missing
-    class << @@CRFTagger
-      if method_defined? :load
-        undef_method :load
-      end
-    end
-    @tagger    = @@CRFTagger.load( model, lemma, helper)
-    @parenPP   = @@ParenthesisPostProcessor.new()
-  end
-  # Returns an array with the mention found in the provided piece of
-  # text.
-  def extract(text)
-    text.gsub!(/\n/,' ')
-    text.gsub!(/\|/,'/') # Character | gives an error
-    sentence = @@Sentence.new(text)
-    @tokenizer.tokenize(sentence)
-    @tagger.tag(sentence)
-    @parenPP.postProcess(sentence)
-    tagged = sentence.getSGML
-    res = tagged.scan(/<GENE>.*?<\/GENE>/).
-      collect{|r|
-      r.match(/<GENE>(.*?)<\/GENE>/)
-      mention = $1
-      mention.sub!(/^\s*/,'')
-      mention.sub!(/\s*$/,'')
-      mention
-    }
-    res
-  end
-end

data/lib/rbbt/ner/dictionaryNER.rb DELETED

@@ -1,98 +0,0 @@
-# This class loads a dictionary of codes with associated names, it then can
-# find those names in a string of text. It works word-wise.
-class DictionaryNER
-  A_INT   = "a"[0]
-  DOWNCASE_OFFSET = "A"[0].bytes.first - "a"[0].bytes.first
-  require 'rbbt/bow/bow'
-  # Divides a string of text into words. A slash separates words, only if the
-  # second one begins with a letter.
-  def self.chunk(text)
-    text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
-  end
-  # Simplify the text to widen the matches. Currently only downcases the keys
-  def self.simplify(text)
-    if text.length > 2 && text[0] < A_INT && text[1] > A_INT
-      text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
-    else
-      return text
-    end
-  end
-  # Given a dictionary structure, find the matches in the text.
-  def self.match(dict, text) #:nodoc:
-    if Array === text
-      words = text
-    else
-      words = chunk(text)
-    end
-    result = {}
-    words.each_with_index{|word, pos|
-      key = simplify(word)
-      next if dict[key].nil?
-      dict[key].each{|entrie|
-        case
-        when String === entrie
-          result[word] ||= []
-          result[word] << entrie unless result[word].include? entrie
-        when Hash === entrie
-          rec_words  = words[(pos + 1)..-1]
-          rec_result = match(entrie, rec_words)
-          rec_result.each{|rec_key, rec_list|
-            composite_key = word + ' ' + rec_key
-            result[composite_key] ||= []
-            result[composite_key] += rec_list
-            result[composite_key].uniq!
-          }
-        end
-      }
-    }
-    result
-  end
-  # Add a name to a structure
-  def self.add_name(dict, name, code)
-    if Array === name
-      words = name
-    else
-      words = chunk(name)
-    end
-    key = simplify(words.shift)
-    if words.empty?
-      dict[key] ||= []
-      dict[key] << code unless dict[key].include? code
-    else
-      rec_dict = {}
-      add_name(rec_dict, words , code)
-      dict[key] ||= []
-      dict[key] << rec_dict
-    end
-  end
-  def self.load(dictionary)
-    dict = {}
-    dictionary = File.open(dictionary).read if File.exists? dictionary
-    dictionary.each_line{|l|
-      names = l.chomp.split(/\t/)
-      code  = names.shift
-      names.each{|name| add_name(dict, name, code) }
-    }
-    dict
-  end
-  def initialize(dictionary)
-    @dict = DictionaryNER.load(dictionary)
-  end
-  def match(text)
-    DictionaryNER.match(@dict, text)
-  end
-end

data/lib/rbbt/ner/regexpNER.rb DELETED

@@ -1,70 +0,0 @@
-require 'rbbt/util/open'
-require 'rbbt/util/misc'
-class RegExpNER
-  def self.match_re(text, res)
-    res = [res] unless Array === res
-    res.collect{|re|
-      text.scan(re)
-    }.flatten
-  end
-  def self.build_re_old(names, ignorecase=true)
-    names.compact.select{|n| n != ""}.
-      sort{|a,b| b.length <=> a.length}.
-      collect{|n|
-        re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
-      }
-  end
-  def self.build_re(names, ignorecase=true)
-    res = names.compact.select{|n| n != ""}.
-      sort{|a,b| b.length <=> a.length}.
-      collect{|n|
-        Regexp.quote(n)
-      }
-    /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
-  end
-  def initialize(lexicon, options = {})
-    options = {:flatten => true, :ignorecase => true, :stopwords => nil}.merge options
-    options[:stopwords] = $stopwords if $stopwords && (options[:stopwords].nil? || options[:stopwords] == true)
-    options[:stopwords] ||= []
-    data = Open.to_hash(lexicon, options)
-    @index = {}
-    data.collect{|code, names|
-      next if code.nil? || code == ""
-      if options[:stopwords].any?
-        names = names.select{|n|
-          ! options[:stopwords].include?(options[:ignorecase] ? n.downcase : n)
-        }
-      end
-      @index[code] = RegExpNER.build_re(names, options[:ignorecase])
-   }
-  end
-  def match_hash(text)
-    return {} if text.nil? || text == ""
-    matches = {}
-    @index.each{|code, re|
-      RegExpNER.match_re(text, re).each{|match|
-         matches[code] ||= []
-         matches[code] << match
-      }
-    }
-    matches
-  end
-  def match(text)
-    match_hash(text)
-  end
-end

data/lib/rbbt/ner/rner.rb DELETED

@@ -1,227 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/util/misc'
-require 'rbbt/util/simpleDSL'
-class NERFeatures < SimpleDSL
-  def self.tokens(text)
-    text.scan(/
-              \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
-              \w-\w*|
-              \w+-[A-Z](?!\w)|
-              \w+|
-              [.,()\/\[\]{}'"+-]
-              /x)
-  end
-  def self.reverse(text)
-    tokens(text).reverse.join(" ")
-  end
-  def define(name, *args, &block)
-    action = *args[0] || block ||  /#{name.to_s}s?/i
-    raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
-    @types[name.to_s] = action
-    @order.push name.to_s
-    name.to_s
-  end
-  attr_accessor :reverse
-  def initialize(file = nil, reverse = false, &block)
-    @types   = {}
-    @order   = []
-    @context = []
-    @reverse = reverse
-    file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
-    super(:define,file, &block)
-  end
-  def config
-    @config[:define]
-  end
-  def window(positions)
-    @window = positions
-  end
-  def context(name, &block)
-    if name.is_a? Array
-      @context += name
-    else
-      @context.push name
-      # The block might be wrongly assigned to this function
-      # instead of the actual definition, fix that.
-      if block
-        @types[name] = block
-      end
-    end
-  end
-  def direction(dir)
-    if dir.to_sym == :reverse
-      @reverse = true
-    end
-  end
-  def features(word)
-    values = [word]
-    @order.each{|features|
-      action = @types[features]
-      if action.is_a?(Proc)
-        values.push(action.call(word))
-      else
-        m = action.match(word)
-        if m
-          if m[1]
-            values.push(m[1])
-          else
-            values.push(m != nil)
-          end
-        else
-          values.push(false)
-        end
-      end
-    }
-    values
-  end
-  def template(window=nil)
-    window ||= @window || [1,-1]
-    template = ""
-    i = 1
-    @order.each{|feat|
-      template += "U#{ feat }: %x[0,#{ i }]\n"
-      if @context.include?(feat)
-        window.each{|p|
-          template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
-        }
-      end
-      i += 1
-    }
-    template += "B\n"
-    template
-  end
-  def text_features(text, positive = nil)
-    text = self.class.reverse(text) if @reverse
-    initial = true
-    self.class.tokens(text).collect{|token|
-      features = features(token)
-      if !positive.nil?
-        features << (positive ? (initial ? 1 : 2) : 0)
-        initial = false
-      end
-      features
-    }
-  end
-  def tagged_features(text, mentions)
-    mentions ||= []
-    mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
-    re = mentions.collect{|mention|
-      Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
-    }.join("|")
-    positive = false
-    features = []
-    chunks = text.split(/(#{re})/)
-    chunks.each{|t|
-      chunk_features = text_features(t, positive)
-      positive = !positive
-      if @reverse
-        features = chunk_features + features
-      else
-        features = features + chunk_features
-      end
-    }
-    features
-  end
-  def train(features, model)
-    tmp_template = TmpFile.tmp_file("template-")
-    Open.write(tmp_template,template)
-    cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
-    system cmd
-    Open.write(model + '.config',config)
-    FileUtils.rm tmp_template
-  end
-end
-class NER
-  def initialize(model = nil)
-    begin
-      require 'CRFPP'
-    rescue Exception
-      require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
-    end
-    model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
-    @parser = NERFeatures.new(model + '.config')
-    @reverse = @parser.reverse
-    @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
-  end
-  def extract(text)
-    features = @parser.text_features(text)
-    @tagger.clear
-    features.each{|feats|
-      @tagger.add(feats.join(" "))
-    }
-    @tagger.parse
-    found = []
-    mention = []
-    @tagger.size.times{|i|
-      label = @tagger.y(i)
-      word  = @tagger.x(i,0)
-      if word == ')'
-        mention.push(')') if mention.join =~ /\(/
-        next
-      end
-      case label
-      when 1
-        if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
-          found.push(mention)
-          mention = []
-        end
-        mention.push(word)
-      when 2
-        mention.push(word)
-      when 0
-        found.push(mention) if mention.any?
-        mention = []
-      end
-    }
-    found << mention if mention.any?
-    found.collect{|list|
-      list = list.reverse if @reverse
-      list.join(" ")
-    }
-  end
-end