RubyGems - rbbt-text - Versions diffs - 0.0.4 - Mend

rbbt-text 0.0.4

Files changed (10) hide show

data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/dictionary.rb +187 -0
data/lib/rbbt/bow/misc.rb +7 -0
data/lib/rbbt/ner/regexpNER.rb +61 -0
data/test/rbbt/bow/test_bow.rb +30 -0
data/test/rbbt/bow/test_dictionary.rb +91 -0
data/test/rbbt/bow/test_misc.rb +9 -0
data/test/rbbt/ner/test_regexpNER.rb +32 -0
data/test/test_helper.rb +4 -0
metadata +92 -0

data/lib/rbbt/bow/bow.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require 'rbbt'
+require 'rbbt/bow/misc'
+require 'stemmer'
+# This module provides methods to extract a bag of words (or bag of bigrams)
+# representation for strings of text, and to produce a vector representations
+# of that bag of words for a given list of terms. This BOW representations of
+# the texts is usually first used to build a Dictionary, and then, with the
+# best selection of terms as determined by the Dictionary::TF_IDF.best of
+# Dictionary::KL.best methods, determine the vector representations for that
+# text.
+module BagOfWords
+  # Divide the input string into an array of words (sequences of \w characters).
+  # Words are stemmed and filtered to remove stopwords and words with less than
+  # 2 characters. The list of stopwords is a global variable defined in
+  # 'rbbt/util/misc'.
+  def self.words(text)
+    return [] if text.nil?
+    raise "Stopword list not loaded. Have you installed the wordlists? (rbbt_config prepare wordlists)" if $stopwords.nil?
+    text.scan(/\w+/).
+      collect{|word| word.downcase.stem}.
+      select{|word|
+      ! $stopwords.include?(word) &&
+        word.length > 2 &&
+        word =~ /[a-z]/
+    }
+  end
+  # Take the array of words for the text and form all the bigrams
+  def self.bigrams(text)
+    words = words(text)
+    bigrams = []
+    lastword = nil
+    words.each{|word|
+      if lastword
+        bigrams << "#{lastword} #{word}"
+      end
+      lastword = word
+    }
+    words + bigrams
+  end
+  # Given an array of terms return a hash with the number of appearances of
+  # each term
+  def self.count(terms)
+    count = Hash.new(0)
+    terms.each{|word| count[word] += 1}
+    count
+  end
+  # Given a string of text find all the words (or bigrams) and return a hash
+  # with their counts
+  def self.terms(text, bigrams = true)
+    if bigrams
+      count(bigrams(text))
+    else
+      count(words(text))
+    end
+  end
+  # Given a string of text and a list of terms, which may or may not contain
+  # bigrams, return an array with one entry per term which holds the number of
+  # occurrences of each term in the text.
+  def self.features(text, terms, bigrams = nil)
+    bigrams ||= terms.select{|term| term =~ / /}.any?
+    count = bigrams ? count(bigrams(text)) : count(words(text))
+    count.values_at(*terms)
+  end
+end
+class String
+  # Shortcut for BagOfWords.words(self)
+  def words
+    BagOfWords.words(self)
+  end
+  # Shortcut for BagOfWords.bigrams(self)
+  def bigrams
+    BagOfWords.bigrams(self)
+  end
+end

data/lib/rbbt/bow/dictionary.rb ADDED Viewed

@@ -0,0 +1,187 @@
+class Dictionary
+  attr_reader :terms
+  def initialize
+    @terms = Hash.new(0)
+  end
+  def add(terms, &block)
+    terms.each{|term, count|
+      @terms[term] += count
+    }
+  end
+end
+class Dictionary::TF_IDF
+  attr_reader :terms, :docs, :total_terms, :num_docs
+  def initialize(options = {})
+    @term_limit = {
+      :limit => 500_000,
+    }.merge(options)[:limit]
+    @terms = Hash.new(0)
+    @docs = Hash.new(0)
+    @num_docs = 0
+    @total_terms = 0
+  end
+  def add(terms)
+    if @term_limit && @terms.length > @term_limit
+      terms = terms.delete_if{|term, count| !@terms.include? term }
+    end
+    terms.each{|term, count|
+      @terms[term] += count
+      @total_terms += count
+      @docs[term]  += 1
+    }
+    @num_docs += 1
+  end
+  def df
+    df = Hash.new(0)
+    @docs.each{|term, count|
+     df[term] = count.to_f / @num_docs
+    }
+    df
+  end
+  def tf
+    tf = Hash.new(0)
+    @terms.each{|term, count|
+     tf[term] = count.to_f / @total_terms
+    }
+    tf
+  end
+  def idf
+    idf = Hash.new(0)
+    num_docs = @num_docs.to_f
+    @docs.each{|term, count|
+     idf[term] = Math::log(num_docs / count)
+    }
+    idf
+  end
+  def tf_idf
+    tf_idf = Hash.new(0)
+    num_docs = @num_docs.to_f
+    @docs.each{|term, count|
+     tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
+    }
+    tf_idf
+  end
+  def best(options = {})
+    hi, low, limit = {
+      :low   => 0,
+      :hi    => 1,
+    }.merge(options).
+    values_at(:hi, :low, :limit)
+    num_docs = @num_docs.to_f
+    best = df.select{|term, value|
+      value >= low && value <= hi
+    }.collect{|p|
+      term     = p.first
+      df_value = p.last
+      [term,
+       @terms[term].to_f / num_docs * Math::log(1.0/df_value)
+      ]
+    }
+    if limit
+      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+    else
+      Hash[*best.flatten]
+    end
+  end
+  def weights(options = {})
+    best_terms = best(options).keys
+    weights = {}
+    num_docs = @num_docs.to_f
+    best_terms.each{|term|
+      weights[term] = Math::log(num_docs / @docs[term])
+    }
+    weights
+  end
+end
+class Dictionary::KL
+  attr_reader :pos_dict, :neg_dict
+  def initialize(options = {})
+    @pos_dict = Dictionary::TF_IDF.new(options)
+    @neg_dict = Dictionary::TF_IDF.new(options)
+  end
+  def terms
+    (pos_dict.terms.keys + neg_dict.terms.keys).uniq
+  end
+  def add(terms, c)
+    dict = (c == :+ || c == '+' ? @pos_dict : @neg_dict)
+    dict.add(terms)
+  end
+  def kl
+    kl = {}
+    pos_df = @pos_dict.df
+    neg_df = @neg_dict.df
+    terms.each{|term|
+      pos = pos_df[term]
+      neg = neg_df[term]
+      pos = 0.000001 if pos == 0
+      pos = 0.999999 if pos == 1
+      neg = 0.000001 if neg == 0
+      neg = 0.999999 if neg == 1
+      kl[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
+    }
+    kl
+  end
+  def best(options = {})
+    hi, low, limit = {
+      :low   => 0,
+      :hi    => 1,
+    }.merge(options).
+    values_at(:hi, :low, :limit)
+    pos_df = @pos_dict.df
+    neg_df = @neg_dict.df
+    best = {}
+    terms.select{|term|
+      pos_df[term] >= low && pos_df[term] <= hi ||
+      neg_df[term] >= low && neg_df[term] <= hi
+    }.each{|term|
+      pos = pos_df[term]
+      neg = neg_df[term]
+      pos = 0.000001 if pos == 0
+      pos = 0.999999 if pos == 1
+      neg = 0.000001 if neg == 0
+      neg = 0.999999 if neg == 1
+      best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
+    }
+    if limit
+      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+    else
+      Hash[*best.flatten]
+    end
+  end
+  def weights(options = {})
+    best(options)
+  end
+end

data/lib/rbbt/bow/misc.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'rbbt'
+require 'rbbt/util/open'
+Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
+$stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')

data/lib/rbbt/ner/regexpNER.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'rbbt-util'
+require 'rbbt/bow/misc'
+class RegExpNER
+  def self.build_re(names, ignorecase=true)
+    res = names.compact.reject{|n| n.empty?}.
+      sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
+    /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
+  end
+  def initialize(lexicon, options = {})
+    options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
+    if $stopwords and  (options[:stopwords].nil? || options[:stopwords] == true)
+      options[:stopwords] = $stopwords
+    else
+      options[:stopwords] = []
+    end
+    data = TSV.new(lexicon, options)
+    @index = {}
+    data.collect{|code, names|
+      next if code.nil? || code == ""
+      if options[:stopwords].any?
+        names = names.select{|n|
+          ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
+        }
+      end
+      @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
+   }
+  end
+  def self.match_re(text, res)
+    res = [res] unless Array === res
+    res.collect{|re|
+      text.scan(re)
+    }.flatten
+  end
+  def match_hash(text)
+    return {} if text.nil? or text.empty?
+    matches = {}
+    @index.each{|code, re|
+      RegExpNER.match_re(text, re).each{|match|
+         matches[code] ||= []
+         matches[code] << match
+      }
+    }
+    matches
+  end
+  def match(text)
+    match_hash(text)
+  end
+end

data/test/rbbt/bow/test_bow.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/bow/bow'
+require 'test/unit'
+class TestBow < Test::Unit::TestCase
+  def test_words
+    assert_equal(["hello", "world"], "Hello World".words)
+  end
+  def test_terms
+    text = "Hello World"
+    assert_equal(["hello", "world"], BagOfWords.terms(text,false).keys.sort)
+    assert_equal(["hello", "hello world", "world"], BagOfWords.terms(text,true).keys.sort)
+  end
+  def test_features
+    text = "Hello world!"
+    text += "Hello World Again!"
+    assert_equal([2, 2], BagOfWords.features(text, "Hello World".words.uniq.sort))
+  end
+  def test_stem
+    assert_equal(["protein"], "Proteins".words)
+  end
+end

data/test/rbbt/bow/test_dictionary.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/bow/dictionary'
+require 'rbbt/bow/bow'
+require 'test/unit'
+class TestDictionary < Test::Unit::TestCase
+  def test_standard
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(2, dict.terms["hello"])
+    assert_equal(2, dict.terms["yin"])
+    assert_equal(0, dict.terms["bye"])
+    assert_equal(1, dict.terms["world"])
+  end
+  def test_tf_idf
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary::TF_IDF.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(2, dict.terms["hello"])
+    assert_equal(2, dict.terms["yin"])
+    assert_equal(0, dict.terms["bye"])
+    assert_equal(1, dict.terms["world"])
+    assert_equal(1,   dict.df["hello"])
+    assert_equal(0.5, dict.df["yin"])
+    assert_equal(0,   dict.df["bye"])
+    assert_equal(0.5, dict.df["world"])
+    assert_equal(2.0/5, dict.tf["hello"])
+    assert_equal(2.0/5, dict.tf["yin"])
+    assert_equal(0,     dict.tf["bye"])
+    assert_equal(1.0/5,   dict.tf["world"])
+    assert_equal(Math::log(1), dict.idf["hello"])
+    assert_equal(Math::log(2), dict.idf["yin"])
+    assert_equal(0,            dict.idf["bye"])
+    assert_equal(Math::log(2), dict.idf["world"])
+    assert_equal(2.0/5 * Math::log(1),   dict.tf_idf["hello"])
+    assert_equal(2.0/5 * Math::log(2), dict.tf_idf["yin"])
+    assert_equal(0,                      dict.tf_idf["bye"])
+    assert_equal(1.0/5 * Math::log(2), dict.tf_idf["world"])
+  end
+  def test_best
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary::TF_IDF.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(1, dict.best(:limit => 1).length)
+    assert(dict.best(:limit => 1).include? "yin")
+  end
+  def test_kl
+    docs = []
+    docs << [BagOfWords.terms("Hello World", false), :+]
+    docs << [BagOfWords.terms("Hello Cruel World", false), :+]
+    docs << [BagOfWords.terms("Hello Yan Yan", false), :-]
+    docs << [BagOfWords.terms("Hello Yin Yin", false), :-]
+    dict = Dictionary::KL.new
+    docs.each{|doc| dict.add *doc}
+    assert_equal(0, dict.kl["hello"])
+    assert_equal(dict.kl['yan'], dict.kl['yin'])
+    assert_in_delta(1 * Math::log(1 / 0.000001), dict.kl["world"],0.01)
+    assert_in_delta(0.5 * Math::log(0.5 / 0.000001), dict.kl["cruel"],0.01)
+  end
+end

data/test/rbbt/bow/test_misc.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/bow/misc'
+require 'test/unit'
+class TestBase < Test::Unit::TestCase
+  def test_url
+    assert_not_nil($stopwords)
+  end
+end

data/test/rbbt/ner/test_regexpNER.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt-util'
+require 'rbbt/ner/regexpNER'
+require 'test/unit'
+class TestRegExpNER < Test::Unit::TestCase
+  def test_class
+    text = "a bc d e f g h i j k  l m n o p q one two"
+    lexicon =<<-EOF
+C1,a,x,xx,xxx
+C2,bc,y,yy,yyy
+C3,i,z,zz,zzz,m,one two
+    EOF
+    file = TmpFile.tmp_file
+    File.open(file, 'w'){|f| f.write lexicon}
+    r = RegExpNER.new(file, :sep => ',', :stopwords => false)
+    assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
+    r = RegExpNER.new(file, :sep => ',', :stopwords => true)
+    assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
+    FileUtils.rm file
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'test/unit'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))

metadata ADDED Viewed

@@ -0,0 +1,92 @@
+--- !ruby/object:Gem::Specification
+name: rbbt-text
+version: !ruby/object:Gem::Version
+  hash: 23
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 4
+  version: 0.0.4
+platform: ruby
+authors:
+- Miguel Vazquez
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-01 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rbbt-util
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
+email: miguel.vazquez@fdi.ucm.es
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/rbbt/bow/bow.rb
+- lib/rbbt/bow/dictionary.rb
+- lib/rbbt/bow/misc.rb
+- lib/rbbt/ner/regexpNER.rb
+- test/rbbt/bow/test_bow.rb
+- test/rbbt/bow/test_dictionary.rb
+- test/rbbt/bow/test_misc.rb
+- test/rbbt/ner/test_regexpNER.rb
+- test/test_helper.rb
+has_rdoc: true
+homepage: http://github.com/mikisvaz/rbbt-util
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
+test_files:
+- test/rbbt/bow/test_bow.rb
+- test/rbbt/bow/test_dictionary.rb
+- test/rbbt/bow/test_misc.rb
+- test/rbbt/ner/test_regexpNER.rb
+- test/test_helper.rb