RubyGems - rbbt-text - Versions diffs - 0.5.0 → 0.6.0 - Mend

rbbt-text 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/lib/rbbt/corpus/corpus.rb +15 -6
data/lib/rbbt/corpus/document.rb +100 -127
data/lib/rbbt/corpus/document_repo.rb +72 -51
data/lib/rbbt/ner/NER.rb +4 -4
data/lib/rbbt/ner/abner.rb +5 -4
data/lib/rbbt/ner/banner.rb +3 -3
data/lib/rbbt/ner/chemical_tagger.rb +3 -3
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
data/lib/rbbt/ner/oscar3.rb +3 -3
data/lib/rbbt/ner/oscar4.rb +3 -3
data/lib/rbbt/ner/patterns.rb +15 -13
data/lib/rbbt/ner/regexpNER.rb +3 -2
data/lib/rbbt/ner/rnorm.rb +2 -2
data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
data/lib/rbbt/ner/segment/relationship.rb +20 -0
data/lib/rbbt/ner/segment/segmented.rb +13 -0
data/lib/rbbt/ner/segment/token.rb +24 -0
data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
data/lib/rbbt/ner/token_trieNER.rb +30 -22
data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
data/lib/rbbt/nlp/nlp.rb +23 -37
data/test/rbbt/corpus/test_document.rb +39 -37
data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
data/test/rbbt/ner/segment/test_segmented.rb +23 -0
data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
data/test/rbbt/ner/test_patterns.rb +11 -12
data/test/rbbt/ner/test_regexpNER.rb +5 -4
data/test/rbbt/ner/test_segment.rb +101 -0
data/test/rbbt/ner/test_token_trieNER.rb +8 -9
data/test/test_helper.rb +6 -6
metadata +40 -22
data/lib/rbbt/ner/annotations/annotated.rb +0 -15
data/lib/rbbt/ner/annotations/relations.rb +0 -25
data/lib/rbbt/ner/annotations/token.rb +0 -28
data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
data/test/rbbt/ner/test_annotations.rb +0 -70

data/lib/rbbt/ner/{annotations → segment}/named_entity.rb RENAMED Viewed

@@ -1,17 +1,9 @@
-require 'rbbt/ner/annotations'
+require 'rbbt/ner/segment'
 module NamedEntity
-  attr_accessor :type, :code, :score, :segment_types
+  extend Annotation
   include Segment
-  def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
-    string.extend NamedEntity
-    string.offset = offset unless offset.nil?
-    string.type  = type unless type.nil?
-    string.code  = code unless code.nil?
-    string.score = score unless score.nil?
-    string
-  end
+  self.annotation :type, :code, :score
   def report
     <<-EOF

data/lib/rbbt/ner/segment/relationship.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'rbbt/ner/segment'
+module Relationship
+  extend Annotation
+  include Segment
+  self.annotation :terms
+  def html
+    text = <<-EOF
+<span class='Relationship'\
+>#{ self }</span>
+    EOF
+    text.chomp
+  end
+  def html_with_entities(*types)
+    annotations.values_at(*types).each do |segments|
+    end
+  end
+end

data/lib/rbbt/ner/segment/segmented.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'rbbt/annotations'
+require 'rbbt/ner/segment'
+module Segmented
+  extend Annotation
+  self.annotation :segments
+  def split_segments(skip_segments = false)
+    Segment.split(self, @segments, skip_segments)
+  end
+end

data/lib/rbbt/ner/segment/token.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'rbbt/annotations'
+require 'rbbt/ner/segment'
+module Token
+  extend Annotation
+  include Segment
+  self.annotation :original
+  def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
+    tokens = []
+    while matchdata = text.match(split_at)
+      tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
+      tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
+      start += matchdata.end(0)
+      text = matchdata.post_match
+    end
+    tokens << Token.setup(text, start) unless text.empty?
+    tokens
+  end
+end

data/lib/rbbt/ner/{annotations → segment}/transformed.rb RENAMED Viewed

@@ -1,7 +1,16 @@
-require 'rbbt/ner/annotations'
+require 'rbbt/ner/segment'
 module Transformed
   attr_accessor :transformation_offset_differences, :transformation_original
+  def self.transform(text, segments, replacement = nil, &block)
+    require 'rbbt/util/misc'
+    text.extend Transformed
+    text.replace(segments, replacement, &block)
+    text
+  end
   def self.with_transform(text, segments, replacement)
     require 'rbbt/util/misc'
@@ -14,16 +23,7 @@ module Transformed
     text.restore(segments, true)
   end
-  def self.transform(text, segments, replacement = nil, &block)
-    require 'rbbt/util/misc'
-    text.extend Transformed
-    text.replace(segments, replacement, &block)
-    text
-  end
   def transform_pos(pos)
     return pos if transformation_offset_differences.nil?
     # tranformation_offset_differences are assumed to be sorted in reverse

data/lib/rbbt/ner/token_trieNER.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-require 'rbbt-util'
-require 'rbbt/util/tsv'
-require 'rbbt/ner/annotations'
-require 'rbbt/ner/annotations/token'
+require 'rbbt'
+require 'rbbt/tsv'
+require 'rbbt/ner/segment'
+require 'rbbt/ner/segment/token'
 require 'rbbt/ner/NER'
 class TokenTrieNER < NER
@@ -16,15 +16,15 @@ class TokenTrieNER < NER
   def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
     if no_clean
       if extend_to_token
-        Token.annotate(clean(token), start, token)
+        Token.setup(clean(token), start, token)
       else
-        clean(token)
+        token
       end
     else
       if extend_to_token
-        Token.annotate(clean(token), start, token)
+        Token.setup(clean(token), start, token)
       else
-        token
+        clean(token)
       end
     end
   end
@@ -137,6 +137,11 @@ class TokenTrieNER < NER
     hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
       names = Array === names ? names : [names]
       names.flatten! if Array === names.first and not Token === names.first.first
+      if names.empty?
+        names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
+      end
       names.each do |name|
         next if name.empty? or (String === name and name.length < 2)
@@ -167,7 +172,7 @@ class TokenTrieNER < NER
       return index[head]
     end
-    return nil unless (not TCHash === index ) and index.include? :PROCS
+    return nil unless (not TokyoCabinet::HDB === index ) and index.include? :PROCS
     index[:PROCS].each do |key,value|
       return value if key.call(head)
@@ -225,16 +230,16 @@ class TokenTrieNER < NER
     match_offset = match_tokens.first.offset
     match_tokens.each{|t|
       match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
-      match << (t.respond_to?(:original) ? t.original : t)
+      match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
     }
-    NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
+    NamedEntity.setup(match, match_tokens.first.offset, type, codes)
   end
   attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
   def initialize(type = nil, file = nil, options = {})
     options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
-      :persistence => false
+      :persist => false
     @slack = slack
     @longest_match = options.delete :longest_match
     @split_at = options.delete :split_at
@@ -242,16 +247,15 @@ class TokenTrieNER < NER
     file = [] if file.nil?
     file = [file] unless Array === file
-    @index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
-      if persistecen_file.nil?
-        @index = {}
-      else
-        FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
-        @index = TCHash.get persistecen_file, true, :marshal
-      end
+    persist_options = Misc.pull_keys options, :persist
+    @index = Persist.persist_tsv(file, options, persist_options) do |data|
+      data.serializer = :marshal if data.respond_to? :serializer and data.serializer == :type
+      @index = data
       file.each do |f|
         merge(f, type)
       end
       @index
     end
   end
@@ -259,10 +263,10 @@ class TokenTrieNER < NER
   def merge(new, type = nil)
     case
     when TokenTrieNER === new
+      Log.debug "TokenTrieNER merging other TokenTrieNER"
       TokenTrieNER.merge(@index, new.index)
-    when Hash === new
-      TokenTrieNER.merge(@index, new)
     when TSV === new
+      Log.debug "TokenTrieNER merging TSV"
       old_unnamed = new.unnamed
       old_monitor = new.monitor
       new.unnamed = true
@@ -270,8 +274,12 @@ class TokenTrieNER < NER
       TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
       new.unnamed = old_unnamed
       new.monitor = old_monitor
+    when Hash === new
+      Log.debug "TokenTrieNER merging Hash"
+      TokenTrieNER.merge(@index, new)
     when String === new
-      new = TSV.new(new, :flat)
+      Log.debug "TokenTrieNER merging file: #{ new }"
+      new = TSV.open(new, :flat)
       new.unnamed = true
       new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
       TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)

data/lib/rbbt/nlp/genia/sentence_splitter.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'rbbt/ner/segment'
 module NLP
   def self.returnFeatures(prevWord, delimiter, nextWord)
     if nextWord.match(/__ss__/)
@@ -206,7 +207,7 @@ module NLP
     offsets.collect do |s,e|
       sentence = text[s..e]
       next if sentence.nil?
-      Segment.annotate sentence, s
+      Segment.setup sentence, s
       sentence
     end

data/lib/rbbt/nlp/nlp.rb CHANGED Viewed

@@ -1,9 +1,9 @@
 require 'rbbt'
 require 'rbbt/util/tmpfile'
-require 'rbbt/util/persistence'
-require 'rbbt/util/resource'
-require 'rbbt/ner/annotations'
-require 'rbbt/ner/annotations/annotated'
+require 'rbbt/persist'
+require 'rbbt/resource'
+require 'rbbt/ner/segment'
+require 'rbbt/ner/segment/segmented'
 require 'rbbt/nlp/genia/sentence_splitter'
 require 'digest/md5'
@@ -11,7 +11,7 @@ require 'digest/md5'
 module NLP
   extend LocalPersist
-  self.local_persistence_dir = '/tmp/crap'
+  self.local_persist_dir = '/tmp/crap'
   #Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
   #Rbbt.software.opt.StanfordParser.produce
@@ -81,44 +81,21 @@ module NLP
       sentence = text[s..e]
       next if sentence.nil?
       #sentence.gsub!(NEW_LINE_MASK, "\n")
-      Segment.annotate sentence, s
+      Segment.setup sentence, s
       sentence
     end
   end
   module GdepToken
-    attr_accessor :num, :token, :lemma, :chunk, :pos, :bio, :link, :dep
+    extend Annotation
     include Segment
-    def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
-      token.extend GdepToken
-      token.offset = offset
-      token.num = num
-      token.lemma = lemma
-      token.chunk = chunk
-      token.pos = pos
-      token.bio = bio
-      token.link = link
-      token.dep = dep
-      token
-    end
+    self.annotation :num, :lemma, :chunk, :pos, :bio, :link, :dep
   end
   module GdepChunk
-    attr_accessor :type, :parts, :segment_types
+    extend Annotation
     include Segment
-    def self.annotate(string, offset = nil, type = nil, parts = nil)
-      string.extend GdepChunk
-      string.offset = offset
-      string.type = type
-      string.parts = parts
-      string
-    end
+    self.annotation :type, :parts
   end
   def self.merge_vp_chunks(chunk_list)
@@ -148,7 +125,7 @@ module NLP
     chunk_start = "B"[0]
     chunk_inside = "I"[0]
-    last = GdepToken.annotate("LW")
+    last = GdepToken.setup("LW")
     chunk_segments = []
     segment_list.each do |segment|
@@ -159,7 +136,7 @@ module NLP
           cstart = chunk_segments.first.offset
           cend = chunk_segments.last.end
           chunk = sentence[cstart..cend]
-          GdepChunk.annotate(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
+          GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
           chunks << chunk
         end
@@ -172,6 +149,15 @@ module NLP
       last = segment
     end
+    if chunk_segments.any?
+      cstart = chunk_segments.first.offset
+      cend = chunk_segments.last.end
+      chunk = sentence[cstart..cend]
+      GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
+      chunks << chunk
+    end
     chunks
   end
@@ -188,7 +174,7 @@ module NLP
         tokens = sentence.split(/\n/).collect do |line|
           next if line.empty?
           num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
-          GdepToken.annotate(token, nil, num, lemma, chunk, pos, bio, link, dep)
+          GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
         end.compact
       end
     end
@@ -214,7 +200,7 @@ module NLP
       Gdep.new.tag(sentence).split(/\n/).collect do |line|
         next if line.empty?
         token, lemma, pos, chunk = line.split(/\t/)
-        GdepToken.annotate(token, nil, nil, lemma, chunk, pos)
+        GdepToken.setup(token, nil, nil, lemma, chunk, pos)
         token
       end.compact
     }

data/test/rbbt/corpus/test_document.rb CHANGED Viewed

@@ -2,11 +2,6 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
 require 'rbbt/corpus/document'
 require 'test/unit'
-$persistence = TSV.new({})
-$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
-$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
-$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
 class Document
   define :sentences do
     require 'rbbt/nlp/nlp'
@@ -14,22 +9,22 @@ class Document
   end
   define :tokens do
-    require 'rbbt/ner/annotations/token'
+    require 'rbbt/ner/segment/token'
     Token.tokenize(text)
   end
   define :long_words do
-    require 'rbbt/ner/annotations/token'
+    require 'rbbt/ner/segment/token'
     Token.tokenize(text).select{|tok| tok.length > 5}
   end
   define :short_words do
-    require 'rbbt/ner/annotations/token'
+    require 'rbbt/ner/segment/token'
     Token.tokenize(text).select{|tok| tok.length < 5}
   end
   define :even_words do
-    require 'rbbt/ner/annotations/token'
+    require 'rbbt/ner/segment/token'
     Token.tokenize(text).select{|tok| tok.length % 2 == 0}
   end
@@ -40,17 +35,30 @@ class Document
   define :tokens_again do
     raise "This should be here already"
   end
-  persist :sentences
-  persist_in_tsv :tokens
-  persist_in_tsv :long_words, $tchash_persistence, :Literal
-  persist_in_global_tsv :short_words, $global_persistence
-  persist_in_global_tsv :even_words, $tchash_global_persistence
-  persist_in_global_tsv :missing, $tchash_global_persistence
 end
 class TestDocument < Test::Unit::TestCase
+  def setup
+    global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
+    $persistence = TSV.setup({})
+    $tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
+    $global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
+    $tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
+    $tchash_global_persistence.read
+    $tchash_global_persistence.write
+    Document.class_eval do
+      persist :sentences
+      persist_in_tsv :tokens, :literal
+      persist_in_tsv :long_words, $tchash_persistence, :literal
+      persist_in_global_tsv :short_words, $global_persistence
+      persist_in_global_tsv :even_words, $tchash_global_persistence
+      persist_in_global_tsv :missing, $tchash_global_persistence
+    end
+  end
   def test_annotations
     text =<<-EOF
@@ -127,7 +135,7 @@ another sentence.
       doc.text = text
       sentence = doc.sentences.last
-      Misc.benchmark(10) do
+      Misc.benchmark(1) do
         doc = Document.new(dir)
         doc.text = text
@@ -166,6 +174,15 @@ another sentence.
       assert_equal "another", sentence.tokens[2]
       assert_equal sentence.offset + 0, sentence.tokens[0].offset
+      assert_equal 2, sentence.long_words.length
+      doc = Document.new(dir)
+      doc.text = text * 10
+      doc.sentences
+      assert_equal sentence, doc.sentences.last
+      sentence = doc.sentences.last
+      doc.load_into sentence, :tokens, :long_words
       assert_equal 2, sentence.long_words.length
       assert_equal %w(another sentence), sentence.long_words
       assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
@@ -183,15 +200,16 @@ another sentence.
       FileUtils.mkdir_p dir
-      doc = Document.new(dir)
+      global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
+      doc = Document.new(dir, nil, nil, global_persistence)
       doc.text = text * 10
-      doc.docid = "FOOF"
-      doc.short_words
+      doc.docid = "TEST"
       doc.sentences
       doc = Document.new(dir)
       doc.text = text * 10
-      doc.docid = "FOOF"
+      doc.docid = "TEST"
       sentence = doc.sentences.last
@@ -201,22 +219,6 @@ another sentence.
       assert_equal 3, sentence.even_words.length
     end
   end
-  def test_dump
-    text =<<-EOF
-This is a
-sentence. This is
-another sentence.
-    EOF
-    TmpFile.with_file do |dir|
-      FileUtils.mkdir_p dir
-      doc = Document.new(dir)
-      doc.text = text * 10
-      tsv = Document.tsv(doc.sentences, ["Literal"])
-   end
-  end
 end