RubyGems - rbbt-text - Versions diffs - 1.3.1 → 1.3.2 - Mend

rbbt-text 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +0 -2
data/lib/rbbt/bow/dictionary.rb +2 -2
data/lib/rbbt/document.rb +15 -5
data/lib/rbbt/document/annotation.rb +5 -2
data/lib/rbbt/document/corpus.rb +26 -3
data/lib/rbbt/document/corpus/pubmed.rb +1 -1
data/lib/rbbt/ner/g_norm_plus.rb +7 -1
data/lib/rbbt/ner/patterns.rb +0 -1
data/lib/rbbt/ner/token_trieNER.rb +28 -15
data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
data/lib/rbbt/nlp/spaCy.rb +52 -0
data/lib/rbbt/segment.rb +6 -4
data/lib/rbbt/segment/annotation.rb +3 -3
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/segment/transformed.rb +2 -2
data/share/install/software/OpenNLP +1 -1
data/test/rbbt/document/test_annotation.rb +5 -5
data/test/rbbt/document/test_corpus.rb +1 -1
data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
data/test/rbbt/segment/test_annotation.rb +3 -4
data/test/rbbt/segment/test_encoding.rb +1 -1
data/test/rbbt/segment/test_named_entity.rb +5 -4
data/test/rbbt/segment/test_range_index.rb +1 -2
data/test/rbbt/test_segment.rb +5 -10
data/test/test_spaCy.rb +32 -0
metadata +6 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
-  data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
+  metadata.gz: 05b1cf1981e955652598dd3db811cf8e6a7d64b68535e21834012abe90efe388
+  data.tar.gz: 67017f8a10cbfae51664999218336d638ea6be7c29b5ec305872473672977a41
 SHA512:
-  metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
-  data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
+  metadata.gz: 03b02dcea1040edfa653e976d9f2f808ed25f9e0164add2fc85afa4417cf8e10ff8dfb27e1927c457f0ff6c6ee90311765ac364b7c5d7c8d9fd51cfff4ab9434
+  data.tar.gz: a5c44f475241da67863ac33ea446e7dbc64283ca53d6642c7c67c1c6e2e34a5d28b1ad678d2a5a44bf3316ed3c063f2d084628b8ff4d79aee8be04db3f8a6ab1

data/lib/rbbt/bow/bow.rb CHANGED

@@ -87,5 +87,3 @@ class String
     BagOfWords.bigrams(self)
   end
 end

data/lib/rbbt/bow/dictionary.rb CHANGED

@@ -95,7 +95,7 @@ class Dictionary::TF_IDF
                      }
                      if limit
-                       Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+                       Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
                      else
                        Hash[*best.flatten]
                      end
@@ -177,7 +177,7 @@ class Dictionary::KL
       best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
     }
     if limit
-      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
     else
       best
     end

data/lib/rbbt/document.rb CHANGED

@@ -1,6 +1,5 @@
 require 'rbbt-util'
 require 'rbbt/entity'
-require 'rbbt/document/annotation'
 module DocID
   extend Entity
@@ -19,10 +18,21 @@ module DocID
     DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
   end
-  def document
-    text = self.corpus[self]
-    namespace, id, type = self.split(":")
-    Document.setup(text, namespace, id, type, :corpus => corpus)
+  property :document => :both do
+    if Array === self
+      namespace, id, type = nil, nil, nil
+      docs = self.collect do |docid|
+        text = self.corpus[docid]
+        namespace, id, type = docid.split(":")
+        #Document.setup(text, namespace, id, type, :corpus => corpus)
+        text
+      end
+      Document.setup(docs, :corpus => corpus)
+    else
+      text = self.corpus[self]
+      namespace, id, type = self.split(":")
+      Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
+    end
   end
 end

data/lib/rbbt/document/annotation.rb CHANGED

@@ -1,3 +1,4 @@
+require 'rbbt/segment'
 require 'rbbt/segment/annotation'
 module Document
@@ -22,17 +23,19 @@ module Document
     send :property, type => :multiple do |list|
       doc_segments = self.instance_exec list, &block
-      doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
+      doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
       doc_segments.each_with_index do |segments,i|
+        next if segments.nil?
         document = list[i]
-        Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+        Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
         segments.each do |segment|
           SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
         end
         docid = document.docid
         segments.each{|s| s.docid = docid if s.docid.nil? }
         segments

data/lib/rbbt/document/corpus.rb CHANGED

@@ -3,17 +3,40 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
-    corpus.extend Document::Corpus
+    corpus.extend Document::Corpus unless Document::Corpus === corpus
+    corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
+    corpus
   end
   def add_document(document)
-    self[document.docid] = document
+    docid = document.docid
+    return document if self.include?(docid)
+    self.write_and_close do
+      self[docid] = document
+    end
+  end
+  def docids(prefix)
+    prefix += ":" unless prefix[-1] == ":"
+    docids = self.read_and_close do
+      self.prefix(prefix)
+    end
+    DocID.setup(docids, :corpus => self)
+  end
+  def documents(prefix)
+    self.docids(prefix).document
   end
   def [](*args)
     docid, *rest = args
-    res = super(*args)
+    res = self.read_and_close do
+      super(*args)
+    end
     return res if args.length > 1
     namespace, id, type  = docid.split(":")
     if res.nil?

data/lib/rbbt/document/corpus/pubmed.rb CHANGED

@@ -6,7 +6,6 @@ module Document::Corpus
     type = nil if String === type and type.empty?
     res = PubMed.get_article(pmids).collect do |pmid, article|
-      Log.debug "Loading pmid #{pmid}"
       document = if type.nil? || type.to_sym == :abstract
                    Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
                  elsif type.to_sym == :title
@@ -15,6 +14,7 @@ module Document::Corpus
                    raise "No FullText available for #{ pmid }" if article.full_text.nil?
                    Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
                  end
+      Log.debug "Loading pmid #{pmid}"
       add_document(document)
     end

data/lib/rbbt/ner/g_norm_plus.rb CHANGED

@@ -55,11 +55,16 @@ EOF
         Open.mkdir 'tmp'
         texts.each do |name,text|
+          text = Misc.fixutf8(text)
+          text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
           Open.write("input/#{name}.txt") do |f|
-            f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
+            f.puts "#{name}|a|" << text
             f.puts
           end
         end
         Open.write('config', CONFIG)
         CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
@@ -95,6 +100,7 @@ EOF
       res[name] = segments
     end
+    res
   end
 end

data/lib/rbbt/ner/patterns.rb CHANGED

@@ -15,7 +15,6 @@ class PatternRelExt
     segments = sentence.segments
     segments = segments.values.flatten if Hash === segments
     Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
-      ppp sentence
       regexpNER.entities(sentence)
     end
   end

data/lib/rbbt/ner/token_trieNER.rb CHANGED

@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
 require 'rbbt/segment/token'
 class TokenTrieNER < NER
-  def self.clean(token)
+  def self.clean(token, stem = false)
     if token.length > 3
-      token.downcase.sub(/-/,'')
+      upcase = token !~ /[a-z]/
+      token = token.downcase.sub(/-/,'')
+      if stem && ! upcase
+        require 'stemmer'
+        if stem == :double
+          token = token.stem.stem
+        else
+          token = token.stem
+        end
+      end
+      token
     else
       token
     end
   end
-  def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
+  def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
     if no_clean
       if extend_to_token
         Token.setup(token, :offset => start, :original => token)
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
       end
     else
       if extend_to_token
-        Token.setup(clean(token), :offset => start, :original => token)
+        Token.setup(clean(token, stem), :offset => start, :original => token)
       else
-        clean(token)
+        clean(token, stem)
       end
     end
   end
-  def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
+  def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
     split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
     tokens = []
     while matchdata = text.match(split_at)
-      tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
-      tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
+      tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
+      tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
       start += matchdata.end(0)
       text = matchdata.post_match
     end
-    tokens << prepare_token(text, start, extend_to_token) unless text.empty?
+    tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
     tokens
   end
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
     index1
   end
-  def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
+  def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
     chunk_size = hash.size / 100
     items_in_chunk = 0
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
       names.each do |name|
         next if name.empty? or (String === name and name.length < 2)
-        tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
+        tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
         tokens.extend EnumeratedArray
         token_index = index_for_tokens(tokens, code, type, slack)
@@ -240,7 +252,7 @@ class TokenTrieNER < NER
     NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
   end
-  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
+  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
   def initialize(type = nil, file = nil, options = {})
     options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
       :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
     @longest_match = options.delete :longest_match
     @split_at = options.delete :split_at
     @no_clean = options.delete :no_clean
+    @stem = options.delete :stem
     file = [] if file.nil?
     file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
       Log.debug "TokenTrieNER merging TSV"
       new.with_unnamed do
         new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
-          TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
+          TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
         end
       end
     when Hash === new
@@ -284,14 +297,14 @@ class TokenTrieNER < NER
       new = TSV.open(new, :flat)
       new.with_unnamed do
         new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
-          TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
+          TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
         end
       end
     end
   end
   def match(text)
-    tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
+    tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
     tokens.extend EnumeratedArray
     tokens.pos = 0

data/lib/rbbt/nlp/genia/sentence_splitter.rb CHANGED

@@ -239,6 +239,7 @@ module NLP
   end
   def self.geniass_sentence_splitter(text)
+    Rbbt.software.opt.Geniass.produce
     offsets = []
     cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
     offsets.collect do |s,e|
       sentence = text[s..e]
       next if sentence.nil?
-      #sentence.gsub!(NEW_LINE_MASK, "\n")
+      sentence.gsub!(NEW_LINE_MASK, "\n")
       Segment.setup sentence, s
       sentence
     end

data/lib/rbbt/nlp/spaCy.rb ADDED

@@ -0,0 +1,52 @@
+require 'rbbt/segment'
+require 'rbbt/document'
+require 'rbbt/segment/annotation'
+require 'rbbt/util/python'
+module SpaCy
+  PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
+  def self.tokens(text, lang = 'en')
+    tokens = []
+    RbbtPython.run 'spacy' do
+      nlp = spacy.load(lang)
+      doc = nlp.call(text)
+      doc.__len__.times do |i|
+        tokens << doc.__getitem__(i)
+      end
+    end
+    tokens
+  end
+  def self.segments(text, lang = 'en')
+    docid = text.docid if Document === text
+    corpus = text.corpus if Document === text
+    tokens = self.tokens(text, lang).collect do |token|
+      info = {}
+      PROPERTIES.each do |p|
+        info[p] = token.instance_eval(p.to_s)
+      end
+      info[:type] = "SpaCy"
+      info[:offset] = token.idx
+      info[:dep] = token.dep_ + "->" + token.head.idx.to_s
+      info[:docid] = docid if docid
+      info[:corpus] = corpus if corpus
+      SpaCyToken.setup(token.text, info)
+    end
+    SpaCyToken.setup(tokens, :corpus => corpus)
+  end
+end
+module SpaCyToken
+  extend Entity
+  include SegmentAnnotation
+  self.annotation *SpaCy::PROPERTIES
+  self.annotation :dep
+end
+if __FILE__ == $0
+  ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
+end

data/lib/rbbt/segment.rb CHANGED

@@ -1,5 +1,6 @@
 require 'rbbt-util'
 require 'rbbt/entity'
+require 'rbbt/document'
 module SegID
   extend Entity
@@ -10,11 +11,11 @@ module SegID
   end
   def range
-    @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
+    @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
   end
   def docid
-    @docid ||= _parts[0..3] * ":"
+    @docid ||= DocID.setup(_parts[0..3] * ":")
   end
   def offset
@@ -25,12 +26,13 @@ module SegID
     range.end - range.begin + 1
   end
-  property :segment do
+  property :segment => :single do
+    docid = self.docid
     document = DocID.setup(docid, :corpus => corpus).document
     text = document[range]
-    Segment.setup(text, docid)
+    Segment.setup(text, :docid => docid, :offset => offset)
   end
   property :segid do

data/lib/rbbt/segment/annotation.rb CHANGED

@@ -1,6 +1,6 @@
 require 'rbbt-util'
-require 'rbbt/entity'
 require 'rbbt/segment'
+require 'rbbt/entity'
 module AnnotID
   extend Entity
@@ -32,7 +32,7 @@ end
 module SegmentAnnotation
   extend Entity
-  include Segment
+  include Object::Segment
   self.annotation :type
   property :segid do
@@ -47,7 +47,7 @@ module SegmentAnnotation
   end
   property :annotid do |corpus=nil|
-    AnnotID.setup([segid, type] * ":", :corpus => corpus)
+    AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
   end
   alias id annotid

data/lib/rbbt/segment/relationship.rb ADDED

@@ -0,0 +1,7 @@
+module Relationship
+  extend Entity
+  self.annotation :segments
+  self.annotation :type
+end

data/lib/rbbt/segment/transformed.rb CHANGED

@@ -69,8 +69,8 @@ module Transformed
     segments = [segments] unless Array === segments
     orig_length = self.length
-    offset = self.respond_to?(:offset) ? self.offset : 0
-    segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
+    offset = self.respond_to?(:offset) ? self.offset.to_i : 0
+    segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
     Segment.clean_sort(segments).each do |segment|
       next if segment.offset.nil?

data/share/install/software/OpenNLP CHANGED

@@ -1,7 +1,7 @@
 #!/bin/bash
 name='OpenNLP'
-url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
+url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
 get_src "$name" "$url"
 move_opt "$name"

data/test/rbbt/document/test_annotation.rb CHANGED

@@ -36,7 +36,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -50,7 +50,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text2, "TEST", "test_doc2", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text1)
     corpus.add_document(text2)
@@ -68,7 +68,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -95,7 +95,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -122,7 +122,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)

data/test/rbbt/document/test_corpus.rb CHANGED

@@ -26,7 +26,7 @@ class TestDocumentCorpus < Test::Unit::TestCase
       corpus.add_document(text)
-      assert corpus.prefix("TEST:").include?(text.docid)
+      assert corpus.docids("TEST:").include?(text.docid)
     end
   end
 end

data/test/rbbt/ner/test_g_norm_plus.rb CHANGED

@@ -5,12 +5,17 @@ Log.severity = 0
 class TestGNormPlus < Test::Unit::TestCase
   def test_match
     text =<<-EOF
-We found that TP53 is regulated by MDM2 in Homo sapiens
+Introduction
+We found that TP53 is regulated by MDM2 in Homo
+sapiens
     EOF
     mentions = GNormPlus.process({:file => text})
     assert_equal 1, mentions.length
-    assert_equal 2, mentions["file"].length
+    assert_equal 3, mentions["file"].length
   end
   def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
     EOF
     mentions = GNormPlus.entities({:file => text})
-    mentions["file"].include? "TP53"
+    assert mentions["file"].include?("TP53")
+    mentions["file"].each do |mention|
+      assert_equal mention, text[mention.range].sub("\n", ' ')
+    end
   end
 end

data/test/rbbt/nlp/genia/test_sentence_splitter.rb CHANGED

@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
 This is a sentence.
 A funky character ™ in a sentence.
 This is a sentence.
-This is a
+This is a broken
 sentence. This is
-another sentence.
+another broken sentence.
     EOF
-    assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
+    iii NLP.geniass_sentence_splitter(text)
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
   end
+  def test_sentences_2
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
+  end
+  def test_sentences_ext
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+  end
 end

data/test/rbbt/segment/test_annotation.rb CHANGED

@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
     segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
     annotation = SegmentAnnotation.setup(segment, :type => :verb)
-    assert_equal 'verb', annotation.annotid.split(":").last
+    assert_equal 'verb', annotation.annotid.split(":")[5]
     annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
-    assert_equal 'verb', annotation.annotid.split(":").last
+    assert_equal 'verb', annotation.annotid.split(":")[5]
   end
   def test_annotid
     text = "This is a document"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/rbbt/segment/test_encoding.rb CHANGED

@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
 require 'rbbt/segment/encoding'
 class TestEncoding < Test::Unit::TestCase
-  def _test_bad_chars
+  def test_bad_chars
     text = "A funky character ™ in a sentence."
     assert_equal ["™"], Segment.bad_chars(text)

data/test/rbbt/segment/test_named_entity.rb CHANGED

@@ -22,12 +22,13 @@ class TestClass < Test::Unit::TestCase
     assert_equal "SCORE", a.score
   end
-  def __test_tsv
+  def test_tsv
     a = "test"
     NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
-    assert Segment.tsv([a]).fields.include? "code"
-    assert Segment.tsv([a], nil).fields.include? "code"
-    assert Segment.tsv([a], "literal").fields.include? "code"
+    assert Annotated.tsv([a]).fields.include? "code"
+    assert Annotated.tsv([a], nil).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "literal"
   end
   def __test_segment_brat

data/test/rbbt/segment/test_range_index.rb CHANGED

@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/rbbt/test_segment.rb CHANGED

@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This is a document"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/test_spaCy.rb ADDED

@@ -0,0 +1,32 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
+require 'rbbt/nlp/spaCy'
+require 'rbbt/document/corpus'
+class TestSpaCy < Test::Unit::TestCase
+  def _test_tokens
+    text = "I tell a story"
+    tokens = SpaCy.tokens(text)
+    assert_equal 4, tokens.length
+    assert_equal "tell", tokens[1].to_s
+  end
+  def test_segments
+    text = "I tell a story. It's a very good story."
+    corpus = Document::Corpus.setup({})
+    Document.setup(text, "TEST", "test_doc1", "simple_sentence")
+    corpus.add_document text
+    text.corpus = corpus
+    segments = SpaCy.segments(text)
+    segments.each do |segment|
+      assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
+    end
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-text
 version: !ruby/object:Gem::Version
-  version: 1.3.1
+  version: 1.3.2
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-04-19 00:00:00.000000000 Z
+date: 2020-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -102,12 +102,14 @@ files:
 - lib/rbbt/nlp/genia/sentence_splitter.rb
 - lib/rbbt/nlp/nlp.rb
 - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
+- lib/rbbt/nlp/spaCy.rb
 - lib/rbbt/segment.rb
 - lib/rbbt/segment/annotation.rb
 - lib/rbbt/segment/encoding.rb
 - lib/rbbt/segment/named_entity.rb
 - lib/rbbt/segment/overlaps.rb
 - lib/rbbt/segment/range_index.rb
+- lib/rbbt/segment/relationship.rb
 - lib/rbbt/segment/segmented.rb
 - lib/rbbt/segment/token.rb
 - lib/rbbt/segment/transformed.rb
@@ -161,6 +163,7 @@ files:
 - test/rbbt/test_document.rb
 - test/rbbt/test_segment.rb
 - test/test_helper.rb
+- test/test_spaCy.rb
 homepage: http://github.com/mikisvaz/rbbt-util
 licenses: []
 metadata: {}
@@ -217,4 +220,5 @@ test_files:
 - test/rbbt/segment/test_encoding.rb
 - test/rbbt/segment/test_range_index.rb
 - test/rbbt/segment/test_corpus.rb
+- test/test_spaCy.rb
 - test/test_helper.rb