RubyGems - rbbt-text - Versions diffs - 1.3.8 → 1.3.10 - Mend

rbbt-text 1.3.8 → 1.3.10

Files changed (21) hide show

checksums.yaml +4 -4
data/lib/rbbt/document/annotation.rb +67 -2
data/lib/rbbt/document/corpus/pubmed.rb +6 -4
data/lib/rbbt/document/corpus.rb +1 -1
data/lib/rbbt/document.rb +4 -0
data/lib/rbbt/ner/g_norm_plus.rb +2 -1
data/lib/rbbt/ner/regexpNER.rb +30 -23
data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
data/lib/rbbt/segment/annotation.rb +1 -0
data/lib/rbbt/segment/named_entity.rb +2 -1
data/lib/rbbt/segment/overlaps.rb +9 -1
data/lib/rbbt/segment/transformed.rb +1 -1
data/lib/rbbt/segment.rb +4 -0
data/share/install/software/Geniass +21 -12
data/share/rnorm/tokens_default +3 -0
data/test/rbbt/document/test_annotation.rb +21 -0
data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
data/test/rbbt/ner/test_regexpNER.rb +17 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
data/test/rbbt/segment/test_transformed.rb +11 -5
metadata +27 -27

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
-  data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
+  metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
+  data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
 SHA512:
-  metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
-  data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
+  metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
+  data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f

data/lib/rbbt/document/annotation.rb CHANGED Viewed

@@ -6,7 +6,9 @@ module Document
     send :property, type do
       segments = self.instance_exec &block
-      Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+      Segment.align(self, segments) unless segments.empty? ||
+          (Segment === segments && segments.offset) ||
+          (Array === segments && Segment === segments.first && segments.first.offset)
       segments.each do |segment|
         SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
       segments
     end
+    DocID.property type do
+      self.document.send(type)
+    end
+    SegID.property type do
+      self.overlaps(self.docid.send(type))
+    end
+    Segment.property type do
+      self.overlaps(self.docid.send(type))
+    end
+    seg_type = "segids_for_" + type.to_s
+    send :property, seg_type do
+      SegID.setup(self.send(type).collect{|s| s.segid })
+    end
+    DocID.property seg_type do
+      self.document.send(seg_type)
+    end
+    SegID.property seg_type do
+      self.overlaps(self.docid.send(seg_type))
+    end
+    Segment.property seg_type do
+      self.overlaps(self.docid.send(seg_type))
+    end
   end
   def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
       doc_segments.each_with_index do |segments,i|
         next if segments.nil?
         document = list[i]
-        Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+        Segment.align(document, segments) unless segments.nil? ||
+          segments.empty? ||
+          (Segment === segments && segments.offset) ||
+          (Array === segments && Segment === segments.first && segments.first.offset)
         segments.each do |segment|
           SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
         segments
       end
     end
+    DocID.property type do
+      self.document.send(type)
+    end
+    SegID.property type do
+      self.overlaps(self.docid.send(type))
+    end
+    Segment.property type do
+      self.overlaps(self.docid.send(type))
+    end
+    seg_type = "segids_for_" + type.to_s
+    send :property, seg_type do
+      SegID.setup(self.send(type).collect{|s| s.segid })
+    end
+    DocID.property seg_type do
+      self.document.send(seg_type)
+    end
+    SegID.property seg_type do
+      self.overlaps(self.docid.send(seg_type))
+    end
+    Segment.property seg_type do
+      self.overlaps(self.docid.send(seg_type))
+    end
   end
 end

data/lib/rbbt/document/corpus/pubmed.rb CHANGED Viewed

@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
 module Document::Corpus
   PUBMED_NAMESPACE="PMID"
-  def add_pmid(pmid, type = nil, update = false)
-    type = :abstract if type.nil?
+  def add_pmid(pmid, type = :title_and_abstract, update = false)
+    type = :title_and_abstract if type.nil?
     if ! (update || Array === pmid)
       id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
     res = PubMed.get_article(pmids).collect do |pmid, article|
       document = if type.to_sym == :abstract
-                   Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
+                   Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
                  elsif type.to_sym == :title
-                   Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
+                   Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
+                 elsif type.to_sym == :title_and_abstract
+                   Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
                  else
                    raise "No FullText available for #{ pmid }" if article.full_text.nil?
                    Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)

data/lib/rbbt/document/corpus.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
-    corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
+    corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
     corpus.extend Document::Corpus unless Document::Corpus === corpus
     corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
     corpus.close

data/lib/rbbt/document.rb CHANGED Viewed

@@ -9,6 +9,10 @@ module DocID
     attr_accessor :default_corpus
   end
+  def id
+    self
+  end
   def corpus
     annotation_values[:corpus] || DocID.default_corpus
   end

data/lib/rbbt/ner/g_norm_plus.rb CHANGED Viewed

@@ -66,7 +66,8 @@ EOF
         end
         Open.write('config', CONFIG)
-        CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
+        mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
+        CMD.cmd_log("java -Xmx#{mem} -Xms#{mem}  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
         if texts.respond_to? :key_field
           key_field = texts.key_field

data/lib/rbbt/ner/regexpNER.rb CHANGED Viewed

@@ -10,34 +10,41 @@ class RegExpNER < NER
     while matchdata = text.match(regexp)
       pre   = matchdata.pre_match
       post  = matchdata.post_match
-      match = matchdata[0]
-      if matchdata.captures.any?
-        capture = matchdata.captures.first
-        more_pre, more_post = match.split(/#{capture}/)
-        match = capture
-        pre << more_pre if more_pre
-        post = more_post << post if more_post
-      end
-      if match and not match.empty?
-        NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
+      if matchdata.named_captures.any?
+        match = matchdata[0]
+        code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
+        NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
         matches << match
+        eend = match.length + pre.length
+        text = text[eend..-1]
+        start += match.length + pre.length
+      elsif matchdata.captures.any?
+        match = matchdata.captures.first
+        offset, eend = matchdata.offset(1)
+        NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
+        matches << match
+        start += offset + match.length
+        text = text[eend..-1]
+      else
+        match = matchdata[0]
+        NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
+        matches << match
+        eend = match.length + pre.length
+        text = text[eend..-1]
+        start += match.length + pre.length
       end
-      start += pre.length + match.length
-      text = post
     end
     matches
   end
-  def self.match_regexp_list(text, regexp_list, type = nil)
+  def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
     matches = []
     regexp_list.each do |regexp|
-      chunks = Segment.split(text, matches)
+      chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
+      chunks = Segment.split(text, [])
       chunks.each do |chunk|
         new_matches = match_regexp(chunk, regexp, type)
         new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +54,15 @@ class RegExpNER < NER
     matches
   end
-  def self.match_regexp_hash(text, regexp_hash)
+  def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
     matches = []
     regexp_hash.each do |type, regexp_list|
       regexp_list = [regexp_list] unless Array === regexp_list
-      chunks = Segment.split(text, matches)
+      chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
       chunks.each do |chunk|
         chunk_offset = chunk.offset
-        match_regexp_list(chunk, regexp_list, type).each do |match|
+        match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
           match.offset = match.offset + chunk_offset;
           matches << match
         end
@@ -65,7 +72,7 @@ class RegExpNER < NER
     matches
   end
-  attr_accessor :regexps
+  attr_accessor :regexps, :split_on_matches
   def initialize(regexps = {})
     @regexps = regexps.collect{|p| p }
   end
@@ -87,9 +94,9 @@ class RegExpNER < NER
   end
   def match(text)
-    matches = RegExpNER.match_regexp_hash(text, @regexps)
+    matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
     matches.collect do |m|
-      NamedEntity.setup(m, :offset => m.offset, :type =>  m.type, :code => m)
+      NamedEntity.setup(m, :offset => m.offset, :type =>  m.type, :code => m.code || m)
     end
   end

data/lib/rbbt/nlp/genia/sentence_splitter.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'rbbt/nlp/nlp'
 require 'rbbt/segment'
 module NLP
   Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
   def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
   end
   def self.process_labels(marked_text, labels)
+    return "" if marked_text.empty? || labels.empty?
     out = ""
     count = 0
@@ -171,8 +173,17 @@ module NLP
   end
   def self.geniass_sentence_splitter_extension(text)
+    cleaned = text.gsub("\n",NEW_LINE_MASK)
+    events, marks = event_extraction(cleaned)
     Rbbt.software.opt.Geniass.produce
-    require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
+    begin
+      ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
+      require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
+    rescue LoadError
+      raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
+    end
     geniass = Geniass.new
     if not geniass.geniass_is_loaded
       Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
       end
     end
-    cleaned = text.gsub("\n",NEW_LINE_MASK)
-    events, marks = event_extraction(cleaned)
     labels = events.split(/\n/).collect{|line|
       geniass.label(line)
     }

data/lib/rbbt/segment/annotation.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'rbbt/entity'
 module AnnotID
   extend Entity
+  include SegID
   self.annotation :corpus
   def _parts

data/lib/rbbt/segment/named_entity.rb CHANGED Viewed

@@ -23,13 +23,14 @@ Score: #{score.inspect}
   end
   def html
-    title = code.nil? ? entity_type : [entity_type, code].compact * ":"
+    title = code.nil? ? entity_type : [entity_type, code].compact * " - "
     text = <<-EOF
 <span class='Entity'\
 #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
 #{code.nil?  ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
 #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
+#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
 #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
 >#{ self }</span>
     EOF

data/lib/rbbt/segment/overlaps.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-module Segment
+module SegmentRanges
   def pull(offset)
     if self.offset.nil? or offset.nil?
       self.offset = nil
@@ -61,3 +61,11 @@ module Segment
     end
   end
 end
+module Segment
+  include SegmentRanges
+end
+module SegID
+  include SegmentRanges
+end

data/lib/rbbt/segment/transformed.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Transformed
     segments = yield text
-    segments = nil unless Array === segments && Segment === segments.first
+    segments = [] unless Array === segments && Segment === segments.first
     text.restore(segments, true)
   end

data/lib/rbbt/segment.rb CHANGED Viewed

@@ -22,6 +22,10 @@ module SegID
     range.begin
   end
+  def eend
+    offset.to_i + length - 1
+  end
   def segment_length
     range.end - range.begin + 1
   end

data/share/install/software/Geniass CHANGED Viewed

@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
 make geniass
 make libgeniass.so
+move_opt "$name" "$extra"
 mkdir lib
 mv libgeniass.so lib
-build "$name" "$extra"
 cd "$OPT_BUILD_DIR/$name"
 mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
 EOF
 cat > ruby/Geniass.cpp <<'EOF'
-#include "rice/Class.hpp"
-#include "rice/String.hpp"
+#include "rice/rice.hpp"
 #include <iostream>
 #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
 using namespace Rice;
 using namespace std;
-ME_Model model;
 bool geniass_loaded = false;
-void load_geniass(){
+bool geniass_is_loaded(Object self){ return(geniass_loaded); };
+ME_Model model;
+void load_geniass(Object self){
     printf("loading model");
     string modelFile = "model1-1.0";
     model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
     printf("..done\n");
 }
-bool geniass_is_loaded(){ return(geniass_loaded); };
 void split(string& str, vector<string>& tokens)
 {
     istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
     }
 }
-string label(string line){
+Object label(Object self, String rb_line){
     vector<string> tokens;
-    split(line, tokens);
     ME_Sample s;
+    string line = rb_line.c_str();
+    split(line, tokens);
     for(vector<string>::const_iterator token = tokens.begin() + 1;
         token != tokens.end(); ++token){
         s.add_feature(*token);
     }
     (void) model.classify(s);
-    return(s.label);
+    string label = s.label;
+    VALUE x;
+    x = rb_str_new_cstr(label.c_str());
+    return(x);
 }
 extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
 {
   Class rb_cGeniass =
     define_class("Geniass")
+    .define_method("geniass_is_loaded", &geniass_is_loaded)
     .define_method("load_geniass", &load_geniass)
     .define_method("label", &label)
-    .define_method("geniass_is_loaded", &geniass_is_loaded);
+    ;
 }
 EOF
 cd ruby
 ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
 make
+setup "$name" "$extra"

data/share/rnorm/tokens_default CHANGED Viewed

@@ -6,6 +6,7 @@ tokens do
   # Some (possible) single letters first
   receptor     /^(?:receptor|r)s?$/i
+  activator      /^(?:activator|p)s?$/i
   protein      /^(?:protein|p)s?$/i
   roman        /^[IV]+$/
   greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
   diff.promoter   -10
   diff.receptor   -10
+  diff.activator   -10
   diff.similar    -10
   diff.capital    -10

data/test/rbbt/document/test_annotation.rb CHANGED Viewed

@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
       self.split(" ")
     end
+    Document.define :lines do
+      self.split("\n")
+    end
     $called_once = false
     Document.define :persisted_words do
       raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
     assert  text.ner.first.segid.include?("TEST:")
   end
+  def test_sentence_words
+    text =<<-EOF
+This is sentence 1
+This is sentence 2
+    EOF
+    Document.setup(text)
+    words = text.words
+    numbers = words.select{|w| w =~ /\d/}
+    text.lines.each do |sentence|
+      Transformed.with_transform(sentence, numbers, "[NUM]") do
+        puts sentence
+      end
+    end
+  end
 end

data/test/rbbt/ner/test_g_norm_plus.rb CHANGED Viewed

@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
 sapiens
     EOF
+    Rbbt::Config.add_entry :java_mem, "2G", :gnp
     mentions = GNormPlus.process({:file => text})
     assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
 We found that TP53 is regulated by MDM2 in Homo sapiens
     EOF
+    Rbbt::Config.add_entry :java_mem, "2G", :gnp
     mentions = GNormPlus.entities({:file => text})
     assert mentions["file"].include?("TP53")
     mentions["file"].each do |mention|

data/test/rbbt/ner/test_regexpNER.rb CHANGED Viewed

@@ -79,6 +79,23 @@ class TestRegExpNER < Test::Unit::TestCase
     assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
   end
+  def test_entities_captures_repeat
+    sentence = "In a sentence I should find not this but this"
+    ner = RegExpNER.new({:this => /not this but (this)/})
+    matches = ner.entities(sentence)
+    assert sentence[0..matches.first.offset-1].include?('this')
+  end
+  def test_entities_named_captures
+    sentence = "In a sentence I should find not this but this"
+    ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
+    matches = ner.entities(sentence)
+  end
   def test_regexp_order
     text =<<-EOF

data/test/rbbt/nlp/genia/test_sentence_splitter.rb CHANGED Viewed

@@ -12,7 +12,6 @@ sentence. This is
 another broken sentence.
     EOF
-    iii NLP.geniass_sentence_splitter(text)
     assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
   end
@@ -37,7 +36,17 @@ sentence. This is
 another broken sentence.
     EOF
-    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+    Log.with_severity 0 do
+      assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+    end
+  end
+  def test_sentence_cmi
+    text =<<-EOF
+The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
+    EOF
+    iii NLP.geniass_sentence_splitter(text)
   end
 end

data/test/rbbt/segment/test_transformed.rb CHANGED Viewed

@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
       assert_equal original.gsub(/TP53/, 'GN'), a
     end
+    Transformed.with_transform(a, [gene2], "GN") do
+      Transformed.with_transform(a, [gene1], "GN") do
+        assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
+      end
+      assert_equal original.gsub(/CDK5R1/, 'GN'), a
+    end
     Transformed.with_transform(a, [gene1], "GN") do
       Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
         assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
     Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
       Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
-        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
+        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
       end
     end
   end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-text
 version: !ruby/object:Gem::Version
-  version: 1.3.8
+  version: 1.3.10
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-11-08 00:00:00.000000000 Z
+date: 2023-02-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.4
+rubygems_version: 3.1.2
 signing_key:
 specification_version: 4
 summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
 test_files:
-- test/rbbt/nlp/test_nlp.rb
-- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
-- test/rbbt/nlp/genia/test_sentence_splitter.rb
+- test/test_spaCy.rb
+- test/test_helper.rb
+- test/rbbt/bow/test_dictionary.rb
 - test/rbbt/bow/test_bow.rb
 - test/rbbt/bow/test_misc.rb
-- test/rbbt/bow/test_dictionary.rb
-- test/rbbt/test_document.rb
-- test/rbbt/document/test_annotation.rb
+- test/rbbt/segment/test_encoding.rb
+- test/rbbt/segment/test_transformed.rb
+- test/rbbt/segment/test_overlaps.rb
+- test/rbbt/segment/test_named_entity.rb
+- test/rbbt/segment/test_corpus.rb
+- test/rbbt/segment/test_range_index.rb
+- test/rbbt/segment/test_annotation.rb
+- test/rbbt/entity/test_document.rb
 - test/rbbt/document/corpus/test_pubmed.rb
 - test/rbbt/document/test_corpus.rb
-- test/rbbt/entity/test_document.rb
+- test/rbbt/document/test_annotation.rb
+- test/rbbt/test_document.rb
 - test/rbbt/ner/test_patterns.rb
-- test/rbbt/ner/test_NER.rb
-- test/rbbt/ner/test_abner.rb
 - test/rbbt/ner/rnorm/test_tokens.rb
-- test/rbbt/ner/test_rnorm.rb
-- test/rbbt/ner/test_regexpNER.rb
 - test/rbbt/ner/test_ngram_prefix_dictionary.rb
+- test/rbbt/ner/test_token_trieNER.rb
+- test/rbbt/ner/test_finder.rb
 - test/rbbt/ner/test_brat.rb
+- test/rbbt/ner/test_regexpNER.rb
 - test/rbbt/ner/test_g_norm_plus.rb
+- test/rbbt/ner/test_rnorm.rb
+- test/rbbt/ner/test_linnaeus.rb
 - test/rbbt/ner/test_chemical_tagger.rb
-- test/rbbt/ner/test_banner.rb
-- test/rbbt/ner/test_token_trieNER.rb
-- test/rbbt/ner/test_finder.rb
+- test/rbbt/ner/test_NER.rb
+- test/rbbt/ner/test_abner.rb
 - test/rbbt/ner/test_rner.rb
-- test/rbbt/ner/test_linnaeus.rb
 - test/rbbt/ner/test_oscar4.rb
+- test/rbbt/ner/test_banner.rb
 - test/rbbt/test_segment.rb
-- test/rbbt/segment/test_transformed.rb
-- test/rbbt/segment/test_overlaps.rb
-- test/rbbt/segment/test_annotation.rb
-- test/rbbt/segment/test_named_entity.rb
-- test/rbbt/segment/test_encoding.rb
-- test/rbbt/segment/test_range_index.rb
-- test/rbbt/segment/test_corpus.rb
-- test/test_spaCy.rb
-- test/test_helper.rb
+- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
+- test/rbbt/nlp/test_nlp.rb
+- test/rbbt/nlp/genia/test_sentence_splitter.rb