RubyGems - rbbt-text - Versions diffs - 1.3.8 → 1.3.9 - Mend

rbbt-text 1.3.8 → 1.3.9

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/rbbt/document/corpus.rb +1 -1
data/lib/rbbt/ner/regexpNER.rb +20 -21
data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
data/lib/rbbt/segment/named_entity.rb +2 -1
data/lib/rbbt/segment/transformed.rb +1 -1
data/share/install/software/Geniass +21 -12
data/share/rnorm/tokens_default +3 -0
data/test/rbbt/ner/test_regexpNER.rb +9 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
data/test/rbbt/segment/test_transformed.rb +11 -5
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
-  data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
+  metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
+  data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
 SHA512:
-  metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
-  data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
+  metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
+  data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b

data/lib/rbbt/document/corpus.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
-    corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
+    corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
     corpus.extend Document::Corpus unless Document::Corpus === corpus
     corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
     corpus.close

data/lib/rbbt/ner/regexpNER.rb CHANGED Viewed

@@ -10,34 +10,33 @@ class RegExpNER < NER
     while matchdata = text.match(regexp)
       pre   = matchdata.pre_match
       post  = matchdata.post_match
-      match = matchdata[0]
       if matchdata.captures.any?
-        capture = matchdata.captures.first
-        more_pre, more_post = match.split(/#{capture}/)
-        match = capture
-        pre << more_pre if more_pre
-        post = more_post << post if more_post
-      end
-      if match and not match.empty?
-        NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
+        match = matchdata.captures.first
+        offset, eend = matchdata.offset(1)
+        NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
         matches << match
+        start += offset + match.length
+        text = text[eend..-1]
+      else
+        match = matchdata[0]
+        NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
+        matches << match
+        eend = match.length + pre.length
+        text = text[eend..-1]
+        start += match.length + pre.length
       end
-      start += pre.length + match.length
-      text = post
     end
     matches
   end
-  def self.match_regexp_list(text, regexp_list, type = nil)
+  def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
     matches = []
     regexp_list.each do |regexp|
-      chunks = Segment.split(text, matches)
+      chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
+      chunks = Segment.split(text, [])
       chunks.each do |chunk|
         new_matches = match_regexp(chunk, regexp, type)
         new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +46,15 @@ class RegExpNER < NER
     matches
   end
-  def self.match_regexp_hash(text, regexp_hash)
+  def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
     matches = []
     regexp_hash.each do |type, regexp_list|
       regexp_list = [regexp_list] unless Array === regexp_list
-      chunks = Segment.split(text, matches)
+      chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
       chunks.each do |chunk|
         chunk_offset = chunk.offset
-        match_regexp_list(chunk, regexp_list, type).each do |match|
+        match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
           match.offset = match.offset + chunk_offset;
           matches << match
         end
@@ -65,7 +64,7 @@ class RegExpNER < NER
     matches
   end
-  attr_accessor :regexps
+  attr_accessor :regexps, :split_on_matches
   def initialize(regexps = {})
     @regexps = regexps.collect{|p| p }
   end
@@ -87,7 +86,7 @@ class RegExpNER < NER
   end
   def match(text)
-    matches = RegExpNER.match_regexp_hash(text, @regexps)
+    matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
     matches.collect do |m|
       NamedEntity.setup(m, :offset => m.offset, :type =>  m.type, :code => m)
     end

data/lib/rbbt/nlp/genia/sentence_splitter.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'rbbt/nlp/nlp'
 require 'rbbt/segment'
 module NLP
   Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
   def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
   end
   def self.process_labels(marked_text, labels)
+    return "" if marked_text.empty? || labels.empty?
     out = ""
     count = 0
@@ -171,8 +173,17 @@ module NLP
   end
   def self.geniass_sentence_splitter_extension(text)
+    cleaned = text.gsub("\n",NEW_LINE_MASK)
+    events, marks = event_extraction(cleaned)
     Rbbt.software.opt.Geniass.produce
-    require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
+    begin
+      ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
+      require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
+    rescue LoadError
+      raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
+    end
     geniass = Geniass.new
     if not geniass.geniass_is_loaded
       Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
       end
     end
-    cleaned = text.gsub("\n",NEW_LINE_MASK)
-    events, marks = event_extraction(cleaned)
     labels = events.split(/\n/).collect{|line|
       geniass.label(line)
     }

data/lib/rbbt/segment/named_entity.rb CHANGED Viewed

@@ -23,13 +23,14 @@ Score: #{score.inspect}
   end
   def html
-    title = code.nil? ? entity_type : [entity_type, code].compact * ":"
+    title = code.nil? ? entity_type : [entity_type, code].compact * " - "
     text = <<-EOF
 <span class='Entity'\
 #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
 #{code.nil?  ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
 #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
+#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
 #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
 >#{ self }</span>
     EOF

data/lib/rbbt/segment/transformed.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Transformed
     segments = yield text
-    segments = nil unless Array === segments && Segment === segments.first
+    segments = [] unless Array === segments && Segment === segments.first
     text.restore(segments, true)
   end

data/share/install/software/Geniass CHANGED Viewed

@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
 make geniass
 make libgeniass.so
+move_opt "$name" "$extra"
 mkdir lib
 mv libgeniass.so lib
-build "$name" "$extra"
 cd "$OPT_BUILD_DIR/$name"
 mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
 EOF
 cat > ruby/Geniass.cpp <<'EOF'
-#include "rice/Class.hpp"
-#include "rice/String.hpp"
+#include "rice/rice.hpp"
 #include <iostream>
 #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
 using namespace Rice;
 using namespace std;
-ME_Model model;
 bool geniass_loaded = false;
-void load_geniass(){
+bool geniass_is_loaded(Object self){ return(geniass_loaded); };
+ME_Model model;
+void load_geniass(Object self){
     printf("loading model");
     string modelFile = "model1-1.0";
     model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
     printf("..done\n");
 }
-bool geniass_is_loaded(){ return(geniass_loaded); };
 void split(string& str, vector<string>& tokens)
 {
     istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
     }
 }
-string label(string line){
+Object label(Object self, String rb_line){
     vector<string> tokens;
-    split(line, tokens);
     ME_Sample s;
+    string line = rb_line.c_str();
+    split(line, tokens);
     for(vector<string>::const_iterator token = tokens.begin() + 1;
         token != tokens.end(); ++token){
         s.add_feature(*token);
     }
     (void) model.classify(s);
-    return(s.label);
+    string label = s.label;
+    VALUE x;
+    x = rb_str_new_cstr(label.c_str());
+    return(x);
 }
 extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
 {
   Class rb_cGeniass =
     define_class("Geniass")
+    .define_method("geniass_is_loaded", &geniass_is_loaded)
     .define_method("load_geniass", &load_geniass)
     .define_method("label", &label)
-    .define_method("geniass_is_loaded", &geniass_is_loaded);
+    ;
 }
 EOF
 cd ruby
 ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
 make
+setup "$name" "$extra"

data/share/rnorm/tokens_default CHANGED Viewed

@@ -6,6 +6,7 @@ tokens do
   # Some (possible) single letters first
   receptor     /^(?:receptor|r)s?$/i
+  activator      /^(?:activator|p)s?$/i
   protein      /^(?:protein|p)s?$/i
   roman        /^[IV]+$/
   greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
   diff.promoter   -10
   diff.receptor   -10
+  diff.activator   -10
   diff.similar    -10
   diff.capital    -10

data/test/rbbt/ner/test_regexpNER.rb CHANGED Viewed

@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
     assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
   end
+  def test_entities_captures_repeat
+    sentence = "In a sentence I should find not this but this"
+    ner = RegExpNER.new({:this => /not this but (this)/})
+    matches = ner.entities(sentence)
+    assert sentence[0..matches.first.offset-1].include?('this')
+  end
   def test_regexp_order
     text =<<-EOF

data/test/rbbt/nlp/genia/test_sentence_splitter.rb CHANGED Viewed

@@ -12,7 +12,6 @@ sentence. This is
 another broken sentence.
     EOF
-    iii NLP.geniass_sentence_splitter(text)
     assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
   end
@@ -37,7 +36,17 @@ sentence. This is
 another broken sentence.
     EOF
-    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+    Log.with_severity 0 do
+      assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+    end
+  end
+  def test_sentence_cmi
+    text =<<-EOF
+The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
+    EOF
+    iii NLP.geniass_sentence_splitter(text)
   end
 end

data/test/rbbt/segment/test_transformed.rb CHANGED Viewed

@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
       assert_equal original.gsub(/TP53/, 'GN'), a
     end
+    Transformed.with_transform(a, [gene2], "GN") do
+      Transformed.with_transform(a, [gene1], "GN") do
+        assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
+      end
+      assert_equal original.gsub(/CDK5R1/, 'GN'), a
+    end
     Transformed.with_transform(a, [gene1], "GN") do
       Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
         assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
     Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
       Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
-        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
+        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
       end
     end
   end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-text
 version: !ruby/object:Gem::Version
-  version: 1.3.8
+  version: 1.3.9
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-11-08 00:00:00.000000000 Z
+date: 2023-01-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util