rbbt-text 1.3.8 → 1.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
- data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
3
+ metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
+ data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
5
5
  SHA512:
6
- metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
- data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
6
+ metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
+ data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
@@ -3,7 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
7
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
8
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
9
  corpus.close
@@ -10,34 +10,33 @@ class RegExpNER < NER
10
10
  while matchdata = text.match(regexp)
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
- match = matchdata[0]
14
13
 
15
14
  if matchdata.captures.any?
16
- capture = matchdata.captures.first
17
- more_pre, more_post = match.split(/#{capture}/)
18
-
19
- match = capture
20
- pre << more_pre if more_pre
21
- post = more_post << post if more_post
22
- end
23
-
24
- if match and not match.empty?
25
- NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
15
+ match = matchdata.captures.first
16
+ offset, eend = matchdata.offset(1)
17
+ NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
26
18
  matches << match
19
+ start += offset + match.length
20
+ text = text[eend..-1]
21
+ else
22
+ match = matchdata[0]
23
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
24
+ matches << match
25
+ eend = match.length + pre.length
26
+ text = text[eend..-1]
27
+ start += match.length + pre.length
27
28
  end
28
-
29
- start += pre.length + match.length
30
- text = post
31
29
  end
32
30
 
33
31
  matches
34
32
  end
35
33
 
36
- def self.match_regexp_list(text, regexp_list, type = nil)
34
+ def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
37
35
  matches = []
38
36
 
39
37
  regexp_list.each do |regexp|
40
- chunks = Segment.split(text, matches)
38
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
39
+ chunks = Segment.split(text, [])
41
40
  chunks.each do |chunk|
42
41
  new_matches = match_regexp(chunk, regexp, type)
43
42
  new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +46,15 @@ class RegExpNER < NER
47
46
  matches
48
47
  end
49
48
 
50
- def self.match_regexp_hash(text, regexp_hash)
49
+ def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
51
50
  matches = []
52
51
 
53
52
  regexp_hash.each do |type, regexp_list|
54
53
  regexp_list = [regexp_list] unless Array === regexp_list
55
- chunks = Segment.split(text, matches)
54
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
56
55
  chunks.each do |chunk|
57
56
  chunk_offset = chunk.offset
58
- match_regexp_list(chunk, regexp_list, type).each do |match|
57
+ match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
59
58
  match.offset = match.offset + chunk_offset;
60
59
  matches << match
61
60
  end
@@ -65,7 +64,7 @@ class RegExpNER < NER
65
64
  matches
66
65
  end
67
66
 
68
- attr_accessor :regexps
67
+ attr_accessor :regexps, :split_on_matches
69
68
  def initialize(regexps = {})
70
69
  @regexps = regexps.collect{|p| p }
71
70
  end
@@ -87,7 +86,7 @@ class RegExpNER < NER
87
86
  end
88
87
 
89
88
  def match(text)
90
- matches = RegExpNER.match_regexp_hash(text, @regexps)
89
+ matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
91
90
  matches.collect do |m|
92
91
  NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
93
92
  end
@@ -1,6 +1,7 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/segment'
3
3
  module NLP
4
+
4
5
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
6
 
6
7
  def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
136
137
  end
137
138
 
138
139
  def self.process_labels(marked_text, labels)
140
+ return "" if marked_text.empty? || labels.empty?
139
141
  out = ""
140
142
 
141
143
  count = 0
@@ -171,8 +173,17 @@ module NLP
171
173
  end
172
174
 
173
175
  def self.geniass_sentence_splitter_extension(text)
176
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
177
+ events, marks = event_extraction(cleaned)
178
+
174
179
  Rbbt.software.opt.Geniass.produce
175
- require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
180
+ begin
181
+ ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
182
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
183
+ rescue LoadError
184
+ raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
185
+ end
186
+
176
187
  geniass = Geniass.new
177
188
  if not geniass.geniass_is_loaded
178
189
  Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
180
191
  end
181
192
  end
182
193
 
183
- cleaned = text.gsub("\n",NEW_LINE_MASK)
184
- events, marks = event_extraction(cleaned)
185
-
186
194
  labels = events.split(/\n/).collect{|line|
187
195
  geniass.label(line)
188
196
  }
@@ -23,13 +23,14 @@ Score: #{score.inspect}
23
23
  end
24
24
 
25
25
  def html
26
- title = code.nil? ? entity_type : [entity_type, code].compact * ":"
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * " - "
27
27
 
28
28
  text = <<-EOF
29
29
  <span class='Entity'\
30
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
31
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
32
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{segid.nil? ? "" : " attr-segid='#{segid}'"}\
33
34
  #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
34
35
  >#{ self }</span>
35
36
  EOF
@@ -17,7 +17,7 @@ module Transformed
17
17
 
18
18
  segments = yield text
19
19
 
20
- segments = nil unless Array === segments && Segment === segments.first
20
+ segments = [] unless Array === segments && Segment === segments.first
21
21
 
22
22
  text.restore(segments, true)
23
23
  end
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
27
27
  make geniass
28
28
  make libgeniass.so
29
29
 
30
+ move_opt "$name" "$extra"
31
+
30
32
  mkdir lib
31
33
  mv libgeniass.so lib
32
34
 
33
- build "$name" "$extra"
34
-
35
35
  cd "$OPT_BUILD_DIR/$name"
36
36
 
37
37
  mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
43
43
  EOF
44
44
 
45
45
  cat > ruby/Geniass.cpp <<'EOF'
46
- #include "rice/Class.hpp"
47
- #include "rice/String.hpp"
46
+ #include "rice/rice.hpp"
48
47
 
49
48
  #include <iostream>
50
49
  #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
59
58
  using namespace Rice;
60
59
  using namespace std;
61
60
 
62
- ME_Model model;
63
61
  bool geniass_loaded = false;
64
62
 
65
- void load_geniass(){
63
+ bool geniass_is_loaded(Object self){ return(geniass_loaded); };
64
+
65
+ ME_Model model;
66
+ void load_geniass(Object self){
66
67
  printf("loading model");
67
68
  string modelFile = "model1-1.0";
68
69
  model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
70
71
  printf("..done\n");
71
72
  }
72
73
 
73
- bool geniass_is_loaded(){ return(geniass_loaded); };
74
-
75
74
  void split(string& str, vector<string>& tokens)
76
75
  {
77
76
  istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
85
84
  }
86
85
  }
87
86
 
88
- string label(string line){
87
+ Object label(Object self, String rb_line){
89
88
  vector<string> tokens;
90
- split(line, tokens);
91
89
  ME_Sample s;
92
90
 
91
+ string line = rb_line.c_str();
92
+ split(line, tokens);
93
+
93
94
  for(vector<string>::const_iterator token = tokens.begin() + 1;
94
95
  token != tokens.end(); ++token){
95
96
  s.add_feature(*token);
96
97
  }
98
+
97
99
  (void) model.classify(s);
98
- return(s.label);
100
+ string label = s.label;
101
+ VALUE x;
102
+ x = rb_str_new_cstr(label.c_str());
103
+ return(x);
99
104
  }
100
105
 
101
106
  extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
103
108
  {
104
109
  Class rb_cGeniass =
105
110
  define_class("Geniass")
111
+ .define_method("geniass_is_loaded", &geniass_is_loaded)
106
112
  .define_method("load_geniass", &load_geniass)
107
113
  .define_method("label", &label)
108
- .define_method("geniass_is_loaded", &geniass_is_loaded);
114
+ ;
109
115
  }
110
116
 
117
+
111
118
  EOF
112
119
 
113
120
  cd ruby
114
121
  ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
115
122
  make
116
123
 
124
+ setup "$name" "$extra"
125
+
117
126
 
118
127
 
@@ -6,6 +6,7 @@ tokens do
6
6
 
7
7
  # Some (possible) single letters first
8
8
  receptor /^(?:receptor|r)s?$/i
9
+ activator /^(?:activator|p)s?$/i
9
10
  protein /^(?:protein|p)s?$/i
10
11
  roman /^[IV]+$/
11
12
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
58
59
 
59
60
  diff.promoter -10
60
61
  diff.receptor -10
62
+ diff.activator -10
63
+
61
64
  diff.similar -10
62
65
  diff.capital -10
63
66
 
@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
79
79
  assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
80
80
  end
81
81
 
82
+ def test_entities_captures_repeat
83
+ sentence = "In a sentence I should find not this but this"
84
+
85
+ ner = RegExpNER.new({:this => /not this but (this)/})
86
+ matches = ner.entities(sentence)
87
+ assert sentence[0..matches.first.offset-1].include?('this')
88
+ end
89
+
90
+
82
91
 
83
92
  def test_regexp_order
84
93
  text =<<-EOF
@@ -12,7 +12,6 @@ sentence. This is
12
12
  another broken sentence.
13
13
  EOF
14
14
 
15
- iii NLP.geniass_sentence_splitter(text)
16
15
  assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
17
16
  end
18
17
 
@@ -37,7 +36,17 @@ sentence. This is
37
36
  another broken sentence.
38
37
  EOF
39
38
 
40
- assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
39
+ Log.with_severity 0 do
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
42
+ end
43
+
44
+ def test_sentence_cmi
45
+ text =<<-EOF
46
+ The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
47
+ EOF
48
+
49
+ iii NLP.geniass_sentence_splitter(text)
41
50
  end
42
51
  end
43
52
 
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
73
73
  assert_equal original.gsub(/TP53/, 'GN'), a
74
74
  end
75
75
 
76
+ Transformed.with_transform(a, [gene2], "GN") do
77
+ Transformed.with_transform(a, [gene1], "GN") do
78
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
79
+ end
80
+ assert_equal original.gsub(/CDK5R1/, 'GN'), a
81
+ end
82
+
76
83
  Transformed.with_transform(a, [gene1], "GN") do
77
84
  Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
78
85
  assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
151
  gene2.entity_type = "Protein"
145
152
 
146
153
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
154
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
148
155
  end
149
156
  end
150
157
 
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
172
  gene2.entity_type = "Protein"
166
173
 
167
174
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
175
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
169
176
  end
170
177
  end
171
178
 
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
192
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
193
 
187
194
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
195
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
196
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
197
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
198
  end
192
199
  end
193
200
  end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
415
422
  end
416
423
 
417
424
  end
418
-
419
425
  end
420
426
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-08 00:00:00.000000000 Z
11
+ date: 2023-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util