rbbt-text 1.3.8 → 1.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
- data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
3
+ metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
+ data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
5
5
  SHA512:
6
- metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
- data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
6
+ metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
+ data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
@@ -3,7 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
7
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
8
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
9
  corpus.close
@@ -10,34 +10,33 @@ class RegExpNER < NER
10
10
  while matchdata = text.match(regexp)
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
- match = matchdata[0]
14
13
 
15
14
  if matchdata.captures.any?
16
- capture = matchdata.captures.first
17
- more_pre, more_post = match.split(/#{capture}/)
18
-
19
- match = capture
20
- pre << more_pre if more_pre
21
- post = more_post << post if more_post
22
- end
23
-
24
- if match and not match.empty?
25
- NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
15
+ match = matchdata.captures.first
16
+ offset, eend = matchdata.offset(1)
17
+ NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
26
18
  matches << match
19
+ start += offset + match.length
20
+ text = text[eend..-1]
21
+ else
22
+ match = matchdata[0]
23
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
24
+ matches << match
25
+ eend = match.length + pre.length
26
+ text = text[eend..-1]
27
+ start += match.length + pre.length
27
28
  end
28
-
29
- start += pre.length + match.length
30
- text = post
31
29
  end
32
30
 
33
31
  matches
34
32
  end
35
33
 
36
- def self.match_regexp_list(text, regexp_list, type = nil)
34
+ def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
37
35
  matches = []
38
36
 
39
37
  regexp_list.each do |regexp|
40
- chunks = Segment.split(text, matches)
38
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
39
+ chunks = Segment.split(text, [])
41
40
  chunks.each do |chunk|
42
41
  new_matches = match_regexp(chunk, regexp, type)
43
42
  new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +46,15 @@ class RegExpNER < NER
47
46
  matches
48
47
  end
49
48
 
50
- def self.match_regexp_hash(text, regexp_hash)
49
+ def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
51
50
  matches = []
52
51
 
53
52
  regexp_hash.each do |type, regexp_list|
54
53
  regexp_list = [regexp_list] unless Array === regexp_list
55
- chunks = Segment.split(text, matches)
54
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
56
55
  chunks.each do |chunk|
57
56
  chunk_offset = chunk.offset
58
- match_regexp_list(chunk, regexp_list, type).each do |match|
57
+ match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
59
58
  match.offset = match.offset + chunk_offset;
60
59
  matches << match
61
60
  end
@@ -65,7 +64,7 @@ class RegExpNER < NER
65
64
  matches
66
65
  end
67
66
 
68
- attr_accessor :regexps
67
+ attr_accessor :regexps, :split_on_matches
69
68
  def initialize(regexps = {})
70
69
  @regexps = regexps.collect{|p| p }
71
70
  end
@@ -87,7 +86,7 @@ class RegExpNER < NER
87
86
  end
88
87
 
89
88
  def match(text)
90
- matches = RegExpNER.match_regexp_hash(text, @regexps)
89
+ matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
91
90
  matches.collect do |m|
92
91
  NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
93
92
  end
@@ -1,6 +1,7 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/segment'
3
3
  module NLP
4
+
4
5
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
6
 
6
7
  def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
136
137
  end
137
138
 
138
139
  def self.process_labels(marked_text, labels)
140
+ return "" if marked_text.empty? || labels.empty?
139
141
  out = ""
140
142
 
141
143
  count = 0
@@ -171,8 +173,17 @@ module NLP
171
173
  end
172
174
 
173
175
  def self.geniass_sentence_splitter_extension(text)
176
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
177
+ events, marks = event_extraction(cleaned)
178
+
174
179
  Rbbt.software.opt.Geniass.produce
175
- require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
180
+ begin
181
+ ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
182
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
183
+ rescue LoadError
184
+ raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
185
+ end
186
+
176
187
  geniass = Geniass.new
177
188
  if not geniass.geniass_is_loaded
178
189
  Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
180
191
  end
181
192
  end
182
193
 
183
- cleaned = text.gsub("\n",NEW_LINE_MASK)
184
- events, marks = event_extraction(cleaned)
185
-
186
194
  labels = events.split(/\n/).collect{|line|
187
195
  geniass.label(line)
188
196
  }
@@ -23,13 +23,14 @@ Score: #{score.inspect}
23
23
  end
24
24
 
25
25
  def html
26
- title = code.nil? ? entity_type : [entity_type, code].compact * ":"
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * " - "
27
27
 
28
28
  text = <<-EOF
29
29
  <span class='Entity'\
30
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
31
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
32
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{segid.nil? ? "" : " attr-segid='#{segid}'"}\
33
34
  #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
34
35
  >#{ self }</span>
35
36
  EOF
@@ -17,7 +17,7 @@ module Transformed
17
17
 
18
18
  segments = yield text
19
19
 
20
- segments = nil unless Array === segments && Segment === segments.first
20
+ segments = [] unless Array === segments && Segment === segments.first
21
21
 
22
22
  text.restore(segments, true)
23
23
  end
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
27
27
  make geniass
28
28
  make libgeniass.so
29
29
 
30
+ move_opt "$name" "$extra"
31
+
30
32
  mkdir lib
31
33
  mv libgeniass.so lib
32
34
 
33
- build "$name" "$extra"
34
-
35
35
  cd "$OPT_BUILD_DIR/$name"
36
36
 
37
37
  mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
43
43
  EOF
44
44
 
45
45
  cat > ruby/Geniass.cpp <<'EOF'
46
- #include "rice/Class.hpp"
47
- #include "rice/String.hpp"
46
+ #include "rice/rice.hpp"
48
47
 
49
48
  #include <iostream>
50
49
  #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
59
58
  using namespace Rice;
60
59
  using namespace std;
61
60
 
62
- ME_Model model;
63
61
  bool geniass_loaded = false;
64
62
 
65
- void load_geniass(){
63
+ bool geniass_is_loaded(Object self){ return(geniass_loaded); };
64
+
65
+ ME_Model model;
66
+ void load_geniass(Object self){
66
67
  printf("loading model");
67
68
  string modelFile = "model1-1.0";
68
69
  model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
70
71
  printf("..done\n");
71
72
  }
72
73
 
73
- bool geniass_is_loaded(){ return(geniass_loaded); };
74
-
75
74
  void split(string& str, vector<string>& tokens)
76
75
  {
77
76
  istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
85
84
  }
86
85
  }
87
86
 
88
- string label(string line){
87
+ Object label(Object self, String rb_line){
89
88
  vector<string> tokens;
90
- split(line, tokens);
91
89
  ME_Sample s;
92
90
 
91
+ string line = rb_line.c_str();
92
+ split(line, tokens);
93
+
93
94
  for(vector<string>::const_iterator token = tokens.begin() + 1;
94
95
  token != tokens.end(); ++token){
95
96
  s.add_feature(*token);
96
97
  }
98
+
97
99
  (void) model.classify(s);
98
- return(s.label);
100
+ string label = s.label;
101
+ VALUE x;
102
+ x = rb_str_new_cstr(label.c_str());
103
+ return(x);
99
104
  }
100
105
 
101
106
  extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
103
108
  {
104
109
  Class rb_cGeniass =
105
110
  define_class("Geniass")
111
+ .define_method("geniass_is_loaded", &geniass_is_loaded)
106
112
  .define_method("load_geniass", &load_geniass)
107
113
  .define_method("label", &label)
108
- .define_method("geniass_is_loaded", &geniass_is_loaded);
114
+ ;
109
115
  }
110
116
 
117
+
111
118
  EOF
112
119
 
113
120
  cd ruby
114
121
  ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
115
122
  make
116
123
 
124
+ setup "$name" "$extra"
125
+
117
126
 
118
127
 
@@ -6,6 +6,7 @@ tokens do
6
6
 
7
7
  # Some (possible) single letters first
8
8
  receptor /^(?:receptor|r)s?$/i
9
+ activator /^(?:activator|p)s?$/i
9
10
  protein /^(?:protein|p)s?$/i
10
11
  roman /^[IV]+$/
11
12
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
58
59
 
59
60
  diff.promoter -10
60
61
  diff.receptor -10
62
+ diff.activator -10
63
+
61
64
  diff.similar -10
62
65
  diff.capital -10
63
66
 
@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
79
79
  assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
80
80
  end
81
81
 
82
+ def test_entities_captures_repeat
83
+ sentence = "In a sentence I should find not this but this"
84
+
85
+ ner = RegExpNER.new({:this => /not this but (this)/})
86
+ matches = ner.entities(sentence)
87
+ assert sentence[0..matches.first.offset-1].include?('this')
88
+ end
89
+
90
+
82
91
 
83
92
  def test_regexp_order
84
93
  text =<<-EOF
@@ -12,7 +12,6 @@ sentence. This is
12
12
  another broken sentence.
13
13
  EOF
14
14
 
15
- iii NLP.geniass_sentence_splitter(text)
16
15
  assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
17
16
  end
18
17
 
@@ -37,7 +36,17 @@ sentence. This is
37
36
  another broken sentence.
38
37
  EOF
39
38
 
40
- assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
39
+ Log.with_severity 0 do
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
42
+ end
43
+
44
+ def test_sentence_cmi
45
+ text =<<-EOF
46
+ The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
47
+ EOF
48
+
49
+ iii NLP.geniass_sentence_splitter(text)
41
50
  end
42
51
  end
43
52
 
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
73
73
  assert_equal original.gsub(/TP53/, 'GN'), a
74
74
  end
75
75
 
76
+ Transformed.with_transform(a, [gene2], "GN") do
77
+ Transformed.with_transform(a, [gene1], "GN") do
78
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
79
+ end
80
+ assert_equal original.gsub(/CDK5R1/, 'GN'), a
81
+ end
82
+
76
83
  Transformed.with_transform(a, [gene1], "GN") do
77
84
  Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
78
85
  assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
151
  gene2.entity_type = "Protein"
145
152
 
146
153
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
154
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
148
155
  end
149
156
  end
150
157
 
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
172
  gene2.entity_type = "Protein"
166
173
 
167
174
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
175
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
169
176
  end
170
177
  end
171
178
 
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
192
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
193
 
187
194
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
195
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
196
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
197
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
198
  end
192
199
  end
193
200
  end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
415
422
  end
416
423
 
417
424
  end
418
-
419
425
  end
420
426
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-08 00:00:00.000000000 Z
11
+ date: 2023-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util