rbbt-text 1.3.8 → 1.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document/corpus.rb +1 -1
- data/lib/rbbt/ner/regexpNER.rb +20 -21
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
- data/lib/rbbt/segment/named_entity.rb +2 -1
- data/lib/rbbt/segment/transformed.rb +1 -1
- data/share/install/software/Geniass +21 -12
- data/share/rnorm/tokens_default +3 -0
- data/test/rbbt/ner/test_regexpNER.rb +9 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
- data/test/rbbt/segment/test_transformed.rb +11 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
|
4
|
+
data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
|
7
|
+
data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
-
corpus = Persist.open_tokyocabinet(corpus,
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
|
7
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
8
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
9
9
|
corpus.close
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -10,34 +10,33 @@ class RegExpNER < NER
|
|
10
10
|
while matchdata = text.match(regexp)
|
11
11
|
pre = matchdata.pre_match
|
12
12
|
post = matchdata.post_match
|
13
|
-
match = matchdata[0]
|
14
13
|
|
15
14
|
if matchdata.captures.any?
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
match = capture
|
20
|
-
pre << more_pre if more_pre
|
21
|
-
post = more_post << post if more_post
|
22
|
-
end
|
23
|
-
|
24
|
-
if match and not match.empty?
|
25
|
-
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
15
|
+
match = matchdata.captures.first
|
16
|
+
offset, eend = matchdata.offset(1)
|
17
|
+
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
26
18
|
matches << match
|
19
|
+
start += offset + match.length
|
20
|
+
text = text[eend..-1]
|
21
|
+
else
|
22
|
+
match = matchdata[0]
|
23
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
|
24
|
+
matches << match
|
25
|
+
eend = match.length + pre.length
|
26
|
+
text = text[eend..-1]
|
27
|
+
start += match.length + pre.length
|
27
28
|
end
|
28
|
-
|
29
|
-
start += pre.length + match.length
|
30
|
-
text = post
|
31
29
|
end
|
32
30
|
|
33
31
|
matches
|
34
32
|
end
|
35
33
|
|
36
|
-
def self.match_regexp_list(text, regexp_list, type = nil)
|
34
|
+
def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
|
37
35
|
matches = []
|
38
36
|
|
39
37
|
regexp_list.each do |regexp|
|
40
|
-
chunks = Segment.split(text, matches)
|
38
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
39
|
+
chunks = Segment.split(text, [])
|
41
40
|
chunks.each do |chunk|
|
42
41
|
new_matches = match_regexp(chunk, regexp, type)
|
43
42
|
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
@@ -47,15 +46,15 @@ class RegExpNER < NER
|
|
47
46
|
matches
|
48
47
|
end
|
49
48
|
|
50
|
-
def self.match_regexp_hash(text, regexp_hash)
|
49
|
+
def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
|
51
50
|
matches = []
|
52
51
|
|
53
52
|
regexp_hash.each do |type, regexp_list|
|
54
53
|
regexp_list = [regexp_list] unless Array === regexp_list
|
55
|
-
chunks = Segment.split(text, matches)
|
54
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
56
55
|
chunks.each do |chunk|
|
57
56
|
chunk_offset = chunk.offset
|
58
|
-
match_regexp_list(chunk, regexp_list, type).each do |match|
|
57
|
+
match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
|
59
58
|
match.offset = match.offset + chunk_offset;
|
60
59
|
matches << match
|
61
60
|
end
|
@@ -65,7 +64,7 @@ class RegExpNER < NER
|
|
65
64
|
matches
|
66
65
|
end
|
67
66
|
|
68
|
-
attr_accessor :regexps
|
67
|
+
attr_accessor :regexps, :split_on_matches
|
69
68
|
def initialize(regexps = {})
|
70
69
|
@regexps = regexps.collect{|p| p }
|
71
70
|
end
|
@@ -87,7 +86,7 @@ class RegExpNER < NER
|
|
87
86
|
end
|
88
87
|
|
89
88
|
def match(text)
|
90
|
-
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
89
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
91
90
|
matches.collect do |m|
|
92
91
|
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
93
92
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
2
2
|
require 'rbbt/segment'
|
3
3
|
module NLP
|
4
|
+
|
4
5
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
5
6
|
|
6
7
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
@@ -136,6 +137,7 @@ module NLP
|
|
136
137
|
end
|
137
138
|
|
138
139
|
def self.process_labels(marked_text, labels)
|
140
|
+
return "" if marked_text.empty? || labels.empty?
|
139
141
|
out = ""
|
140
142
|
|
141
143
|
count = 0
|
@@ -171,8 +173,17 @@ module NLP
|
|
171
173
|
end
|
172
174
|
|
173
175
|
def self.geniass_sentence_splitter_extension(text)
|
176
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
177
|
+
events, marks = event_extraction(cleaned)
|
178
|
+
|
174
179
|
Rbbt.software.opt.Geniass.produce
|
175
|
-
|
180
|
+
begin
|
181
|
+
ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
|
182
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
183
|
+
rescue LoadError
|
184
|
+
raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
|
185
|
+
end
|
186
|
+
|
176
187
|
geniass = Geniass.new
|
177
188
|
if not geniass.geniass_is_loaded
|
178
189
|
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
@@ -180,9 +191,6 @@ module NLP
|
|
180
191
|
end
|
181
192
|
end
|
182
193
|
|
183
|
-
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
184
|
-
events, marks = event_extraction(cleaned)
|
185
|
-
|
186
194
|
labels = events.split(/\n/).collect{|line|
|
187
195
|
geniass.label(line)
|
188
196
|
}
|
@@ -23,13 +23,14 @@ Score: #{score.inspect}
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def html
|
26
|
-
title = code.nil? ? entity_type : [entity_type, code].compact * "
|
26
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * " - "
|
27
27
|
|
28
28
|
text = <<-EOF
|
29
29
|
<span class='Entity'\
|
30
30
|
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
31
31
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
32
32
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
33
|
+
#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
|
33
34
|
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
34
35
|
>#{ self }</span>
|
35
36
|
EOF
|
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
|
|
27
27
|
make geniass
|
28
28
|
make libgeniass.so
|
29
29
|
|
30
|
+
move_opt "$name" "$extra"
|
31
|
+
|
30
32
|
mkdir lib
|
31
33
|
mv libgeniass.so lib
|
32
34
|
|
33
|
-
build "$name" "$extra"
|
34
|
-
|
35
35
|
cd "$OPT_BUILD_DIR/$name"
|
36
36
|
|
37
37
|
mkdir ruby
|
@@ -43,8 +43,7 @@ create_makefile('Geniass')
|
|
43
43
|
EOF
|
44
44
|
|
45
45
|
cat > ruby/Geniass.cpp <<'EOF'
|
46
|
-
#include "rice/
|
47
|
-
#include "rice/String.hpp"
|
46
|
+
#include "rice/rice.hpp"
|
48
47
|
|
49
48
|
#include <iostream>
|
50
49
|
#include <iomanip>
|
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
|
|
59
58
|
using namespace Rice;
|
60
59
|
using namespace std;
|
61
60
|
|
62
|
-
ME_Model model;
|
63
61
|
bool geniass_loaded = false;
|
64
62
|
|
65
|
-
|
63
|
+
bool geniass_is_loaded(Object self){ return(geniass_loaded); };
|
64
|
+
|
65
|
+
ME_Model model;
|
66
|
+
void load_geniass(Object self){
|
66
67
|
printf("loading model");
|
67
68
|
string modelFile = "model1-1.0";
|
68
69
|
model.load_from_file(modelFile.c_str());
|
@@ -70,8 +71,6 @@ void load_geniass(){
|
|
70
71
|
printf("..done\n");
|
71
72
|
}
|
72
73
|
|
73
|
-
bool geniass_is_loaded(){ return(geniass_loaded); };
|
74
|
-
|
75
74
|
void split(string& str, vector<string>& tokens)
|
76
75
|
{
|
77
76
|
istringstream in(str);
|
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
|
|
85
84
|
}
|
86
85
|
}
|
87
86
|
|
88
|
-
|
87
|
+
Object label(Object self, String rb_line){
|
89
88
|
vector<string> tokens;
|
90
|
-
split(line, tokens);
|
91
89
|
ME_Sample s;
|
92
90
|
|
91
|
+
string line = rb_line.c_str();
|
92
|
+
split(line, tokens);
|
93
|
+
|
93
94
|
for(vector<string>::const_iterator token = tokens.begin() + 1;
|
94
95
|
token != tokens.end(); ++token){
|
95
96
|
s.add_feature(*token);
|
96
97
|
}
|
98
|
+
|
97
99
|
(void) model.classify(s);
|
98
|
-
|
100
|
+
string label = s.label;
|
101
|
+
VALUE x;
|
102
|
+
x = rb_str_new_cstr(label.c_str());
|
103
|
+
return(x);
|
99
104
|
}
|
100
105
|
|
101
106
|
extern "C"
|
@@ -103,16 +108,20 @@ void Init_Geniass()
|
|
103
108
|
{
|
104
109
|
Class rb_cGeniass =
|
105
110
|
define_class("Geniass")
|
111
|
+
.define_method("geniass_is_loaded", &geniass_is_loaded)
|
106
112
|
.define_method("load_geniass", &load_geniass)
|
107
113
|
.define_method("label", &label)
|
108
|
-
|
114
|
+
;
|
109
115
|
}
|
110
116
|
|
117
|
+
|
111
118
|
EOF
|
112
119
|
|
113
120
|
cd ruby
|
114
121
|
ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
|
115
122
|
make
|
116
123
|
|
124
|
+
setup "$name" "$extra"
|
125
|
+
|
117
126
|
|
118
127
|
|
data/share/rnorm/tokens_default
CHANGED
@@ -6,6 +6,7 @@ tokens do
|
|
6
6
|
|
7
7
|
# Some (possible) single letters first
|
8
8
|
receptor /^(?:receptor|r)s?$/i
|
9
|
+
activator /^(?:activator|p)s?$/i
|
9
10
|
protein /^(?:protein|p)s?$/i
|
10
11
|
roman /^[IV]+$/
|
11
12
|
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
@@ -58,6 +59,8 @@ comparisons do
|
|
58
59
|
|
59
60
|
diff.promoter -10
|
60
61
|
diff.receptor -10
|
62
|
+
diff.activator -10
|
63
|
+
|
61
64
|
diff.similar -10
|
62
65
|
diff.capital -10
|
63
66
|
|
@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
79
79
|
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
|
+
def test_entities_captures_repeat
|
83
|
+
sentence = "In a sentence I should find not this but this"
|
84
|
+
|
85
|
+
ner = RegExpNER.new({:this => /not this but (this)/})
|
86
|
+
matches = ner.entities(sentence)
|
87
|
+
assert sentence[0..matches.first.offset-1].include?('this')
|
88
|
+
end
|
89
|
+
|
90
|
+
|
82
91
|
|
83
92
|
def test_regexp_order
|
84
93
|
text =<<-EOF
|
@@ -12,7 +12,6 @@ sentence. This is
|
|
12
12
|
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
iii NLP.geniass_sentence_splitter(text)
|
16
15
|
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
16
|
end
|
18
17
|
|
@@ -37,7 +36,17 @@ sentence. This is
|
|
37
36
|
another broken sentence.
|
38
37
|
EOF
|
39
38
|
|
40
|
-
|
39
|
+
Log.with_severity 0 do
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_sentence_cmi
|
45
|
+
text =<<-EOF
|
46
|
+
The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
|
47
|
+
EOF
|
48
|
+
|
49
|
+
iii NLP.geniass_sentence_splitter(text)
|
41
50
|
end
|
42
51
|
end
|
43
52
|
|
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
73
73
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
74
74
|
end
|
75
75
|
|
76
|
+
Transformed.with_transform(a, [gene2], "GN") do
|
77
|
+
Transformed.with_transform(a, [gene1], "GN") do
|
78
|
+
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
79
|
+
end
|
80
|
+
assert_equal original.gsub(/CDK5R1/, 'GN'), a
|
81
|
+
end
|
82
|
+
|
76
83
|
Transformed.with_transform(a, [gene1], "GN") do
|
77
84
|
Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
|
78
85
|
assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
|
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
144
151
|
gene2.entity_type = "Protein"
|
145
152
|
|
146
153
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
154
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
|
148
155
|
end
|
149
156
|
end
|
150
157
|
|
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
165
172
|
gene2.entity_type = "Protein"
|
166
173
|
|
167
174
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
175
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
|
169
176
|
end
|
170
177
|
end
|
171
178
|
|
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
185
192
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
186
193
|
|
187
194
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
195
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
189
196
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
197
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
191
198
|
end
|
192
199
|
end
|
193
200
|
end
|
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
415
422
|
end
|
416
423
|
|
417
424
|
end
|
418
|
-
|
419
425
|
end
|
420
426
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|