rbbt-text 1.3.8 → 1.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document/corpus.rb +1 -1
- data/lib/rbbt/ner/regexpNER.rb +20 -21
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
- data/lib/rbbt/segment/named_entity.rb +2 -1
- data/lib/rbbt/segment/transformed.rb +1 -1
- data/share/install/software/Geniass +21 -12
- data/share/rnorm/tokens_default +3 -0
- data/test/rbbt/ner/test_regexpNER.rb +9 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
- data/test/rbbt/segment/test_transformed.rb +11 -5
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
|
|
4
|
+
data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
|
|
7
|
+
data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
|
data/lib/rbbt/document/corpus.rb
CHANGED
|
@@ -3,7 +3,7 @@ require 'rbbt-util'
|
|
|
3
3
|
module Document::Corpus
|
|
4
4
|
|
|
5
5
|
def self.setup(corpus)
|
|
6
|
-
corpus = Persist.open_tokyocabinet(corpus,
|
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
|
|
7
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
|
8
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
|
9
9
|
corpus.close
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
|
@@ -10,34 +10,33 @@ class RegExpNER < NER
|
|
|
10
10
|
while matchdata = text.match(regexp)
|
|
11
11
|
pre = matchdata.pre_match
|
|
12
12
|
post = matchdata.post_match
|
|
13
|
-
match = matchdata[0]
|
|
14
13
|
|
|
15
14
|
if matchdata.captures.any?
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
match = capture
|
|
20
|
-
pre << more_pre if more_pre
|
|
21
|
-
post = more_post << post if more_post
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
if match and not match.empty?
|
|
25
|
-
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
|
15
|
+
match = matchdata.captures.first
|
|
16
|
+
offset, eend = matchdata.offset(1)
|
|
17
|
+
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
|
26
18
|
matches << match
|
|
19
|
+
start += offset + match.length
|
|
20
|
+
text = text[eend..-1]
|
|
21
|
+
else
|
|
22
|
+
match = matchdata[0]
|
|
23
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
|
|
24
|
+
matches << match
|
|
25
|
+
eend = match.length + pre.length
|
|
26
|
+
text = text[eend..-1]
|
|
27
|
+
start += match.length + pre.length
|
|
27
28
|
end
|
|
28
|
-
|
|
29
|
-
start += pre.length + match.length
|
|
30
|
-
text = post
|
|
31
29
|
end
|
|
32
30
|
|
|
33
31
|
matches
|
|
34
32
|
end
|
|
35
33
|
|
|
36
|
-
def self.match_regexp_list(text, regexp_list, type = nil)
|
|
34
|
+
def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
|
|
37
35
|
matches = []
|
|
38
36
|
|
|
39
37
|
regexp_list.each do |regexp|
|
|
40
|
-
chunks = Segment.split(text, matches)
|
|
38
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
|
39
|
+
chunks = Segment.split(text, [])
|
|
41
40
|
chunks.each do |chunk|
|
|
42
41
|
new_matches = match_regexp(chunk, regexp, type)
|
|
43
42
|
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
|
@@ -47,15 +46,15 @@ class RegExpNER < NER
|
|
|
47
46
|
matches
|
|
48
47
|
end
|
|
49
48
|
|
|
50
|
-
def self.match_regexp_hash(text, regexp_hash)
|
|
49
|
+
def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
|
|
51
50
|
matches = []
|
|
52
51
|
|
|
53
52
|
regexp_hash.each do |type, regexp_list|
|
|
54
53
|
regexp_list = [regexp_list] unless Array === regexp_list
|
|
55
|
-
chunks = Segment.split(text, matches)
|
|
54
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
|
56
55
|
chunks.each do |chunk|
|
|
57
56
|
chunk_offset = chunk.offset
|
|
58
|
-
match_regexp_list(chunk, regexp_list, type).each do |match|
|
|
57
|
+
match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
|
|
59
58
|
match.offset = match.offset + chunk_offset;
|
|
60
59
|
matches << match
|
|
61
60
|
end
|
|
@@ -65,7 +64,7 @@ class RegExpNER < NER
|
|
|
65
64
|
matches
|
|
66
65
|
end
|
|
67
66
|
|
|
68
|
-
attr_accessor :regexps
|
|
67
|
+
attr_accessor :regexps, :split_on_matches
|
|
69
68
|
def initialize(regexps = {})
|
|
70
69
|
@regexps = regexps.collect{|p| p }
|
|
71
70
|
end
|
|
@@ -87,7 +86,7 @@ class RegExpNER < NER
|
|
|
87
86
|
end
|
|
88
87
|
|
|
89
88
|
def match(text)
|
|
90
|
-
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
|
89
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
|
91
90
|
matches.collect do |m|
|
|
92
91
|
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
|
93
92
|
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
|
2
2
|
require 'rbbt/segment'
|
|
3
3
|
module NLP
|
|
4
|
+
|
|
4
5
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
|
5
6
|
|
|
6
7
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
|
@@ -136,6 +137,7 @@ module NLP
|
|
|
136
137
|
end
|
|
137
138
|
|
|
138
139
|
def self.process_labels(marked_text, labels)
|
|
140
|
+
return "" if marked_text.empty? || labels.empty?
|
|
139
141
|
out = ""
|
|
140
142
|
|
|
141
143
|
count = 0
|
|
@@ -171,8 +173,17 @@ module NLP
|
|
|
171
173
|
end
|
|
172
174
|
|
|
173
175
|
def self.geniass_sentence_splitter_extension(text)
|
|
176
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
|
177
|
+
events, marks = event_extraction(cleaned)
|
|
178
|
+
|
|
174
179
|
Rbbt.software.opt.Geniass.produce
|
|
175
|
-
|
|
180
|
+
begin
|
|
181
|
+
ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
|
|
182
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
|
183
|
+
rescue LoadError
|
|
184
|
+
raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
|
|
185
|
+
end
|
|
186
|
+
|
|
176
187
|
geniass = Geniass.new
|
|
177
188
|
if not geniass.geniass_is_loaded
|
|
178
189
|
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
|
@@ -180,9 +191,6 @@ module NLP
|
|
|
180
191
|
end
|
|
181
192
|
end
|
|
182
193
|
|
|
183
|
-
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
|
184
|
-
events, marks = event_extraction(cleaned)
|
|
185
|
-
|
|
186
194
|
labels = events.split(/\n/).collect{|line|
|
|
187
195
|
geniass.label(line)
|
|
188
196
|
}
|
|
@@ -23,13 +23,14 @@ Score: #{score.inspect}
|
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def html
|
|
26
|
-
title = code.nil? ? entity_type : [entity_type, code].compact * "
|
|
26
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * " - "
|
|
27
27
|
|
|
28
28
|
text = <<-EOF
|
|
29
29
|
<span class='Entity'\
|
|
30
30
|
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
|
31
31
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
|
32
32
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
|
33
|
+
#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
|
|
33
34
|
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
|
34
35
|
>#{ self }</span>
|
|
35
36
|
EOF
|
|
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
|
|
|
27
27
|
make geniass
|
|
28
28
|
make libgeniass.so
|
|
29
29
|
|
|
30
|
+
move_opt "$name" "$extra"
|
|
31
|
+
|
|
30
32
|
mkdir lib
|
|
31
33
|
mv libgeniass.so lib
|
|
32
34
|
|
|
33
|
-
build "$name" "$extra"
|
|
34
|
-
|
|
35
35
|
cd "$OPT_BUILD_DIR/$name"
|
|
36
36
|
|
|
37
37
|
mkdir ruby
|
|
@@ -43,8 +43,7 @@ create_makefile('Geniass')
|
|
|
43
43
|
EOF
|
|
44
44
|
|
|
45
45
|
cat > ruby/Geniass.cpp <<'EOF'
|
|
46
|
-
#include "rice/
|
|
47
|
-
#include "rice/String.hpp"
|
|
46
|
+
#include "rice/rice.hpp"
|
|
48
47
|
|
|
49
48
|
#include <iostream>
|
|
50
49
|
#include <iomanip>
|
|
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
|
|
|
59
58
|
using namespace Rice;
|
|
60
59
|
using namespace std;
|
|
61
60
|
|
|
62
|
-
ME_Model model;
|
|
63
61
|
bool geniass_loaded = false;
|
|
64
62
|
|
|
65
|
-
|
|
63
|
+
bool geniass_is_loaded(Object self){ return(geniass_loaded); };
|
|
64
|
+
|
|
65
|
+
ME_Model model;
|
|
66
|
+
void load_geniass(Object self){
|
|
66
67
|
printf("loading model");
|
|
67
68
|
string modelFile = "model1-1.0";
|
|
68
69
|
model.load_from_file(modelFile.c_str());
|
|
@@ -70,8 +71,6 @@ void load_geniass(){
|
|
|
70
71
|
printf("..done\n");
|
|
71
72
|
}
|
|
72
73
|
|
|
73
|
-
bool geniass_is_loaded(){ return(geniass_loaded); };
|
|
74
|
-
|
|
75
74
|
void split(string& str, vector<string>& tokens)
|
|
76
75
|
{
|
|
77
76
|
istringstream in(str);
|
|
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
|
|
|
85
84
|
}
|
|
86
85
|
}
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
Object label(Object self, String rb_line){
|
|
89
88
|
vector<string> tokens;
|
|
90
|
-
split(line, tokens);
|
|
91
89
|
ME_Sample s;
|
|
92
90
|
|
|
91
|
+
string line = rb_line.c_str();
|
|
92
|
+
split(line, tokens);
|
|
93
|
+
|
|
93
94
|
for(vector<string>::const_iterator token = tokens.begin() + 1;
|
|
94
95
|
token != tokens.end(); ++token){
|
|
95
96
|
s.add_feature(*token);
|
|
96
97
|
}
|
|
98
|
+
|
|
97
99
|
(void) model.classify(s);
|
|
98
|
-
|
|
100
|
+
string label = s.label;
|
|
101
|
+
VALUE x;
|
|
102
|
+
x = rb_str_new_cstr(label.c_str());
|
|
103
|
+
return(x);
|
|
99
104
|
}
|
|
100
105
|
|
|
101
106
|
extern "C"
|
|
@@ -103,16 +108,20 @@ void Init_Geniass()
|
|
|
103
108
|
{
|
|
104
109
|
Class rb_cGeniass =
|
|
105
110
|
define_class("Geniass")
|
|
111
|
+
.define_method("geniass_is_loaded", &geniass_is_loaded)
|
|
106
112
|
.define_method("load_geniass", &load_geniass)
|
|
107
113
|
.define_method("label", &label)
|
|
108
|
-
|
|
114
|
+
;
|
|
109
115
|
}
|
|
110
116
|
|
|
117
|
+
|
|
111
118
|
EOF
|
|
112
119
|
|
|
113
120
|
cd ruby
|
|
114
121
|
ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
|
|
115
122
|
make
|
|
116
123
|
|
|
124
|
+
setup "$name" "$extra"
|
|
125
|
+
|
|
117
126
|
|
|
118
127
|
|
data/share/rnorm/tokens_default
CHANGED
|
@@ -6,6 +6,7 @@ tokens do
|
|
|
6
6
|
|
|
7
7
|
# Some (possible) single letters first
|
|
8
8
|
receptor /^(?:receptor|r)s?$/i
|
|
9
|
+
activator /^(?:activator|p)s?$/i
|
|
9
10
|
protein /^(?:protein|p)s?$/i
|
|
10
11
|
roman /^[IV]+$/
|
|
11
12
|
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
|
@@ -58,6 +59,8 @@ comparisons do
|
|
|
58
59
|
|
|
59
60
|
diff.promoter -10
|
|
60
61
|
diff.receptor -10
|
|
62
|
+
diff.activator -10
|
|
63
|
+
|
|
61
64
|
diff.similar -10
|
|
62
65
|
diff.capital -10
|
|
63
66
|
|
|
@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
|
79
79
|
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
+
def test_entities_captures_repeat
|
|
83
|
+
sentence = "In a sentence I should find not this but this"
|
|
84
|
+
|
|
85
|
+
ner = RegExpNER.new({:this => /not this but (this)/})
|
|
86
|
+
matches = ner.entities(sentence)
|
|
87
|
+
assert sentence[0..matches.first.offset-1].include?('this')
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
|
|
82
91
|
|
|
83
92
|
def test_regexp_order
|
|
84
93
|
text =<<-EOF
|
|
@@ -12,7 +12,6 @@ sentence. This is
|
|
|
12
12
|
another broken sentence.
|
|
13
13
|
EOF
|
|
14
14
|
|
|
15
|
-
iii NLP.geniass_sentence_splitter(text)
|
|
16
15
|
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
|
17
16
|
end
|
|
18
17
|
|
|
@@ -37,7 +36,17 @@ sentence. This is
|
|
|
37
36
|
another broken sentence.
|
|
38
37
|
EOF
|
|
39
38
|
|
|
40
|
-
|
|
39
|
+
Log.with_severity 0 do
|
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def test_sentence_cmi
|
|
45
|
+
text =<<-EOF
|
|
46
|
+
The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
|
|
47
|
+
EOF
|
|
48
|
+
|
|
49
|
+
iii NLP.geniass_sentence_splitter(text)
|
|
41
50
|
end
|
|
42
51
|
end
|
|
43
52
|
|
|
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
73
73
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
|
74
74
|
end
|
|
75
75
|
|
|
76
|
+
Transformed.with_transform(a, [gene2], "GN") do
|
|
77
|
+
Transformed.with_transform(a, [gene1], "GN") do
|
|
78
|
+
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
|
79
|
+
end
|
|
80
|
+
assert_equal original.gsub(/CDK5R1/, 'GN'), a
|
|
81
|
+
end
|
|
82
|
+
|
|
76
83
|
Transformed.with_transform(a, [gene1], "GN") do
|
|
77
84
|
Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
|
|
78
85
|
assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
|
|
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
144
151
|
gene2.entity_type = "Protein"
|
|
145
152
|
|
|
146
153
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
|
154
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
|
|
148
155
|
end
|
|
149
156
|
end
|
|
150
157
|
|
|
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
165
172
|
gene2.entity_type = "Protein"
|
|
166
173
|
|
|
167
174
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
|
175
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
|
|
169
176
|
end
|
|
170
177
|
end
|
|
171
178
|
|
|
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
185
192
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
|
186
193
|
|
|
187
194
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
|
195
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
|
189
196
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
|
197
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
|
191
198
|
end
|
|
192
199
|
end
|
|
193
200
|
end
|
|
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
|
415
422
|
end
|
|
416
423
|
|
|
417
424
|
end
|
|
418
|
-
|
|
419
425
|
end
|
|
420
426
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rbbt-text
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Miguel Vazquez
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2023-01-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rbbt-util
|