rbbt-text 1.3.8 → 1.3.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document/annotation.rb +67 -2
- data/lib/rbbt/document/corpus/pubmed.rb +6 -4
- data/lib/rbbt/document/corpus.rb +1 -1
- data/lib/rbbt/document.rb +4 -0
- data/lib/rbbt/ner/g_norm_plus.rb +2 -1
- data/lib/rbbt/ner/regexpNER.rb +30 -23
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
- data/lib/rbbt/segment/annotation.rb +1 -0
- data/lib/rbbt/segment/named_entity.rb +2 -1
- data/lib/rbbt/segment/overlaps.rb +9 -1
- data/lib/rbbt/segment/transformed.rb +1 -1
- data/lib/rbbt/segment.rb +4 -0
- data/share/install/software/Geniass +21 -12
- data/share/rnorm/tokens_default +3 -0
- data/test/rbbt/document/test_annotation.rb +21 -0
- data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
- data/test/rbbt/ner/test_regexpNER.rb +17 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
- data/test/rbbt/segment/test_transformed.rb +11 -5
- metadata +27 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
|
4
|
+
data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
|
7
|
+
data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
|
@@ -6,7 +6,9 @@ module Document
|
|
6
6
|
send :property, type do
|
7
7
|
segments = self.instance_exec &block
|
8
8
|
|
9
|
-
Segment.align(self, segments) unless segments.empty? ||
|
9
|
+
Segment.align(self, segments) unless segments.empty? ||
|
10
|
+
(Segment === segments && segments.offset) ||
|
11
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
10
12
|
|
11
13
|
segments.each do |segment|
|
12
14
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -17,6 +19,36 @@ module Document
|
|
17
19
|
|
18
20
|
segments
|
19
21
|
end
|
22
|
+
|
23
|
+
DocID.property type do
|
24
|
+
self.document.send(type)
|
25
|
+
end
|
26
|
+
|
27
|
+
SegID.property type do
|
28
|
+
self.overlaps(self.docid.send(type))
|
29
|
+
end
|
30
|
+
|
31
|
+
Segment.property type do
|
32
|
+
self.overlaps(self.docid.send(type))
|
33
|
+
end
|
34
|
+
|
35
|
+
seg_type = "segids_for_" + type.to_s
|
36
|
+
|
37
|
+
send :property, seg_type do
|
38
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
39
|
+
end
|
40
|
+
|
41
|
+
DocID.property seg_type do
|
42
|
+
self.document.send(seg_type)
|
43
|
+
end
|
44
|
+
|
45
|
+
SegID.property seg_type do
|
46
|
+
self.overlaps(self.docid.send(seg_type))
|
47
|
+
end
|
48
|
+
|
49
|
+
Segment.property seg_type do
|
50
|
+
self.overlaps(self.docid.send(seg_type))
|
51
|
+
end
|
20
52
|
end
|
21
53
|
|
22
54
|
def self.define_multiple(type, &block)
|
@@ -28,7 +60,10 @@ module Document
|
|
28
60
|
doc_segments.each_with_index do |segments,i|
|
29
61
|
next if segments.nil?
|
30
62
|
document = list[i]
|
31
|
-
Segment.align(document, segments) unless segments.nil? ||
|
63
|
+
Segment.align(document, segments) unless segments.nil? ||
|
64
|
+
segments.empty? ||
|
65
|
+
(Segment === segments && segments.offset) ||
|
66
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
32
67
|
|
33
68
|
segments.each do |segment|
|
34
69
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -41,5 +76,35 @@ module Document
|
|
41
76
|
segments
|
42
77
|
end
|
43
78
|
end
|
79
|
+
|
80
|
+
DocID.property type do
|
81
|
+
self.document.send(type)
|
82
|
+
end
|
83
|
+
|
84
|
+
SegID.property type do
|
85
|
+
self.overlaps(self.docid.send(type))
|
86
|
+
end
|
87
|
+
|
88
|
+
Segment.property type do
|
89
|
+
self.overlaps(self.docid.send(type))
|
90
|
+
end
|
91
|
+
|
92
|
+
seg_type = "segids_for_" + type.to_s
|
93
|
+
|
94
|
+
send :property, seg_type do
|
95
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
96
|
+
end
|
97
|
+
|
98
|
+
DocID.property seg_type do
|
99
|
+
self.document.send(seg_type)
|
100
|
+
end
|
101
|
+
|
102
|
+
SegID.property seg_type do
|
103
|
+
self.overlaps(self.docid.send(seg_type))
|
104
|
+
end
|
105
|
+
|
106
|
+
Segment.property seg_type do
|
107
|
+
self.overlaps(self.docid.send(seg_type))
|
108
|
+
end
|
44
109
|
end
|
45
110
|
end
|
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
|
|
2
2
|
|
3
3
|
module Document::Corpus
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
5
|
-
def add_pmid(pmid, type =
|
6
|
-
type = :
|
5
|
+
def add_pmid(pmid, type = :title_and_abstract, update = false)
|
6
|
+
type = :title_and_abstract if type.nil?
|
7
7
|
|
8
8
|
if ! (update || Array === pmid)
|
9
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
@@ -16,9 +16,11 @@ module Document::Corpus
|
|
16
16
|
|
17
17
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
18
18
|
document = if type.to_sym == :abstract
|
19
|
-
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid,
|
19
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
|
20
20
|
elsif type.to_sym == :title
|
21
|
-
Document.setup(article.title, PUBMED_NAMESPACE, pmid,
|
21
|
+
Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
|
+
elsif type.to_sym == :title_and_abstract
|
23
|
+
Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
24
|
else
|
23
25
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
24
26
|
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
-
corpus = Persist.open_tokyocabinet(corpus,
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
|
7
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
8
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
9
9
|
corpus.close
|
data/lib/rbbt/document.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -66,7 +66,8 @@ EOF
|
|
66
66
|
end
|
67
67
|
|
68
68
|
Open.write('config', CONFIG)
|
69
|
-
|
69
|
+
mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
|
70
|
+
CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
70
71
|
|
71
72
|
if texts.respond_to? :key_field
|
72
73
|
key_field = texts.key_field
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -10,34 +10,41 @@ class RegExpNER < NER
|
|
10
10
|
while matchdata = text.match(regexp)
|
11
11
|
pre = matchdata.pre_match
|
12
12
|
post = matchdata.post_match
|
13
|
-
match = matchdata[0]
|
14
13
|
|
15
|
-
if matchdata.
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
match = capture
|
20
|
-
pre << more_pre if more_pre
|
21
|
-
post = more_post << post if more_post
|
22
|
-
end
|
23
|
-
|
24
|
-
if match and not match.empty?
|
25
|
-
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
14
|
+
if matchdata.named_captures.any?
|
15
|
+
match = matchdata[0]
|
16
|
+
code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
|
17
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
|
26
18
|
matches << match
|
19
|
+
eend = match.length + pre.length
|
20
|
+
text = text[eend..-1]
|
21
|
+
start += match.length + pre.length
|
22
|
+
elsif matchdata.captures.any?
|
23
|
+
match = matchdata.captures.first
|
24
|
+
offset, eend = matchdata.offset(1)
|
25
|
+
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
26
|
+
matches << match
|
27
|
+
start += offset + match.length
|
28
|
+
text = text[eend..-1]
|
29
|
+
else
|
30
|
+
match = matchdata[0]
|
31
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
|
32
|
+
matches << match
|
33
|
+
eend = match.length + pre.length
|
34
|
+
text = text[eend..-1]
|
35
|
+
start += match.length + pre.length
|
27
36
|
end
|
28
|
-
|
29
|
-
start += pre.length + match.length
|
30
|
-
text = post
|
31
37
|
end
|
32
38
|
|
33
39
|
matches
|
34
40
|
end
|
35
41
|
|
36
|
-
def self.match_regexp_list(text, regexp_list, type = nil)
|
42
|
+
def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
|
37
43
|
matches = []
|
38
44
|
|
39
45
|
regexp_list.each do |regexp|
|
40
|
-
chunks = Segment.split(text, matches)
|
46
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
47
|
+
chunks = Segment.split(text, [])
|
41
48
|
chunks.each do |chunk|
|
42
49
|
new_matches = match_regexp(chunk, regexp, type)
|
43
50
|
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
@@ -47,15 +54,15 @@ class RegExpNER < NER
|
|
47
54
|
matches
|
48
55
|
end
|
49
56
|
|
50
|
-
def self.match_regexp_hash(text, regexp_hash)
|
57
|
+
def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
|
51
58
|
matches = []
|
52
59
|
|
53
60
|
regexp_hash.each do |type, regexp_list|
|
54
61
|
regexp_list = [regexp_list] unless Array === regexp_list
|
55
|
-
chunks = Segment.split(text, matches)
|
62
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
56
63
|
chunks.each do |chunk|
|
57
64
|
chunk_offset = chunk.offset
|
58
|
-
match_regexp_list(chunk, regexp_list, type).each do |match|
|
65
|
+
match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
|
59
66
|
match.offset = match.offset + chunk_offset;
|
60
67
|
matches << match
|
61
68
|
end
|
@@ -65,7 +72,7 @@ class RegExpNER < NER
|
|
65
72
|
matches
|
66
73
|
end
|
67
74
|
|
68
|
-
attr_accessor :regexps
|
75
|
+
attr_accessor :regexps, :split_on_matches
|
69
76
|
def initialize(regexps = {})
|
70
77
|
@regexps = regexps.collect{|p| p }
|
71
78
|
end
|
@@ -87,9 +94,9 @@ class RegExpNER < NER
|
|
87
94
|
end
|
88
95
|
|
89
96
|
def match(text)
|
90
|
-
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
97
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
91
98
|
matches.collect do |m|
|
92
|
-
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
99
|
+
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
|
93
100
|
end
|
94
101
|
end
|
95
102
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
2
2
|
require 'rbbt/segment'
|
3
3
|
module NLP
|
4
|
+
|
4
5
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
5
6
|
|
6
7
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
@@ -136,6 +137,7 @@ module NLP
|
|
136
137
|
end
|
137
138
|
|
138
139
|
def self.process_labels(marked_text, labels)
|
140
|
+
return "" if marked_text.empty? || labels.empty?
|
139
141
|
out = ""
|
140
142
|
|
141
143
|
count = 0
|
@@ -171,8 +173,17 @@ module NLP
|
|
171
173
|
end
|
172
174
|
|
173
175
|
def self.geniass_sentence_splitter_extension(text)
|
176
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
177
|
+
events, marks = event_extraction(cleaned)
|
178
|
+
|
174
179
|
Rbbt.software.opt.Geniass.produce
|
175
|
-
|
180
|
+
begin
|
181
|
+
ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
|
182
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
183
|
+
rescue LoadError
|
184
|
+
raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
|
185
|
+
end
|
186
|
+
|
176
187
|
geniass = Geniass.new
|
177
188
|
if not geniass.geniass_is_loaded
|
178
189
|
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
@@ -180,9 +191,6 @@ module NLP
|
|
180
191
|
end
|
181
192
|
end
|
182
193
|
|
183
|
-
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
184
|
-
events, marks = event_extraction(cleaned)
|
185
|
-
|
186
194
|
labels = events.split(/\n/).collect{|line|
|
187
195
|
geniass.label(line)
|
188
196
|
}
|
@@ -23,13 +23,14 @@ Score: #{score.inspect}
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def html
|
26
|
-
title = code.nil? ? entity_type : [entity_type, code].compact * "
|
26
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * " - "
|
27
27
|
|
28
28
|
text = <<-EOF
|
29
29
|
<span class='Entity'\
|
30
30
|
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
31
31
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
32
32
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
33
|
+
#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
|
33
34
|
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
34
35
|
>#{ self }</span>
|
35
36
|
EOF
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module SegmentRanges
|
2
2
|
def pull(offset)
|
3
3
|
if self.offset.nil? or offset.nil?
|
4
4
|
self.offset = nil
|
@@ -61,3 +61,11 @@ module Segment
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
module Segment
|
66
|
+
include SegmentRanges
|
67
|
+
end
|
68
|
+
|
69
|
+
module SegID
|
70
|
+
include SegmentRanges
|
71
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
|
|
27
27
|
make geniass
|
28
28
|
make libgeniass.so
|
29
29
|
|
30
|
+
move_opt "$name" "$extra"
|
31
|
+
|
30
32
|
mkdir lib
|
31
33
|
mv libgeniass.so lib
|
32
34
|
|
33
|
-
build "$name" "$extra"
|
34
|
-
|
35
35
|
cd "$OPT_BUILD_DIR/$name"
|
36
36
|
|
37
37
|
mkdir ruby
|
@@ -43,8 +43,7 @@ create_makefile('Geniass')
|
|
43
43
|
EOF
|
44
44
|
|
45
45
|
cat > ruby/Geniass.cpp <<'EOF'
|
46
|
-
#include "rice/
|
47
|
-
#include "rice/String.hpp"
|
46
|
+
#include "rice/rice.hpp"
|
48
47
|
|
49
48
|
#include <iostream>
|
50
49
|
#include <iomanip>
|
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
|
|
59
58
|
using namespace Rice;
|
60
59
|
using namespace std;
|
61
60
|
|
62
|
-
ME_Model model;
|
63
61
|
bool geniass_loaded = false;
|
64
62
|
|
65
|
-
|
63
|
+
bool geniass_is_loaded(Object self){ return(geniass_loaded); };
|
64
|
+
|
65
|
+
ME_Model model;
|
66
|
+
void load_geniass(Object self){
|
66
67
|
printf("loading model");
|
67
68
|
string modelFile = "model1-1.0";
|
68
69
|
model.load_from_file(modelFile.c_str());
|
@@ -70,8 +71,6 @@ void load_geniass(){
|
|
70
71
|
printf("..done\n");
|
71
72
|
}
|
72
73
|
|
73
|
-
bool geniass_is_loaded(){ return(geniass_loaded); };
|
74
|
-
|
75
74
|
void split(string& str, vector<string>& tokens)
|
76
75
|
{
|
77
76
|
istringstream in(str);
|
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
|
|
85
84
|
}
|
86
85
|
}
|
87
86
|
|
88
|
-
|
87
|
+
Object label(Object self, String rb_line){
|
89
88
|
vector<string> tokens;
|
90
|
-
split(line, tokens);
|
91
89
|
ME_Sample s;
|
92
90
|
|
91
|
+
string line = rb_line.c_str();
|
92
|
+
split(line, tokens);
|
93
|
+
|
93
94
|
for(vector<string>::const_iterator token = tokens.begin() + 1;
|
94
95
|
token != tokens.end(); ++token){
|
95
96
|
s.add_feature(*token);
|
96
97
|
}
|
98
|
+
|
97
99
|
(void) model.classify(s);
|
98
|
-
|
100
|
+
string label = s.label;
|
101
|
+
VALUE x;
|
102
|
+
x = rb_str_new_cstr(label.c_str());
|
103
|
+
return(x);
|
99
104
|
}
|
100
105
|
|
101
106
|
extern "C"
|
@@ -103,16 +108,20 @@ void Init_Geniass()
|
|
103
108
|
{
|
104
109
|
Class rb_cGeniass =
|
105
110
|
define_class("Geniass")
|
111
|
+
.define_method("geniass_is_loaded", &geniass_is_loaded)
|
106
112
|
.define_method("load_geniass", &load_geniass)
|
107
113
|
.define_method("label", &label)
|
108
|
-
|
114
|
+
;
|
109
115
|
}
|
110
116
|
|
117
|
+
|
111
118
|
EOF
|
112
119
|
|
113
120
|
cd ruby
|
114
121
|
ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
|
115
122
|
make
|
116
123
|
|
124
|
+
setup "$name" "$extra"
|
125
|
+
|
117
126
|
|
118
127
|
|
data/share/rnorm/tokens_default
CHANGED
@@ -6,6 +6,7 @@ tokens do
|
|
6
6
|
|
7
7
|
# Some (possible) single letters first
|
8
8
|
receptor /^(?:receptor|r)s?$/i
|
9
|
+
activator /^(?:activator|p)s?$/i
|
9
10
|
protein /^(?:protein|p)s?$/i
|
10
11
|
roman /^[IV]+$/
|
11
12
|
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
@@ -58,6 +59,8 @@ comparisons do
|
|
58
59
|
|
59
60
|
diff.promoter -10
|
60
61
|
diff.receptor -10
|
62
|
+
diff.activator -10
|
63
|
+
|
61
64
|
diff.similar -10
|
62
65
|
diff.capital -10
|
63
66
|
|
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
|
|
13
13
|
self.split(" ")
|
14
14
|
end
|
15
15
|
|
16
|
+
Document.define :lines do
|
17
|
+
self.split("\n")
|
18
|
+
end
|
19
|
+
|
16
20
|
$called_once = false
|
17
21
|
Document.define :persisted_words do
|
18
22
|
raise CalledOnce if $called_once
|
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
|
|
145
149
|
|
146
150
|
assert text.ner.first.segid.include?("TEST:")
|
147
151
|
end
|
152
|
+
|
153
|
+
def test_sentence_words
|
154
|
+
text =<<-EOF
|
155
|
+
This is sentence 1
|
156
|
+
This is sentence 2
|
157
|
+
EOF
|
158
|
+
|
159
|
+
Document.setup(text)
|
160
|
+
|
161
|
+
words = text.words
|
162
|
+
numbers = words.select{|w| w =~ /\d/}
|
163
|
+
text.lines.each do |sentence|
|
164
|
+
Transformed.with_transform(sentence, numbers, "[NUM]") do
|
165
|
+
puts sentence
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
148
169
|
end
|
149
170
|
|
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
|
|
12
12
|
sapiens
|
13
13
|
EOF
|
14
14
|
|
15
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
15
16
|
mentions = GNormPlus.process({:file => text})
|
16
17
|
|
17
18
|
assert_equal 1, mentions.length
|
@@ -23,6 +24,7 @@ sapiens
|
|
23
24
|
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
25
|
EOF
|
25
26
|
|
27
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
26
28
|
mentions = GNormPlus.entities({:file => text})
|
27
29
|
assert mentions["file"].include?("TP53")
|
28
30
|
mentions["file"].each do |mention|
|
@@ -79,6 +79,23 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
79
79
|
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
|
+
def test_entities_captures_repeat
|
83
|
+
sentence = "In a sentence I should find not this but this"
|
84
|
+
|
85
|
+
ner = RegExpNER.new({:this => /not this but (this)/})
|
86
|
+
matches = ner.entities(sentence)
|
87
|
+
assert sentence[0..matches.first.offset-1].include?('this')
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def test_entities_named_captures
|
92
|
+
sentence = "In a sentence I should find not this but this"
|
93
|
+
|
94
|
+
ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
|
95
|
+
matches = ner.entities(sentence)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
82
99
|
|
83
100
|
def test_regexp_order
|
84
101
|
text =<<-EOF
|
@@ -12,7 +12,6 @@ sentence. This is
|
|
12
12
|
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
iii NLP.geniass_sentence_splitter(text)
|
16
15
|
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
16
|
end
|
18
17
|
|
@@ -37,7 +36,17 @@ sentence. This is
|
|
37
36
|
another broken sentence.
|
38
37
|
EOF
|
39
38
|
|
40
|
-
|
39
|
+
Log.with_severity 0 do
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_sentence_cmi
|
45
|
+
text =<<-EOF
|
46
|
+
The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
|
47
|
+
EOF
|
48
|
+
|
49
|
+
iii NLP.geniass_sentence_splitter(text)
|
41
50
|
end
|
42
51
|
end
|
43
52
|
|
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
73
73
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
74
74
|
end
|
75
75
|
|
76
|
+
Transformed.with_transform(a, [gene2], "GN") do
|
77
|
+
Transformed.with_transform(a, [gene1], "GN") do
|
78
|
+
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
79
|
+
end
|
80
|
+
assert_equal original.gsub(/CDK5R1/, 'GN'), a
|
81
|
+
end
|
82
|
+
|
76
83
|
Transformed.with_transform(a, [gene1], "GN") do
|
77
84
|
Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
|
78
85
|
assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
|
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
144
151
|
gene2.entity_type = "Protein"
|
145
152
|
|
146
153
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
154
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
|
148
155
|
end
|
149
156
|
end
|
150
157
|
|
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
165
172
|
gene2.entity_type = "Protein"
|
166
173
|
|
167
174
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
175
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
|
169
176
|
end
|
170
177
|
end
|
171
178
|
|
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
185
192
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
186
193
|
|
187
194
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
195
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
189
196
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
197
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
191
198
|
end
|
192
199
|
end
|
193
200
|
end
|
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
415
422
|
end
|
416
423
|
|
417
424
|
end
|
418
|
-
|
419
425
|
end
|
420
426
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
175
|
- !ruby/object:Gem::Version
|
176
176
|
version: '0'
|
177
177
|
requirements: []
|
178
|
-
rubygems_version: 3.1.
|
178
|
+
rubygems_version: 3.1.2
|
179
179
|
signing_key:
|
180
180
|
specification_version: 4
|
181
181
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
182
182
|
test_files:
|
183
|
-
- test/
|
184
|
-
- test/
|
185
|
-
- test/rbbt/
|
183
|
+
- test/test_spaCy.rb
|
184
|
+
- test/test_helper.rb
|
185
|
+
- test/rbbt/bow/test_dictionary.rb
|
186
186
|
- test/rbbt/bow/test_bow.rb
|
187
187
|
- test/rbbt/bow/test_misc.rb
|
188
|
-
- test/rbbt/
|
189
|
-
- test/rbbt/
|
190
|
-
- test/rbbt/
|
188
|
+
- test/rbbt/segment/test_encoding.rb
|
189
|
+
- test/rbbt/segment/test_transformed.rb
|
190
|
+
- test/rbbt/segment/test_overlaps.rb
|
191
|
+
- test/rbbt/segment/test_named_entity.rb
|
192
|
+
- test/rbbt/segment/test_corpus.rb
|
193
|
+
- test/rbbt/segment/test_range_index.rb
|
194
|
+
- test/rbbt/segment/test_annotation.rb
|
195
|
+
- test/rbbt/entity/test_document.rb
|
191
196
|
- test/rbbt/document/corpus/test_pubmed.rb
|
192
197
|
- test/rbbt/document/test_corpus.rb
|
193
|
-
- test/rbbt/
|
198
|
+
- test/rbbt/document/test_annotation.rb
|
199
|
+
- test/rbbt/test_document.rb
|
194
200
|
- test/rbbt/ner/test_patterns.rb
|
195
|
-
- test/rbbt/ner/test_NER.rb
|
196
|
-
- test/rbbt/ner/test_abner.rb
|
197
201
|
- test/rbbt/ner/rnorm/test_tokens.rb
|
198
|
-
- test/rbbt/ner/test_rnorm.rb
|
199
|
-
- test/rbbt/ner/test_regexpNER.rb
|
200
202
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
203
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
204
|
+
- test/rbbt/ner/test_finder.rb
|
201
205
|
- test/rbbt/ner/test_brat.rb
|
206
|
+
- test/rbbt/ner/test_regexpNER.rb
|
202
207
|
- test/rbbt/ner/test_g_norm_plus.rb
|
208
|
+
- test/rbbt/ner/test_rnorm.rb
|
209
|
+
- test/rbbt/ner/test_linnaeus.rb
|
203
210
|
- test/rbbt/ner/test_chemical_tagger.rb
|
204
|
-
- test/rbbt/ner/
|
205
|
-
- test/rbbt/ner/
|
206
|
-
- test/rbbt/ner/test_finder.rb
|
211
|
+
- test/rbbt/ner/test_NER.rb
|
212
|
+
- test/rbbt/ner/test_abner.rb
|
207
213
|
- test/rbbt/ner/test_rner.rb
|
208
|
-
- test/rbbt/ner/test_linnaeus.rb
|
209
214
|
- test/rbbt/ner/test_oscar4.rb
|
215
|
+
- test/rbbt/ner/test_banner.rb
|
210
216
|
- test/rbbt/test_segment.rb
|
211
|
-
- test/rbbt/
|
212
|
-
- test/rbbt/
|
213
|
-
- test/rbbt/
|
214
|
-
- test/rbbt/segment/test_named_entity.rb
|
215
|
-
- test/rbbt/segment/test_encoding.rb
|
216
|
-
- test/rbbt/segment/test_range_index.rb
|
217
|
-
- test/rbbt/segment/test_corpus.rb
|
218
|
-
- test/test_spaCy.rb
|
219
|
-
- test/test_helper.rb
|
217
|
+
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
218
|
+
- test/rbbt/nlp/test_nlp.rb
|
219
|
+
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|