rbbt-text 1.3.8 → 1.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document/annotation.rb +67 -2
- data/lib/rbbt/document/corpus/pubmed.rb +6 -4
- data/lib/rbbt/document/corpus.rb +1 -1
- data/lib/rbbt/document.rb +4 -0
- data/lib/rbbt/ner/g_norm_plus.rb +2 -1
- data/lib/rbbt/ner/regexpNER.rb +30 -23
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +12 -4
- data/lib/rbbt/segment/annotation.rb +1 -0
- data/lib/rbbt/segment/named_entity.rb +2 -1
- data/lib/rbbt/segment/overlaps.rb +9 -1
- data/lib/rbbt/segment/transformed.rb +1 -1
- data/lib/rbbt/segment.rb +4 -0
- data/share/install/software/Geniass +21 -12
- data/share/rnorm/tokens_default +3 -0
- data/test/rbbt/document/test_annotation.rb +21 -0
- data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
- data/test/rbbt/ner/test_regexpNER.rb +17 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +11 -2
- data/test/rbbt/segment/test_transformed.rb +11 -5
- metadata +27 -27
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
|
|
4
|
+
data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
|
|
7
|
+
data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
|
|
@@ -6,7 +6,9 @@ module Document
|
|
|
6
6
|
send :property, type do
|
|
7
7
|
segments = self.instance_exec &block
|
|
8
8
|
|
|
9
|
-
Segment.align(self, segments) unless segments.empty? ||
|
|
9
|
+
Segment.align(self, segments) unless segments.empty? ||
|
|
10
|
+
(Segment === segments && segments.offset) ||
|
|
11
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
|
10
12
|
|
|
11
13
|
segments.each do |segment|
|
|
12
14
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
|
@@ -17,6 +19,36 @@ module Document
|
|
|
17
19
|
|
|
18
20
|
segments
|
|
19
21
|
end
|
|
22
|
+
|
|
23
|
+
DocID.property type do
|
|
24
|
+
self.document.send(type)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
SegID.property type do
|
|
28
|
+
self.overlaps(self.docid.send(type))
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
Segment.property type do
|
|
32
|
+
self.overlaps(self.docid.send(type))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
seg_type = "segids_for_" + type.to_s
|
|
36
|
+
|
|
37
|
+
send :property, seg_type do
|
|
38
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
DocID.property seg_type do
|
|
42
|
+
self.document.send(seg_type)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
SegID.property seg_type do
|
|
46
|
+
self.overlaps(self.docid.send(seg_type))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
Segment.property seg_type do
|
|
50
|
+
self.overlaps(self.docid.send(seg_type))
|
|
51
|
+
end
|
|
20
52
|
end
|
|
21
53
|
|
|
22
54
|
def self.define_multiple(type, &block)
|
|
@@ -28,7 +60,10 @@ module Document
|
|
|
28
60
|
doc_segments.each_with_index do |segments,i|
|
|
29
61
|
next if segments.nil?
|
|
30
62
|
document = list[i]
|
|
31
|
-
Segment.align(document, segments) unless segments.nil? ||
|
|
63
|
+
Segment.align(document, segments) unless segments.nil? ||
|
|
64
|
+
segments.empty? ||
|
|
65
|
+
(Segment === segments && segments.offset) ||
|
|
66
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
|
32
67
|
|
|
33
68
|
segments.each do |segment|
|
|
34
69
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
|
@@ -41,5 +76,35 @@ module Document
|
|
|
41
76
|
segments
|
|
42
77
|
end
|
|
43
78
|
end
|
|
79
|
+
|
|
80
|
+
DocID.property type do
|
|
81
|
+
self.document.send(type)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
SegID.property type do
|
|
85
|
+
self.overlaps(self.docid.send(type))
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
Segment.property type do
|
|
89
|
+
self.overlaps(self.docid.send(type))
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
seg_type = "segids_for_" + type.to_s
|
|
93
|
+
|
|
94
|
+
send :property, seg_type do
|
|
95
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
DocID.property seg_type do
|
|
99
|
+
self.document.send(seg_type)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
SegID.property seg_type do
|
|
103
|
+
self.overlaps(self.docid.send(seg_type))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
Segment.property seg_type do
|
|
107
|
+
self.overlaps(self.docid.send(seg_type))
|
|
108
|
+
end
|
|
44
109
|
end
|
|
45
110
|
end
|
|
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
|
|
|
2
2
|
|
|
3
3
|
module Document::Corpus
|
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
|
5
|
-
def add_pmid(pmid, type =
|
|
6
|
-
type = :
|
|
5
|
+
def add_pmid(pmid, type = :title_and_abstract, update = false)
|
|
6
|
+
type = :title_and_abstract if type.nil?
|
|
7
7
|
|
|
8
8
|
if ! (update || Array === pmid)
|
|
9
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
|
@@ -16,9 +16,11 @@ module Document::Corpus
|
|
|
16
16
|
|
|
17
17
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
|
18
18
|
document = if type.to_sym == :abstract
|
|
19
|
-
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid,
|
|
19
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
|
|
20
20
|
elsif type.to_sym == :title
|
|
21
|
-
Document.setup(article.title, PUBMED_NAMESPACE, pmid,
|
|
21
|
+
Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
|
22
|
+
elsif type.to_sym == :title_and_abstract
|
|
23
|
+
Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
|
22
24
|
else
|
|
23
25
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
|
24
26
|
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
data/lib/rbbt/document/corpus.rb
CHANGED
|
@@ -3,7 +3,7 @@ require 'rbbt-util'
|
|
|
3
3
|
module Document::Corpus
|
|
4
4
|
|
|
5
5
|
def self.setup(corpus)
|
|
6
|
-
corpus = Persist.open_tokyocabinet(corpus,
|
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
|
|
7
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
|
8
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
|
9
9
|
corpus.close
|
data/lib/rbbt/document.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
|
@@ -66,7 +66,8 @@ EOF
|
|
|
66
66
|
end
|
|
67
67
|
|
|
68
68
|
Open.write('config', CONFIG)
|
|
69
|
-
|
|
69
|
+
mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
|
|
70
|
+
CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
|
70
71
|
|
|
71
72
|
if texts.respond_to? :key_field
|
|
72
73
|
key_field = texts.key_field
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
|
@@ -10,34 +10,41 @@ class RegExpNER < NER
|
|
|
10
10
|
while matchdata = text.match(regexp)
|
|
11
11
|
pre = matchdata.pre_match
|
|
12
12
|
post = matchdata.post_match
|
|
13
|
-
match = matchdata[0]
|
|
14
13
|
|
|
15
|
-
if matchdata.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
match = capture
|
|
20
|
-
pre << more_pre if more_pre
|
|
21
|
-
post = more_post << post if more_post
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
if match and not match.empty?
|
|
25
|
-
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
|
14
|
+
if matchdata.named_captures.any?
|
|
15
|
+
match = matchdata[0]
|
|
16
|
+
code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
|
|
17
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
|
|
26
18
|
matches << match
|
|
19
|
+
eend = match.length + pre.length
|
|
20
|
+
text = text[eend..-1]
|
|
21
|
+
start += match.length + pre.length
|
|
22
|
+
elsif matchdata.captures.any?
|
|
23
|
+
match = matchdata.captures.first
|
|
24
|
+
offset, eend = matchdata.offset(1)
|
|
25
|
+
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
|
26
|
+
matches << match
|
|
27
|
+
start += offset + match.length
|
|
28
|
+
text = text[eend..-1]
|
|
29
|
+
else
|
|
30
|
+
match = matchdata[0]
|
|
31
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
|
|
32
|
+
matches << match
|
|
33
|
+
eend = match.length + pre.length
|
|
34
|
+
text = text[eend..-1]
|
|
35
|
+
start += match.length + pre.length
|
|
27
36
|
end
|
|
28
|
-
|
|
29
|
-
start += pre.length + match.length
|
|
30
|
-
text = post
|
|
31
37
|
end
|
|
32
38
|
|
|
33
39
|
matches
|
|
34
40
|
end
|
|
35
41
|
|
|
36
|
-
def self.match_regexp_list(text, regexp_list, type = nil)
|
|
42
|
+
def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
|
|
37
43
|
matches = []
|
|
38
44
|
|
|
39
45
|
regexp_list.each do |regexp|
|
|
40
|
-
chunks = Segment.split(text, matches)
|
|
46
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
|
47
|
+
chunks = Segment.split(text, [])
|
|
41
48
|
chunks.each do |chunk|
|
|
42
49
|
new_matches = match_regexp(chunk, regexp, type)
|
|
43
50
|
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
|
@@ -47,15 +54,15 @@ class RegExpNER < NER
|
|
|
47
54
|
matches
|
|
48
55
|
end
|
|
49
56
|
|
|
50
|
-
def self.match_regexp_hash(text, regexp_hash)
|
|
57
|
+
def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
|
|
51
58
|
matches = []
|
|
52
59
|
|
|
53
60
|
regexp_hash.each do |type, regexp_list|
|
|
54
61
|
regexp_list = [regexp_list] unless Array === regexp_list
|
|
55
|
-
chunks = Segment.split(text, matches)
|
|
62
|
+
chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
|
|
56
63
|
chunks.each do |chunk|
|
|
57
64
|
chunk_offset = chunk.offset
|
|
58
|
-
match_regexp_list(chunk, regexp_list, type).each do |match|
|
|
65
|
+
match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
|
|
59
66
|
match.offset = match.offset + chunk_offset;
|
|
60
67
|
matches << match
|
|
61
68
|
end
|
|
@@ -65,7 +72,7 @@ class RegExpNER < NER
|
|
|
65
72
|
matches
|
|
66
73
|
end
|
|
67
74
|
|
|
68
|
-
attr_accessor :regexps
|
|
75
|
+
attr_accessor :regexps, :split_on_matches
|
|
69
76
|
def initialize(regexps = {})
|
|
70
77
|
@regexps = regexps.collect{|p| p }
|
|
71
78
|
end
|
|
@@ -87,9 +94,9 @@ class RegExpNER < NER
|
|
|
87
94
|
end
|
|
88
95
|
|
|
89
96
|
def match(text)
|
|
90
|
-
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
|
97
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
|
91
98
|
matches.collect do |m|
|
|
92
|
-
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
|
99
|
+
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
|
|
93
100
|
end
|
|
94
101
|
end
|
|
95
102
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
|
2
2
|
require 'rbbt/segment'
|
|
3
3
|
module NLP
|
|
4
|
+
|
|
4
5
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
|
5
6
|
|
|
6
7
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
|
@@ -136,6 +137,7 @@ module NLP
|
|
|
136
137
|
end
|
|
137
138
|
|
|
138
139
|
def self.process_labels(marked_text, labels)
|
|
140
|
+
return "" if marked_text.empty? || labels.empty?
|
|
139
141
|
out = ""
|
|
140
142
|
|
|
141
143
|
count = 0
|
|
@@ -171,8 +173,17 @@ module NLP
|
|
|
171
173
|
end
|
|
172
174
|
|
|
173
175
|
def self.geniass_sentence_splitter_extension(text)
|
|
176
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
|
177
|
+
events, marks = event_extraction(cleaned)
|
|
178
|
+
|
|
174
179
|
Rbbt.software.opt.Geniass.produce
|
|
175
|
-
|
|
180
|
+
begin
|
|
181
|
+
ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
|
|
182
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
|
183
|
+
rescue LoadError
|
|
184
|
+
raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
|
|
185
|
+
end
|
|
186
|
+
|
|
176
187
|
geniass = Geniass.new
|
|
177
188
|
if not geniass.geniass_is_loaded
|
|
178
189
|
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
|
@@ -180,9 +191,6 @@ module NLP
|
|
|
180
191
|
end
|
|
181
192
|
end
|
|
182
193
|
|
|
183
|
-
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
|
184
|
-
events, marks = event_extraction(cleaned)
|
|
185
|
-
|
|
186
194
|
labels = events.split(/\n/).collect{|line|
|
|
187
195
|
geniass.label(line)
|
|
188
196
|
}
|
|
@@ -23,13 +23,14 @@ Score: #{score.inspect}
|
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def html
|
|
26
|
-
title = code.nil? ? entity_type : [entity_type, code].compact * "
|
|
26
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * " - "
|
|
27
27
|
|
|
28
28
|
text = <<-EOF
|
|
29
29
|
<span class='Entity'\
|
|
30
30
|
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
|
31
31
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
|
32
32
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
|
33
|
+
#{segid.nil? ? "" : " attr-segid='#{segid}'"}\
|
|
33
34
|
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
|
34
35
|
>#{ self }</span>
|
|
35
36
|
EOF
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
module
|
|
1
|
+
module SegmentRanges
|
|
2
2
|
def pull(offset)
|
|
3
3
|
if self.offset.nil? or offset.nil?
|
|
4
4
|
self.offset = nil
|
|
@@ -61,3 +61,11 @@ module Segment
|
|
|
61
61
|
end
|
|
62
62
|
end
|
|
63
63
|
end
|
|
64
|
+
|
|
65
|
+
module Segment
|
|
66
|
+
include SegmentRanges
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
module SegID
|
|
70
|
+
include SegmentRanges
|
|
71
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
|
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
|
|
|
27
27
|
make geniass
|
|
28
28
|
make libgeniass.so
|
|
29
29
|
|
|
30
|
+
move_opt "$name" "$extra"
|
|
31
|
+
|
|
30
32
|
mkdir lib
|
|
31
33
|
mv libgeniass.so lib
|
|
32
34
|
|
|
33
|
-
build "$name" "$extra"
|
|
34
|
-
|
|
35
35
|
cd "$OPT_BUILD_DIR/$name"
|
|
36
36
|
|
|
37
37
|
mkdir ruby
|
|
@@ -43,8 +43,7 @@ create_makefile('Geniass')
|
|
|
43
43
|
EOF
|
|
44
44
|
|
|
45
45
|
cat > ruby/Geniass.cpp <<'EOF'
|
|
46
|
-
#include "rice/
|
|
47
|
-
#include "rice/String.hpp"
|
|
46
|
+
#include "rice/rice.hpp"
|
|
48
47
|
|
|
49
48
|
#include <iostream>
|
|
50
49
|
#include <iomanip>
|
|
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
|
|
|
59
58
|
using namespace Rice;
|
|
60
59
|
using namespace std;
|
|
61
60
|
|
|
62
|
-
ME_Model model;
|
|
63
61
|
bool geniass_loaded = false;
|
|
64
62
|
|
|
65
|
-
|
|
63
|
+
bool geniass_is_loaded(Object self){ return(geniass_loaded); };
|
|
64
|
+
|
|
65
|
+
ME_Model model;
|
|
66
|
+
void load_geniass(Object self){
|
|
66
67
|
printf("loading model");
|
|
67
68
|
string modelFile = "model1-1.0";
|
|
68
69
|
model.load_from_file(modelFile.c_str());
|
|
@@ -70,8 +71,6 @@ void load_geniass(){
|
|
|
70
71
|
printf("..done\n");
|
|
71
72
|
}
|
|
72
73
|
|
|
73
|
-
bool geniass_is_loaded(){ return(geniass_loaded); };
|
|
74
|
-
|
|
75
74
|
void split(string& str, vector<string>& tokens)
|
|
76
75
|
{
|
|
77
76
|
istringstream in(str);
|
|
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
|
|
|
85
84
|
}
|
|
86
85
|
}
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
Object label(Object self, String rb_line){
|
|
89
88
|
vector<string> tokens;
|
|
90
|
-
split(line, tokens);
|
|
91
89
|
ME_Sample s;
|
|
92
90
|
|
|
91
|
+
string line = rb_line.c_str();
|
|
92
|
+
split(line, tokens);
|
|
93
|
+
|
|
93
94
|
for(vector<string>::const_iterator token = tokens.begin() + 1;
|
|
94
95
|
token != tokens.end(); ++token){
|
|
95
96
|
s.add_feature(*token);
|
|
96
97
|
}
|
|
98
|
+
|
|
97
99
|
(void) model.classify(s);
|
|
98
|
-
|
|
100
|
+
string label = s.label;
|
|
101
|
+
VALUE x;
|
|
102
|
+
x = rb_str_new_cstr(label.c_str());
|
|
103
|
+
return(x);
|
|
99
104
|
}
|
|
100
105
|
|
|
101
106
|
extern "C"
|
|
@@ -103,16 +108,20 @@ void Init_Geniass()
|
|
|
103
108
|
{
|
|
104
109
|
Class rb_cGeniass =
|
|
105
110
|
define_class("Geniass")
|
|
111
|
+
.define_method("geniass_is_loaded", &geniass_is_loaded)
|
|
106
112
|
.define_method("load_geniass", &load_geniass)
|
|
107
113
|
.define_method("label", &label)
|
|
108
|
-
|
|
114
|
+
;
|
|
109
115
|
}
|
|
110
116
|
|
|
117
|
+
|
|
111
118
|
EOF
|
|
112
119
|
|
|
113
120
|
cd ruby
|
|
114
121
|
ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
|
|
115
122
|
make
|
|
116
123
|
|
|
124
|
+
setup "$name" "$extra"
|
|
125
|
+
|
|
117
126
|
|
|
118
127
|
|
data/share/rnorm/tokens_default
CHANGED
|
@@ -6,6 +6,7 @@ tokens do
|
|
|
6
6
|
|
|
7
7
|
# Some (possible) single letters first
|
|
8
8
|
receptor /^(?:receptor|r)s?$/i
|
|
9
|
+
activator /^(?:activator|p)s?$/i
|
|
9
10
|
protein /^(?:protein|p)s?$/i
|
|
10
11
|
roman /^[IV]+$/
|
|
11
12
|
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
|
@@ -58,6 +59,8 @@ comparisons do
|
|
|
58
59
|
|
|
59
60
|
diff.promoter -10
|
|
60
61
|
diff.receptor -10
|
|
62
|
+
diff.activator -10
|
|
63
|
+
|
|
61
64
|
diff.similar -10
|
|
62
65
|
diff.capital -10
|
|
63
66
|
|
|
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
|
|
|
13
13
|
self.split(" ")
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
Document.define :lines do
|
|
17
|
+
self.split("\n")
|
|
18
|
+
end
|
|
19
|
+
|
|
16
20
|
$called_once = false
|
|
17
21
|
Document.define :persisted_words do
|
|
18
22
|
raise CalledOnce if $called_once
|
|
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
|
|
|
145
149
|
|
|
146
150
|
assert text.ner.first.segid.include?("TEST:")
|
|
147
151
|
end
|
|
152
|
+
|
|
153
|
+
def test_sentence_words
|
|
154
|
+
text =<<-EOF
|
|
155
|
+
This is sentence 1
|
|
156
|
+
This is sentence 2
|
|
157
|
+
EOF
|
|
158
|
+
|
|
159
|
+
Document.setup(text)
|
|
160
|
+
|
|
161
|
+
words = text.words
|
|
162
|
+
numbers = words.select{|w| w =~ /\d/}
|
|
163
|
+
text.lines.each do |sentence|
|
|
164
|
+
Transformed.with_transform(sentence, numbers, "[NUM]") do
|
|
165
|
+
puts sentence
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
148
169
|
end
|
|
149
170
|
|
|
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
|
|
|
12
12
|
sapiens
|
|
13
13
|
EOF
|
|
14
14
|
|
|
15
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
|
15
16
|
mentions = GNormPlus.process({:file => text})
|
|
16
17
|
|
|
17
18
|
assert_equal 1, mentions.length
|
|
@@ -23,6 +24,7 @@ sapiens
|
|
|
23
24
|
We found that TP53 is regulated by MDM2 in Homo sapiens
|
|
24
25
|
EOF
|
|
25
26
|
|
|
27
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
|
26
28
|
mentions = GNormPlus.entities({:file => text})
|
|
27
29
|
assert mentions["file"].include?("TP53")
|
|
28
30
|
mentions["file"].each do |mention|
|
|
@@ -79,6 +79,23 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
|
79
79
|
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
+
def test_entities_captures_repeat
|
|
83
|
+
sentence = "In a sentence I should find not this but this"
|
|
84
|
+
|
|
85
|
+
ner = RegExpNER.new({:this => /not this but (this)/})
|
|
86
|
+
matches = ner.entities(sentence)
|
|
87
|
+
assert sentence[0..matches.first.offset-1].include?('this')
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_entities_named_captures
|
|
92
|
+
sentence = "In a sentence I should find not this but this"
|
|
93
|
+
|
|
94
|
+
ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
|
|
95
|
+
matches = ner.entities(sentence)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
|
|
82
99
|
|
|
83
100
|
def test_regexp_order
|
|
84
101
|
text =<<-EOF
|
|
@@ -12,7 +12,6 @@ sentence. This is
|
|
|
12
12
|
another broken sentence.
|
|
13
13
|
EOF
|
|
14
14
|
|
|
15
|
-
iii NLP.geniass_sentence_splitter(text)
|
|
16
15
|
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
|
17
16
|
end
|
|
18
17
|
|
|
@@ -37,7 +36,17 @@ sentence. This is
|
|
|
37
36
|
another broken sentence.
|
|
38
37
|
EOF
|
|
39
38
|
|
|
40
|
-
|
|
39
|
+
Log.with_severity 0 do
|
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def test_sentence_cmi
|
|
45
|
+
text =<<-EOF
|
|
46
|
+
The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
|
|
47
|
+
EOF
|
|
48
|
+
|
|
49
|
+
iii NLP.geniass_sentence_splitter(text)
|
|
41
50
|
end
|
|
42
51
|
end
|
|
43
52
|
|
|
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
73
73
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
|
74
74
|
end
|
|
75
75
|
|
|
76
|
+
Transformed.with_transform(a, [gene2], "GN") do
|
|
77
|
+
Transformed.with_transform(a, [gene1], "GN") do
|
|
78
|
+
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
|
79
|
+
end
|
|
80
|
+
assert_equal original.gsub(/CDK5R1/, 'GN'), a
|
|
81
|
+
end
|
|
82
|
+
|
|
76
83
|
Transformed.with_transform(a, [gene1], "GN") do
|
|
77
84
|
Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
|
|
78
85
|
assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
|
|
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
144
151
|
gene2.entity_type = "Protein"
|
|
145
152
|
|
|
146
153
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
|
154
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
|
|
148
155
|
end
|
|
149
156
|
end
|
|
150
157
|
|
|
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
165
172
|
gene2.entity_type = "Protein"
|
|
166
173
|
|
|
167
174
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
|
175
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
|
|
169
176
|
end
|
|
170
177
|
end
|
|
171
178
|
|
|
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
|
185
192
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
|
186
193
|
|
|
187
194
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
|
195
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
|
189
196
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
|
197
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
|
191
198
|
end
|
|
192
199
|
end
|
|
193
200
|
end
|
|
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
|
415
422
|
end
|
|
416
423
|
|
|
417
424
|
end
|
|
418
|
-
|
|
419
425
|
end
|
|
420
426
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rbbt-text
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Miguel Vazquez
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2023-02-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rbbt-util
|
|
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
175
175
|
- !ruby/object:Gem::Version
|
|
176
176
|
version: '0'
|
|
177
177
|
requirements: []
|
|
178
|
-
rubygems_version: 3.1.
|
|
178
|
+
rubygems_version: 3.1.2
|
|
179
179
|
signing_key:
|
|
180
180
|
specification_version: 4
|
|
181
181
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
|
182
182
|
test_files:
|
|
183
|
-
- test/
|
|
184
|
-
- test/
|
|
185
|
-
- test/rbbt/
|
|
183
|
+
- test/test_spaCy.rb
|
|
184
|
+
- test/test_helper.rb
|
|
185
|
+
- test/rbbt/bow/test_dictionary.rb
|
|
186
186
|
- test/rbbt/bow/test_bow.rb
|
|
187
187
|
- test/rbbt/bow/test_misc.rb
|
|
188
|
-
- test/rbbt/
|
|
189
|
-
- test/rbbt/
|
|
190
|
-
- test/rbbt/
|
|
188
|
+
- test/rbbt/segment/test_encoding.rb
|
|
189
|
+
- test/rbbt/segment/test_transformed.rb
|
|
190
|
+
- test/rbbt/segment/test_overlaps.rb
|
|
191
|
+
- test/rbbt/segment/test_named_entity.rb
|
|
192
|
+
- test/rbbt/segment/test_corpus.rb
|
|
193
|
+
- test/rbbt/segment/test_range_index.rb
|
|
194
|
+
- test/rbbt/segment/test_annotation.rb
|
|
195
|
+
- test/rbbt/entity/test_document.rb
|
|
191
196
|
- test/rbbt/document/corpus/test_pubmed.rb
|
|
192
197
|
- test/rbbt/document/test_corpus.rb
|
|
193
|
-
- test/rbbt/
|
|
198
|
+
- test/rbbt/document/test_annotation.rb
|
|
199
|
+
- test/rbbt/test_document.rb
|
|
194
200
|
- test/rbbt/ner/test_patterns.rb
|
|
195
|
-
- test/rbbt/ner/test_NER.rb
|
|
196
|
-
- test/rbbt/ner/test_abner.rb
|
|
197
201
|
- test/rbbt/ner/rnorm/test_tokens.rb
|
|
198
|
-
- test/rbbt/ner/test_rnorm.rb
|
|
199
|
-
- test/rbbt/ner/test_regexpNER.rb
|
|
200
202
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
|
203
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
|
204
|
+
- test/rbbt/ner/test_finder.rb
|
|
201
205
|
- test/rbbt/ner/test_brat.rb
|
|
206
|
+
- test/rbbt/ner/test_regexpNER.rb
|
|
202
207
|
- test/rbbt/ner/test_g_norm_plus.rb
|
|
208
|
+
- test/rbbt/ner/test_rnorm.rb
|
|
209
|
+
- test/rbbt/ner/test_linnaeus.rb
|
|
203
210
|
- test/rbbt/ner/test_chemical_tagger.rb
|
|
204
|
-
- test/rbbt/ner/
|
|
205
|
-
- test/rbbt/ner/
|
|
206
|
-
- test/rbbt/ner/test_finder.rb
|
|
211
|
+
- test/rbbt/ner/test_NER.rb
|
|
212
|
+
- test/rbbt/ner/test_abner.rb
|
|
207
213
|
- test/rbbt/ner/test_rner.rb
|
|
208
|
-
- test/rbbt/ner/test_linnaeus.rb
|
|
209
214
|
- test/rbbt/ner/test_oscar4.rb
|
|
215
|
+
- test/rbbt/ner/test_banner.rb
|
|
210
216
|
- test/rbbt/test_segment.rb
|
|
211
|
-
- test/rbbt/
|
|
212
|
-
- test/rbbt/
|
|
213
|
-
- test/rbbt/
|
|
214
|
-
- test/rbbt/segment/test_named_entity.rb
|
|
215
|
-
- test/rbbt/segment/test_encoding.rb
|
|
216
|
-
- test/rbbt/segment/test_range_index.rb
|
|
217
|
-
- test/rbbt/segment/test_corpus.rb
|
|
218
|
-
- test/test_spaCy.rb
|
|
219
|
-
- test/test_helper.rb
|
|
217
|
+
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
|
218
|
+
- test/rbbt/nlp/test_nlp.rb
|
|
219
|
+
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|