rbbt-text 1.3.4 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +6 -0
- data/lib/rbbt/document/annotation.rb +2 -2
- data/lib/rbbt/document/corpus.rb +5 -3
- data/lib/rbbt/document/corpus/pubmed.rb +1 -0
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +158 -15
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +3 -0
- data/lib/rbbt/segment/named_entity.rb +4 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +10 -1
- data/test/rbbt/document/test_corpus.rb +14 -0
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/segment/test_named_entity.rb +2 -1
- data/test/rbbt/segment/test_transformed.rb +4 -4
- data/test/test_spaCy.rb +113 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
|
4
|
+
data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
|
7
|
+
data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
|
data/lib/rbbt/document.rb
CHANGED
@@ -13,7 +13,7 @@ module Document
|
|
13
13
|
end
|
14
14
|
|
15
15
|
docid = self.docid
|
16
|
-
segments.each{|s| s.docid = docid
|
16
|
+
segments.each{|s| s.docid = docid }
|
17
17
|
|
18
18
|
segments
|
19
19
|
end
|
@@ -36,7 +36,7 @@ module Document
|
|
36
36
|
|
37
37
|
docid = document.docid
|
38
38
|
|
39
|
-
segments.each{|s| s.docid = docid
|
39
|
+
segments.each{|s| s.docid = docid }
|
40
40
|
|
41
41
|
segments
|
42
42
|
end
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
|
6
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
7
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
8
9
|
corpus
|
@@ -16,7 +17,8 @@ module Document::Corpus
|
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
|
-
def docids(prefix)
|
20
|
+
def docids(*prefix)
|
21
|
+
prefix = prefix * ":"
|
20
22
|
prefix += ":" unless prefix == :all || prefix[-1] == ":"
|
21
23
|
docids = self.read_and_close do
|
22
24
|
prefix == :all ? self.keys : self.prefix(prefix)
|
@@ -24,8 +26,8 @@ module Document::Corpus
|
|
24
26
|
DocID.setup(docids, :corpus => self)
|
25
27
|
end
|
26
28
|
|
27
|
-
def documents(prefix)
|
28
|
-
self.docids(prefix).document
|
29
|
+
def documents(*prefix)
|
30
|
+
self.docids(*prefix).document
|
29
31
|
end
|
30
32
|
|
31
33
|
def [](*args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -39,14 +39,15 @@ class Abner < NER
|
|
39
39
|
types = res[1]
|
40
40
|
strings = res[0]
|
41
41
|
|
42
|
+
docid = Misc.digest(text)
|
42
43
|
global_offset = 0
|
43
44
|
strings.zip(types).collect do |mention, type|
|
44
45
|
mention = mention.to_s;
|
45
46
|
offset = text.index(mention)
|
46
47
|
if offset.nil?
|
47
|
-
NamedEntity.setup(mention,
|
48
|
+
NamedEntity.setup(mention, :docid => docid, :entity_type => type)
|
48
49
|
else
|
49
|
-
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
50
|
+
NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
|
50
51
|
text = text[offset + mention.length..-1]
|
51
52
|
global_offset += offset + mention.length
|
52
53
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -55,6 +55,7 @@ class Banner < NER
|
|
55
55
|
# text.
|
56
56
|
def match(text)
|
57
57
|
return [] if text.nil?
|
58
|
+
text = text.dup if text.frozen?
|
58
59
|
text.gsub!(/\n/,' ')
|
59
60
|
text.gsub!(/\|/,'/') # Character | gives an error
|
60
61
|
return [] if text.strip.empty?
|
@@ -66,6 +67,7 @@ class Banner < NER
|
|
66
67
|
@parenPP.postProcess(sentence)
|
67
68
|
tagged = sentence.getSGML
|
68
69
|
|
70
|
+
docid = Misc.digest text
|
69
71
|
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
70
72
|
collect{|r|
|
71
73
|
r.match(/<GENE>(.*?)<\/GENE>/)
|
@@ -73,7 +75,7 @@ class Banner < NER
|
|
73
75
|
mention.sub!(/^\s*/,'')
|
74
76
|
mention.sub!(/\s*$/,'')
|
75
77
|
offset = text.index(mention)
|
76
|
-
NamedEntity.setup(mention, offset, 'GENE')
|
78
|
+
NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
|
77
79
|
mention
|
78
80
|
}
|
79
81
|
res
|
data/lib/rbbt/ner/brat.rb
CHANGED
data/lib/rbbt/ner/linnaeus.rb
CHANGED
@@ -31,7 +31,8 @@ module Linnaeus
|
|
31
31
|
init unless defined? @@Matcher
|
32
32
|
|
33
33
|
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
-
|
34
|
+
best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
|
35
|
+
NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/misc'
|
4
|
+
require 'rbbt/util/simpleDSL'
|
5
|
+
|
6
|
+
class NERFeatures
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
def self.tokens(text)
|
10
|
+
text.scan(/
|
11
|
+
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
12
|
+
\w-\w*|
|
13
|
+
\w+-[A-Z](?!\w)|
|
14
|
+
\w+|
|
15
|
+
[.,()\/\[\]{}'"+-]
|
16
|
+
/x)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.reverse(text)
|
20
|
+
tokens(text).reverse.join(" ")
|
21
|
+
end
|
22
|
+
|
23
|
+
def define(name, *args, &block)
|
24
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
25
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
26
|
+
|
27
|
+
@types[name.to_s] = action
|
28
|
+
@order.push name.to_s
|
29
|
+
|
30
|
+
name.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_accessor :reverse
|
34
|
+
def initialize(file = nil, reverse = false, &block)
|
35
|
+
@types = {}
|
36
|
+
@order = []
|
37
|
+
@context = []
|
38
|
+
@reverse = reverse
|
39
|
+
|
40
|
+
file ||= Rbbt.share.ner['config.rb'].find if !file && !block
|
41
|
+
|
42
|
+
parse(:define, file, &block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def config
|
46
|
+
@config[:define]
|
47
|
+
end
|
48
|
+
|
49
|
+
def window(positions)
|
50
|
+
@window = positions
|
51
|
+
end
|
52
|
+
|
53
|
+
def context(name, &block)
|
54
|
+
if name.is_a? Array
|
55
|
+
@context += name
|
56
|
+
else
|
57
|
+
@context.push name
|
58
|
+
|
59
|
+
# The block might be wrongly assigned to this function
|
60
|
+
# instead of the actual definition, fix that.
|
61
|
+
if block
|
62
|
+
@types[name] = block
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def direction(dir)
|
68
|
+
if dir.to_sym == :reverse
|
69
|
+
@reverse = true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def features(word)
|
74
|
+
values = [word]
|
75
|
+
|
76
|
+
@order.each{|features|
|
77
|
+
action = @types[features]
|
78
|
+
if action.is_a?(Proc)
|
79
|
+
values.push(action.call(word))
|
80
|
+
else
|
81
|
+
m = action.match(word)
|
82
|
+
if m
|
83
|
+
if m[1]
|
84
|
+
values.push(m[1])
|
85
|
+
else
|
86
|
+
values.push(m != nil)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
values.push(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
}
|
93
|
+
values
|
94
|
+
end
|
95
|
+
|
96
|
+
def template(window=nil)
|
97
|
+
window ||= @window || [1,-1]
|
98
|
+
template = ""
|
99
|
+
|
100
|
+
i = 1
|
101
|
+
@order.each{|feat|
|
102
|
+
template += "U#{ feat }: %x[0,#{ i }]\n"
|
103
|
+
|
104
|
+
if @context.include?(feat)
|
105
|
+
window.each{|p|
|
106
|
+
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
107
|
+
}
|
108
|
+
end
|
109
|
+
i += 1
|
110
|
+
}
|
111
|
+
|
112
|
+
template += "B\n"
|
113
|
+
|
114
|
+
template
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def text_features(text, positive = nil)
|
119
|
+
text = self.class.reverse(text) if @reverse
|
120
|
+
initial = true
|
121
|
+
self.class.tokens(text).collect{|token|
|
122
|
+
features = features(token)
|
123
|
+
if !positive.nil?
|
124
|
+
features << (positive ? (initial ? 1 : 2) : 0)
|
125
|
+
initial = false
|
126
|
+
end
|
127
|
+
features
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def tagged_features(text, mentions)
|
132
|
+
mentions ||= []
|
133
|
+
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
134
|
+
re = mentions.collect{|mention|
|
135
|
+
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
136
|
+
}.join("|")
|
137
|
+
|
138
|
+
positive = false
|
139
|
+
features = []
|
140
|
+
chunks = text.split(/(#{re})/)
|
141
|
+
chunks.each{|t|
|
142
|
+
chunk_features = text_features(t, positive)
|
143
|
+
positive = !positive
|
144
|
+
if @reverse
|
145
|
+
features = chunk_features + features
|
146
|
+
else
|
147
|
+
features = features + chunk_features
|
148
|
+
end
|
149
|
+
}
|
150
|
+
features
|
151
|
+
end
|
152
|
+
|
153
|
+
def train(features, model)
|
154
|
+
tmp_template = TmpFile.tmp_file("template-")
|
155
|
+
Open.write(tmp_template,template)
|
156
|
+
|
157
|
+
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
158
|
+
system cmd
|
159
|
+
Open.write(model + '.config',config)
|
160
|
+
FileUtils.rm tmp_template
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
class NER
|
166
|
+
|
167
|
+
def initialize(model = nil)
|
168
|
+
begin
|
169
|
+
require 'CRFPP'
|
170
|
+
rescue Exception
|
171
|
+
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
172
|
+
end
|
173
|
+
|
174
|
+
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
175
|
+
|
176
|
+
@parser = NERFeatures.new(model + '.config')
|
177
|
+
@reverse = @parser.reverse
|
178
|
+
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract(text)
|
182
|
+
features = @parser.text_features(text)
|
183
|
+
|
184
|
+
@tagger.clear
|
185
|
+
features.each{|feats|
|
186
|
+
@tagger.add(feats.join(" "))
|
187
|
+
}
|
188
|
+
|
189
|
+
@tagger.parse
|
190
|
+
|
191
|
+
found = []
|
192
|
+
mention = []
|
193
|
+
|
194
|
+
@tagger.size.times{|i|
|
195
|
+
label = @tagger.y(i)
|
196
|
+
word = @tagger.x(i,0)
|
197
|
+
|
198
|
+
if word == ')'
|
199
|
+
mention.push(')') if mention.join =~ /\(/
|
200
|
+
next
|
201
|
+
end
|
202
|
+
|
203
|
+
case label
|
204
|
+
when 1
|
205
|
+
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
206
|
+
found.push(mention)
|
207
|
+
mention = []
|
208
|
+
end
|
209
|
+
mention.push(word)
|
210
|
+
when 2
|
211
|
+
mention.push(word)
|
212
|
+
when 0
|
213
|
+
found.push(mention) if mention.any?
|
214
|
+
mention = []
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
found << mention if mention.any?
|
219
|
+
|
220
|
+
found.collect{|list|
|
221
|
+
list = list.reverse if @reverse
|
222
|
+
list.join(" ")
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
|
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
|
|
249
249
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
250
250
|
}
|
251
251
|
|
252
|
-
|
252
|
+
type = type.first
|
253
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
|
253
254
|
end
|
254
255
|
|
255
256
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
data/lib/rbbt/nlp/spaCy.rb
CHANGED
@@ -2,30 +2,55 @@ require 'rbbt/segment'
|
|
2
2
|
require 'rbbt/document'
|
3
3
|
require 'rbbt/segment/annotation'
|
4
4
|
require 'rbbt/util/python'
|
5
|
+
require 'rbbt/network/paths'
|
5
6
|
|
6
7
|
module SpaCy
|
7
8
|
|
8
|
-
|
9
|
+
TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
10
|
+
CHUNK_PROPERTIES = %w(lemma_)
|
9
11
|
|
10
|
-
def self.
|
12
|
+
def self.nlp(lang = 'en_core_web_md')
|
13
|
+
@@nlp ||= {}
|
14
|
+
@@nlp[lang] ||= RbbtPython.run :spacy do
|
15
|
+
spacy.load(lang)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.tokens(text, lang = 'en_core_web_sm')
|
11
20
|
|
12
21
|
tokens = []
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
22
|
+
|
23
|
+
nlp = nlp(lang)
|
24
|
+
doc = nlp.call(text)
|
25
|
+
|
26
|
+
doc.__len__.times do |i|
|
27
|
+
tokens << doc.__getitem__(i)
|
28
|
+
end
|
29
|
+
|
30
|
+
tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.chunks(text, lang = 'en_core_web_sm')
|
34
|
+
|
35
|
+
tokens = []
|
36
|
+
nlp = nlp(lang)
|
37
|
+
|
38
|
+
doc = nlp.call(text)
|
39
|
+
chunks = doc.noun_chunks.__iter__
|
40
|
+
|
41
|
+
RbbtPython.iterate chunks do |item|
|
42
|
+
tokens << item
|
19
43
|
end
|
44
|
+
|
20
45
|
tokens
|
21
46
|
end
|
22
47
|
|
23
|
-
def self.segments(text, lang = '
|
24
|
-
docid = text.docid if Document === text
|
48
|
+
def self.segments(text, lang = 'en_core_web_sm')
|
49
|
+
docid = text.docid if Document === text
|
25
50
|
corpus = text.corpus if Document === text
|
26
51
|
tokens = self.tokens(text, lang).collect do |token|
|
27
52
|
info = {}
|
28
|
-
|
53
|
+
TOKEN_PROPERTIES.each do |p|
|
29
54
|
info[p] = token.instance_eval(p.to_s)
|
30
55
|
end
|
31
56
|
info[:type] = "SpaCy"
|
@@ -35,7 +60,120 @@ module SpaCy
|
|
35
60
|
info[:corpus] = corpus if corpus
|
36
61
|
SpaCyToken.setup(token.text, info)
|
37
62
|
end
|
38
|
-
|
63
|
+
|
64
|
+
tokens
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.chunk_segments(text, lang = 'en_core_web_sm')
|
68
|
+
docid = text.docid if Document === text
|
69
|
+
corpus = text.corpus if Document === text
|
70
|
+
chunks = self.chunks(text, lang).collect do |chunk|
|
71
|
+
info = {}
|
72
|
+
CHUNK_PROPERTIES.each do |p|
|
73
|
+
info[p] = chunk.instance_eval(p.to_s)
|
74
|
+
end
|
75
|
+
start = eend = nil
|
76
|
+
deps = []
|
77
|
+
RbbtPython.iterate chunk.__iter__ do |token|
|
78
|
+
start = token.idx if start.nil?
|
79
|
+
eend = start + chunk.text.length if eend.nil?
|
80
|
+
deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
|
81
|
+
end
|
82
|
+
info[:type] = "SpaCy"
|
83
|
+
info[:offset] = chunk.__iter__.__next__.idx
|
84
|
+
info[:dep] = deps * ";"
|
85
|
+
info[:docid] = docid if docid
|
86
|
+
info[:corpus] = corpus if corpus
|
87
|
+
SpaCySpan.setup(chunk.text, info)
|
88
|
+
end
|
89
|
+
|
90
|
+
chunks
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
|
94
|
+
tokens = self.segments(text, lang)
|
95
|
+
index = Segment.index(tokens)
|
96
|
+
associations = {}
|
97
|
+
tokens.each do |token|
|
98
|
+
type, target_pos = token.dep.split("->")
|
99
|
+
target_tokens = index[target_pos.to_i]
|
100
|
+
associations[token.segid] = target_tokens
|
101
|
+
end
|
102
|
+
|
103
|
+
if reverse
|
104
|
+
old = associations.dup
|
105
|
+
old.each do |s,ts|
|
106
|
+
ts.each do |t|
|
107
|
+
associations[t] ||= []
|
108
|
+
associations[t] += [s] unless associations[t].include?(s)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
associations
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
|
117
|
+
associations = dep_graph(text, false, lang)
|
118
|
+
|
119
|
+
chunks = self.chunk_segments(text, lang)
|
120
|
+
tokens = self.segments(text, lang)
|
121
|
+
index = Segment.index(tokens + chunks)
|
122
|
+
|
123
|
+
chunks.each do |chunk|
|
124
|
+
target_token_ids = chunk.dep.split(";").collect do|dep|
|
125
|
+
type, target_pos = dep.split("->")
|
126
|
+
index[target_pos.to_i]
|
127
|
+
end.flatten
|
128
|
+
|
129
|
+
target_tokens = target_token_ids.collect do |target_token_id|
|
130
|
+
range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
|
131
|
+
range.collect do |pos|
|
132
|
+
index[pos]
|
133
|
+
end.uniq
|
134
|
+
end.flatten
|
135
|
+
associations[chunk.segid] = target_tokens
|
136
|
+
end
|
137
|
+
|
138
|
+
if reverse
|
139
|
+
old = associations.dup
|
140
|
+
old.each do |s,ts|
|
141
|
+
ts.each do |t|
|
142
|
+
associations[t] ||= []
|
143
|
+
associations[t] += [s] unless associations[t].include?(s)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
associations
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
|
152
|
+
graph = SpaCy.chunk_dep_graph(text, reverse, lang)
|
153
|
+
|
154
|
+
chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
|
155
|
+
|
156
|
+
source_id = chunk_index[source.offset].first || source.segid
|
157
|
+
target_id = chunk_index[target.offset].first || target.segid
|
158
|
+
|
159
|
+
path = Paths.dijkstra(graph, source_id, [target_id])
|
160
|
+
|
161
|
+
return nil if path.nil?
|
162
|
+
|
163
|
+
path.reverse
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.config(base, target = nil)
|
167
|
+
TmpFile.with_file(base) do |baseconfig|
|
168
|
+
if target
|
169
|
+
CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
|
170
|
+
else
|
171
|
+
TmpFile.with_file do |tmptarget|
|
172
|
+
CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
|
173
|
+
Open.read(targetconfig)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
39
177
|
end
|
40
178
|
end
|
41
179
|
|
@@ -43,10 +181,15 @@ module SpaCyToken
|
|
43
181
|
extend Entity
|
44
182
|
include SegmentAnnotation
|
45
183
|
|
46
|
-
self.annotation *SpaCy::
|
184
|
+
self.annotation *SpaCy::TOKEN_PROPERTIES
|
47
185
|
self.annotation :dep
|
48
186
|
end
|
49
187
|
|
50
|
-
|
51
|
-
|
188
|
+
module SpaCySpan
|
189
|
+
extend Entity
|
190
|
+
include SegmentAnnotation
|
191
|
+
|
192
|
+
self.annotation *SpaCy::CHUNK_PROPERTIES
|
193
|
+
self.annotation :dep
|
52
194
|
end
|
195
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Relationship
|
4
|
+
extend Annotation
|
5
|
+
self.annotation :segment
|
6
|
+
self.annotation :terms
|
7
|
+
self.annotation :type
|
8
|
+
|
9
|
+
def text
|
10
|
+
if segment
|
11
|
+
segment
|
12
|
+
else
|
13
|
+
type + ": " + terms * ", "
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def html
|
18
|
+
text = <<-EOF
|
19
|
+
<span class='Relationship'\
|
20
|
+
>#{ self.text }</span>
|
21
|
+
EOF
|
22
|
+
text.chomp
|
23
|
+
end
|
24
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -6,7 +6,7 @@ module Segment::RangeIndex
|
|
6
6
|
SegID.setup(res, :corpus => corpus)
|
7
7
|
end
|
8
8
|
|
9
|
-
def self.index(segments, corpus, persist_file = :memory)
|
9
|
+
def self.index(segments, corpus = nil, persist_file = :memory)
|
10
10
|
segments = segments.values.flatten if Hash === segments
|
11
11
|
|
12
12
|
annotation_index =
|
@@ -0,0 +1,51 @@
|
|
1
|
+
isLetters /^[A-Z]+$/i
|
2
|
+
isUpper /^[A-Z]+$/
|
3
|
+
isLower /^[a-z]+$/
|
4
|
+
isDigits /^[0-9]+$/i
|
5
|
+
isRoman /^[IVX]+$/
|
6
|
+
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
+
isPunctuation /^[,.;]$/
|
8
|
+
isDelim /^[\/()\[\]{}\-]$/
|
9
|
+
isNonWord /^[^\w]+$/
|
10
|
+
isConjunction /^and|or|&|,$/
|
11
|
+
|
12
|
+
hasLetters /[A-Z]/i
|
13
|
+
hasUpper /.[A-Z]/
|
14
|
+
hasLower /[a-z]/
|
15
|
+
hasDigits /[0-9]/i
|
16
|
+
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
+
hasPunctuation /[,.;]/
|
18
|
+
hasDelim /[\/()\[\]{}\-]/
|
19
|
+
hasNonWord /[^\w]/
|
20
|
+
caspMix /[a-z].[A-Z]/
|
21
|
+
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
+
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
+
|
24
|
+
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
+
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
+
#
|
27
|
+
prefix_3 /^(...)/
|
28
|
+
prefix_4 /^(....)/
|
29
|
+
suffix_3 /(...)$/
|
30
|
+
suffix_4 /(....)$/
|
31
|
+
|
32
|
+
|
33
|
+
token1 do |w|
|
34
|
+
w.sub(/[A-Z]/,'A').
|
35
|
+
sub(/[a-z]/,'a').
|
36
|
+
sub(/[0-9]/,'0').
|
37
|
+
sub(/[^0-9a-z]/i,'x')
|
38
|
+
end
|
39
|
+
token2 do |w|
|
40
|
+
w.sub(/[A-Z]+/,'A').
|
41
|
+
sub(/[a-z]+/,'a').
|
42
|
+
sub(/[0-9]+/,'0').
|
43
|
+
sub(/[^0-9a-z]+/i,'x')
|
44
|
+
end
|
45
|
+
token3 do |w| w.downcase end
|
46
|
+
special do |w| w.is_special? end
|
47
|
+
|
48
|
+
context %w(special token2 isPunctuation isDelim)
|
49
|
+
window %w(1 2 3 -1 -2 -3)
|
50
|
+
#direction :reverse
|
51
|
+
|
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
|
|
7
7
|
def test_add_pmid
|
8
8
|
corpus = Document::Corpus.setup({})
|
9
9
|
|
10
|
-
document = corpus.add_pmid("
|
10
|
+
document = corpus.add_pmid("33359141", :abstract).first
|
11
|
+
iii document.docid
|
11
12
|
title = document.to(:title)
|
12
13
|
assert title.include?("COVID-19")
|
13
14
|
end
|
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
|
|
4
4
|
require 'rbbt/segment'
|
5
5
|
require 'rbbt/document/annotation'
|
6
6
|
require 'rbbt/segment/named_entity'
|
7
|
+
require 'rbbt/ner/abner'
|
7
8
|
|
8
9
|
class TestAnnotation < Test::Unit::TestCase
|
9
10
|
class CalledOnce < Exception; end
|
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
|
|
28
29
|
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
30
|
end
|
30
31
|
|
32
|
+
Document.define :abner do
|
33
|
+
$called_once = true
|
34
|
+
Abner.new.match(self)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
31
38
|
Document.persist :ner
|
32
39
|
end
|
33
40
|
|
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
|
|
133
140
|
text.ner
|
134
141
|
|
135
142
|
assert ! $called_once
|
136
|
-
|
143
|
+
|
144
|
+
assert_equal text.abner.first.docid, text.docid
|
145
|
+
|
137
146
|
assert text.ner.first.segid.include?("TEST:")
|
138
147
|
end
|
139
148
|
end
|
@@ -29,5 +29,19 @@ class TestDocumentCorpus < Test::Unit::TestCase
|
|
29
29
|
assert corpus.docids("TEST:").include?(text.docid)
|
30
30
|
end
|
31
31
|
end
|
32
|
+
|
33
|
+
def test_load
|
34
|
+
text = "This is a document"
|
35
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
36
|
+
|
37
|
+
TmpFile.with_file do |path|
|
38
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
39
|
+
corpus.extend Document::Corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
44
|
+
end
|
45
|
+
end
|
32
46
|
end
|
33
47
|
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/rner'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestRNer < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@parser = NERFeatures.new() do
|
10
|
+
isLetters /^[A-Z]+$/i
|
11
|
+
context prefix_3 /^(...)/
|
12
|
+
downcase do |w| w.downcase end
|
13
|
+
|
14
|
+
context %w(downcase)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_config
|
19
|
+
config = <<-EOC
|
20
|
+
isLetters /^[A-Z]+$/i
|
21
|
+
context prefix_3 /^(...)/
|
22
|
+
downcase do |w| w.downcase end
|
23
|
+
|
24
|
+
context %w(downcase)
|
25
|
+
EOC
|
26
|
+
|
27
|
+
assert_equal config.strip, @parser.config.strip
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_reverse
|
31
|
+
assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
|
32
|
+
assert_equal(
|
33
|
+
". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
|
34
|
+
NERFeatures.reverse(
|
35
|
+
"A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
|
36
|
+
))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_features
|
40
|
+
assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_template
|
44
|
+
template =<<-EOT
|
45
|
+
UisLetters: %x[0,1]
|
46
|
+
Uprefix_3: %x[0,2]
|
47
|
+
Uprefix_3#1: %x[1,2]
|
48
|
+
Uprefix_3#-1: %x[-1,2]
|
49
|
+
Udowncase: %x[0,3]
|
50
|
+
Udowncase#1: %x[1,3]
|
51
|
+
Udowncase#-1: %x[-1,3]
|
52
|
+
B
|
53
|
+
EOT
|
54
|
+
|
55
|
+
assert(@parser.template == template)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_tokens
|
59
|
+
assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
|
60
|
+
["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
def test_text_features
|
65
|
+
|
66
|
+
assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
|
67
|
+
assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
|
68
|
+
assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_tagged_features
|
73
|
+
assert_equal(
|
74
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
75
|
+
["of",true, false, "of", 0],
|
76
|
+
["GENE1",false, "GEN", "gene1", 1],
|
77
|
+
[".", false, false, ".", 0]],
|
78
|
+
@parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
|
79
|
+
|
80
|
+
assert_equal(
|
81
|
+
[["GENE1",false, "GEN", "gene1", 1],
|
82
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]],
|
83
|
+
@parser.tagged_features("GENE1 phosphorilation",['GENE1']))
|
84
|
+
|
85
|
+
|
86
|
+
assert_equal(
|
87
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
88
|
+
["of",true, false, "of", 0],
|
89
|
+
["GENE",true, "GEN", "gene", 1],
|
90
|
+
["1",false, false, "1", 2],
|
91
|
+
[".", false, false, ".", 0]],
|
92
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_tagged_features_reverse
|
96
|
+
@parser.reverse = true
|
97
|
+
assert_equal(
|
98
|
+
[
|
99
|
+
["GENE1",false, "GEN", "gene1", 1],
|
100
|
+
["of",true, false, "of", 0],
|
101
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
102
|
+
],
|
103
|
+
@parser.tagged_features("phosphorilation of GENE1",['GENE1']))
|
104
|
+
|
105
|
+
assert_equal(
|
106
|
+
[
|
107
|
+
[".", false, false, ".", 0],
|
108
|
+
["1",false, false, "1", 1],
|
109
|
+
["GENE",true, "GEN", "gene", 2],
|
110
|
+
["of",true, false, "of", 0],
|
111
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
112
|
+
],
|
113
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_default_config
|
117
|
+
require 'rbbt/bow/misc'
|
118
|
+
text =<<-EOF
|
119
|
+
This text explains how MDM2 interacts with TP53.
|
120
|
+
EOF
|
121
|
+
@parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
|
122
|
+
features = @parser.tagged_features text, %w(TP53 MDM2)
|
123
|
+
assert features.first.first == "This"
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def __test_CRFPP_install
|
129
|
+
assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -24,7 +24,8 @@ class TestClass < Test::Unit::TestCase
|
|
24
24
|
|
25
25
|
def test_tsv
|
26
26
|
a = "test"
|
27
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
27
|
+
NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
|
28
|
+
ppp Annotated.tsv([a,a])
|
28
29
|
assert Annotated.tsv([a]).fields.include? "code"
|
29
30
|
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
31
|
assert Annotated.tsv([a], :all).fields.include? "code"
|
@@ -144,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
144
144
|
gene2.entity_type = "Protein"
|
145
145
|
|
146
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
147
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
148
148
|
end
|
149
149
|
end
|
150
150
|
|
@@ -165,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
165
165
|
gene2.entity_type = "Protein"
|
166
166
|
|
167
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
168
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
169
169
|
end
|
170
170
|
end
|
171
171
|
|
@@ -185,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
185
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
186
186
|
|
187
187
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
188
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
189
189
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
190
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
191
191
|
end
|
192
192
|
end
|
193
193
|
end
|
data/test/test_spaCy.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/nlp/spaCy'
|
|
3
3
|
require 'rbbt/document/corpus'
|
4
4
|
|
5
5
|
class TestSpaCy < Test::Unit::TestCase
|
6
|
-
def
|
6
|
+
def test_tokens
|
7
7
|
text = "I tell a story"
|
8
8
|
|
9
9
|
tokens = SpaCy.tokens(text)
|
@@ -12,6 +12,16 @@ class TestSpaCy < Test::Unit::TestCase
|
|
12
12
|
assert_equal "tell", tokens[1].to_s
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_chunks
|
16
|
+
text = "Miguel Vazquez tell a good story"
|
17
|
+
|
18
|
+
tokens = SpaCy.chunks(text)
|
19
|
+
|
20
|
+
assert_equal 2, tokens.length
|
21
|
+
assert_equal "Miguel Vazquez", tokens[0].to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
|
15
25
|
def test_segments
|
16
26
|
text = "I tell a story. It's a very good story."
|
17
27
|
|
@@ -28,5 +38,107 @@ class TestSpaCy < Test::Unit::TestCase
|
|
28
38
|
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
29
39
|
end
|
30
40
|
end
|
41
|
+
|
42
|
+
def test_chunk_segments
|
43
|
+
text = "I tell a story. It's a very good story."
|
44
|
+
|
45
|
+
corpus = Document::Corpus.setup({})
|
46
|
+
|
47
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
48
|
+
|
49
|
+
corpus.add_document text
|
50
|
+
text.corpus = corpus
|
51
|
+
|
52
|
+
segments = SpaCy.chunk_segments(text)
|
53
|
+
|
54
|
+
segments.each do |segment|
|
55
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_dep_graph
|
60
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
61
|
+
graph = SpaCy.dep_graph(text, true)
|
62
|
+
|
63
|
+
tokens = SpaCy.segments(text)
|
64
|
+
index = Segment.index tokens
|
65
|
+
tf_s = tokens.select{|t| t == "TF" }.first
|
66
|
+
tg_s = tokens.select{|t| t == "TG" }.first
|
67
|
+
|
68
|
+
require 'rbbt/network/paths'
|
69
|
+
|
70
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
71
|
+
path_tokens = path.collect do |segid|
|
72
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
73
|
+
text[range]
|
74
|
+
end
|
75
|
+
|
76
|
+
assert path_tokens.include? 'increase'
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_chunk_dep_graph
|
81
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
82
|
+
graph = SpaCy.chunk_dep_graph(text, true)
|
83
|
+
|
84
|
+
tokens = SpaCy.chunk_segments(text)
|
85
|
+
index = Segment.index tokens
|
86
|
+
tf_s = tokens.select{|t| t.include? "TF" }.first
|
87
|
+
tg_s = tokens.select{|t| t.include? "TG" }.first
|
88
|
+
|
89
|
+
|
90
|
+
require 'rbbt/network/paths'
|
91
|
+
|
92
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
93
|
+
path_tokens = path.collect do |segid|
|
94
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
95
|
+
text[range]
|
96
|
+
end
|
97
|
+
|
98
|
+
assert path_tokens.include? 'increase'
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_paths
|
102
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
103
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
104
|
+
|
105
|
+
|
106
|
+
path_tokens = path.collect do |segid|
|
107
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
108
|
+
text[range]
|
109
|
+
end
|
110
|
+
|
111
|
+
ppp text
|
112
|
+
iii path_tokens
|
113
|
+
|
114
|
+
assert path_tokens.include? 'increase'
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_paths2
|
118
|
+
text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
|
119
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
120
|
+
|
121
|
+
|
122
|
+
path_tokens = path.collect do |segid|
|
123
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
124
|
+
text[range]
|
125
|
+
end
|
126
|
+
|
127
|
+
iii path_tokens
|
128
|
+
|
129
|
+
|
130
|
+
assert path_tokens.include? 'regulation'
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_paths3
|
134
|
+
text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
|
135
|
+
path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
|
136
|
+
|
137
|
+
path_tokens = path.collect do |segid|
|
138
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
139
|
+
text[range]
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
31
143
|
end
|
32
144
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- lib/rbbt/ner/oscar4.rb
|
96
96
|
- lib/rbbt/ner/patterns.rb
|
97
97
|
- lib/rbbt/ner/regexpNER.rb
|
98
|
+
- lib/rbbt/ner/rner.rb
|
98
99
|
- lib/rbbt/ner/rnorm.rb
|
99
100
|
- lib/rbbt/ner/rnorm/cue_index.rb
|
100
101
|
- lib/rbbt/ner/rnorm/tokens.rb
|
@@ -103,6 +104,7 @@ files:
|
|
103
104
|
- lib/rbbt/nlp/nlp.rb
|
104
105
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
105
106
|
- lib/rbbt/nlp/spaCy.rb
|
107
|
+
- lib/rbbt/relationship.rb
|
106
108
|
- lib/rbbt/segment.rb
|
107
109
|
- lib/rbbt/segment/annotation.rb
|
108
110
|
- lib/rbbt/segment/encoding.rb
|
@@ -126,6 +128,7 @@ files:
|
|
126
128
|
- share/install/software/OpenNLP
|
127
129
|
- share/install/software/StanfordParser
|
128
130
|
- share/patterns/drug_induce_disease
|
131
|
+
- share/rner/config.rb
|
129
132
|
- share/rnorm/cue_default
|
130
133
|
- share/rnorm/tokens_default
|
131
134
|
- share/wordlists/stopwords
|
@@ -148,6 +151,7 @@ files:
|
|
148
151
|
- test/rbbt/ner/test_oscar4.rb
|
149
152
|
- test/rbbt/ner/test_patterns.rb
|
150
153
|
- test/rbbt/ner/test_regexpNER.rb
|
154
|
+
- test/rbbt/ner/test_rner.rb
|
151
155
|
- test/rbbt/ner/test_rnorm.rb
|
152
156
|
- test/rbbt/ner/test_token_trieNER.rb
|
153
157
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
@@ -182,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
186
|
- !ruby/object:Gem::Version
|
183
187
|
version: '0'
|
184
188
|
requirements: []
|
185
|
-
rubygems_version: 3.
|
189
|
+
rubygems_version: 3.1.4
|
186
190
|
signing_key:
|
187
191
|
specification_version: 4
|
188
192
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
@@ -210,6 +214,7 @@ test_files:
|
|
210
214
|
- test/rbbt/ner/test_banner.rb
|
211
215
|
- test/rbbt/ner/test_token_trieNER.rb
|
212
216
|
- test/rbbt/ner/test_finder.rb
|
217
|
+
- test/rbbt/ner/test_rner.rb
|
213
218
|
- test/rbbt/ner/test_linnaeus.rb
|
214
219
|
- test/rbbt/ner/test_oscar4.rb
|
215
220
|
- test/rbbt/test_segment.rb
|