rbbt-text 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +6 -0
- data/lib/rbbt/document/annotation.rb +2 -2
- data/lib/rbbt/document/corpus.rb +5 -3
- data/lib/rbbt/document/corpus/pubmed.rb +1 -0
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +158 -15
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +3 -0
- data/lib/rbbt/segment/named_entity.rb +4 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +10 -1
- data/test/rbbt/document/test_corpus.rb +14 -0
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/segment/test_named_entity.rb +2 -1
- data/test/rbbt/segment/test_transformed.rb +4 -4
- data/test/test_spaCy.rb +113 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
|
4
|
+
data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
|
7
|
+
data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
|
data/lib/rbbt/document.rb
CHANGED
@@ -13,7 +13,7 @@ module Document
|
|
13
13
|
end
|
14
14
|
|
15
15
|
docid = self.docid
|
16
|
-
segments.each{|s| s.docid = docid
|
16
|
+
segments.each{|s| s.docid = docid }
|
17
17
|
|
18
18
|
segments
|
19
19
|
end
|
@@ -36,7 +36,7 @@ module Document
|
|
36
36
|
|
37
37
|
docid = document.docid
|
38
38
|
|
39
|
-
segments.each{|s| s.docid = docid
|
39
|
+
segments.each{|s| s.docid = docid }
|
40
40
|
|
41
41
|
segments
|
42
42
|
end
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
|
6
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
7
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
8
9
|
corpus
|
@@ -16,7 +17,8 @@ module Document::Corpus
|
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
|
-
def docids(prefix)
|
20
|
+
def docids(*prefix)
|
21
|
+
prefix = prefix * ":"
|
20
22
|
prefix += ":" unless prefix == :all || prefix[-1] == ":"
|
21
23
|
docids = self.read_and_close do
|
22
24
|
prefix == :all ? self.keys : self.prefix(prefix)
|
@@ -24,8 +26,8 @@ module Document::Corpus
|
|
24
26
|
DocID.setup(docids, :corpus => self)
|
25
27
|
end
|
26
28
|
|
27
|
-
def documents(prefix)
|
28
|
-
self.docids(prefix).document
|
29
|
+
def documents(*prefix)
|
30
|
+
self.docids(*prefix).document
|
29
31
|
end
|
30
32
|
|
31
33
|
def [](*args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -39,14 +39,15 @@ class Abner < NER
|
|
39
39
|
types = res[1]
|
40
40
|
strings = res[0]
|
41
41
|
|
42
|
+
docid = Misc.digest(text)
|
42
43
|
global_offset = 0
|
43
44
|
strings.zip(types).collect do |mention, type|
|
44
45
|
mention = mention.to_s;
|
45
46
|
offset = text.index(mention)
|
46
47
|
if offset.nil?
|
47
|
-
NamedEntity.setup(mention,
|
48
|
+
NamedEntity.setup(mention, :docid => docid, :entity_type => type)
|
48
49
|
else
|
49
|
-
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
50
|
+
NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
|
50
51
|
text = text[offset + mention.length..-1]
|
51
52
|
global_offset += offset + mention.length
|
52
53
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -55,6 +55,7 @@ class Banner < NER
|
|
55
55
|
# text.
|
56
56
|
def match(text)
|
57
57
|
return [] if text.nil?
|
58
|
+
text = text.dup if text.frozen?
|
58
59
|
text.gsub!(/\n/,' ')
|
59
60
|
text.gsub!(/\|/,'/') # Character | gives an error
|
60
61
|
return [] if text.strip.empty?
|
@@ -66,6 +67,7 @@ class Banner < NER
|
|
66
67
|
@parenPP.postProcess(sentence)
|
67
68
|
tagged = sentence.getSGML
|
68
69
|
|
70
|
+
docid = Misc.digest text
|
69
71
|
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
70
72
|
collect{|r|
|
71
73
|
r.match(/<GENE>(.*?)<\/GENE>/)
|
@@ -73,7 +75,7 @@ class Banner < NER
|
|
73
75
|
mention.sub!(/^\s*/,'')
|
74
76
|
mention.sub!(/\s*$/,'')
|
75
77
|
offset = text.index(mention)
|
76
|
-
NamedEntity.setup(mention, offset, 'GENE')
|
78
|
+
NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
|
77
79
|
mention
|
78
80
|
}
|
79
81
|
res
|
data/lib/rbbt/ner/brat.rb
CHANGED
data/lib/rbbt/ner/linnaeus.rb
CHANGED
@@ -31,7 +31,8 @@ module Linnaeus
|
|
31
31
|
init unless defined? @@Matcher
|
32
32
|
|
33
33
|
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
-
|
34
|
+
best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
|
35
|
+
NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/misc'
|
4
|
+
require 'rbbt/util/simpleDSL'
|
5
|
+
|
6
|
+
class NERFeatures
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
def self.tokens(text)
|
10
|
+
text.scan(/
|
11
|
+
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
12
|
+
\w-\w*|
|
13
|
+
\w+-[A-Z](?!\w)|
|
14
|
+
\w+|
|
15
|
+
[.,()\/\[\]{}'"+-]
|
16
|
+
/x)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.reverse(text)
|
20
|
+
tokens(text).reverse.join(" ")
|
21
|
+
end
|
22
|
+
|
23
|
+
def define(name, *args, &block)
|
24
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
25
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
26
|
+
|
27
|
+
@types[name.to_s] = action
|
28
|
+
@order.push name.to_s
|
29
|
+
|
30
|
+
name.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_accessor :reverse
|
34
|
+
def initialize(file = nil, reverse = false, &block)
|
35
|
+
@types = {}
|
36
|
+
@order = []
|
37
|
+
@context = []
|
38
|
+
@reverse = reverse
|
39
|
+
|
40
|
+
file ||= Rbbt.share.ner['config.rb'].find if !file && !block
|
41
|
+
|
42
|
+
parse(:define, file, &block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def config
|
46
|
+
@config[:define]
|
47
|
+
end
|
48
|
+
|
49
|
+
def window(positions)
|
50
|
+
@window = positions
|
51
|
+
end
|
52
|
+
|
53
|
+
def context(name, &block)
|
54
|
+
if name.is_a? Array
|
55
|
+
@context += name
|
56
|
+
else
|
57
|
+
@context.push name
|
58
|
+
|
59
|
+
# The block might be wrongly assigned to this function
|
60
|
+
# instead of the actual definition, fix that.
|
61
|
+
if block
|
62
|
+
@types[name] = block
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def direction(dir)
|
68
|
+
if dir.to_sym == :reverse
|
69
|
+
@reverse = true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def features(word)
|
74
|
+
values = [word]
|
75
|
+
|
76
|
+
@order.each{|features|
|
77
|
+
action = @types[features]
|
78
|
+
if action.is_a?(Proc)
|
79
|
+
values.push(action.call(word))
|
80
|
+
else
|
81
|
+
m = action.match(word)
|
82
|
+
if m
|
83
|
+
if m[1]
|
84
|
+
values.push(m[1])
|
85
|
+
else
|
86
|
+
values.push(m != nil)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
values.push(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
}
|
93
|
+
values
|
94
|
+
end
|
95
|
+
|
96
|
+
def template(window=nil)
|
97
|
+
window ||= @window || [1,-1]
|
98
|
+
template = ""
|
99
|
+
|
100
|
+
i = 1
|
101
|
+
@order.each{|feat|
|
102
|
+
template += "U#{ feat }: %x[0,#{ i }]\n"
|
103
|
+
|
104
|
+
if @context.include?(feat)
|
105
|
+
window.each{|p|
|
106
|
+
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
107
|
+
}
|
108
|
+
end
|
109
|
+
i += 1
|
110
|
+
}
|
111
|
+
|
112
|
+
template += "B\n"
|
113
|
+
|
114
|
+
template
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def text_features(text, positive = nil)
|
119
|
+
text = self.class.reverse(text) if @reverse
|
120
|
+
initial = true
|
121
|
+
self.class.tokens(text).collect{|token|
|
122
|
+
features = features(token)
|
123
|
+
if !positive.nil?
|
124
|
+
features << (positive ? (initial ? 1 : 2) : 0)
|
125
|
+
initial = false
|
126
|
+
end
|
127
|
+
features
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def tagged_features(text, mentions)
|
132
|
+
mentions ||= []
|
133
|
+
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
134
|
+
re = mentions.collect{|mention|
|
135
|
+
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
136
|
+
}.join("|")
|
137
|
+
|
138
|
+
positive = false
|
139
|
+
features = []
|
140
|
+
chunks = text.split(/(#{re})/)
|
141
|
+
chunks.each{|t|
|
142
|
+
chunk_features = text_features(t, positive)
|
143
|
+
positive = !positive
|
144
|
+
if @reverse
|
145
|
+
features = chunk_features + features
|
146
|
+
else
|
147
|
+
features = features + chunk_features
|
148
|
+
end
|
149
|
+
}
|
150
|
+
features
|
151
|
+
end
|
152
|
+
|
153
|
+
def train(features, model)
|
154
|
+
tmp_template = TmpFile.tmp_file("template-")
|
155
|
+
Open.write(tmp_template,template)
|
156
|
+
|
157
|
+
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
158
|
+
system cmd
|
159
|
+
Open.write(model + '.config',config)
|
160
|
+
FileUtils.rm tmp_template
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
class NER
|
166
|
+
|
167
|
+
def initialize(model = nil)
|
168
|
+
begin
|
169
|
+
require 'CRFPP'
|
170
|
+
rescue Exception
|
171
|
+
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
172
|
+
end
|
173
|
+
|
174
|
+
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
175
|
+
|
176
|
+
@parser = NERFeatures.new(model + '.config')
|
177
|
+
@reverse = @parser.reverse
|
178
|
+
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract(text)
|
182
|
+
features = @parser.text_features(text)
|
183
|
+
|
184
|
+
@tagger.clear
|
185
|
+
features.each{|feats|
|
186
|
+
@tagger.add(feats.join(" "))
|
187
|
+
}
|
188
|
+
|
189
|
+
@tagger.parse
|
190
|
+
|
191
|
+
found = []
|
192
|
+
mention = []
|
193
|
+
|
194
|
+
@tagger.size.times{|i|
|
195
|
+
label = @tagger.y(i)
|
196
|
+
word = @tagger.x(i,0)
|
197
|
+
|
198
|
+
if word == ')'
|
199
|
+
mention.push(')') if mention.join =~ /\(/
|
200
|
+
next
|
201
|
+
end
|
202
|
+
|
203
|
+
case label
|
204
|
+
when 1
|
205
|
+
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
206
|
+
found.push(mention)
|
207
|
+
mention = []
|
208
|
+
end
|
209
|
+
mention.push(word)
|
210
|
+
when 2
|
211
|
+
mention.push(word)
|
212
|
+
when 0
|
213
|
+
found.push(mention) if mention.any?
|
214
|
+
mention = []
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
found << mention if mention.any?
|
219
|
+
|
220
|
+
found.collect{|list|
|
221
|
+
list = list.reverse if @reverse
|
222
|
+
list.join(" ")
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
|
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
|
|
249
249
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
250
250
|
}
|
251
251
|
|
252
|
-
|
252
|
+
type = type.first
|
253
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
|
253
254
|
end
|
254
255
|
|
255
256
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
data/lib/rbbt/nlp/spaCy.rb
CHANGED
@@ -2,30 +2,55 @@ require 'rbbt/segment'
|
|
2
2
|
require 'rbbt/document'
|
3
3
|
require 'rbbt/segment/annotation'
|
4
4
|
require 'rbbt/util/python'
|
5
|
+
require 'rbbt/network/paths'
|
5
6
|
|
6
7
|
module SpaCy
|
7
8
|
|
8
|
-
|
9
|
+
TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
10
|
+
CHUNK_PROPERTIES = %w(lemma_)
|
9
11
|
|
10
|
-
def self.
|
12
|
+
def self.nlp(lang = 'en_core_web_md')
|
13
|
+
@@nlp ||= {}
|
14
|
+
@@nlp[lang] ||= RbbtPython.run :spacy do
|
15
|
+
spacy.load(lang)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.tokens(text, lang = 'en_core_web_sm')
|
11
20
|
|
12
21
|
tokens = []
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
22
|
+
|
23
|
+
nlp = nlp(lang)
|
24
|
+
doc = nlp.call(text)
|
25
|
+
|
26
|
+
doc.__len__.times do |i|
|
27
|
+
tokens << doc.__getitem__(i)
|
28
|
+
end
|
29
|
+
|
30
|
+
tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.chunks(text, lang = 'en_core_web_sm')
|
34
|
+
|
35
|
+
tokens = []
|
36
|
+
nlp = nlp(lang)
|
37
|
+
|
38
|
+
doc = nlp.call(text)
|
39
|
+
chunks = doc.noun_chunks.__iter__
|
40
|
+
|
41
|
+
RbbtPython.iterate chunks do |item|
|
42
|
+
tokens << item
|
19
43
|
end
|
44
|
+
|
20
45
|
tokens
|
21
46
|
end
|
22
47
|
|
23
|
-
def self.segments(text, lang = '
|
24
|
-
docid = text.docid if Document === text
|
48
|
+
def self.segments(text, lang = 'en_core_web_sm')
|
49
|
+
docid = text.docid if Document === text
|
25
50
|
corpus = text.corpus if Document === text
|
26
51
|
tokens = self.tokens(text, lang).collect do |token|
|
27
52
|
info = {}
|
28
|
-
|
53
|
+
TOKEN_PROPERTIES.each do |p|
|
29
54
|
info[p] = token.instance_eval(p.to_s)
|
30
55
|
end
|
31
56
|
info[:type] = "SpaCy"
|
@@ -35,7 +60,120 @@ module SpaCy
|
|
35
60
|
info[:corpus] = corpus if corpus
|
36
61
|
SpaCyToken.setup(token.text, info)
|
37
62
|
end
|
38
|
-
|
63
|
+
|
64
|
+
tokens
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.chunk_segments(text, lang = 'en_core_web_sm')
|
68
|
+
docid = text.docid if Document === text
|
69
|
+
corpus = text.corpus if Document === text
|
70
|
+
chunks = self.chunks(text, lang).collect do |chunk|
|
71
|
+
info = {}
|
72
|
+
CHUNK_PROPERTIES.each do |p|
|
73
|
+
info[p] = chunk.instance_eval(p.to_s)
|
74
|
+
end
|
75
|
+
start = eend = nil
|
76
|
+
deps = []
|
77
|
+
RbbtPython.iterate chunk.__iter__ do |token|
|
78
|
+
start = token.idx if start.nil?
|
79
|
+
eend = start + chunk.text.length if eend.nil?
|
80
|
+
deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
|
81
|
+
end
|
82
|
+
info[:type] = "SpaCy"
|
83
|
+
info[:offset] = chunk.__iter__.__next__.idx
|
84
|
+
info[:dep] = deps * ";"
|
85
|
+
info[:docid] = docid if docid
|
86
|
+
info[:corpus] = corpus if corpus
|
87
|
+
SpaCySpan.setup(chunk.text, info)
|
88
|
+
end
|
89
|
+
|
90
|
+
chunks
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
|
94
|
+
tokens = self.segments(text, lang)
|
95
|
+
index = Segment.index(tokens)
|
96
|
+
associations = {}
|
97
|
+
tokens.each do |token|
|
98
|
+
type, target_pos = token.dep.split("->")
|
99
|
+
target_tokens = index[target_pos.to_i]
|
100
|
+
associations[token.segid] = target_tokens
|
101
|
+
end
|
102
|
+
|
103
|
+
if reverse
|
104
|
+
old = associations.dup
|
105
|
+
old.each do |s,ts|
|
106
|
+
ts.each do |t|
|
107
|
+
associations[t] ||= []
|
108
|
+
associations[t] += [s] unless associations[t].include?(s)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
associations
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
|
117
|
+
associations = dep_graph(text, false, lang)
|
118
|
+
|
119
|
+
chunks = self.chunk_segments(text, lang)
|
120
|
+
tokens = self.segments(text, lang)
|
121
|
+
index = Segment.index(tokens + chunks)
|
122
|
+
|
123
|
+
chunks.each do |chunk|
|
124
|
+
target_token_ids = chunk.dep.split(";").collect do|dep|
|
125
|
+
type, target_pos = dep.split("->")
|
126
|
+
index[target_pos.to_i]
|
127
|
+
end.flatten
|
128
|
+
|
129
|
+
target_tokens = target_token_ids.collect do |target_token_id|
|
130
|
+
range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
|
131
|
+
range.collect do |pos|
|
132
|
+
index[pos]
|
133
|
+
end.uniq
|
134
|
+
end.flatten
|
135
|
+
associations[chunk.segid] = target_tokens
|
136
|
+
end
|
137
|
+
|
138
|
+
if reverse
|
139
|
+
old = associations.dup
|
140
|
+
old.each do |s,ts|
|
141
|
+
ts.each do |t|
|
142
|
+
associations[t] ||= []
|
143
|
+
associations[t] += [s] unless associations[t].include?(s)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
associations
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
|
152
|
+
graph = SpaCy.chunk_dep_graph(text, reverse, lang)
|
153
|
+
|
154
|
+
chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
|
155
|
+
|
156
|
+
source_id = chunk_index[source.offset].first || source.segid
|
157
|
+
target_id = chunk_index[target.offset].first || target.segid
|
158
|
+
|
159
|
+
path = Paths.dijkstra(graph, source_id, [target_id])
|
160
|
+
|
161
|
+
return nil if path.nil?
|
162
|
+
|
163
|
+
path.reverse
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.config(base, target = nil)
|
167
|
+
TmpFile.with_file(base) do |baseconfig|
|
168
|
+
if target
|
169
|
+
CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
|
170
|
+
else
|
171
|
+
TmpFile.with_file do |tmptarget|
|
172
|
+
CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
|
173
|
+
Open.read(targetconfig)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
39
177
|
end
|
40
178
|
end
|
41
179
|
|
@@ -43,10 +181,15 @@ module SpaCyToken
|
|
43
181
|
extend Entity
|
44
182
|
include SegmentAnnotation
|
45
183
|
|
46
|
-
self.annotation *SpaCy::
|
184
|
+
self.annotation *SpaCy::TOKEN_PROPERTIES
|
47
185
|
self.annotation :dep
|
48
186
|
end
|
49
187
|
|
50
|
-
|
51
|
-
|
188
|
+
module SpaCySpan
|
189
|
+
extend Entity
|
190
|
+
include SegmentAnnotation
|
191
|
+
|
192
|
+
self.annotation *SpaCy::CHUNK_PROPERTIES
|
193
|
+
self.annotation :dep
|
52
194
|
end
|
195
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Relationship
|
4
|
+
extend Annotation
|
5
|
+
self.annotation :segment
|
6
|
+
self.annotation :terms
|
7
|
+
self.annotation :type
|
8
|
+
|
9
|
+
def text
|
10
|
+
if segment
|
11
|
+
segment
|
12
|
+
else
|
13
|
+
type + ": " + terms * ", "
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def html
|
18
|
+
text = <<-EOF
|
19
|
+
<span class='Relationship'\
|
20
|
+
>#{ self.text }</span>
|
21
|
+
EOF
|
22
|
+
text.chomp
|
23
|
+
end
|
24
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -6,7 +6,7 @@ module Segment::RangeIndex
|
|
6
6
|
SegID.setup(res, :corpus => corpus)
|
7
7
|
end
|
8
8
|
|
9
|
-
def self.index(segments, corpus, persist_file = :memory)
|
9
|
+
def self.index(segments, corpus = nil, persist_file = :memory)
|
10
10
|
segments = segments.values.flatten if Hash === segments
|
11
11
|
|
12
12
|
annotation_index =
|
@@ -0,0 +1,51 @@
|
|
1
|
+
isLetters /^[A-Z]+$/i
|
2
|
+
isUpper /^[A-Z]+$/
|
3
|
+
isLower /^[a-z]+$/
|
4
|
+
isDigits /^[0-9]+$/i
|
5
|
+
isRoman /^[IVX]+$/
|
6
|
+
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
+
isPunctuation /^[,.;]$/
|
8
|
+
isDelim /^[\/()\[\]{}\-]$/
|
9
|
+
isNonWord /^[^\w]+$/
|
10
|
+
isConjunction /^and|or|&|,$/
|
11
|
+
|
12
|
+
hasLetters /[A-Z]/i
|
13
|
+
hasUpper /.[A-Z]/
|
14
|
+
hasLower /[a-z]/
|
15
|
+
hasDigits /[0-9]/i
|
16
|
+
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
+
hasPunctuation /[,.;]/
|
18
|
+
hasDelim /[\/()\[\]{}\-]/
|
19
|
+
hasNonWord /[^\w]/
|
20
|
+
caspMix /[a-z].[A-Z]/
|
21
|
+
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
+
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
+
|
24
|
+
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
+
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
+
#
|
27
|
+
prefix_3 /^(...)/
|
28
|
+
prefix_4 /^(....)/
|
29
|
+
suffix_3 /(...)$/
|
30
|
+
suffix_4 /(....)$/
|
31
|
+
|
32
|
+
|
33
|
+
token1 do |w|
|
34
|
+
w.sub(/[A-Z]/,'A').
|
35
|
+
sub(/[a-z]/,'a').
|
36
|
+
sub(/[0-9]/,'0').
|
37
|
+
sub(/[^0-9a-z]/i,'x')
|
38
|
+
end
|
39
|
+
token2 do |w|
|
40
|
+
w.sub(/[A-Z]+/,'A').
|
41
|
+
sub(/[a-z]+/,'a').
|
42
|
+
sub(/[0-9]+/,'0').
|
43
|
+
sub(/[^0-9a-z]+/i,'x')
|
44
|
+
end
|
45
|
+
token3 do |w| w.downcase end
|
46
|
+
special do |w| w.is_special? end
|
47
|
+
|
48
|
+
context %w(special token2 isPunctuation isDelim)
|
49
|
+
window %w(1 2 3 -1 -2 -3)
|
50
|
+
#direction :reverse
|
51
|
+
|
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
|
|
7
7
|
def test_add_pmid
|
8
8
|
corpus = Document::Corpus.setup({})
|
9
9
|
|
10
|
-
document = corpus.add_pmid("
|
10
|
+
document = corpus.add_pmid("33359141", :abstract).first
|
11
|
+
iii document.docid
|
11
12
|
title = document.to(:title)
|
12
13
|
assert title.include?("COVID-19")
|
13
14
|
end
|
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
|
|
4
4
|
require 'rbbt/segment'
|
5
5
|
require 'rbbt/document/annotation'
|
6
6
|
require 'rbbt/segment/named_entity'
|
7
|
+
require 'rbbt/ner/abner'
|
7
8
|
|
8
9
|
class TestAnnotation < Test::Unit::TestCase
|
9
10
|
class CalledOnce < Exception; end
|
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
|
|
28
29
|
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
30
|
end
|
30
31
|
|
32
|
+
Document.define :abner do
|
33
|
+
$called_once = true
|
34
|
+
Abner.new.match(self)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
31
38
|
Document.persist :ner
|
32
39
|
end
|
33
40
|
|
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
|
|
133
140
|
text.ner
|
134
141
|
|
135
142
|
assert ! $called_once
|
136
|
-
|
143
|
+
|
144
|
+
assert_equal text.abner.first.docid, text.docid
|
145
|
+
|
137
146
|
assert text.ner.first.segid.include?("TEST:")
|
138
147
|
end
|
139
148
|
end
|
@@ -29,5 +29,19 @@ class TestDocumentCorpus < Test::Unit::TestCase
|
|
29
29
|
assert corpus.docids("TEST:").include?(text.docid)
|
30
30
|
end
|
31
31
|
end
|
32
|
+
|
33
|
+
def test_load
|
34
|
+
text = "This is a document"
|
35
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
36
|
+
|
37
|
+
TmpFile.with_file do |path|
|
38
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
39
|
+
corpus.extend Document::Corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
44
|
+
end
|
45
|
+
end
|
32
46
|
end
|
33
47
|
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/rner'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestRNer < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@parser = NERFeatures.new() do
|
10
|
+
isLetters /^[A-Z]+$/i
|
11
|
+
context prefix_3 /^(...)/
|
12
|
+
downcase do |w| w.downcase end
|
13
|
+
|
14
|
+
context %w(downcase)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_config
|
19
|
+
config = <<-EOC
|
20
|
+
isLetters /^[A-Z]+$/i
|
21
|
+
context prefix_3 /^(...)/
|
22
|
+
downcase do |w| w.downcase end
|
23
|
+
|
24
|
+
context %w(downcase)
|
25
|
+
EOC
|
26
|
+
|
27
|
+
assert_equal config.strip, @parser.config.strip
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_reverse
|
31
|
+
assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
|
32
|
+
assert_equal(
|
33
|
+
". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
|
34
|
+
NERFeatures.reverse(
|
35
|
+
"A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
|
36
|
+
))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_features
|
40
|
+
assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_template
|
44
|
+
template =<<-EOT
|
45
|
+
UisLetters: %x[0,1]
|
46
|
+
Uprefix_3: %x[0,2]
|
47
|
+
Uprefix_3#1: %x[1,2]
|
48
|
+
Uprefix_3#-1: %x[-1,2]
|
49
|
+
Udowncase: %x[0,3]
|
50
|
+
Udowncase#1: %x[1,3]
|
51
|
+
Udowncase#-1: %x[-1,3]
|
52
|
+
B
|
53
|
+
EOT
|
54
|
+
|
55
|
+
assert(@parser.template == template)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_tokens
|
59
|
+
assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
|
60
|
+
["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
def test_text_features
|
65
|
+
|
66
|
+
assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
|
67
|
+
assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
|
68
|
+
assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_tagged_features
|
73
|
+
assert_equal(
|
74
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
75
|
+
["of",true, false, "of", 0],
|
76
|
+
["GENE1",false, "GEN", "gene1", 1],
|
77
|
+
[".", false, false, ".", 0]],
|
78
|
+
@parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
|
79
|
+
|
80
|
+
assert_equal(
|
81
|
+
[["GENE1",false, "GEN", "gene1", 1],
|
82
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]],
|
83
|
+
@parser.tagged_features("GENE1 phosphorilation",['GENE1']))
|
84
|
+
|
85
|
+
|
86
|
+
assert_equal(
|
87
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
88
|
+
["of",true, false, "of", 0],
|
89
|
+
["GENE",true, "GEN", "gene", 1],
|
90
|
+
["1",false, false, "1", 2],
|
91
|
+
[".", false, false, ".", 0]],
|
92
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_tagged_features_reverse
|
96
|
+
@parser.reverse = true
|
97
|
+
assert_equal(
|
98
|
+
[
|
99
|
+
["GENE1",false, "GEN", "gene1", 1],
|
100
|
+
["of",true, false, "of", 0],
|
101
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
102
|
+
],
|
103
|
+
@parser.tagged_features("phosphorilation of GENE1",['GENE1']))
|
104
|
+
|
105
|
+
assert_equal(
|
106
|
+
[
|
107
|
+
[".", false, false, ".", 0],
|
108
|
+
["1",false, false, "1", 1],
|
109
|
+
["GENE",true, "GEN", "gene", 2],
|
110
|
+
["of",true, false, "of", 0],
|
111
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
112
|
+
],
|
113
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_default_config
|
117
|
+
require 'rbbt/bow/misc'
|
118
|
+
text =<<-EOF
|
119
|
+
This text explains how MDM2 interacts with TP53.
|
120
|
+
EOF
|
121
|
+
@parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
|
122
|
+
features = @parser.tagged_features text, %w(TP53 MDM2)
|
123
|
+
assert features.first.first == "This"
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def __test_CRFPP_install
|
129
|
+
assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -24,7 +24,8 @@ class TestClass < Test::Unit::TestCase
|
|
24
24
|
|
25
25
|
def test_tsv
|
26
26
|
a = "test"
|
27
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
27
|
+
NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
|
28
|
+
ppp Annotated.tsv([a,a])
|
28
29
|
assert Annotated.tsv([a]).fields.include? "code"
|
29
30
|
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
31
|
assert Annotated.tsv([a], :all).fields.include? "code"
|
@@ -144,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
144
144
|
gene2.entity_type = "Protein"
|
145
145
|
|
146
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
147
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
147
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
148
148
|
end
|
149
149
|
end
|
150
150
|
|
@@ -165,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
165
165
|
gene2.entity_type = "Protein"
|
166
166
|
|
167
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
168
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
168
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
169
169
|
end
|
170
170
|
end
|
171
171
|
|
@@ -185,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
185
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
186
186
|
|
187
187
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
188
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
188
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
189
189
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
190
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
190
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
191
191
|
end
|
192
192
|
end
|
193
193
|
end
|
data/test/test_spaCy.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/nlp/spaCy'
|
|
3
3
|
require 'rbbt/document/corpus'
|
4
4
|
|
5
5
|
class TestSpaCy < Test::Unit::TestCase
|
6
|
-
def
|
6
|
+
def test_tokens
|
7
7
|
text = "I tell a story"
|
8
8
|
|
9
9
|
tokens = SpaCy.tokens(text)
|
@@ -12,6 +12,16 @@ class TestSpaCy < Test::Unit::TestCase
|
|
12
12
|
assert_equal "tell", tokens[1].to_s
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_chunks
|
16
|
+
text = "Miguel Vazquez tell a good story"
|
17
|
+
|
18
|
+
tokens = SpaCy.chunks(text)
|
19
|
+
|
20
|
+
assert_equal 2, tokens.length
|
21
|
+
assert_equal "Miguel Vazquez", tokens[0].to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
|
15
25
|
def test_segments
|
16
26
|
text = "I tell a story. It's a very good story."
|
17
27
|
|
@@ -28,5 +38,107 @@ class TestSpaCy < Test::Unit::TestCase
|
|
28
38
|
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
29
39
|
end
|
30
40
|
end
|
41
|
+
|
42
|
+
def test_chunk_segments
|
43
|
+
text = "I tell a story. It's a very good story."
|
44
|
+
|
45
|
+
corpus = Document::Corpus.setup({})
|
46
|
+
|
47
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
48
|
+
|
49
|
+
corpus.add_document text
|
50
|
+
text.corpus = corpus
|
51
|
+
|
52
|
+
segments = SpaCy.chunk_segments(text)
|
53
|
+
|
54
|
+
segments.each do |segment|
|
55
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_dep_graph
|
60
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
61
|
+
graph = SpaCy.dep_graph(text, true)
|
62
|
+
|
63
|
+
tokens = SpaCy.segments(text)
|
64
|
+
index = Segment.index tokens
|
65
|
+
tf_s = tokens.select{|t| t == "TF" }.first
|
66
|
+
tg_s = tokens.select{|t| t == "TG" }.first
|
67
|
+
|
68
|
+
require 'rbbt/network/paths'
|
69
|
+
|
70
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
71
|
+
path_tokens = path.collect do |segid|
|
72
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
73
|
+
text[range]
|
74
|
+
end
|
75
|
+
|
76
|
+
assert path_tokens.include? 'increase'
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_chunk_dep_graph
|
81
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
82
|
+
graph = SpaCy.chunk_dep_graph(text, true)
|
83
|
+
|
84
|
+
tokens = SpaCy.chunk_segments(text)
|
85
|
+
index = Segment.index tokens
|
86
|
+
tf_s = tokens.select{|t| t.include? "TF" }.first
|
87
|
+
tg_s = tokens.select{|t| t.include? "TG" }.first
|
88
|
+
|
89
|
+
|
90
|
+
require 'rbbt/network/paths'
|
91
|
+
|
92
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
93
|
+
path_tokens = path.collect do |segid|
|
94
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
95
|
+
text[range]
|
96
|
+
end
|
97
|
+
|
98
|
+
assert path_tokens.include? 'increase'
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_paths
|
102
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
103
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
104
|
+
|
105
|
+
|
106
|
+
path_tokens = path.collect do |segid|
|
107
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
108
|
+
text[range]
|
109
|
+
end
|
110
|
+
|
111
|
+
ppp text
|
112
|
+
iii path_tokens
|
113
|
+
|
114
|
+
assert path_tokens.include? 'increase'
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_paths2
|
118
|
+
text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
|
119
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
120
|
+
|
121
|
+
|
122
|
+
path_tokens = path.collect do |segid|
|
123
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
124
|
+
text[range]
|
125
|
+
end
|
126
|
+
|
127
|
+
iii path_tokens
|
128
|
+
|
129
|
+
|
130
|
+
assert path_tokens.include? 'regulation'
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_paths3
|
134
|
+
text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
|
135
|
+
path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
|
136
|
+
|
137
|
+
path_tokens = path.collect do |segid|
|
138
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
139
|
+
text[range]
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
31
143
|
end
|
32
144
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- lib/rbbt/ner/oscar4.rb
|
96
96
|
- lib/rbbt/ner/patterns.rb
|
97
97
|
- lib/rbbt/ner/regexpNER.rb
|
98
|
+
- lib/rbbt/ner/rner.rb
|
98
99
|
- lib/rbbt/ner/rnorm.rb
|
99
100
|
- lib/rbbt/ner/rnorm/cue_index.rb
|
100
101
|
- lib/rbbt/ner/rnorm/tokens.rb
|
@@ -103,6 +104,7 @@ files:
|
|
103
104
|
- lib/rbbt/nlp/nlp.rb
|
104
105
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
105
106
|
- lib/rbbt/nlp/spaCy.rb
|
107
|
+
- lib/rbbt/relationship.rb
|
106
108
|
- lib/rbbt/segment.rb
|
107
109
|
- lib/rbbt/segment/annotation.rb
|
108
110
|
- lib/rbbt/segment/encoding.rb
|
@@ -126,6 +128,7 @@ files:
|
|
126
128
|
- share/install/software/OpenNLP
|
127
129
|
- share/install/software/StanfordParser
|
128
130
|
- share/patterns/drug_induce_disease
|
131
|
+
- share/rner/config.rb
|
129
132
|
- share/rnorm/cue_default
|
130
133
|
- share/rnorm/tokens_default
|
131
134
|
- share/wordlists/stopwords
|
@@ -148,6 +151,7 @@ files:
|
|
148
151
|
- test/rbbt/ner/test_oscar4.rb
|
149
152
|
- test/rbbt/ner/test_patterns.rb
|
150
153
|
- test/rbbt/ner/test_regexpNER.rb
|
154
|
+
- test/rbbt/ner/test_rner.rb
|
151
155
|
- test/rbbt/ner/test_rnorm.rb
|
152
156
|
- test/rbbt/ner/test_token_trieNER.rb
|
153
157
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
@@ -182,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
186
|
- !ruby/object:Gem::Version
|
183
187
|
version: '0'
|
184
188
|
requirements: []
|
185
|
-
rubygems_version: 3.
|
189
|
+
rubygems_version: 3.1.4
|
186
190
|
signing_key:
|
187
191
|
specification_version: 4
|
188
192
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
@@ -210,6 +214,7 @@ test_files:
|
|
210
214
|
- test/rbbt/ner/test_banner.rb
|
211
215
|
- test/rbbt/ner/test_token_trieNER.rb
|
212
216
|
- test/rbbt/ner/test_finder.rb
|
217
|
+
- test/rbbt/ner/test_rner.rb
|
213
218
|
- test/rbbt/ner/test_linnaeus.rb
|
214
219
|
- test/rbbt/ner/test_oscar4.rb
|
215
220
|
- test/rbbt/test_segment.rb
|