rbbt-text 1.3.4 → 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +20 -0
- data/lib/rbbt/document/annotation.rb +2 -2
- data/lib/rbbt/document/corpus/pubmed.rb +14 -5
- data/lib/rbbt/document/corpus.rb +10 -7
- data/lib/rbbt/document.rb +7 -3
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/oscar3.rb +0 -1
- data/lib/rbbt/ner/oscar4.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +3 -1
- data/lib/rbbt/ner/rnorm.rb +5 -1
- data/lib/rbbt/ner/token_trieNER.rb +2 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/spaCy.rb +158 -15
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment/named_entity.rb +4 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/transformed.rb +9 -1
- data/lib/rbbt/segment.rb +3 -0
- data/share/install/software/OpenNLP +3 -8
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +1 -1
- data/test/rbbt/document/test_annotation.rb +10 -1
- data/test/rbbt/document/test_corpus.rb +14 -0
- data/test/rbbt/ner/rnorm/test_tokens.rb +11 -0
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/ner/test_rnorm.rb +5 -0
- data/test/rbbt/segment/test_named_entity.rb +2 -1
- data/test/rbbt/segment/test_transformed.rb +13 -30
- data/test/test_spaCy.rb +113 -1
- metadata +13 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8dfc374254fcbe88c8be6bfffd9a3cfabf6e23c953c11ecd2f61cf41027ff3d6
|
4
|
+
data.tar.gz: 3d3211f41cfecea05862505d1508a4b7b76eecb3c90b3b0000194eb08033715e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ed870e46bae2c113d0885697bfbade6064732a89477833c640eaf4ee8bdb2c0fbf52f69f456af5eb30a82e56a7f0aeb37e71127f884430c3d315202a07fa3cb
|
7
|
+
data.tar.gz: e31853e816321a5ead788036b5f67eecaca179c75168c0bb2804be1f18ae844031ab808a4e3c9d67e1f9a52f94ca478949798b8101e164eba32481c0182a1f58
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2022 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -13,7 +13,7 @@ module Document
|
|
13
13
|
end
|
14
14
|
|
15
15
|
docid = self.docid
|
16
|
-
segments.each{|s| s.docid = docid
|
16
|
+
segments.each{|s| s.docid = docid }
|
17
17
|
|
18
18
|
segments
|
19
19
|
end
|
@@ -36,7 +36,7 @@ module Document
|
|
36
36
|
|
37
37
|
docid = document.docid
|
38
38
|
|
39
|
-
segments.each{|s| s.docid = docid
|
39
|
+
segments.each{|s| s.docid = docid }
|
40
40
|
|
41
41
|
segments
|
42
42
|
end
|
@@ -1,21 +1,30 @@
|
|
1
1
|
require 'rbbt/sources/pubmed'
|
2
2
|
|
3
3
|
module Document::Corpus
|
4
|
-
|
4
|
+
PUBMED_NAMESPACE="PMID"
|
5
|
+
def add_pmid(pmid, type = nil, update = false)
|
6
|
+
type = :abstract if type.nil?
|
7
|
+
if update == false
|
8
|
+
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
9
|
+
documents = self.documents(id)
|
10
|
+
return documents if documents.any?
|
11
|
+
end
|
12
|
+
|
5
13
|
pmids = Array === pmid ? pmid : [pmid]
|
6
14
|
type = nil if String === type and type.empty?
|
7
15
|
|
8
16
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
9
|
-
document = if type.
|
10
|
-
Document.setup(article.abstract || "",
|
17
|
+
document = if type.to_sym == :abstract
|
18
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
|
11
19
|
elsif type.to_sym == :title
|
12
|
-
Document.setup(article.title,
|
20
|
+
Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
|
13
21
|
else
|
14
22
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
15
|
-
Document.setup(article.full_text,
|
23
|
+
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
16
24
|
end
|
17
25
|
Log.debug "Loading pmid #{pmid}"
|
18
26
|
add_document(document)
|
27
|
+
document
|
19
28
|
end
|
20
29
|
|
21
30
|
Document.setup(res)
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,8 +3,10 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
|
6
7
|
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
7
8
|
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
9
|
+
corpus.close
|
8
10
|
corpus
|
9
11
|
end
|
10
12
|
|
@@ -16,22 +18,23 @@ module Document::Corpus
|
|
16
18
|
end
|
17
19
|
end
|
18
20
|
|
19
|
-
def docids(prefix)
|
20
|
-
prefix
|
21
|
+
def docids(*prefix)
|
22
|
+
prefix = prefix * ":"
|
23
|
+
prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
|
21
24
|
docids = self.read_and_close do
|
22
|
-
prefix ==
|
25
|
+
prefix == "all" ? self.keys : self.prefix(prefix)
|
23
26
|
end
|
24
27
|
DocID.setup(docids, :corpus => self)
|
25
28
|
end
|
26
29
|
|
27
|
-
def documents(prefix)
|
28
|
-
self.docids(prefix).document
|
30
|
+
def documents(*prefix)
|
31
|
+
self.docids(*prefix).document
|
29
32
|
end
|
30
33
|
|
31
34
|
def [](*args)
|
32
35
|
docid, *rest = args
|
33
36
|
|
34
|
-
res = self.
|
37
|
+
res = self.with_read do
|
35
38
|
super(*args)
|
36
39
|
end
|
37
40
|
|
@@ -41,7 +44,7 @@ module Document::Corpus
|
|
41
44
|
namespace, id, type = docid.split(":")
|
42
45
|
|
43
46
|
if res.nil?
|
44
|
-
if Document::Corpus.claims.include?(namespace.to_s)
|
47
|
+
if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
|
45
48
|
res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
|
46
49
|
end
|
47
50
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -22,9 +22,7 @@ module DocID
|
|
22
22
|
if Array === self
|
23
23
|
namespace, id, type = nil, nil, nil
|
24
24
|
docs = self.collect do |docid|
|
25
|
-
|
26
|
-
namespace, id, type = docid.split(":")
|
27
|
-
text
|
25
|
+
self.corpus[docid]
|
28
26
|
end
|
29
27
|
Document.setup(docs, :corpus => corpus)
|
30
28
|
else
|
@@ -53,3 +51,9 @@ module Document
|
|
53
51
|
alias id docid
|
54
52
|
end
|
55
53
|
|
54
|
+
#class String
|
55
|
+
# def docid
|
56
|
+
# digest = Misc.digest(self)
|
57
|
+
# ["STRING", digest, nil, nil] * ":"
|
58
|
+
# end
|
59
|
+
#end
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -39,14 +39,15 @@ class Abner < NER
|
|
39
39
|
types = res[1]
|
40
40
|
strings = res[0]
|
41
41
|
|
42
|
+
docid = Misc.digest(text)
|
42
43
|
global_offset = 0
|
43
44
|
strings.zip(types).collect do |mention, type|
|
44
45
|
mention = mention.to_s;
|
45
46
|
offset = text.index(mention)
|
46
47
|
if offset.nil?
|
47
|
-
NamedEntity.setup(mention,
|
48
|
+
NamedEntity.setup(mention, :docid => docid, :entity_type => type)
|
48
49
|
else
|
49
|
-
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
50
|
+
NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
|
50
51
|
text = text[offset + mention.length..-1]
|
51
52
|
global_offset += offset + mention.length
|
52
53
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -55,6 +55,7 @@ class Banner < NER
|
|
55
55
|
# text.
|
56
56
|
def match(text)
|
57
57
|
return [] if text.nil?
|
58
|
+
text = text.dup if text.frozen?
|
58
59
|
text.gsub!(/\n/,' ')
|
59
60
|
text.gsub!(/\|/,'/') # Character | gives an error
|
60
61
|
return [] if text.strip.empty?
|
@@ -66,6 +67,7 @@ class Banner < NER
|
|
66
67
|
@parenPP.postProcess(sentence)
|
67
68
|
tagged = sentence.getSGML
|
68
69
|
|
70
|
+
docid = Misc.digest text
|
69
71
|
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
70
72
|
collect{|r|
|
71
73
|
r.match(/<GENE>(.*?)<\/GENE>/)
|
@@ -73,7 +75,7 @@ class Banner < NER
|
|
73
75
|
mention.sub!(/^\s*/,'')
|
74
76
|
mention.sub!(/\s*$/,'')
|
75
77
|
offset = text.index(mention)
|
76
|
-
NamedEntity.setup(mention, offset, 'GENE')
|
78
|
+
NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
|
77
79
|
mention
|
78
80
|
}
|
79
81
|
res
|
data/lib/rbbt/ner/brat.rb
CHANGED
data/lib/rbbt/ner/linnaeus.rb
CHANGED
@@ -31,7 +31,8 @@ module Linnaeus
|
|
31
31
|
init unless defined? @@Matcher
|
32
32
|
|
33
33
|
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
-
|
34
|
+
best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
|
35
|
+
NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/misc'
|
4
|
+
require 'rbbt/util/simpleDSL'
|
5
|
+
|
6
|
+
class NERFeatures
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
def self.tokens(text)
|
10
|
+
text.scan(/
|
11
|
+
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
12
|
+
\w-\w*|
|
13
|
+
\w+-[A-Z](?!\w)|
|
14
|
+
\w+|
|
15
|
+
[.,()\/\[\]{}'"+-]
|
16
|
+
/x)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.reverse(text)
|
20
|
+
tokens(text).reverse.join(" ")
|
21
|
+
end
|
22
|
+
|
23
|
+
def define(name, *args, &block)
|
24
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
25
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
26
|
+
|
27
|
+
@types[name.to_s] = action
|
28
|
+
@order.push name.to_s
|
29
|
+
|
30
|
+
name.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_accessor :reverse
|
34
|
+
def initialize(file = nil, reverse = false, &block)
|
35
|
+
@types = {}
|
36
|
+
@order = []
|
37
|
+
@context = []
|
38
|
+
@reverse = reverse
|
39
|
+
|
40
|
+
file ||= Rbbt.share.ner['config.rb'].find if !file && !block
|
41
|
+
|
42
|
+
parse(:define, file, &block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def config
|
46
|
+
@config[:define]
|
47
|
+
end
|
48
|
+
|
49
|
+
def window(positions)
|
50
|
+
@window = positions
|
51
|
+
end
|
52
|
+
|
53
|
+
def context(name, &block)
|
54
|
+
if name.is_a? Array
|
55
|
+
@context += name
|
56
|
+
else
|
57
|
+
@context.push name
|
58
|
+
|
59
|
+
# The block might be wrongly assigned to this function
|
60
|
+
# instead of the actual definition, fix that.
|
61
|
+
if block
|
62
|
+
@types[name] = block
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def direction(dir)
|
68
|
+
if dir.to_sym == :reverse
|
69
|
+
@reverse = true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def features(word)
|
74
|
+
values = [word]
|
75
|
+
|
76
|
+
@order.each{|features|
|
77
|
+
action = @types[features]
|
78
|
+
if action.is_a?(Proc)
|
79
|
+
values.push(action.call(word))
|
80
|
+
else
|
81
|
+
m = action.match(word)
|
82
|
+
if m
|
83
|
+
if m[1]
|
84
|
+
values.push(m[1])
|
85
|
+
else
|
86
|
+
values.push(m != nil)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
values.push(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
}
|
93
|
+
values
|
94
|
+
end
|
95
|
+
|
96
|
+
def template(window=nil)
|
97
|
+
window ||= @window || [1,-1]
|
98
|
+
template = ""
|
99
|
+
|
100
|
+
i = 1
|
101
|
+
@order.each{|feat|
|
102
|
+
template += "U#{ feat }: %x[0,#{ i }]\n"
|
103
|
+
|
104
|
+
if @context.include?(feat)
|
105
|
+
window.each{|p|
|
106
|
+
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
107
|
+
}
|
108
|
+
end
|
109
|
+
i += 1
|
110
|
+
}
|
111
|
+
|
112
|
+
template += "B\n"
|
113
|
+
|
114
|
+
template
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def text_features(text, positive = nil)
|
119
|
+
text = self.class.reverse(text) if @reverse
|
120
|
+
initial = true
|
121
|
+
self.class.tokens(text).collect{|token|
|
122
|
+
features = features(token)
|
123
|
+
if !positive.nil?
|
124
|
+
features << (positive ? (initial ? 1 : 2) : 0)
|
125
|
+
initial = false
|
126
|
+
end
|
127
|
+
features
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def tagged_features(text, mentions)
|
132
|
+
mentions ||= []
|
133
|
+
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
134
|
+
re = mentions.collect{|mention|
|
135
|
+
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
136
|
+
}.join("|")
|
137
|
+
|
138
|
+
positive = false
|
139
|
+
features = []
|
140
|
+
chunks = text.split(/(#{re})/)
|
141
|
+
chunks.each{|t|
|
142
|
+
chunk_features = text_features(t, positive)
|
143
|
+
positive = !positive
|
144
|
+
if @reverse
|
145
|
+
features = chunk_features + features
|
146
|
+
else
|
147
|
+
features = features + chunk_features
|
148
|
+
end
|
149
|
+
}
|
150
|
+
features
|
151
|
+
end
|
152
|
+
|
153
|
+
def train(features, model)
|
154
|
+
tmp_template = TmpFile.tmp_file("template-")
|
155
|
+
Open.write(tmp_template,template)
|
156
|
+
|
157
|
+
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
158
|
+
system cmd
|
159
|
+
Open.write(model + '.config',config)
|
160
|
+
FileUtils.rm tmp_template
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
class NER
|
166
|
+
|
167
|
+
def initialize(model = nil)
|
168
|
+
begin
|
169
|
+
require 'CRFPP'
|
170
|
+
rescue Exception
|
171
|
+
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
172
|
+
end
|
173
|
+
|
174
|
+
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
175
|
+
|
176
|
+
@parser = NERFeatures.new(model + '.config')
|
177
|
+
@reverse = @parser.reverse
|
178
|
+
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract(text)
|
182
|
+
features = @parser.text_features(text)
|
183
|
+
|
184
|
+
@tagger.clear
|
185
|
+
features.each{|feats|
|
186
|
+
@tagger.add(feats.join(" "))
|
187
|
+
}
|
188
|
+
|
189
|
+
@tagger.parse
|
190
|
+
|
191
|
+
found = []
|
192
|
+
mention = []
|
193
|
+
|
194
|
+
@tagger.size.times{|i|
|
195
|
+
label = @tagger.y(i)
|
196
|
+
word = @tagger.x(i,0)
|
197
|
+
|
198
|
+
if word == ')'
|
199
|
+
mention.push(')') if mention.join =~ /\(/
|
200
|
+
next
|
201
|
+
end
|
202
|
+
|
203
|
+
case label
|
204
|
+
when 1
|
205
|
+
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
206
|
+
found.push(mention)
|
207
|
+
mention = []
|
208
|
+
end
|
209
|
+
mention.push(word)
|
210
|
+
when 2
|
211
|
+
mention.push(word)
|
212
|
+
when 0
|
213
|
+
found.push(mention) if mention.any?
|
214
|
+
mention = []
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
found << mention if mention.any?
|
219
|
+
|
220
|
+
found.collect{|list|
|
221
|
+
list = list.reverse if @reverse
|
222
|
+
list.join(" ")
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
|
@@ -172,6 +172,7 @@ class Tokenizer
|
|
172
172
|
|
173
173
|
#{{{ Token Types
|
174
174
|
GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
|
175
|
+
GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
|
175
176
|
def tokenize(word)
|
176
177
|
return word.
|
177
178
|
gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
|
@@ -180,6 +181,7 @@ class Tokenizer
|
|
180
181
|
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
|
181
182
|
gsub(/^(#{GREEK_RE})/,'\1-').
|
182
183
|
gsub(/(#{GREEK_RE})$/,'-\1').
|
184
|
+
gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
|
183
185
|
split( /[^\w.]+/). # Split by separator char
|
184
186
|
select{|t| !t.empty? }
|
185
187
|
end
|
@@ -204,7 +206,7 @@ class Tokenizer
|
|
204
206
|
end
|
205
207
|
|
206
208
|
#{{{ Comparisons
|
207
|
-
|
209
|
+
|
208
210
|
def evaluate_tokens(list1, list2)
|
209
211
|
@operations.inject(0){|acc, o|
|
210
212
|
acc + o.eval(list1, list2)
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -18,6 +18,10 @@ class Normalizer
|
|
18
18
|
values.select{|p| p[1] == best}
|
19
19
|
end
|
20
20
|
|
21
|
+
def token_evaluate(mention, name)
|
22
|
+
@tokens.evaluate(mention, name)
|
23
|
+
end
|
24
|
+
|
21
25
|
# Compares the tokens and gives each candidate a score based on the
|
22
26
|
# commonalities and differences amongst the tokens.
|
23
27
|
def token_score(code, mention)
|
@@ -31,7 +35,7 @@ class Normalizer
|
|
31
35
|
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
32
36
|
80
|
33
37
|
else
|
34
|
-
|
38
|
+
token_evaluate(mention, name)
|
35
39
|
end
|
36
40
|
[value, name]
|
37
41
|
}.sort_by{|value, name| value }.last
|
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
|
|
249
249
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
250
250
|
}
|
251
251
|
|
252
|
-
|
252
|
+
type = type.first
|
253
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
|
253
254
|
end
|
254
255
|
|
255
256
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
@@ -7,7 +7,7 @@ module OpenNLP
|
|
7
7
|
Rbbt.claim Rbbt.software.opt.OpenNLP, :install, Rbbt.share.install.software.OpenNLP.find
|
8
8
|
|
9
9
|
|
10
|
-
Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "
|
10
|
+
Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "https://www.apache.org/dyn/closer.cgi/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"
|
11
11
|
|
12
12
|
MAX = 5
|
13
13
|
|