rbbt-text 1.3.0 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
|
4
|
+
data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
|
7
|
+
data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
|
data/lib/rbbt/bow/bow.rb
CHANGED
@@ -69,6 +69,11 @@ module BagOfWords
|
|
69
69
|
count = bigrams ? count(bigrams(text)) : count(words(text))
|
70
70
|
count.values_at(*terms)
|
71
71
|
end
|
72
|
+
|
73
|
+
def self.weighted_features(text, weights)
|
74
|
+
features = features(text, weights.keys)
|
75
|
+
features.zip(weights.values).collect{|f,w| f * w }
|
76
|
+
end
|
72
77
|
end
|
73
78
|
|
74
79
|
class String
|
@@ -82,5 +87,3 @@ class String
|
|
82
87
|
BagOfWords.bigrams(self)
|
83
88
|
end
|
84
89
|
end
|
85
|
-
|
86
|
-
|
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def best(options = {})
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
77
|
+
key = Misc.obj2digest(options)
|
78
|
+
@best ||= {}
|
79
|
+
@best[key] ||= begin
|
80
|
+
high, low, limit = {
|
81
|
+
:low => 0,
|
82
|
+
:high => 1,
|
83
|
+
}.merge(options).
|
84
|
+
values_at(:high, :low, :limit)
|
85
|
+
|
86
|
+
num_docs = @num_docs.to_f
|
87
|
+
best = df.select{|term, value|
|
88
|
+
value >= low && value <= high
|
89
|
+
}.collect{|p|
|
90
|
+
term = p.first
|
91
|
+
df_value = p.last
|
92
|
+
[term,
|
93
|
+
@terms[term].to_f / num_docs * Math::log(1.0/df_value)
|
94
|
+
]
|
95
|
+
}
|
96
|
+
|
97
|
+
if limit
|
98
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
99
|
+
else
|
100
|
+
Hash[*best.flatten]
|
101
|
+
end
|
102
|
+
end
|
99
103
|
end
|
100
104
|
|
101
105
|
def weights(options = {})
|
@@ -173,7 +177,7 @@ class Dictionary::KL
|
|
173
177
|
best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
|
174
178
|
}
|
175
179
|
if limit
|
176
|
-
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
180
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
177
181
|
else
|
178
182
|
best
|
179
183
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/entity'
|
3
|
-
require 'rbbt/document/annotation'
|
4
3
|
|
5
4
|
module DocID
|
6
5
|
extend Entity
|
@@ -19,10 +18,20 @@ module DocID
|
|
19
18
|
DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
|
20
19
|
end
|
21
20
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
property :document => :both do
|
22
|
+
if Array === self
|
23
|
+
namespace, id, type = nil, nil, nil
|
24
|
+
docs = self.collect do |docid|
|
25
|
+
text = self.corpus[docid]
|
26
|
+
namespace, id, type = docid.split(":")
|
27
|
+
text
|
28
|
+
end
|
29
|
+
Document.setup(docs, :corpus => corpus)
|
30
|
+
else
|
31
|
+
text = self.corpus[self]
|
32
|
+
namespace, id, type = self.split(":")
|
33
|
+
Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
|
34
|
+
end
|
26
35
|
end
|
27
36
|
end
|
28
37
|
|
@@ -44,3 +53,9 @@ module Document
|
|
44
53
|
alias id docid
|
45
54
|
end
|
46
55
|
|
56
|
+
#class String
|
57
|
+
# def docid
|
58
|
+
# digest = Misc.digest(self)
|
59
|
+
# ["STRING", digest, nil, nil] * ":"
|
60
|
+
# end
|
61
|
+
#end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt/segment'
|
1
2
|
require 'rbbt/segment/annotation'
|
2
3
|
|
3
4
|
module Document
|
@@ -12,7 +13,7 @@ module Document
|
|
12
13
|
end
|
13
14
|
|
14
15
|
docid = self.docid
|
15
|
-
segments.each{|s| s.docid = docid
|
16
|
+
segments.each{|s| s.docid = docid }
|
16
17
|
|
17
18
|
segments
|
18
19
|
end
|
@@ -22,18 +23,20 @@ module Document
|
|
22
23
|
send :property, type => :multiple do |list|
|
23
24
|
doc_segments = self.instance_exec list, &block
|
24
25
|
|
25
|
-
doc_segments = doc_segments.chunked_values_at(
|
26
|
+
doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
|
26
27
|
|
27
28
|
doc_segments.each_with_index do |segments,i|
|
29
|
+
next if segments.nil?
|
28
30
|
document = list[i]
|
29
|
-
Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
31
|
+
Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
30
32
|
|
31
33
|
segments.each do |segment|
|
32
34
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
33
35
|
end
|
34
36
|
|
35
37
|
docid = document.docid
|
36
|
-
|
38
|
+
|
39
|
+
segments.each{|s| s.docid = docid }
|
37
40
|
|
38
41
|
segments
|
39
42
|
end
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,17 +3,43 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
-
corpus.
|
6
|
+
corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
|
7
|
+
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
8
|
+
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
9
|
+
corpus
|
7
10
|
end
|
8
11
|
|
9
12
|
def add_document(document)
|
10
|
-
|
13
|
+
docid = document.docid
|
14
|
+
return self[docid] if self.include?(docid)
|
15
|
+
self.write_and_close do
|
16
|
+
self[docid] = document
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def docids(*prefix)
|
21
|
+
prefix = prefix * ":"
|
22
|
+
prefix += ":" unless prefix == :all || prefix[-1] == ":"
|
23
|
+
docids = self.read_and_close do
|
24
|
+
prefix == :all ? self.keys : self.prefix(prefix)
|
25
|
+
end
|
26
|
+
DocID.setup(docids, :corpus => self)
|
27
|
+
end
|
28
|
+
|
29
|
+
def documents(*prefix)
|
30
|
+
self.docids(*prefix).document
|
11
31
|
end
|
12
32
|
|
13
33
|
def [](*args)
|
14
34
|
docid, *rest = args
|
15
|
-
|
35
|
+
|
36
|
+
res = self.read_and_close do
|
37
|
+
super(*args)
|
38
|
+
end
|
39
|
+
|
40
|
+
res.force_encoding(Encoding.default_external) if res
|
16
41
|
return res if args.length > 1
|
42
|
+
|
17
43
|
namespace, id, type = docid.split(":")
|
18
44
|
|
19
45
|
if res.nil?
|
@@ -22,6 +48,7 @@ module Document::Corpus
|
|
22
48
|
end
|
23
49
|
end
|
24
50
|
|
51
|
+
res.force_encoding(Encoding.default_external) if res
|
25
52
|
Document.setup(res, namespace, id, type, self) unless res.nil?
|
26
53
|
|
27
54
|
res
|
@@ -6,7 +6,6 @@ module Document::Corpus
|
|
6
6
|
type = nil if String === type and type.empty?
|
7
7
|
|
8
8
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
9
|
-
Log.debug "Loading pmid #{pmid}"
|
10
9
|
document = if type.nil? || type.to_sym == :abstract
|
11
10
|
Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
|
12
11
|
elsif type.to_sym == :title
|
@@ -15,7 +14,9 @@ module Document::Corpus
|
|
15
14
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
16
15
|
Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
|
17
16
|
end
|
17
|
+
Log.debug "Loading pmid #{pmid}"
|
18
18
|
add_document(document)
|
19
|
+
document
|
19
20
|
end
|
20
21
|
|
21
22
|
Document.setup(res)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -39,14 +39,15 @@ class Abner < NER
|
|
39
39
|
types = res[1]
|
40
40
|
strings = res[0]
|
41
41
|
|
42
|
+
docid = Misc.digest(text)
|
42
43
|
global_offset = 0
|
43
44
|
strings.zip(types).collect do |mention, type|
|
44
45
|
mention = mention.to_s;
|
45
46
|
offset = text.index(mention)
|
46
47
|
if offset.nil?
|
47
|
-
NamedEntity.setup(mention,
|
48
|
+
NamedEntity.setup(mention, :docid => docid, :entity_type => type)
|
48
49
|
else
|
49
|
-
NamedEntity.setup(mention, offset + global_offset, type.to_s)
|
50
|
+
NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
|
50
51
|
text = text[offset + mention.length..-1]
|
51
52
|
global_offset += offset + mention.length
|
52
53
|
end
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -55,6 +55,7 @@ class Banner < NER
|
|
55
55
|
# text.
|
56
56
|
def match(text)
|
57
57
|
return [] if text.nil?
|
58
|
+
text = text.dup if text.frozen?
|
58
59
|
text.gsub!(/\n/,' ')
|
59
60
|
text.gsub!(/\|/,'/') # Character | gives an error
|
60
61
|
return [] if text.strip.empty?
|
@@ -66,6 +67,7 @@ class Banner < NER
|
|
66
67
|
@parenPP.postProcess(sentence)
|
67
68
|
tagged = sentence.getSGML
|
68
69
|
|
70
|
+
docid = Misc.digest text
|
69
71
|
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
70
72
|
collect{|r|
|
71
73
|
r.match(/<GENE>(.*?)<\/GENE>/)
|
@@ -73,7 +75,7 @@ class Banner < NER
|
|
73
75
|
mention.sub!(/^\s*/,'')
|
74
76
|
mention.sub!(/\s*$/,'')
|
75
77
|
offset = text.index(mention)
|
76
|
-
NamedEntity.setup(mention, offset, 'GENE')
|
78
|
+
NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
|
77
79
|
mention
|
78
80
|
}
|
79
81
|
res
|
data/lib/rbbt/ner/brat.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -55,11 +55,16 @@ EOF
|
|
55
55
|
Open.mkdir 'tmp'
|
56
56
|
|
57
57
|
texts.each do |name,text|
|
58
|
+
text = Misc.fixutf8(text)
|
59
|
+
|
60
|
+
text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
|
61
|
+
|
58
62
|
Open.write("input/#{name}.txt") do |f|
|
59
|
-
f.puts "#{name}|a|" << text
|
63
|
+
f.puts "#{name}|a|" << text
|
60
64
|
f.puts
|
61
65
|
end
|
62
66
|
end
|
67
|
+
|
63
68
|
Open.write('config', CONFIG)
|
64
69
|
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
65
70
|
|
@@ -95,6 +100,7 @@ EOF
|
|
95
100
|
|
96
101
|
res[name] = segments
|
97
102
|
end
|
103
|
+
res
|
98
104
|
end
|
99
105
|
end
|
100
106
|
|
data/lib/rbbt/ner/linnaeus.rb
CHANGED
@@ -31,7 +31,8 @@ module Linnaeus
|
|
31
31
|
init unless defined? @@Matcher
|
32
32
|
|
33
33
|
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
-
|
34
|
+
best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
|
35
|
+
NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -15,7 +15,6 @@ class PatternRelExt
|
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
17
|
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
|
-
ppp sentence
|
19
18
|
regexpNER.entities(sentence)
|
20
19
|
end
|
21
20
|
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/misc'
|
4
|
+
require 'rbbt/util/simpleDSL'
|
5
|
+
|
6
|
+
class NERFeatures
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
def self.tokens(text)
|
10
|
+
text.scan(/
|
11
|
+
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
12
|
+
\w-\w*|
|
13
|
+
\w+-[A-Z](?!\w)|
|
14
|
+
\w+|
|
15
|
+
[.,()\/\[\]{}'"+-]
|
16
|
+
/x)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.reverse(text)
|
20
|
+
tokens(text).reverse.join(" ")
|
21
|
+
end
|
22
|
+
|
23
|
+
def define(name, *args, &block)
|
24
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
25
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
26
|
+
|
27
|
+
@types[name.to_s] = action
|
28
|
+
@order.push name.to_s
|
29
|
+
|
30
|
+
name.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_accessor :reverse
|
34
|
+
def initialize(file = nil, reverse = false, &block)
|
35
|
+
@types = {}
|
36
|
+
@order = []
|
37
|
+
@context = []
|
38
|
+
@reverse = reverse
|
39
|
+
|
40
|
+
file ||= Rbbt.share.ner['config.rb'].find if !file && !block
|
41
|
+
|
42
|
+
parse(:define, file, &block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def config
|
46
|
+
@config[:define]
|
47
|
+
end
|
48
|
+
|
49
|
+
def window(positions)
|
50
|
+
@window = positions
|
51
|
+
end
|
52
|
+
|
53
|
+
def context(name, &block)
|
54
|
+
if name.is_a? Array
|
55
|
+
@context += name
|
56
|
+
else
|
57
|
+
@context.push name
|
58
|
+
|
59
|
+
# The block might be wrongly assigned to this function
|
60
|
+
# instead of the actual definition, fix that.
|
61
|
+
if block
|
62
|
+
@types[name] = block
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def direction(dir)
|
68
|
+
if dir.to_sym == :reverse
|
69
|
+
@reverse = true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def features(word)
|
74
|
+
values = [word]
|
75
|
+
|
76
|
+
@order.each{|features|
|
77
|
+
action = @types[features]
|
78
|
+
if action.is_a?(Proc)
|
79
|
+
values.push(action.call(word))
|
80
|
+
else
|
81
|
+
m = action.match(word)
|
82
|
+
if m
|
83
|
+
if m[1]
|
84
|
+
values.push(m[1])
|
85
|
+
else
|
86
|
+
values.push(m != nil)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
values.push(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
}
|
93
|
+
values
|
94
|
+
end
|
95
|
+
|
96
|
+
def template(window=nil)
|
97
|
+
window ||= @window || [1,-1]
|
98
|
+
template = ""
|
99
|
+
|
100
|
+
i = 1
|
101
|
+
@order.each{|feat|
|
102
|
+
template += "U#{ feat }: %x[0,#{ i }]\n"
|
103
|
+
|
104
|
+
if @context.include?(feat)
|
105
|
+
window.each{|p|
|
106
|
+
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
107
|
+
}
|
108
|
+
end
|
109
|
+
i += 1
|
110
|
+
}
|
111
|
+
|
112
|
+
template += "B\n"
|
113
|
+
|
114
|
+
template
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def text_features(text, positive = nil)
|
119
|
+
text = self.class.reverse(text) if @reverse
|
120
|
+
initial = true
|
121
|
+
self.class.tokens(text).collect{|token|
|
122
|
+
features = features(token)
|
123
|
+
if !positive.nil?
|
124
|
+
features << (positive ? (initial ? 1 : 2) : 0)
|
125
|
+
initial = false
|
126
|
+
end
|
127
|
+
features
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def tagged_features(text, mentions)
|
132
|
+
mentions ||= []
|
133
|
+
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
134
|
+
re = mentions.collect{|mention|
|
135
|
+
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
136
|
+
}.join("|")
|
137
|
+
|
138
|
+
positive = false
|
139
|
+
features = []
|
140
|
+
chunks = text.split(/(#{re})/)
|
141
|
+
chunks.each{|t|
|
142
|
+
chunk_features = text_features(t, positive)
|
143
|
+
positive = !positive
|
144
|
+
if @reverse
|
145
|
+
features = chunk_features + features
|
146
|
+
else
|
147
|
+
features = features + chunk_features
|
148
|
+
end
|
149
|
+
}
|
150
|
+
features
|
151
|
+
end
|
152
|
+
|
153
|
+
def train(features, model)
|
154
|
+
tmp_template = TmpFile.tmp_file("template-")
|
155
|
+
Open.write(tmp_template,template)
|
156
|
+
|
157
|
+
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
158
|
+
system cmd
|
159
|
+
Open.write(model + '.config',config)
|
160
|
+
FileUtils.rm tmp_template
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
class NER
|
166
|
+
|
167
|
+
def initialize(model = nil)
|
168
|
+
begin
|
169
|
+
require 'CRFPP'
|
170
|
+
rescue Exception
|
171
|
+
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
172
|
+
end
|
173
|
+
|
174
|
+
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
175
|
+
|
176
|
+
@parser = NERFeatures.new(model + '.config')
|
177
|
+
@reverse = @parser.reverse
|
178
|
+
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract(text)
|
182
|
+
features = @parser.text_features(text)
|
183
|
+
|
184
|
+
@tagger.clear
|
185
|
+
features.each{|feats|
|
186
|
+
@tagger.add(feats.join(" "))
|
187
|
+
}
|
188
|
+
|
189
|
+
@tagger.parse
|
190
|
+
|
191
|
+
found = []
|
192
|
+
mention = []
|
193
|
+
|
194
|
+
@tagger.size.times{|i|
|
195
|
+
label = @tagger.y(i)
|
196
|
+
word = @tagger.x(i,0)
|
197
|
+
|
198
|
+
if word == ')'
|
199
|
+
mention.push(')') if mention.join =~ /\(/
|
200
|
+
next
|
201
|
+
end
|
202
|
+
|
203
|
+
case label
|
204
|
+
when 1
|
205
|
+
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
206
|
+
found.push(mention)
|
207
|
+
mention = []
|
208
|
+
end
|
209
|
+
mention.push(word)
|
210
|
+
when 2
|
211
|
+
mention.push(word)
|
212
|
+
when 0
|
213
|
+
found.push(mention) if mention.any?
|
214
|
+
mention = []
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
found << mention if mention.any?
|
219
|
+
|
220
|
+
found.collect{|list|
|
221
|
+
list = list.reverse if @reverse
|
222
|
+
list.join(" ")
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
|