rbbt 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
data/lib/rbbt/ner/abner.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rjb'
|
3
|
-
|
4
|
-
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
5
|
-
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
6
|
-
class Abner
|
7
|
-
|
8
|
-
@@JFile = Rjb::import('java.io.File')
|
9
|
-
@@Tagger = Rjb::import('abner.Tagger')
|
10
|
-
@@Trainer = Rjb::import('abner.Trainer')
|
11
|
-
|
12
|
-
# If modelfile is present a custom trained model can be used,
|
13
|
-
# otherwise, the default BioCreative model is used.
|
14
|
-
def initialize(modelfile=nil)
|
15
|
-
if modelfile == nil
|
16
|
-
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
17
|
-
else
|
18
|
-
@tagger = @@Tagger.new(@@JFile.new(modelfile))
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Given a chunk of text, it finds all the mentions appearing in it. It
|
23
|
-
# returns all the mentions found, regardless of type, to be coherent
|
24
|
-
# with the rest of NER packages in Rbbt.
|
25
|
-
def extract(text)
|
26
|
-
|
27
|
-
res = @tagger.getEntities(text)
|
28
|
-
types = res[1]
|
29
|
-
strings = res[0]
|
30
|
-
|
31
|
-
return strings.collect{|s| s.to_s}
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
data/lib/rbbt/ner/banner.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rjb'
|
3
|
-
|
4
|
-
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
5
|
-
# in Java. Banner[http://banner.sourceforge.net/].
|
6
|
-
class Banner
|
7
|
-
|
8
|
-
|
9
|
-
@@JFile = Rjb::import('java.io.File')
|
10
|
-
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
11
|
-
@@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
|
12
|
-
@@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
|
13
|
-
@@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
|
14
|
-
@@Sentence = Rjb::import('banner.Sentence')
|
15
|
-
@@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
# The parameters are set to default values, the only one that one
|
20
|
-
# might want to change is the modelfile to point to a custom trained
|
21
|
-
# one.
|
22
|
-
def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
|
23
|
-
lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
|
24
|
-
taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
|
25
|
-
)
|
26
|
-
|
27
|
-
@tokenizer = @@SimpleTokenizer.new
|
28
|
-
|
29
|
-
model = @@JFile.new(modelfile)
|
30
|
-
lemma = @@EngLemmatiser.new(lemmadir,false,true)
|
31
|
-
helper = @@HeppleTagger.new(taggerdir)
|
32
|
-
|
33
|
-
# The next lines are needed to avoid colisions with
|
34
|
-
# metraprograming that could define load (activesupport in
|
35
|
-
# particular :@ ). RJB seems to call java on method missing
|
36
|
-
class << @@CRFTagger
|
37
|
-
if method_defined? :load
|
38
|
-
undef_method :load
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
@tagger = @@CRFTagger.load( model, lemma, helper)
|
43
|
-
@parenPP = @@ParenthesisPostProcessor.new()
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
# Returns an array with the mention found in the provided piece of
|
48
|
-
# text.
|
49
|
-
def extract(text)
|
50
|
-
text.gsub!(/\n/,' ')
|
51
|
-
text.gsub!(/\|/,'/') # Character | gives an error
|
52
|
-
sentence = @@Sentence.new(text)
|
53
|
-
@tokenizer.tokenize(sentence)
|
54
|
-
@tagger.tag(sentence)
|
55
|
-
@parenPP.postProcess(sentence)
|
56
|
-
tagged = sentence.getSGML
|
57
|
-
|
58
|
-
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
59
|
-
collect{|r|
|
60
|
-
r.match(/<GENE>(.*?)<\/GENE>/)
|
61
|
-
mention = $1
|
62
|
-
mention.sub!(/^\s*/,'')
|
63
|
-
mention.sub!(/\s*$/,'')
|
64
|
-
mention
|
65
|
-
}
|
66
|
-
res
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# This class loads a dictionary of codes with associated names, it then can
|
2
|
-
# find those names in a string of text. It works word-wise.
|
3
|
-
class DictionaryNER
|
4
|
-
|
5
|
-
A_INT = "a"[0]
|
6
|
-
DOWNCASE_OFFSET = "A"[0].bytes.first - "a"[0].bytes.first
|
7
|
-
|
8
|
-
require 'rbbt/bow/bow'
|
9
|
-
# Divides a string of text into words. A slash separates words, only if the
|
10
|
-
# second one begins with a letter.
|
11
|
-
def self.chunk(text)
|
12
|
-
text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
|
13
|
-
end
|
14
|
-
|
15
|
-
# Simplify the text to widen the matches. Currently only downcases the keys
|
16
|
-
def self.simplify(text)
|
17
|
-
if text.length > 2 && text[0] < A_INT && text[1] > A_INT
|
18
|
-
text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
|
19
|
-
else
|
20
|
-
return text
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
# Given a dictionary structure, find the matches in the text.
|
25
|
-
def self.match(dict, text) #:nodoc:
|
26
|
-
|
27
|
-
if Array === text
|
28
|
-
words = text
|
29
|
-
else
|
30
|
-
words = chunk(text)
|
31
|
-
end
|
32
|
-
|
33
|
-
result = {}
|
34
|
-
words.each_with_index{|word, pos|
|
35
|
-
key = simplify(word)
|
36
|
-
next if dict[key].nil?
|
37
|
-
dict[key].each{|entrie|
|
38
|
-
case
|
39
|
-
when String === entrie
|
40
|
-
result[word] ||= []
|
41
|
-
result[word] << entrie unless result[word].include? entrie
|
42
|
-
when Hash === entrie
|
43
|
-
rec_words = words[(pos + 1)..-1]
|
44
|
-
rec_result = match(entrie, rec_words)
|
45
|
-
rec_result.each{|rec_key, rec_list|
|
46
|
-
composite_key = word + ' ' + rec_key
|
47
|
-
result[composite_key] ||= []
|
48
|
-
result[composite_key] += rec_list
|
49
|
-
result[composite_key].uniq!
|
50
|
-
}
|
51
|
-
end
|
52
|
-
}
|
53
|
-
}
|
54
|
-
result
|
55
|
-
end
|
56
|
-
|
57
|
-
# Add a name to a structure
|
58
|
-
def self.add_name(dict, name, code)
|
59
|
-
if Array === name
|
60
|
-
words = name
|
61
|
-
else
|
62
|
-
words = chunk(name)
|
63
|
-
end
|
64
|
-
|
65
|
-
key = simplify(words.shift)
|
66
|
-
if words.empty?
|
67
|
-
dict[key] ||= []
|
68
|
-
dict[key] << code unless dict[key].include? code
|
69
|
-
else
|
70
|
-
rec_dict = {}
|
71
|
-
add_name(rec_dict, words , code)
|
72
|
-
dict[key] ||= []
|
73
|
-
dict[key] << rec_dict
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.load(dictionary)
|
78
|
-
dict = {}
|
79
|
-
|
80
|
-
dictionary = File.open(dictionary).read if File.exists? dictionary
|
81
|
-
|
82
|
-
dictionary.each_line{|l|
|
83
|
-
names = l.chomp.split(/\t/)
|
84
|
-
code = names.shift
|
85
|
-
names.each{|name| add_name(dict, name, code) }
|
86
|
-
}
|
87
|
-
dict
|
88
|
-
end
|
89
|
-
|
90
|
-
def initialize(dictionary)
|
91
|
-
@dict = DictionaryNER.load(dictionary)
|
92
|
-
end
|
93
|
-
|
94
|
-
def match(text)
|
95
|
-
DictionaryNER.match(@dict, text)
|
96
|
-
end
|
97
|
-
|
98
|
-
end
|
data/lib/rbbt/ner/regexpNER.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'rbbt/util/open'
|
2
|
-
require 'rbbt/util/misc'
|
3
|
-
|
4
|
-
class RegExpNER
|
5
|
-
|
6
|
-
def self.match_re(text, res)
|
7
|
-
res = [res] unless Array === res
|
8
|
-
|
9
|
-
res.collect{|re|
|
10
|
-
text.scan(re)
|
11
|
-
}.flatten
|
12
|
-
end
|
13
|
-
|
14
|
-
def self.build_re_old(names, ignorecase=true)
|
15
|
-
names.compact.select{|n| n != ""}.
|
16
|
-
sort{|a,b| b.length <=> a.length}.
|
17
|
-
collect{|n|
|
18
|
-
re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
|
19
|
-
}
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.build_re(names, ignorecase=true)
|
23
|
-
res = names.compact.select{|n| n != ""}.
|
24
|
-
sort{|a,b| b.length <=> a.length}.
|
25
|
-
collect{|n|
|
26
|
-
Regexp.quote(n)
|
27
|
-
}
|
28
|
-
|
29
|
-
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
def initialize(lexicon, options = {})
|
34
|
-
options = {:flatten => true, :ignorecase => true, :stopwords => nil}.merge options
|
35
|
-
|
36
|
-
options[:stopwords] = $stopwords if $stopwords && (options[:stopwords].nil? || options[:stopwords] == true)
|
37
|
-
options[:stopwords] ||= []
|
38
|
-
|
39
|
-
data = Open.to_hash(lexicon, options)
|
40
|
-
|
41
|
-
@index = {}
|
42
|
-
data.collect{|code, names|
|
43
|
-
next if code.nil? || code == ""
|
44
|
-
if options[:stopwords].any?
|
45
|
-
names = names.select{|n|
|
46
|
-
! options[:stopwords].include?(options[:ignorecase] ? n.downcase : n)
|
47
|
-
}
|
48
|
-
end
|
49
|
-
@index[code] = RegExpNER.build_re(names, options[:ignorecase])
|
50
|
-
}
|
51
|
-
end
|
52
|
-
|
53
|
-
def match_hash(text)
|
54
|
-
return {} if text.nil? || text == ""
|
55
|
-
matches = {}
|
56
|
-
@index.each{|code, re|
|
57
|
-
RegExpNER.match_re(text, re).each{|match|
|
58
|
-
matches[code] ||= []
|
59
|
-
matches[code] << match
|
60
|
-
}
|
61
|
-
}
|
62
|
-
matches
|
63
|
-
end
|
64
|
-
|
65
|
-
def match(text)
|
66
|
-
match_hash(text)
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
data/lib/rbbt/ner/rner.rb
DELETED
@@ -1,227 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/misc'
|
4
|
-
require 'rbbt/util/simpleDSL'
|
5
|
-
|
6
|
-
class NERFeatures < SimpleDSL
|
7
|
-
def self.tokens(text)
|
8
|
-
text.scan(/
|
9
|
-
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
10
|
-
\w-\w*|
|
11
|
-
\w+-[A-Z](?!\w)|
|
12
|
-
\w+|
|
13
|
-
[.,()\/\[\]{}'"+-]
|
14
|
-
/x)
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.reverse(text)
|
18
|
-
tokens(text).reverse.join(" ")
|
19
|
-
end
|
20
|
-
|
21
|
-
def define(name, *args, &block)
|
22
|
-
action = *args[0] || block || /#{name.to_s}s?/i
|
23
|
-
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
24
|
-
|
25
|
-
@types[name.to_s] = action
|
26
|
-
@order.push name.to_s
|
27
|
-
|
28
|
-
name.to_s
|
29
|
-
end
|
30
|
-
|
31
|
-
attr_accessor :reverse
|
32
|
-
def initialize(file = nil, reverse = false, &block)
|
33
|
-
@types = {}
|
34
|
-
@order = []
|
35
|
-
@context = []
|
36
|
-
@reverse = reverse
|
37
|
-
|
38
|
-
file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
|
39
|
-
|
40
|
-
super(:define,file, &block)
|
41
|
-
end
|
42
|
-
|
43
|
-
def config
|
44
|
-
@config[:define]
|
45
|
-
end
|
46
|
-
|
47
|
-
def window(positions)
|
48
|
-
@window = positions
|
49
|
-
end
|
50
|
-
|
51
|
-
def context(name, &block)
|
52
|
-
if name.is_a? Array
|
53
|
-
@context += name
|
54
|
-
else
|
55
|
-
@context.push name
|
56
|
-
|
57
|
-
# The block might be wrongly assigned to this function
|
58
|
-
# instead of the actual definition, fix that.
|
59
|
-
if block
|
60
|
-
@types[name] = block
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def direction(dir)
|
66
|
-
if dir.to_sym == :reverse
|
67
|
-
@reverse = true
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def features(word)
|
72
|
-
values = [word]
|
73
|
-
|
74
|
-
@order.each{|features|
|
75
|
-
action = @types[features]
|
76
|
-
if action.is_a?(Proc)
|
77
|
-
values.push(action.call(word))
|
78
|
-
else
|
79
|
-
m = action.match(word)
|
80
|
-
if m
|
81
|
-
if m[1]
|
82
|
-
values.push(m[1])
|
83
|
-
else
|
84
|
-
values.push(m != nil)
|
85
|
-
end
|
86
|
-
else
|
87
|
-
values.push(false)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
}
|
91
|
-
values
|
92
|
-
end
|
93
|
-
|
94
|
-
def template(window=nil)
|
95
|
-
window ||= @window || [1,-1]
|
96
|
-
template = ""
|
97
|
-
|
98
|
-
i = 1
|
99
|
-
@order.each{|feat|
|
100
|
-
template += "U#{ feat }: %x[0,#{ i }]\n"
|
101
|
-
|
102
|
-
if @context.include?(feat)
|
103
|
-
window.each{|p|
|
104
|
-
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
105
|
-
}
|
106
|
-
end
|
107
|
-
i += 1
|
108
|
-
}
|
109
|
-
|
110
|
-
template += "B\n"
|
111
|
-
|
112
|
-
template
|
113
|
-
end
|
114
|
-
|
115
|
-
|
116
|
-
def text_features(text, positive = nil)
|
117
|
-
text = self.class.reverse(text) if @reverse
|
118
|
-
initial = true
|
119
|
-
self.class.tokens(text).collect{|token|
|
120
|
-
features = features(token)
|
121
|
-
if !positive.nil?
|
122
|
-
features << (positive ? (initial ? 1 : 2) : 0)
|
123
|
-
initial = false
|
124
|
-
end
|
125
|
-
features
|
126
|
-
}
|
127
|
-
end
|
128
|
-
|
129
|
-
def tagged_features(text, mentions)
|
130
|
-
mentions ||= []
|
131
|
-
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
132
|
-
re = mentions.collect{|mention|
|
133
|
-
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
134
|
-
}.join("|")
|
135
|
-
|
136
|
-
positive = false
|
137
|
-
features = []
|
138
|
-
chunks = text.split(/(#{re})/)
|
139
|
-
chunks.each{|t|
|
140
|
-
chunk_features = text_features(t, positive)
|
141
|
-
positive = !positive
|
142
|
-
if @reverse
|
143
|
-
features = chunk_features + features
|
144
|
-
else
|
145
|
-
features = features + chunk_features
|
146
|
-
end
|
147
|
-
}
|
148
|
-
features
|
149
|
-
end
|
150
|
-
|
151
|
-
def train(features, model)
|
152
|
-
tmp_template = TmpFile.tmp_file("template-")
|
153
|
-
Open.write(tmp_template,template)
|
154
|
-
|
155
|
-
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
156
|
-
system cmd
|
157
|
-
Open.write(model + '.config',config)
|
158
|
-
FileUtils.rm tmp_template
|
159
|
-
end
|
160
|
-
|
161
|
-
end
|
162
|
-
|
163
|
-
class NER
|
164
|
-
|
165
|
-
def initialize(model = nil)
|
166
|
-
begin
|
167
|
-
require 'CRFPP'
|
168
|
-
rescue Exception
|
169
|
-
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
170
|
-
end
|
171
|
-
|
172
|
-
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
173
|
-
|
174
|
-
@parser = NERFeatures.new(model + '.config')
|
175
|
-
@reverse = @parser.reverse
|
176
|
-
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
177
|
-
end
|
178
|
-
|
179
|
-
def extract(text)
|
180
|
-
features = @parser.text_features(text)
|
181
|
-
|
182
|
-
@tagger.clear
|
183
|
-
features.each{|feats|
|
184
|
-
@tagger.add(feats.join(" "))
|
185
|
-
}
|
186
|
-
|
187
|
-
@tagger.parse
|
188
|
-
|
189
|
-
found = []
|
190
|
-
mention = []
|
191
|
-
|
192
|
-
@tagger.size.times{|i|
|
193
|
-
label = @tagger.y(i)
|
194
|
-
word = @tagger.x(i,0)
|
195
|
-
|
196
|
-
if word == ')'
|
197
|
-
mention.push(')') if mention.join =~ /\(/
|
198
|
-
next
|
199
|
-
end
|
200
|
-
|
201
|
-
case label
|
202
|
-
when 1
|
203
|
-
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
204
|
-
found.push(mention)
|
205
|
-
mention = []
|
206
|
-
end
|
207
|
-
mention.push(word)
|
208
|
-
when 2
|
209
|
-
mention.push(word)
|
210
|
-
when 0
|
211
|
-
found.push(mention) if mention.any?
|
212
|
-
mention = []
|
213
|
-
end
|
214
|
-
}
|
215
|
-
|
216
|
-
found << mention if mention.any?
|
217
|
-
|
218
|
-
found.collect{|list|
|
219
|
-
list = list.reverse if @reverse
|
220
|
-
list.join(" ")
|
221
|
-
}
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
225
|
-
|
226
|
-
|
227
|
-
|