rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
data/lib/rbbt/ner/abner.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rjb'
|
3
|
-
|
4
|
-
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
5
|
-
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
6
|
-
class Abner
|
7
|
-
|
8
|
-
@@JFile = Rjb::import('java.io.File')
|
9
|
-
@@Tagger = Rjb::import('abner.Tagger')
|
10
|
-
@@Trainer = Rjb::import('abner.Trainer')
|
11
|
-
|
12
|
-
# If modelfile is present a custom trained model can be used,
|
13
|
-
# otherwise, the default BioCreative model is used.
|
14
|
-
def initialize(modelfile=nil)
|
15
|
-
if modelfile == nil
|
16
|
-
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
17
|
-
else
|
18
|
-
@tagger = @@Tagger.new(@@JFile.new(modelfile))
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Given a chunk of text, it finds all the mentions appearing in it. It
|
23
|
-
# returns all the mentions found, regardless of type, to be coherent
|
24
|
-
# with the rest of NER packages in Rbbt.
|
25
|
-
def extract(text)
|
26
|
-
|
27
|
-
res = @tagger.getEntities(text)
|
28
|
-
types = res[1]
|
29
|
-
strings = res[0]
|
30
|
-
|
31
|
-
return strings.collect{|s| s.to_s}
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
data/lib/rbbt/ner/banner.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rjb'
|
3
|
-
|
4
|
-
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
5
|
-
# in Java. Banner[http://banner.sourceforge.net/].
|
6
|
-
class Banner
|
7
|
-
|
8
|
-
|
9
|
-
@@JFile = Rjb::import('java.io.File')
|
10
|
-
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
11
|
-
@@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
|
12
|
-
@@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
|
13
|
-
@@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
|
14
|
-
@@Sentence = Rjb::import('banner.Sentence')
|
15
|
-
@@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
# The parameters are set to default values, the only one that one
|
20
|
-
# might want to change is the modelfile to point to a custom trained
|
21
|
-
# one.
|
22
|
-
def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
|
23
|
-
lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
|
24
|
-
taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
|
25
|
-
)
|
26
|
-
|
27
|
-
@tokenizer = @@SimpleTokenizer.new
|
28
|
-
|
29
|
-
model = @@JFile.new(modelfile)
|
30
|
-
lemma = @@EngLemmatiser.new(lemmadir,false,true)
|
31
|
-
helper = @@HeppleTagger.new(taggerdir)
|
32
|
-
|
33
|
-
# The next lines are needed to avoid colisions with
|
34
|
-
# metraprograming that could define load (activesupport in
|
35
|
-
# particular :@ ). RJB seems to call java on method missing
|
36
|
-
class << @@CRFTagger
|
37
|
-
if method_defined? :load
|
38
|
-
undef_method :load
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
@tagger = @@CRFTagger.load( model, lemma, helper)
|
43
|
-
@parenPP = @@ParenthesisPostProcessor.new()
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
# Returns an array with the mention found in the provided piece of
|
48
|
-
# text.
|
49
|
-
def extract(text)
|
50
|
-
text.gsub!(/\n/,' ')
|
51
|
-
text.gsub!(/\|/,'/') # Character | gives an error
|
52
|
-
sentence = @@Sentence.new(text)
|
53
|
-
@tokenizer.tokenize(sentence)
|
54
|
-
@tagger.tag(sentence)
|
55
|
-
@parenPP.postProcess(sentence)
|
56
|
-
tagged = sentence.getSGML
|
57
|
-
|
58
|
-
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
59
|
-
collect{|r|
|
60
|
-
r.match(/<GENE>(.*?)<\/GENE>/)
|
61
|
-
mention = $1
|
62
|
-
mention.sub!(/^\s*/,'')
|
63
|
-
mention.sub!(/\s*$/,'')
|
64
|
-
mention
|
65
|
-
}
|
66
|
-
res
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# This class loads a dictionary of codes with associated names, it then can
|
2
|
-
# find those names in a string of text. It works word-wise.
|
3
|
-
class DictionaryNER
|
4
|
-
|
5
|
-
A_INT = "a"[0]
|
6
|
-
DOWNCASE_OFFSET = "A"[0].bytes.first - "a"[0].bytes.first
|
7
|
-
|
8
|
-
require 'rbbt/bow/bow'
|
9
|
-
# Divides a string of text into words. A slash separates words, only if the
|
10
|
-
# second one begins with a letter.
|
11
|
-
def self.chunk(text)
|
12
|
-
text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
|
13
|
-
end
|
14
|
-
|
15
|
-
# Simplify the text to widen the matches. Currently only downcases the keys
|
16
|
-
def self.simplify(text)
|
17
|
-
if text.length > 2 && text[0] < A_INT && text[1] > A_INT
|
18
|
-
text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
|
19
|
-
else
|
20
|
-
return text
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
# Given a dictionary structure, find the matches in the text.
|
25
|
-
def self.match(dict, text) #:nodoc:
|
26
|
-
|
27
|
-
if Array === text
|
28
|
-
words = text
|
29
|
-
else
|
30
|
-
words = chunk(text)
|
31
|
-
end
|
32
|
-
|
33
|
-
result = {}
|
34
|
-
words.each_with_index{|word, pos|
|
35
|
-
key = simplify(word)
|
36
|
-
next if dict[key].nil?
|
37
|
-
dict[key].each{|entrie|
|
38
|
-
case
|
39
|
-
when String === entrie
|
40
|
-
result[word] ||= []
|
41
|
-
result[word] << entrie unless result[word].include? entrie
|
42
|
-
when Hash === entrie
|
43
|
-
rec_words = words[(pos + 1)..-1]
|
44
|
-
rec_result = match(entrie, rec_words)
|
45
|
-
rec_result.each{|rec_key, rec_list|
|
46
|
-
composite_key = word + ' ' + rec_key
|
47
|
-
result[composite_key] ||= []
|
48
|
-
result[composite_key] += rec_list
|
49
|
-
result[composite_key].uniq!
|
50
|
-
}
|
51
|
-
end
|
52
|
-
}
|
53
|
-
}
|
54
|
-
result
|
55
|
-
end
|
56
|
-
|
57
|
-
# Add a name to a structure
|
58
|
-
def self.add_name(dict, name, code)
|
59
|
-
if Array === name
|
60
|
-
words = name
|
61
|
-
else
|
62
|
-
words = chunk(name)
|
63
|
-
end
|
64
|
-
|
65
|
-
key = simplify(words.shift)
|
66
|
-
if words.empty?
|
67
|
-
dict[key] ||= []
|
68
|
-
dict[key] << code unless dict[key].include? code
|
69
|
-
else
|
70
|
-
rec_dict = {}
|
71
|
-
add_name(rec_dict, words , code)
|
72
|
-
dict[key] ||= []
|
73
|
-
dict[key] << rec_dict
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.load(dictionary)
|
78
|
-
dict = {}
|
79
|
-
|
80
|
-
dictionary = File.open(dictionary).read if File.exists? dictionary
|
81
|
-
|
82
|
-
dictionary.each_line{|l|
|
83
|
-
names = l.chomp.split(/\t/)
|
84
|
-
code = names.shift
|
85
|
-
names.each{|name| add_name(dict, name, code) }
|
86
|
-
}
|
87
|
-
dict
|
88
|
-
end
|
89
|
-
|
90
|
-
def initialize(dictionary)
|
91
|
-
@dict = DictionaryNER.load(dictionary)
|
92
|
-
end
|
93
|
-
|
94
|
-
def match(text)
|
95
|
-
DictionaryNER.match(@dict, text)
|
96
|
-
end
|
97
|
-
|
98
|
-
end
|
data/lib/rbbt/ner/regexpNER.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'rbbt/util/open'
|
2
|
-
require 'rbbt/util/misc'
|
3
|
-
|
4
|
-
class RegExpNER
|
5
|
-
|
6
|
-
def self.match_re(text, res)
|
7
|
-
res = [res] unless Array === res
|
8
|
-
|
9
|
-
res.collect{|re|
|
10
|
-
text.scan(re)
|
11
|
-
}.flatten
|
12
|
-
end
|
13
|
-
|
14
|
-
def self.build_re_old(names, ignorecase=true)
|
15
|
-
names.compact.select{|n| n != ""}.
|
16
|
-
sort{|a,b| b.length <=> a.length}.
|
17
|
-
collect{|n|
|
18
|
-
re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
|
19
|
-
}
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.build_re(names, ignorecase=true)
|
23
|
-
res = names.compact.select{|n| n != ""}.
|
24
|
-
sort{|a,b| b.length <=> a.length}.
|
25
|
-
collect{|n|
|
26
|
-
Regexp.quote(n)
|
27
|
-
}
|
28
|
-
|
29
|
-
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
def initialize(lexicon, options = {})
|
34
|
-
options = {:flatten => true, :ignorecase => true, :stopwords => nil}.merge options
|
35
|
-
|
36
|
-
options[:stopwords] = $stopwords if $stopwords && (options[:stopwords].nil? || options[:stopwords] == true)
|
37
|
-
options[:stopwords] ||= []
|
38
|
-
|
39
|
-
data = Open.to_hash(lexicon, options)
|
40
|
-
|
41
|
-
@index = {}
|
42
|
-
data.collect{|code, names|
|
43
|
-
next if code.nil? || code == ""
|
44
|
-
if options[:stopwords].any?
|
45
|
-
names = names.select{|n|
|
46
|
-
! options[:stopwords].include?(options[:ignorecase] ? n.downcase : n)
|
47
|
-
}
|
48
|
-
end
|
49
|
-
@index[code] = RegExpNER.build_re(names, options[:ignorecase])
|
50
|
-
}
|
51
|
-
end
|
52
|
-
|
53
|
-
def match_hash(text)
|
54
|
-
return {} if text.nil? || text == ""
|
55
|
-
matches = {}
|
56
|
-
@index.each{|code, re|
|
57
|
-
RegExpNER.match_re(text, re).each{|match|
|
58
|
-
matches[code] ||= []
|
59
|
-
matches[code] << match
|
60
|
-
}
|
61
|
-
}
|
62
|
-
matches
|
63
|
-
end
|
64
|
-
|
65
|
-
def match(text)
|
66
|
-
match_hash(text)
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
data/lib/rbbt/ner/rner.rb
DELETED
@@ -1,227 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/misc'
|
4
|
-
require 'rbbt/util/simpleDSL'
|
5
|
-
|
6
|
-
class NERFeatures < SimpleDSL
|
7
|
-
def self.tokens(text)
|
8
|
-
text.scan(/
|
9
|
-
\w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
|
10
|
-
\w-\w*|
|
11
|
-
\w+-[A-Z](?!\w)|
|
12
|
-
\w+|
|
13
|
-
[.,()\/\[\]{}'"+-]
|
14
|
-
/x)
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.reverse(text)
|
18
|
-
tokens(text).reverse.join(" ")
|
19
|
-
end
|
20
|
-
|
21
|
-
def define(name, *args, &block)
|
22
|
-
action = *args[0] || block || /#{name.to_s}s?/i
|
23
|
-
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
24
|
-
|
25
|
-
@types[name.to_s] = action
|
26
|
-
@order.push name.to_s
|
27
|
-
|
28
|
-
name.to_s
|
29
|
-
end
|
30
|
-
|
31
|
-
attr_accessor :reverse
|
32
|
-
def initialize(file = nil, reverse = false, &block)
|
33
|
-
@types = {}
|
34
|
-
@order = []
|
35
|
-
@context = []
|
36
|
-
@reverse = reverse
|
37
|
-
|
38
|
-
file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
|
39
|
-
|
40
|
-
super(:define,file, &block)
|
41
|
-
end
|
42
|
-
|
43
|
-
def config
|
44
|
-
@config[:define]
|
45
|
-
end
|
46
|
-
|
47
|
-
def window(positions)
|
48
|
-
@window = positions
|
49
|
-
end
|
50
|
-
|
51
|
-
def context(name, &block)
|
52
|
-
if name.is_a? Array
|
53
|
-
@context += name
|
54
|
-
else
|
55
|
-
@context.push name
|
56
|
-
|
57
|
-
# The block might be wrongly assigned to this function
|
58
|
-
# instead of the actual definition, fix that.
|
59
|
-
if block
|
60
|
-
@types[name] = block
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def direction(dir)
|
66
|
-
if dir.to_sym == :reverse
|
67
|
-
@reverse = true
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def features(word)
|
72
|
-
values = [word]
|
73
|
-
|
74
|
-
@order.each{|features|
|
75
|
-
action = @types[features]
|
76
|
-
if action.is_a?(Proc)
|
77
|
-
values.push(action.call(word))
|
78
|
-
else
|
79
|
-
m = action.match(word)
|
80
|
-
if m
|
81
|
-
if m[1]
|
82
|
-
values.push(m[1])
|
83
|
-
else
|
84
|
-
values.push(m != nil)
|
85
|
-
end
|
86
|
-
else
|
87
|
-
values.push(false)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
}
|
91
|
-
values
|
92
|
-
end
|
93
|
-
|
94
|
-
def template(window=nil)
|
95
|
-
window ||= @window || [1,-1]
|
96
|
-
template = ""
|
97
|
-
|
98
|
-
i = 1
|
99
|
-
@order.each{|feat|
|
100
|
-
template += "U#{ feat }: %x[0,#{ i }]\n"
|
101
|
-
|
102
|
-
if @context.include?(feat)
|
103
|
-
window.each{|p|
|
104
|
-
template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
|
105
|
-
}
|
106
|
-
end
|
107
|
-
i += 1
|
108
|
-
}
|
109
|
-
|
110
|
-
template += "B\n"
|
111
|
-
|
112
|
-
template
|
113
|
-
end
|
114
|
-
|
115
|
-
|
116
|
-
def text_features(text, positive = nil)
|
117
|
-
text = self.class.reverse(text) if @reverse
|
118
|
-
initial = true
|
119
|
-
self.class.tokens(text).collect{|token|
|
120
|
-
features = features(token)
|
121
|
-
if !positive.nil?
|
122
|
-
features << (positive ? (initial ? 1 : 2) : 0)
|
123
|
-
initial = false
|
124
|
-
end
|
125
|
-
features
|
126
|
-
}
|
127
|
-
end
|
128
|
-
|
129
|
-
def tagged_features(text, mentions)
|
130
|
-
mentions ||= []
|
131
|
-
mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
|
132
|
-
re = mentions.collect{|mention|
|
133
|
-
Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
|
134
|
-
}.join("|")
|
135
|
-
|
136
|
-
positive = false
|
137
|
-
features = []
|
138
|
-
chunks = text.split(/(#{re})/)
|
139
|
-
chunks.each{|t|
|
140
|
-
chunk_features = text_features(t, positive)
|
141
|
-
positive = !positive
|
142
|
-
if @reverse
|
143
|
-
features = chunk_features + features
|
144
|
-
else
|
145
|
-
features = features + chunk_features
|
146
|
-
end
|
147
|
-
}
|
148
|
-
features
|
149
|
-
end
|
150
|
-
|
151
|
-
def train(features, model)
|
152
|
-
tmp_template = TmpFile.tmp_file("template-")
|
153
|
-
Open.write(tmp_template,template)
|
154
|
-
|
155
|
-
cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
|
156
|
-
system cmd
|
157
|
-
Open.write(model + '.config',config)
|
158
|
-
FileUtils.rm tmp_template
|
159
|
-
end
|
160
|
-
|
161
|
-
end
|
162
|
-
|
163
|
-
class NER
|
164
|
-
|
165
|
-
def initialize(model = nil)
|
166
|
-
begin
|
167
|
-
require 'CRFPP'
|
168
|
-
rescue Exception
|
169
|
-
require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
|
170
|
-
end
|
171
|
-
|
172
|
-
model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
|
173
|
-
|
174
|
-
@parser = NERFeatures.new(model + '.config')
|
175
|
-
@reverse = @parser.reverse
|
176
|
-
@tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
|
177
|
-
end
|
178
|
-
|
179
|
-
def extract(text)
|
180
|
-
features = @parser.text_features(text)
|
181
|
-
|
182
|
-
@tagger.clear
|
183
|
-
features.each{|feats|
|
184
|
-
@tagger.add(feats.join(" "))
|
185
|
-
}
|
186
|
-
|
187
|
-
@tagger.parse
|
188
|
-
|
189
|
-
found = []
|
190
|
-
mention = []
|
191
|
-
|
192
|
-
@tagger.size.times{|i|
|
193
|
-
label = @tagger.y(i)
|
194
|
-
word = @tagger.x(i,0)
|
195
|
-
|
196
|
-
if word == ')'
|
197
|
-
mention.push(')') if mention.join =~ /\(/
|
198
|
-
next
|
199
|
-
end
|
200
|
-
|
201
|
-
case label
|
202
|
-
when 1
|
203
|
-
if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
|
204
|
-
found.push(mention)
|
205
|
-
mention = []
|
206
|
-
end
|
207
|
-
mention.push(word)
|
208
|
-
when 2
|
209
|
-
mention.push(word)
|
210
|
-
when 0
|
211
|
-
found.push(mention) if mention.any?
|
212
|
-
mention = []
|
213
|
-
end
|
214
|
-
}
|
215
|
-
|
216
|
-
found << mention if mention.any?
|
217
|
-
|
218
|
-
found.collect{|list|
|
219
|
-
list = list.reverse if @reverse
|
220
|
-
list.join(" ")
|
221
|
-
}
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
225
|
-
|
226
|
-
|
227
|
-
|