rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1 @@
|
|
1
|
+
a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where
|
data/lib/rbbt/bow/bow.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'stemmer'
|
3
|
+
require 'rbbt/util/misc'
|
4
|
+
|
5
|
+
# This module provides methods to extract a bag of words (or bag of bigrams)
|
6
|
+
# representation for strings of text, and to produce a vector representations
|
7
|
+
# of that bag of words for a given list of terms. This BOW representations of
|
8
|
+
# the texts is usually first used to build a Dictionary, and then, with the
|
9
|
+
# best selection of terms as determined by the Dictionary::TF_IDF.best of
|
10
|
+
# Dictionary::KL.best methods, determine the vector representations for that
|
11
|
+
# text.
|
12
|
+
module BagOfWords
|
13
|
+
|
14
|
+
# Divide the input string into an array of words (sequences of \w characters).
|
15
|
+
# Words are stemmed and filtered to remove stopwords and words with less than
|
16
|
+
# 2 characters. The list of stopwords is a global variable defined in
|
17
|
+
# 'rbbt/util/misc'.
|
18
|
+
def self.words(text)
|
19
|
+
return [] if text.nil?
|
20
|
+
text.scan(/\w+/).
|
21
|
+
collect{|word| word.downcase.stem}.
|
22
|
+
select{|word|
|
23
|
+
! $stopwords.include?(word) &&
|
24
|
+
word.length > 2 &&
|
25
|
+
word =~ /[a-z]/
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
# Take the array of words for the text and form all the bigrams
|
30
|
+
def self.bigrams(text)
|
31
|
+
words = words(text)
|
32
|
+
bigrams = []
|
33
|
+
lastword = nil
|
34
|
+
|
35
|
+
words.each{|word|
|
36
|
+
if lastword
|
37
|
+
bigrams << "#{lastword} #{word}"
|
38
|
+
end
|
39
|
+
lastword = word
|
40
|
+
}
|
41
|
+
|
42
|
+
words + bigrams
|
43
|
+
end
|
44
|
+
|
45
|
+
# Given an array of terms return a hash with the number of appearances of
|
46
|
+
# each term
|
47
|
+
def self.count(terms)
|
48
|
+
count = Hash.new(0)
|
49
|
+
terms.each{|word| count[word] += 1}
|
50
|
+
count
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Given a string of text find all the words (or bigrams) and return a hash
|
55
|
+
# with their counts
|
56
|
+
def self.terms(text, bigrams = true)
|
57
|
+
|
58
|
+
if bigrams
|
59
|
+
count(bigrams(text))
|
60
|
+
else
|
61
|
+
count(words(text))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Given a string of text and a list of terms, which may or may not contain
|
66
|
+
# bigrams, return an array with one entry per term which holds the number of
|
67
|
+
# occurrences of each term in the text.
|
68
|
+
def self.features(text, terms, bigrams = nil)
|
69
|
+
bigrams ||= terms.select{|term| term =~ / /}.any?
|
70
|
+
count = bigrams ? count(bigrams(text)) : count(words(text))
|
71
|
+
count.values_at(*terms)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class String
|
76
|
+
# Shortcut for BagOfWords.words(self)
|
77
|
+
def words
|
78
|
+
BagOfWords.words(self)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Shortcut for BagOfWords.bigrams(self)
|
82
|
+
def bigrams
|
83
|
+
BagOfWords.bigrams(self)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'rbbt/bow/bow'
|
2
|
+
require 'rsruby'
|
3
|
+
|
4
|
+
# This class uses R to build and use classification models. It needs the
|
5
|
+
# 'e1071' R package.
|
6
|
+
class Classifier
|
7
|
+
|
8
|
+
|
9
|
+
# Given the path to a features file, which specifies a number of instances
|
10
|
+
# along with their classes and features in a tab separated format, it uses R
|
11
|
+
# to build a svm model which is save to file in the path specified as
|
12
|
+
# modelfile.
|
13
|
+
def self.create_model(featuresfile, modelfile, dictfile = nil)
|
14
|
+
|
15
|
+
r = RSRuby.instance
|
16
|
+
r.source(File.join(Rbbt.datadir, 'classifier/R/classify.R'))
|
17
|
+
r.BOW_classification_model(featuresfile, modelfile)
|
18
|
+
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :terms
|
23
|
+
|
24
|
+
# Loads an R interpreter which loads the svm model under modelfile.
|
25
|
+
def initialize(modelfile)
|
26
|
+
@r = RSRuby.instance
|
27
|
+
@r.library('e1071')
|
28
|
+
@r.source(File.join(Rbbt.datadir, 'classifier/R/classify.R'))
|
29
|
+
|
30
|
+
@r.load(modelfile)
|
31
|
+
|
32
|
+
@model = @r.svm_model
|
33
|
+
@terms = @r.eval_R("terms = unlist(attr(attr(svm.model$terms,'factors'),'dimnames')[2])")
|
34
|
+
end
|
35
|
+
|
36
|
+
def classify_feature_array(input) #:nodoc:
|
37
|
+
@r.assign('input', input)
|
38
|
+
|
39
|
+
@r.eval_R('input = t(as.data.frame(input))')
|
40
|
+
@r.eval_R('rownames(input) <- NULL')
|
41
|
+
@r.eval_R('colnames(input) <- terms')
|
42
|
+
|
43
|
+
results = @r.eval_R('BOW.classification.classify(svm.model, input, svm.weights)')
|
44
|
+
results.sort.collect{|p| p[1]}
|
45
|
+
end
|
46
|
+
|
47
|
+
def classify_feature_hash(input) #:nodoc:
|
48
|
+
names = []
|
49
|
+
features = []
|
50
|
+
input.each{|name, feats|
|
51
|
+
names << name.to_s
|
52
|
+
features << feats
|
53
|
+
}
|
54
|
+
|
55
|
+
@r.assign('input', features)
|
56
|
+
@r.assign('input.names', names)
|
57
|
+
|
58
|
+
@r.eval_R('input = t(as.data.frame(input))')
|
59
|
+
@r.eval_R('rownames(input) <- input.names')
|
60
|
+
@r.eval_R('colnames(input) <- terms')
|
61
|
+
|
62
|
+
@r.eval_R('BOW.classification.classify(svm.model, input, svm.weights)')
|
63
|
+
end
|
64
|
+
|
65
|
+
def classify_text_array(input) #:nodoc:
|
66
|
+
features = input.collect{|text|
|
67
|
+
BagOfWords.features(text, @terms)
|
68
|
+
}
|
69
|
+
|
70
|
+
classify_feature_array(features)
|
71
|
+
end
|
72
|
+
|
73
|
+
def classify_text_hash(input) #:nodoc:
|
74
|
+
features = {}
|
75
|
+
input.each{|key,text|
|
76
|
+
features[key] = BagOfWords.features(text, @terms)
|
77
|
+
}
|
78
|
+
|
79
|
+
classify_feature_hash(features)
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
# This is a polymorphic method. The input variable may be a single input, in
|
84
|
+
# which case the results will be just the class, a hash of inputs, in which
|
85
|
+
# case the result will be a hash with the results for each input, or an
|
86
|
+
# array, in which case the result is an array of the results in the same
|
87
|
+
# order. Each input may also be in the form of a string, in which case it
|
88
|
+
# will be transformed into a feature vector, or an array in which case it
|
89
|
+
# will be considered as an feature vector itself.
|
90
|
+
def classify(input)
|
91
|
+
if input.is_a? String
|
92
|
+
return classify_text_array([input]).first
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
if input.is_a? Hash
|
97
|
+
return {} if input.empty?
|
98
|
+
if input.values.first.is_a? String
|
99
|
+
return classify_text_hash(input)
|
100
|
+
elsif input.values.first.is_a? Array
|
101
|
+
return classify_feature_hash(input)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
if input.is_a? Array
|
106
|
+
return [] if input.empty?
|
107
|
+
if input.first.is_a? String
|
108
|
+
return classify_text_array(input)
|
109
|
+
elsif input.first.is_a? Array
|
110
|
+
return classify_feature_array(input)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
class Dictionary
|
2
|
+
attr_reader :terms
|
3
|
+
def initialize
|
4
|
+
@terms = Hash.new(0)
|
5
|
+
end
|
6
|
+
|
7
|
+
def add(terms, &block)
|
8
|
+
terms.each{|term, count|
|
9
|
+
@terms[term] += count
|
10
|
+
}
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Dictionary::TF_IDF
|
15
|
+
attr_reader :terms, :docs, :total_terms, :num_docs
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
@term_limit = {
|
19
|
+
:limit => 500_000,
|
20
|
+
}.merge(options)[:limit]
|
21
|
+
|
22
|
+
@terms = Hash.new(0)
|
23
|
+
@docs = Hash.new(0)
|
24
|
+
@num_docs = 0
|
25
|
+
@total_terms = 0
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def add(terms)
|
30
|
+
if @term_limit && @terms.length > @term_limit
|
31
|
+
terms = terms.delete_if{|term, count| !@terms.include? term }
|
32
|
+
end
|
33
|
+
|
34
|
+
terms.each{|term, count|
|
35
|
+
@terms[term] += count
|
36
|
+
@total_terms += count
|
37
|
+
@docs[term] += 1
|
38
|
+
}
|
39
|
+
@num_docs += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
def df
|
43
|
+
df = Hash.new(0)
|
44
|
+
@docs.each{|term, count|
|
45
|
+
df[term] = count.to_f / @num_docs
|
46
|
+
}
|
47
|
+
df
|
48
|
+
end
|
49
|
+
|
50
|
+
def tf
|
51
|
+
tf = Hash.new(0)
|
52
|
+
@terms.each{|term, count|
|
53
|
+
tf[term] = count.to_f / @total_terms
|
54
|
+
}
|
55
|
+
tf
|
56
|
+
end
|
57
|
+
|
58
|
+
def idf
|
59
|
+
idf = Hash.new(0)
|
60
|
+
num_docs = @num_docs.to_f
|
61
|
+
@docs.each{|term, count|
|
62
|
+
idf[term] = Math::log(num_docs / count)
|
63
|
+
}
|
64
|
+
idf
|
65
|
+
end
|
66
|
+
|
67
|
+
def tf_idf
|
68
|
+
tf_idf = Hash.new(0)
|
69
|
+
num_docs = @num_docs.to_f
|
70
|
+
@docs.each{|term, count|
|
71
|
+
tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
|
72
|
+
}
|
73
|
+
tf_idf
|
74
|
+
end
|
75
|
+
|
76
|
+
def best(options = {})
|
77
|
+
hi, low, limit = {
|
78
|
+
:low => 0,
|
79
|
+
:hi => 1,
|
80
|
+
}.merge(options).
|
81
|
+
values_at(:hi, :low, :limit)
|
82
|
+
|
83
|
+
num_docs = @num_docs.to_f
|
84
|
+
best = df.select{|term, value|
|
85
|
+
value >= low && value <= hi
|
86
|
+
}.collect{|p|
|
87
|
+
term = p.first
|
88
|
+
df_value = p.last
|
89
|
+
[term,
|
90
|
+
@terms[term].to_f / num_docs * Math::log(1.0/df_value)
|
91
|
+
]
|
92
|
+
}
|
93
|
+
if limit
|
94
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
95
|
+
else
|
96
|
+
Hash[*best.flatten]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def weights(options = {})
|
101
|
+
best_terms = best(options).keys
|
102
|
+
weights = {}
|
103
|
+
|
104
|
+
num_docs = @num_docs.to_f
|
105
|
+
best_terms.each{|term|
|
106
|
+
weights[term] = Math::log(num_docs / @docs[term])
|
107
|
+
}
|
108
|
+
weights
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
class Dictionary::KL
|
114
|
+
attr_reader :pos_dict, :neg_dict
|
115
|
+
|
116
|
+
def initialize(options = {})
|
117
|
+
@pos_dict = Dictionary::TF_IDF.new(options)
|
118
|
+
@neg_dict = Dictionary::TF_IDF.new(options)
|
119
|
+
end
|
120
|
+
|
121
|
+
def terms
|
122
|
+
(pos_dict.terms.keys + neg_dict.terms.keys).uniq
|
123
|
+
end
|
124
|
+
|
125
|
+
def add(terms, c)
|
126
|
+
dict = (c == :+ || c == '+' ? @pos_dict : @neg_dict)
|
127
|
+
dict.add(terms)
|
128
|
+
end
|
129
|
+
|
130
|
+
def kl
|
131
|
+
kl = {}
|
132
|
+
pos_df = @pos_dict.df
|
133
|
+
neg_df = @neg_dict.df
|
134
|
+
|
135
|
+
terms.each{|term|
|
136
|
+
pos = pos_df[term]
|
137
|
+
neg = neg_df[term]
|
138
|
+
|
139
|
+
pos = 0.000001 if pos == 0
|
140
|
+
pos = 0.999999 if pos == 1
|
141
|
+
neg = 0.000001 if neg == 0
|
142
|
+
neg = 0.999999 if neg == 1
|
143
|
+
|
144
|
+
kl[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
|
145
|
+
}
|
146
|
+
kl
|
147
|
+
end
|
148
|
+
|
149
|
+
def best(options = {})
|
150
|
+
hi, low, limit = {
|
151
|
+
:low => 0,
|
152
|
+
:hi => 1,
|
153
|
+
}.merge(options).
|
154
|
+
values_at(:hi, :low, :limit)
|
155
|
+
|
156
|
+
pos_df = @pos_dict.df
|
157
|
+
neg_df = @neg_dict.df
|
158
|
+
|
159
|
+
best = {}
|
160
|
+
terms.select{|term|
|
161
|
+
pos_df[term] >= low && pos_df[term] <= hi ||
|
162
|
+
neg_df[term] >= low && neg_df[term] <= hi
|
163
|
+
}.each{|term|
|
164
|
+
pos = pos_df[term]
|
165
|
+
neg = neg_df[term]
|
166
|
+
|
167
|
+
pos = 0.000001 if pos == 0
|
168
|
+
pos = 0.999999 if pos == 1
|
169
|
+
neg = 0.000001 if neg == 0
|
170
|
+
neg = 0.999999 if neg == 1
|
171
|
+
|
172
|
+
best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
|
173
|
+
}
|
174
|
+
if limit
|
175
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
176
|
+
else
|
177
|
+
Hash[*best.flatten]
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def weights(options = {})
|
182
|
+
best(options)
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
if __FILE__ == $0
|
190
|
+
|
191
|
+
require 'benchmark'
|
192
|
+
require 'rbbt/sources/pubmed'
|
193
|
+
require 'rbbt/bow/bow'
|
194
|
+
require 'progress-meter'
|
195
|
+
|
196
|
+
max = 10000
|
197
|
+
|
198
|
+
pmids = PubMed.query("Homo Sapiens", max)
|
199
|
+
Progress.monitor "Get pimds"
|
200
|
+
docs = PubMed.get_article(pmids).values.collect{|article| BagOfWords.terms(article.text)}
|
201
|
+
|
202
|
+
dict = Dictionary::TF_IDF.new()
|
203
|
+
|
204
|
+
puts "Starting Benchmark"
|
205
|
+
puts Benchmark.measure{
|
206
|
+
docs.each{|doc|
|
207
|
+
dict.add doc
|
208
|
+
}
|
209
|
+
}
|
210
|
+
puts Benchmark.measure{
|
211
|
+
dict.weights
|
212
|
+
}
|
213
|
+
|
214
|
+
puts dict.terms.length
|
215
|
+
|
216
|
+
|
217
|
+
end
|
218
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
|
4
|
+
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
5
|
+
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
6
|
+
class Abner
|
7
|
+
|
8
|
+
@@JFile = Rjb::import('java.io.File')
|
9
|
+
@@Tagger = Rjb::import('abner.Tagger')
|
10
|
+
@@Trainer = Rjb::import('abner.Trainer')
|
11
|
+
|
12
|
+
# If modelfile is present a custom trained model can be used,
|
13
|
+
# otherwise, the default BioCreative model is used.
|
14
|
+
def initialize(modelfile=nil)
|
15
|
+
if modelfile == nil
|
16
|
+
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
17
|
+
else
|
18
|
+
@tagger = @@Tagger.new(@@JFile.new(modelfile))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Given a chunk of text, it finds all the mentions appearing in it. It
|
23
|
+
# returns all the mentions found, regardless of type, to be coherent
|
24
|
+
# with the rest of NER packages in Rbbt.
|
25
|
+
def extract(text)
|
26
|
+
|
27
|
+
res = @tagger.getEntities(text)
|
28
|
+
types = res[1]
|
29
|
+
strings = res[0]
|
30
|
+
|
31
|
+
return strings.collect{|s| s.to_s}
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
|
4
|
+
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
5
|
+
# in Java. Banner[http://banner.sourceforge.net/].
|
6
|
+
class Banner
|
7
|
+
|
8
|
+
|
9
|
+
@@JFile = Rjb::import('java.io.File')
|
10
|
+
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
11
|
+
@@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
|
12
|
+
@@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
|
13
|
+
@@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
|
14
|
+
@@Sentence = Rjb::import('banner.Sentence')
|
15
|
+
@@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
# The parameters are set to default values, the only one that one
|
20
|
+
# might want to change is the modelfile to point to a custom trained
|
21
|
+
# one.
|
22
|
+
def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
|
23
|
+
lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
|
24
|
+
taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
|
25
|
+
)
|
26
|
+
|
27
|
+
@tokenizer = @@SimpleTokenizer.new
|
28
|
+
|
29
|
+
model = @@JFile.new(modelfile)
|
30
|
+
lemma = @@EngLemmatiser.new(lemmadir,false,true)
|
31
|
+
helper = @@HeppleTagger.new(taggerdir)
|
32
|
+
|
33
|
+
# The next lines are needed to avoid colisions with
|
34
|
+
# metraprograming that could define load (activesupport in
|
35
|
+
# particular :@ ). RJB seems to call java on method missing
|
36
|
+
class << @@CRFTagger
|
37
|
+
if method_defined? :load
|
38
|
+
undef_method :load
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
@tagger = @@CRFTagger.load( model, lemma, helper)
|
43
|
+
@parenPP = @@ParenthesisPostProcessor.new()
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
# Returns an array with the mention found in the provided piece of
|
48
|
+
# text.
|
49
|
+
def extract(text)
|
50
|
+
text.gsub!(/\n/,' ')
|
51
|
+
text.gsub!(/\|/,'/') # Character | gives an error
|
52
|
+
sentence = @@Sentence.new(text)
|
53
|
+
@tokenizer.tokenize(sentence)
|
54
|
+
@tagger.tag(sentence)
|
55
|
+
@parenPP.postProcess(sentence)
|
56
|
+
tagged = sentence.getSGML
|
57
|
+
|
58
|
+
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
59
|
+
collect{|r|
|
60
|
+
r.match(/<GENE>(.*?)<\/GENE>/)
|
61
|
+
mention = $1
|
62
|
+
mention.sub!(/^\s*/,'')
|
63
|
+
mention.sub!(/\s*$/,'')
|
64
|
+
mention
|
65
|
+
}
|
66
|
+
res
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
|
4
|
+
class RegExpNER
|
5
|
+
|
6
|
+
def self.match_re(text, res)
|
7
|
+
res = [res] unless Array === res
|
8
|
+
|
9
|
+
res.collect{|re|
|
10
|
+
if text.match(re)
|
11
|
+
$1
|
12
|
+
else
|
13
|
+
nil
|
14
|
+
end
|
15
|
+
}.compact
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.build_re(names, ignorecase=true)
|
19
|
+
names.compact.select{|n| n != ""}.
|
20
|
+
sort{|a,b| b.length <=> a.length}.
|
21
|
+
collect{|n|
|
22
|
+
re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
|
23
|
+
/(?:^|[^\w])(#{ re })(?:$|[^\w])/i
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(lexicon, options = {})
|
28
|
+
options[:flatten] = true
|
29
|
+
options[:ignorecase] = true if options[:ignorecase].nil?
|
30
|
+
options[:stopwords] = true if options[:stopwords].nil?
|
31
|
+
|
32
|
+
data = Open.to_hash(lexicon, options)
|
33
|
+
|
34
|
+
@index = {}
|
35
|
+
data.collect{|code, names|
|
36
|
+
next if code.nil? || code == ""
|
37
|
+
if options[:stopwords]
|
38
|
+
names = names.select{|n|
|
39
|
+
! $stopwords.include?(options[:ignorecase] ? n.downcase : n)
|
40
|
+
}
|
41
|
+
end
|
42
|
+
@index[code] = RegExpNER.build_re(names, options[:ignorecase])
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def match_hash(text)
|
47
|
+
matches = {}
|
48
|
+
@index.each{|code, re|
|
49
|
+
RegExpNER.match_re(text, re).each{|match|
|
50
|
+
matches[code] ||= []
|
51
|
+
matches[code] << match
|
52
|
+
}
|
53
|
+
}
|
54
|
+
matches
|
55
|
+
end
|
56
|
+
|
57
|
+
def match(text)
|
58
|
+
match_hash(text).values.flatten
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|